Pyspark transformation: Column names to rows - apache-spark-sql

I'm working with pyspark and want to transform this spark data frame:
+----+-------------------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+
| TS | ABC[0].VAL.VAL[0].UNT[0].sth1 | ABC[0].VAL.VAL[0].UNT[1].sth1 | ABC[0].VAL.VAL[1].UNT[0].sth1 | ABC[0].VAL.VAL[1].UNT[1].sth1 | ABC[0].VAL.VAL[0].UNT[0].sth2 | ABC[0].VAL.VAL[0].UNT[1].sth2 | ABC[0].VAL.VAL[1].UNT[0].sth2 | ABC[0].VAL.VAL[1].UNT[1].sth2 |
+----+-------------------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+
| 1 | some_value | some_value | some_value | some_value | some_value | some_value | some_value | some_value |
+----+-------------------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+
to that:
+----+-----+-----+------------+------------+
| TS | VAL | UNT | sth1 | sth2 |
+----+-----+-----+------------+------------+
| 1 | 0 | 0 | some_value | some_value |
| 1 | 0 | 1 | some_value | some_value |
| 1 | 1 | 0 | some_value | some_value |
| 1 | 1 | 1 | some_value | some_value |
+----+-----+-----+------------+------------+
Any idea how I can do that using some fancy transformation?
Edit:
So this is how I could solve it:
from pyspark.sql.functions import array, col, explode, struct, lit
import re
df = sc.parallelize([(1, 0.0, 0.6, 0.1, 0.4, 0.7, 0.2, 0.4, 0.1), (2, 0.6, 0.7, 0.1, 0.5, 0.8, 0.3, 0.1, 0.3)]).toDF(["TS", "ABC[0].VAL.VAL[0].UNT[0].sth1", "ABC[0].VAL.VAL[0].UNT[1].sth1", "ABC[0].VAL.VAL[1].UNT[0].sth1", "ABC[0].VAL.VAL[1].UNT[1].sth1", "ABC[0].VAL.VAL[0].UNT[0].sth2", "ABC[0].VAL.VAL[0].UNT[1].sth2", "ABC[0].VAL.VAL[1].UNT[0].sth2", "ABC[0].VAL.VAL[1].UNT[1].sth2"])
newcols = list(map(lambda x: x.replace(".", "_"), df.columns))
df = df.toDF(*newcols)
cols, dtypes = zip(*((c, t) for (c, t) in df.dtypes if c not in ["TS"]))
kvs = explode(array([struct(
lit( re.search(re.compile(r"VAL\[(\d{1,2})\]"),c).group(1) ).alias("VAL"),
lit( re.search(re.compile(r"UNT\[(\d{1,2})\]"),c).group(1) ).alias("UNT"),
lit( re.search(re.compile(r"([^_]+$)"),c).group(1) ).alias("Parameter"),
col(c).alias("data")) for c in cols
])).alias("kvs")
display(df.select(["TS"] + [kvs]).select(["TS"] + ["kvs.VAL", "kvs.UNT", "kvs.Parameter", "kvs.data"]).groupBy("TS","VAL","UNT").pivot("Parameter").sum("data").orderBy("TS","VAL","UNT"))
Output:
+----+-----+-----+------+------+
| TS | VAL | UNT | sth1 | sth2 |
+----+-----+-----+------+------+
| 1 | 0 | 0 | 0 | 0.7 |
| 1 | 0 | 1 | 0.6 | 0.2 |
| 1 | 1 | 0 | 0.1 | 0.4 |
| 1 | 1 | 1 | 0.4 | 0.1 |
| 2 | 0 | 0 | 0.6 | 0.8 |
| 2 | 0 | 1 | 0.7 | 0.3 |
| 2 | 1 | 0 | 0.1 | 0.1 |
| 2 | 1 | 1 | 0.5 | 0.3 |
+----+-----+-----+------+------+
How can it be done better?

So this is how I could solve it:
from pyspark.sql.functions import array, col, explode, struct, lit
import re
df = sc.parallelize([(1, 0.0, 0.6, 0.1, 0.4, 0.7, 0.2, 0.4, 0.1), (2, 0.6, 0.7, 0.1, 0.5, 0.8, 0.3, 0.1, 0.3)]).toDF(["TS", "ABC[0].VAL.VAL[0].UNT[0].sth1", "ABC[0].VAL.VAL[0].UNT[1].sth1", "ABC[0].VAL.VAL[1].UNT[0].sth1", "ABC[0].VAL.VAL[1].UNT[1].sth1", "ABC[0].VAL.VAL[0].UNT[0].sth2", "ABC[0].VAL.VAL[0].UNT[1].sth2", "ABC[0].VAL.VAL[1].UNT[0].sth2", "ABC[0].VAL.VAL[1].UNT[1].sth2"])
newcols = list(map(lambda x: x.replace(".", "_"), df.columns))
df = df.toDF(*newcols)
cols, dtypes = zip(*((c, t) for (c, t) in df.dtypes if c not in ["TS"]))
kvs = explode(array([struct(
lit( re.search(re.compile(r"VAL\[(\d{1,2})\]"),c).group(1) ).alias("VAL"),
lit( re.search(re.compile(r"UNT\[(\d{1,2})\]"),c).group(1) ).alias("UNT"),
lit( re.search(re.compile(r"([^_]+$)"),c).group(1) ).alias("Parameter"),
col(c).alias("data")) for c in cols
])).alias("kvs")
display(df.select(["TS"] + [kvs]).select(["TS"] + ["kvs.VAL", "kvs.UNT", "kvs.Parameter", "kvs.data"]).groupBy("TS","VAL","UNT").pivot("Parameter").sum("data").orderBy("TS","VAL","UNT"))
Output:
+----+-----+-----+------+------+
| TS | VAL | UNT | sth1 | sth2 |
+----+-----+-----+------+------+
| 1 | 0 | 0 | 0 | 0.7 |
| 1 | 0 | 1 | 0.6 | 0.2 |
| 1 | 1 | 0 | 0.1 | 0.4 |
| 1 | 1 | 1 | 0.4 | 0.1 |
| 2 | 0 | 0 | 0.6 | 0.8 |
| 2 | 0 | 1 | 0.7 | 0.3 |
| 2 | 1 | 0 | 0.1 | 0.1 |
| 2 | 1 | 1 | 0.5 | 0.3 |
+----+-----+-----+------+------+
Now at least tell me how it can be done better...

Your approach is good (upvoted). The only thing I would really do is extract the essential parts from the column names in one regex search. I’d also remove a superfluous select in favor of groupBy, but that’s not as important.
import re
from pyspark.sql.functions import lit, explode, array, struct, col
df = sc.parallelize([(1, 0.0, 0.6, 0.1, 0.4, 0.7, 0.2, 0.4, 0.1), (2, 0.6, 0.7, 0.1, 0.5, 0.8, 0.3, 0.1, 0.3)]).toDF(
["TS", "ABC[0].VAL.VAL[0].UNT[0].sth1", "ABC[0].VAL.VAL[0].UNT[1].sth1", "ABC[0].VAL.VAL[1].UNT[0].sth1",
"ABC[0].VAL.VAL[1].UNT[1].sth1", "ABC[0].VAL.VAL[0].UNT[0].sth2", "ABC[0].VAL.VAL[0].UNT[1].sth2",
"ABC[0].VAL.VAL[1].UNT[0].sth2", "ABC[0].VAL.VAL[1].UNT[1].sth2"])
newcols = list(map(lambda x: x.replace(".", "_"), df.columns))
df = df.toDF(*newcols)
def extract_indices_and_label(column_name):
s = re.match(r"\D+\d+\D+(\d+)\D+(\d+)[^_]_(.*)$", column_name)
m, n, label = s.groups()
return int(m), int(n), label
def create_struct(column_name):
val, unt, label = extract_indices_and_label(column_name)
return struct(lit(val).alias("val"),
lit(unt).alias("unt"),
lit(label).alias("label"),
col(column_name).alias("value"))
df2 = (df.select(
df.TS,
explode(array([create_struct(c) for c in df.columns[1:]]))))
df2.printSchema() # this is instructional: it shows the structure is nearly there
# root
# |-- TS: long (nullable = true)
# |-- col: struct (nullable = false)
# | |-- val: integer (nullable = false)
# | |-- unt: integer (nullable = false)
# | |-- label: string (nullable = false)
# | |-- value: double (nullable = true)
df3 = (df2
.groupBy(df2.TS, df2.col.val.alias("VAL"), df2.col.unt.alias("UNT"))
.pivot("col.label", values=("sth1", "sth2"))
.sum("col.value"))
df3.orderBy("TS", "VAL", "UNT").show()
# +---+---+---+----+----+
# | TS|VAL|UNT|sth1|sth2|
# +---+---+---+----+----+
# | 1| 0| 0| 0.0| 0.7|
# | 1| 0| 1| 0.6| 0.2|
# | 1| 1| 0| 0.1| 0.4|
# | 1| 1| 1| 0.4| 0.1|
# | 2| 0| 0| 0.6| 0.8|
# | 2| 0| 1| 0.7| 0.3|
# | 2| 1| 0| 0.1| 0.1|
# | 2| 1| 1| 0.5| 0.3|
# +---+---+---+----+----+
If you know a priori that you will have only the two columns sth1 and sth2 that will be pivoted, you could add these to pivot’s values parameter, which will improve the efficiency further.

Related

Swap values between columns based on third column

I have a table like this:
src_id | src_source | dst_id | dst_source | metadata
--------------------------------------------------------
123 | A | 345 | B | some_string
234 | B | 567 | A | some_other_string
498 | A | 432 | A | another_one # this line should be ignored
765 | B | 890 | B | another_one # this line should be ignored
What I would like is:
A_id | B_id | metadata
-----------------------
123 | 345 | some string
567 | 234 | some_other_string
Here's the data to replicate:
data = [
("123", "A", "345", "B", "some_string"),
("234", "B", "567", "A", "some_other_string"),
("498", "A", "432", "A", "another_one"),
("765", "B", "890", "B", "another_two"),
]
cols = ["src_id", "src_source", "dst_id", "dst_source", "metadata"]
df = spark.createDataFrame(data).toDF(*cols)
I am a bit confused as to how to do this - I got to here:
output = (
df
.filter(F.col("src_source") != F.col("dst_source"))
.withColumn("A_id",
F.when(F.col("src_source") == "A", F.col("src_id")))
.withColumn("B_id",
F.when(F.col("src_source") == "B", F.col("src_id")))
)
I think i figured it out - I need to split the df and union again!
ab_df = (
df
.filter(F.col("src_source") != F.col("dst_source"))
.filter((F.col("src_source") == "A") & (F.col("dst_source") == "B"))
.select(F.col("src_id").alias("A_id"),
F.col("dst_id").alias("B_id"),
"metadata")
)
ba_df = (
df
.filter(F.col("src_source") != F.col("dst_source"))
.filter((F.col("src_source") == "B") & (F.col("dst_source") == "A"))
.select(F.col("src_id").alias("B_id"),
F.col("dst_id").alias("A_id"),
"metadata")
)
all = ab_df.unionByName(ba_df)
You can do it without union, just in one select, without the need to write the same filter twice.
output = (
df
.filter(F.col("src_source") != F.col("dst_source"))
.select(
F.when(F.col("src_source") == "A", F.col("src_id")).otherwise(F.col("dst_id")).alias("A_id"),
F.when(F.col("src_source") == "A", F.col("dst_id")).otherwise(F.col("src_id")).alias("B_id"),
"metadata"
)
)
output.show()
# +----+----+-----------------+
# |A_id|B_id| metadata|
# +----+----+-----------------+
# | 123| 345| some_string|
# | 567| 234|some_other_string|
# +----+----+-----------------+

Counting longest sequence of specific elements in a list contained within a spark.sql database column

I have the following problem I would like to solve.
I have the following Dataframe that I created from a query
val temp = spark.sql("select Id, collect_list(from) as letter from f group by Id")
|Id| letter|
+-----------+---------------+
| 106| [c]|
| 101| [p]|
| 104|[c, c, c, t, u]|
| 100|[d, t, j, j, c]|
| 110| [p, n, f]|
| 113|[s, c, c, b, ..|
| 115|[u, s, t, c, ..|
| 11| [c, c, i, s]|
| 117| [d, d, p, s]|
| 118|[a, s, c, t, ..|
| 123| [d, n]|
| 125| [n, b]|
| 128| [c]|
| 131| [c, t, c, u]|
| 132| [c, u, i]|
| 134|[c, p, j, u, c]|
| 136|[b, a, t, n, c]|
| 137| [b, a]|
| 138| [b, t, c]|
| 141| [s]|
I would like to create a new column called "n"
This column would contain a numerical value which represents the longest sequence of letters in a cell before "c" appears. the longest sequence can be anywhere in the list.
For example the solution column for this section (assuming nothing is cut off by the ....) would be
0, 1, 3, 5, 3, 2, 4, 4, 4, 4, 2, 2, 1, 4, 2, 5, 5, 2, 3, 1
Any help would be greatly appreciated. Thank you!
Here is how you can use the spark functions, you can convert the given scala functions with spark functions as below
import org.apache.spark.sql.functions._
df.withColumn("n_trip",
array_max(
transform(
filter(
split(array_join($"trip", " "), "co"),
(col: Column) => (col =!= "" || col =!= null)
), (col: Column) => size(split(trim(col), " "))
)
))
.withColumn("n_trip", when($"n_trip".isNull, 0).otherwise($"n_trip"))
.show(false)
Update: Easy to understand
df.withColumn("split", split(array_join($"trip", " "), "co"))
.withColumn("filter", filter($"split", (col: Column) => col =!= "" || col =!= null))
.withColumn("n_trip", array_max(transform($"filter", (col: Column) => size(split(trim(col), " ")))))
.withColumn("n_trip", when($"n_trip".isNull, 0).otherwise($"n_trip"))
.drop("split", "filter")
.show(false)
Output:
+-----------+--------------------+------+
|passengerId|trip |n_trip|
+-----------+--------------------+------+
|10096 |[co] |0 |
|10351 |[pk] |1 |
|10436 |[co, co, cn, tj, us]|3 |
|1090 |[dk, tj, jo, jo, ch]|5 |
|11078 |[pk, no, fr] |3 |
|11332 |[sg, cn, co, bm] |2 |
|11563 |[us, sg, th, cn] |4 |
|1159 |[ca, cl, il, sg] |4 |
|11722 |[dk, dk, pk, sg] |4 |
|11888 |[au, se, ca, tj] |4 |
|12394 |[dk, nl] |2 |
|12529 |[no, be] |2 |
|12847 |[cn] |1 |
|13192 |[cn, tk, cg, uk] |4 |
|13282 |[co, us, iq] |2 |
|13442 |[cn, pk, jo, us, ch]|5 |
|13610 |[be, ar, tj, no, ch]|5 |
|13772 |[be, at] |2 |
|13865 |[be, th, cn] |3 |
|14157 |[sg] |1 |
+-----------+--------------------+------+
You could write a user defined function (udf) that would compute what you wish. There are plenty of ways to compute the longuest sequence. One simple way is to split the sequence on "co", compute the size of each sub sequence and take the max.
val longuest_seq = udf((x : Seq[String]) => {
x.reduce(_ +" "+_)
.split(" *co *")
.map(_.count(_ == ' ') + 1)
.max
})
val df = Seq(
(1, Array("x", "y", "co", "z")),
(2, Array("x")),
(3, Array("co", "t")),
(4, Array("a", "b", "c", "d", "co", "e"))
).toDF("id", "trip")
df.withColumn("n_trips", longuest_seq('trip)).show
which yields
+---+-------------------+-------+
| id| trip|n_trips|
+---+-------------------+-------+
| 1| [x, y, co, z]| 2|
| 2| [x]| 1|
| 3| [co, t]| 1|
| 4|[a, b, c, d, co, e]| 4|
+---+-------------------+-------+

How to compute a cumulative sum under a limit with Spark?

After several tries and some research, I'm stuck on trying to solve the following problem with Spark.
I have a Dataframe of elements with a priority and a quantity.
+------+-------+--------+---+
|family|element|priority|qty|
+------+-------+--------+---+
| f1| elmt 1| 1| 20|
| f1| elmt 2| 2| 40|
| f1| elmt 3| 3| 10|
| f1| elmt 4| 4| 50|
| f1| elmt 5| 5| 40|
| f1| elmt 6| 6| 10|
| f1| elmt 7| 7| 20|
| f1| elmt 8| 8| 10|
+------+-------+--------+---+
I have a fixed limit quantity :
+------+--------+
|family|limitQty|
+------+--------+
| f1| 100|
+------+--------+
I want to mark as "ok" the elements whose the cumulative sum is under the limit. Here is the expected result :
+------+-------+--------+---+---+
|family|element|priority|qty| ok|
+------+-------+--------+---+---+
| f1| elmt 1| 1| 20| 1| -> 20 < 100 => ok
| f1| elmt 2| 2| 40| 1| -> 20 + 40 < 100 => ok
| f1| elmt 3| 3| 10| 1| -> 20 + 40 + 10 < 100 => ok
| f1| elmt 4| 4| 50| 0| -> 20 + 40 + 10 + 50 > 100 => ko
| f1| elmt 5| 5| 40| 0| -> 20 + 40 + 10 + 40 > 100 => ko
| f1| elmt 6| 6| 10| 1| -> 20 + 40 + 10 + 10 < 100 => ok
| f1| elmt 7| 7| 20| 1| -> 20 + 40 + 10 + 10 + 20 < 100 => ok
| f1| elmt 8| 8| 10| 0| -> 20 + 40 + 10 + 10 + 20 + 10 > 100 => ko
+------+-------+--------+---+---+
I try to solve if with a cumulative sum :
initDF
.join(limitQtyDF, Seq("family"), "left_outer")
.withColumn("cumulSum", sum($"qty").over(Window.partitionBy("family").orderBy("priority")))
.withColumn("ok", when($"cumulSum" <= $"limitQty", 1).otherwise(0))
.drop("cumulSum", "limitQty")
But it's not enough because the elements after the element that is up to the limit are not take into account.
I can't find a way to solve it with Spark. Do you have an idea ?
Here is the corresponding Scala code :
val sparkSession = SparkSession.builder()
.master("local[*]")
.getOrCreate()
import sparkSession.implicits._
val initDF = Seq(
("f1", "elmt 1", 1, 20),
("f1", "elmt 2", 2, 40),
("f1", "elmt 3", 3, 10),
("f1", "elmt 4", 4, 50),
("f1", "elmt 5", 5, 40),
("f1", "elmt 6", 6, 10),
("f1", "elmt 7", 7, 20),
("f1", "elmt 8", 8, 10)
).toDF("family", "element", "priority", "qty")
val limitQtyDF = Seq(("f1", 100)).toDF("family", "limitQty")
val expectedDF = Seq(
("f1", "elmt 1", 1, 20, 1),
("f1", "elmt 2", 2, 40, 1),
("f1", "elmt 3", 3, 10, 1),
("f1", "elmt 4", 4, 50, 0),
("f1", "elmt 5", 5, 40, 0),
("f1", "elmt 6", 6, 10, 1),
("f1", "elmt 7", 7, 20, 1),
("f1", "elmt 8", 8, 10, 0)
).toDF("family", "element", "priority", "qty", "ok").show()
Thank you for your help !
The solution is shown below:
scala> initDF.show
+------+-------+--------+---+
|family|element|priority|qty|
+------+-------+--------+---+
| f1| elmt 1| 1| 20|
| f1| elmt 2| 2| 40|
| f1| elmt 3| 3| 10|
| f1| elmt 4| 4| 50|
| f1| elmt 5| 5| 40|
| f1| elmt 6| 6| 10|
| f1| elmt 7| 7| 20|
| f1| elmt 8| 8| 10|
+------+-------+--------+---+
scala> val df1 = initDF.groupBy("family").agg(collect_list("qty").as("comb_qty"), collect_list("priority").as("comb_prior"), collect_list("element").as("comb_elem"))
df1: org.apache.spark.sql.DataFrame = [family: string, comb_qty: array<int> ... 2 more fields]
scala> df1.show
+------+--------------------+--------------------+--------------------+
|family| comb_qty| comb_prior| comb_elem|
+------+--------------------+--------------------+--------------------+
| f1|[20, 40, 10, 50, ...|[1, 2, 3, 4, 5, 6...|[elmt 1, elmt 2, ...|
+------+--------------------+--------------------+--------------------+
scala> val df2 = df1.join(limitQtyDF, df1("family") === limitQtyDF("family")).drop(limitQtyDF("family"))
df2: org.apache.spark.sql.DataFrame = [family: string, comb_qty: array<int> ... 3 more fields]
scala> df2.show
+------+--------------------+--------------------+--------------------+--------+
|family| comb_qty| comb_prior| comb_elem|limitQty|
+------+--------------------+--------------------+--------------------+--------+
| f1|[20, 40, 10, 50, ...|[1, 2, 3, 4, 5, 6...|[elmt 1, elmt 2, ...| 100|
+------+--------------------+--------------------+--------------------+--------+
scala> def validCheck = (qty: Seq[Int], limit: Int) => {
| var sum = 0
| qty.map(elem => {
| if (elem + sum <= limit) {
| sum = sum + elem
| 1}else{
| 0
| }})}
validCheck: (scala.collection.mutable.Seq[Int], Int) => scala.collection.mutable.Seq[Int]
scala> val newUdf = udf(validCheck)
newUdf: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function2>,ArrayType(IntegerType,false),Some(List(ArrayType(IntegerType,false), IntegerType)))
val df3 = df2.withColumn("valid", newUdf(col("comb_qty"),col("limitQty"))).drop("limitQty")
df3: org.apache.spark.sql.DataFrame = [family: string, comb_qty: array<int> ... 3 more fields]
scala> df3.show
+------+--------------------+--------------------+--------------------+--------------------+
|family| comb_qty| comb_prior| comb_elem| valid|
+------+--------------------+--------------------+--------------------+--------------------+
| f1|[20, 40, 10, 50, ...|[1, 2, 3, 4, 5, 6...|[elmt 1, elmt 2, ...|[1, 1, 1, 0, 0, 1...|
+------+--------------------+--------------------+--------------------+--------------------+
scala> val myUdf = udf((qty: Seq[Int], prior: Seq[Int], elem: Seq[String], valid: Seq[Int]) => {
| elem zip prior zip qty zip valid map{
| case (((a,b),c),d) => (a,b,c,d)}
| }
| )
scala> val df4 = df3.withColumn("combined", myUdf(col("comb_qty"),col("comb_prior"),col("comb_elem"),col("valid")))
df4: org.apache.spark.sql.DataFrame = [family: string, comb_qty: array<int> ... 4 more fields]
scala> val df5 = df4.drop("comb_qty","comb_prior","comb_elem","valid")
df5: org.apache.spark.sql.DataFrame = [family: string, combined: array<struct<_1:string,_2:int,_3:int,_4:int>>]
scala> df5.show(false)
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|family|combined |
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|f1 |[[elmt 1, 1, 20, 1], [elmt 2, 2, 40, 1], [elmt 3, 3, 10, 1], [elmt 4, 4, 50, 0], [elmt 5, 5, 40, 0], [elmt 6, 6, 10, 1], [elmt 7, 7, 20, 1], [elmt 8, 8, 10, 0]]|
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+
scala> val df6 = df5.withColumn("combined",explode(col("combined")))
df6: org.apache.spark.sql.DataFrame = [family: string, combined: struct<_1: string, _2: int ... 2 more fields>]
scala> df6.show
+------+------------------+
|family| combined|
+------+------------------+
| f1|[elmt 1, 1, 20, 1]|
| f1|[elmt 2, 2, 40, 1]|
| f1|[elmt 3, 3, 10, 1]|
| f1|[elmt 4, 4, 50, 0]|
| f1|[elmt 5, 5, 40, 0]|
| f1|[elmt 6, 6, 10, 1]|
| f1|[elmt 7, 7, 20, 1]|
| f1|[elmt 8, 8, 10, 0]|
+------+------------------+
scala> val df7 = df6.select("family", "combined._1", "combined._2", "combined._3", "combined._4").withColumnRenamed("_1","element").withColumnRenamed("_2","priority").withColumnRenamed("_3", "qty").withColumnRenamed("_4","ok")
df7: org.apache.spark.sql.DataFrame = [family: string, element: string ... 3 more fields]
scala> df7.show
+------+-------+--------+---+---+
|family|element|priority|qty| ok|
+------+-------+--------+---+---+
| f1| elmt 1| 1| 20| 1|
| f1| elmt 2| 2| 40| 1|
| f1| elmt 3| 3| 10| 1|
| f1| elmt 4| 4| 50| 0|
| f1| elmt 5| 5| 40| 0|
| f1| elmt 6| 6| 10| 1|
| f1| elmt 7| 7| 20| 1|
| f1| elmt 8| 8| 10| 0|
+------+-------+--------+---+---+
Let me know if it helps!!
Another way to do it will be an RDD based approach by iterating row by row.
var bufferRow: collection.mutable.Buffer[Row] = collection.mutable.Buffer.empty[Row]
var tempSum: Double = 0
val iterator = df.collect.iterator
while(iterator.hasNext){
val record = iterator.next()
val y = record.getAs[Integer]("qty")
tempSum = tempSum + y
print(record)
if (tempSum <= 100.0 ) {
bufferRow = bufferRow ++ Seq(transformRow(record,1))
}
else{
bufferRow = bufferRow ++ Seq(transformRow(record,0))
tempSum = tempSum - y
}
}
Defining transformRow function which is used to add a column to a row.
def transformRow(row: Row,flag : Int): Row = Row.fromSeq(row.toSeq ++ Array[Integer](flag))
Next thing to do will be adding an additional column to the schema.
val newSchema = StructType(df.schema.fields ++ Array(StructField("C_Sum", IntegerType, false))
Followed by creating a new dataframe.
val outputdf = spark.createDataFrame(spark.sparkContext.parallelize(bufferRow.toSeq),newSchema)
Output Dataframe :
+------+-------+--------+---+-----+
|family|element|priority|qty|C_Sum|
+------+-------+--------+---+-----+
| f1| elmt1| 1| 20| 1|
| f1| elmt2| 2| 40| 1|
| f1| elmt3| 3| 10| 1|
| f1| elmt4| 4| 50| 0|
| f1| elmt5| 5| 40| 0|
| f1| elmt6| 6| 10| 1|
| f1| elmt7| 7| 20| 1|
| f1| elmt8| 8| 10| 0|
+------+-------+--------+---+-----+
I am new to Spark so this solution may not be optimal. I am assuming the value of 100 is an input to the program here. In that case:
case class Frame(family:String, element : String, priority : Int, qty :Int)
import scala.collection.JavaConverters._
val ans = df.as[Frame].toLocalIterator
.asScala
.foldLeft((Seq.empty[Int],0))((acc,a) =>
if(acc._2 + a.qty <= 100) (acc._1 :+ a.priority, acc._2 + a.qty) else acc)._1
df.withColumn("OK" , when($"priority".isin(ans :_*), 1).otherwise(0)).show
results in:
+------+-------+--------+---+--------+
|family|element|priority|qty|OK |
+------+-------+--------+---+--------+
| f1| elmt 1| 1| 20| 1|
| f1| elmt 2| 2| 40| 1|
| f1| elmt 3| 3| 10| 1|
| f1| elmt 4| 4| 50| 0|
| f1| elmt 5| 5| 40| 0|
| f1| elmt 6| 6| 10| 1|
| f1| elmt 7| 7| 20| 1|
| f1| elmt 8| 8| 10| 0|
+------+-------+--------+---+--------+
The idea is simply to get a Scala iterator and extract the participating priority values from it and then use those values to filter out the participating rows. Given this solution gathers all the data in memory on one machine, it could run into memory problems if the dataframe size is too large to fit in memory.
Cumulative sum for each group
from pyspark.sql.window import Window as window
from pyspark.sql.types import IntegerType,StringType,FloatType,StructType,StructField,DateType
schema = StructType() \
.add(StructField("empno",IntegerType(),True)) \
.add(StructField("ename",StringType(),True)) \
.add(StructField("job",StringType(),True)) \
.add(StructField("mgr",StringType(),True)) \
.add(StructField("hiredate",DateType(),True)) \
.add(StructField("sal",FloatType(),True)) \
.add(StructField("comm",StringType(),True)) \
.add(StructField("deptno",IntegerType(),True))
emp = spark.read.csv('data/emp.csv',schema)
dept_partition = window.partitionBy(emp.deptno).orderBy(emp.sal)
emp_win = emp.withColumn("dept_cum_sal",
f.sum(emp.sal).over(dept_partition.rowsBetween(window.unboundedPreceding, window.currentRow)))
emp_win.show()
Results appear like below:
+-----+------+---------+----+----------+------+-------+------+------------
+
|empno| ename| job| mgr| hiredate| sal| comm|deptno|dept_cum_sal|
+-----+------+---------+----+----------+------+-------+------+------------
+
| 7369| SMITH| CLERK|7902|1980-12-17| 800.0| null| 20| 800.0|
| 7876| ADAMS| CLERK|7788|1983-01-12|1100.0| null| 20| 1900.0|
| 7566| JONES| MANAGER|7839|1981-04-02|2975.0| null| 20| 4875.0|
| 7788| SCOTT| ANALYST|7566|1982-12-09|3000.0| null| 20| 7875.0|
| 7902| FORD| ANALYST|7566|1981-12-03|3000.0| null| 20| 10875.0|
| 7934|MILLER| CLERK|7782|1982-01-23|1300.0| null| 10| 1300.0|
| 7782| CLARK| MANAGER|7839|1981-06-09|2450.0| null| 10| 3750.0|
| 7839| KING|PRESIDENT|null|1981-11-17|5000.0| null| 10| 8750.0|
| 7900| JAMES| CLERK|7698|1981-12-03| 950.0| null| 30| 950.0|
| 7521| WARD| SALESMAN|7698|1981-02-22|1250.0| 500.00| 30| 2200.0|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.0|1400.00| 30| 3450.0|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.0| 0.00| 30| 4950.0|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.0| 300.00| 30| 6550.0|
| 7698| BLAKE| MANAGER|7839|1981-05-01|2850.0| null| 30| 9400.0|
+-----+------+---------+----+----------+------+-------+------+------------+
PFA the answer
val initDF = Seq(("f1", "elmt 1", 1, 20),("f1", "elmt 2", 2, 40),("f1", "elmt 3", 3, 10),
("f1", "elmt 4", 4, 50),
("f1", "elmt 5", 5, 40),
("f1", "elmt 6", 6, 10),
("f1", "elmt 7", 7, 20),
("f1", "elmt 8", 8, 10)
).toDF("family", "element", "priority", "qty")
val limitQtyDF = Seq(("f1", 100)).toDF("family", "limitQty")
sc.broadcast(limitQtyDF)
val joinedInitDF=initDF.join(limitQtyDF,Seq("family"),"left")
case class dataResult(family:String,element:String,priority:Int, qty:Int, comutedValue:Int, limitQty:Int,controlOut:String)
val familyIDs=initDF.select("family").distinct.collect.map(_(0).toString).toList
def checkingUDF(inputRows:List[Row])={
var controlVarQty=0
val outputArrayBuffer=collection.mutable.ArrayBuffer[dataResult]()
val setLimit=inputRows.head.getInt(4)
for(inputRow <- inputRows)
{
val currQty=inputRow.getInt(3)
//val outpurForRec=
controlVarQty + currQty match {
case value if value <= setLimit =>
controlVarQty+=currQty
outputArrayBuffer+=dataResult(inputRow.getString(0),inputRow.getString(1),inputRow.getInt(2),inputRow.getInt(3),value,setLimit,"ok")
case value =>
outputArrayBuffer+=dataResult(inputRow.getString(0),inputRow.getString(1),inputRow.getInt(2),inputRow.getInt(3),value,setLimit,"ko")
}
//outputArrayBuffer+=Row(inputRow.getString(0),inputRow.getString(1),inputRow.getInt(2),inputRow.getInt(3),controlVarQty+currQty,setLimit,outpurForRec)
}
outputArrayBuffer.toList
}
val tmpAB=collection.mutable.ArrayBuffer[List[dataResult]]()
for (familyID <- familyIDs) // val familyID="f1"
{
val currentFamily=joinedInitDF.filter(s"family = '${familyID}'").orderBy("element", "priority").collect.toList
tmpAB+=checkingUDF(currentFamily)
}
tmpAB.toSeq.flatMap(x => x).toDF.show(false)
This works for me .
+------+-------+--------+---+------------+--------+----------+
|family|element|priority|qty|comutedValue|limitQty|controlOut|
+------+-------+--------+---+------------+--------+----------+
|f1 |elmt 1 |1 |20 |20 |100 |ok |
|f1 |elmt 2 |2 |40 |60 |100 |ok |
|f1 |elmt 3 |3 |10 |70 |100 |ok |
|f1 |elmt 4 |4 |50 |120 |100 |ko |
|f1 |elmt 5 |5 |40 |110 |100 |ko |
|f1 |elmt 6 |6 |10 |80 |100 |ok |
|f1 |elmt 7 |7 |20 |100 |100 |ok |
|f1 |elmt 8 |8 |10 |110 |100 |ko |
+------+-------+--------+---+------------+--------+----------+
Please do drop unnecessary columns from the output

Writing select queries on dataframe based on where condition from other dataframe, scala

I have two dataframes with the following columns..
DF1 - partitionNum, lowerBound, upperBound
DF2- ID, cumulativeCount
I want a resulting Frame which has - ID, partitionNum
I have done a cross join which is performing bad as below
DF2.crossJoin(DF1).where(col("cumulativeCount").between(col("lowerBound"), col("upperBound"))).orderBy("cumulativeCount")
.select("ID", "partitionNum")
Since DF2 has 5 million of rows and DF1 has 50 rows, this cross join yields 250 million rows and this task is dying. How can i make this as a select where resulting frame should have ID from DF2 and partitionNum from DF1 and condition is select partition num from DF1 WHERE cumulative Count of DF2 is between lower and upperBound of DF1
I am looking for something like below will this work
sparkSession.sqlContext.sql("SELECT ID, cumulativeCount, A.partitionNum FROM CumulativeCountViewById WHERE cumulativeCount IN " +
"(SELECT partitionNum FROM CumulativeRangeView WHERE cumulativeCount BETWEEN lowerBound and upperBound) AS A")
Try this.
Solution is - you don't need to do crossjoin. Since your DF1 is only 50 rows, convert it to a map of key: partitionNum, value: Tuple2(lowerBound, UppperBound).
Create an UDF which takes a number(your cumulativeCount) and checks against the map to return keys(ie., partitionNums) when lowerBound < cumulativeCount < upperBound.
You may edit the UDF to return only partitionNumbers and explode the "partNums" array column in the end if you choose to.
scala> DF1.show
+------------+----------+----------+
|partitionNum|lowerBound|upperBound|
+------------+----------+----------+
| 1| 10| 20|
| 2| 5| 10|
| 3| 6| 15|
| 4| 8| 20|
+------------+----------+----------+
scala> DF2.show
+---+---------------+
| ID|cumulativeCount|
+---+---------------+
|100| 5|
|100| 10|
|100| 15|
|100| 20|
|100| 25|
|100| 30|
|100| 6|
|100| 12|
|100| 18|
|100| 24|
|101| 1|
|101| 2|
|101| 3|
|101| 4|
|101| 5|
|101| 6|
|101| 7|
|101| 8|
|101| 9|
|101| 10|
+---+---------------+
scala> val smallData = DF1.collect.map(row => row.getInt(0) -> (row.getInt(1), row.getInt(2))).toMap
smallData: scala.collection.immutable.Map[Int,(Int, Int)] = Map(1 -> (10,20), 2 -> (5,10), 3 -> (6,15), 4 -> (8,20))
scala> val myUdf = udf((num:Int) => smallData.filter((v) => v._2._2 > num && num > v._2._1))
myUdf: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,MapType(IntegerType,StructType(StructField(_1,IntegerType,false), StructField(_2,IntegerType,false)),true),Some(List(IntegerType)))
scala> DF2.withColumn("partNums", myUdf($"cumulativeCount")).show(false)
+---+---------------+-------------------------------------------+
|ID |cumulativeCount|partNums |
+---+---------------+-------------------------------------------+
|100|5 |[] |
|100|10 |[3 -> [6, 15], 4 -> [8, 20]] |
|100|15 |[1 -> [10, 20], 4 -> [8, 20]] |
|100|20 |[] |
|100|25 |[] |
|100|30 |[] |
|100|6 |[2 -> [5, 10]] |
|100|12 |[1 -> [10, 20], 3 -> [6, 15], 4 -> [8, 20]]|
|100|18 |[1 -> [10, 20], 4 -> [8, 20]] |
|100|24 |[] |
|101|1 |[] |
|101|2 |[] |
|101|3 |[] |
|101|4 |[] |
|101|5 |[] |
|101|6 |[2 -> [5, 10]] |
|101|7 |[2 -> [5, 10], 3 -> [6, 15]] |
|101|8 |[2 -> [5, 10], 3 -> [6, 15]] |
|101|9 |[2 -> [5, 10], 3 -> [6, 15], 4 -> [8, 20]] |
|101|10 |[3 -> [6, 15], 4 -> [8, 20]] |
+---+---------------+-------------------------------------------+

Printing out a SQL table in an R Sweave PDF

This seems like it should be very simple but I can't seem to find the answer anywhere I look.
This seems like it has just as much chance being easier to solve using clever SQL queries as it is to use R code.
The table is being pulled into the script with this code:
dbhandle <- SQLConn_remote(DBName = "DATABASE", ServerName = "SERVER")
Testdf<-sqlQuery(dbhandle, 'select * from TABLENAME
order by FileName, Number, Category', stringsAsFactors = FALSE)
I want to print out a SQL Table on a R Sweave PDF. I'd like to do it with the following conditions:
Printing only specific columns. This seems simple enough using sqlQuery but I've already created a variable in my script called Testdf that contains all of the table so I'd rather just subset that if I can. The reason I'm not satisfied to simply do this, is because the next condition seems beyond me in queries.
Here's the tricky part. In the sample table I gave below, There is a list of File names that are organized by Version numbers and group Numbers. I'd like to print the table in the .Rnw file so that there are 3 columns. The 1st column is the FileName column, the 2nd column is a column of all Values where Number == 2, and the final (3rd) column is a column of all Values where Number == 3.
Here's what the table looks like:
| Name | Version | Category | Value | Date | Number | Build | Error |
|:-----:|:-------:|:--------:|:-----:|:------:|:------:|:---------:|:-----:|
| File1 | 0.01 | Time | 123 | 1-1-12 | 1 | Iteration | None |
| File1 | 0.01 | Size | 456 | 1-1-12 | 1 | Iteration | None |
| File1 | 0.01 | Final | 789 | 1-1-12 | 1 | Iteration | None |
| File2 | 0.01 | Time | 312 | 1-1-12 | 1 | Iteration | None |
| File2 | 0.01 | Size | 645 | 1-1-12 | 1 | Iteration | None |
| File2 | 0.01 | Final | 978 | 1-1-12 | 1 | Iteration | None |
| File3 | 0.01 | Time | 741 | 1-1-12 | 1 | Iteration | None |
| File3 | 0.01 | Size | 852 | 1-1-12 | 1 | Iteration | None |
| File3 | 0.01 | Final | 963 | 1-1-12 | 1 | Iteration | None |
| File1 | 0.02 | Time | 369 | 1-1-12 | 2 | Iteration | None |
| File1 | 0.02 | Size | 258 | 1-1-12 | 2 | Iteration | None |
| File1 | 0.02 | Final | 147 | 1-1-12 | 2 | Iteration | None |
| File2 | 0.02 | Time | 753 | 1-1-12 | 2 | Iteration | None |
| File2 | 0.02 | Size | 498 | 1-1-12 | 2 | Iteration | None |
| File2 | 0.02 | Final | 951 | 1-1-12 | 2 | Iteration | None |
| File3 | 0.02 | Time | 753 | 1-1-12 | 2 | Iteration | None |
| File3 | 0.02 | Size | 915 | 1-1-12 | 2 | Iteration | None |
| File3 | 0.02 | Final | 438 | 1-1-12 | 2 | Iteration | None |
Here's what I'd like it to look like:
| Name | 0.01 | 0.02 |
|:-----:|:----:|:----:|
| File1 | 123 | 369 |
| File1 | 456 | 258 |
| File1 | 789 | 147 |
| File2 | 312 | 753 |
| File2 | 645 | 498 |
| File2 | 978 | 951 |
| File3 | 741 | 753 |
| File3 | 852 | 915 |
| File3 | 963 | 438 |
The middle and right column titles are derived from the original Version column. The values in the middle column are all of the entries in the Value column that correspond to both 0.01 in the Version column and 1 in the Number column. The values in the right column are all of the entries in the Value column that correspond to both 0.02 in the Version column and 2 in the Number column.
Here's a sample database for reference and if you'd like to reproduce this using R:
rw1 <- c("File1", "File1", "File1", "File2", "File2", "File2", "File3", "File3", "File3", "File1", "File1", "File1", "File2", "File2", "File2", "File3", "File3", "File3", "File1", "File1", "File1", "File2", "File2", "File2", "File3", "File3", "File3")
rw2 <- c("0.01", "0.01", "0.01", "0.01", "0.01", "0.01", "0.01", "0.01", "0.01", "0.02", "0.02", "0.02", "0.02", "0.02", "0.02", "0.02", "0.02", "0.02", "0.03", "0.03", "0.03", "0.03", "0.03", "0.03", "0.03", "0.03", "0.03")
rw3 <- c("Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final")
rw4 <- c(123, 456, 789, 312, 645, 978, 741, 852, 963, 369, 258, 147, 753, 498, 951, 753, 915, 438, 978, 741, 852, 963, 369, 258, 147, 753, 498)
rw5 <- c("01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12")
rw6 <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3)
rw7 <- c("Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Release", "Release", "Release", "Release", "Release", "Release", "Release", "Release", "Release")
rw8 <- c("None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "Cannot Connect to Database", "None", "None", "None", "None", "None", "None", "None", "None")
Testdf = data.frame(rw1, rw2, rw3, rw4, rw5, rw6, rw7, rw8)
colnames(Testdf) <- c("FileName", "Version", "Category", "Value", "Date", "Number", "Build", "Error")
Here's a solution using dplyr and tidyr. The relevant variables are selected. An index column is then added to allow for the data to be spread without issues around duplicate indices. The data is then reshaped with spread, and finally the Index column removed.
library("dplyr")
library("tidyr")
Testdf %>%
select(FileName, Version, Value) %>%
group_by(FileName, Version) %>%
mutate(Index = 1:n()) %>%
spread(Version, Value) %>%
select(-Index)
If it can always be assumed that for each FileName there will be 9 Values, one for each combination of Version and Category, then this would work:
Testdf %>%
select(FileName, Category, Version, Value) %>%
spread(Version, Value) %>%
select(-Category)
If you wanted to use data.table, you could do:
setDT(Testdf)[, split(Value, Version), by = FileName]
If you want LaTeX output, then you could further pipe the output to xtable::xtable.