Spark code to read property and define logger and multi threading - dataframe

Below is the code to run spark code in thread
we can run spark task on parallel below code is example to run using thread
spark dataframe pass query to run future will help to run parallel
Added logger file and property file to read property
package Test
import java.util.concurrent.Executors
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import scala.io.Source.fromFile
import org.apache.log4j.Logger
object ParallerlExecution {
val log = Logger.getLogger(getClass.getName)
def main(args: Array[String]): Unit = {
log.info("Start of program!!")
val queryList=loadFile()
parallerlExecution(queryList)
log.info("End of program!!!")
}
def loadFile():List[String]={
fromFile("").getLines().toList
}
def parallerlExecution(queryList:List[String]): Unit ={
val spark=SparkSession.builder().appName("test").master("local[*]").getOrCreate()
/*
--properties-file parallelprop.conf
create file and put info like below
spark.jdbc.url <jdbc url >
spark.jdbc.username <user name for db>
spark.jdbc.password <password for db>
*/
val url=spark.sparkContext.getConf.get("spark.jdbc.url")
val username=spark.sparkContext.getConf.get("spark.jdbc.username")
val password=spark.sparkContext.getConf.get("spark.jdbc.password")
val pool= Executors.newFixedThreadPool(3)
for(query<-queryList){
val r= new Runnable {
override def run(): Unit = {
val st = System.currentTimeMillis();
val df = spark.read
.format("jdbc")
.option("url", "jdbc:postgresql:dbserver")
.option("dbtable", query)
.option("user", username)
.option("password", password)
.load()
val count = df.count
val et = System.currentTimeMillis();
val resIntoHdfs=spark.sparkContext.parallelize( Seq(url,count))
resIntoHdfs.coalesce(1).saveAsTextFile("hdfs path to write result example /user/abc/"+et)
val rddOfDataframe = df.rdd.map(_.toString())
val size = calcRDDSize(rddOfDataframe)
val logInput="Thread" + Thread.currentThread().getId() + " Record Count " + count + " StartTime " + st + " Endtime " + et +" Size: "+size+ " Query: " + query
case class LogOut(value:String)
import spark.implicits._
val logDF=spark.sparkContext.parallelize( Seq(LogOut(logInput))).toDF
logDF.coalesce(1).write.mode("append").save("hdfs path to save result example /home/abc/logsqlresult")
println("Thread" + Thread.currentThread().getId() + " Record Count " + count + " StartTime " + st + " Endtime " + et +" Size: "+size+ " Query: " + query)
log.info(logInput)
}
}
pool.execute(r)
}
pool.shutdown()
}
def calcRDDSize(rdd: RDD[String]): Long = {
rdd.map(_.getBytes("UTF-8").length.toLong)
.reduce(_+_) //add the sizes together
}
}
we can run spark in multi threading defined thread pool and in loop we can run our spark program in parallel mode

package Test
import java.io.{BufferedReader, InputStreamReader}
import java.util.concurrent.Executors
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import scala.io.Source.fromFile
import org.apache.log4j.Logger
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.security.alias.CredentialProviderFactory
import java.util.Properties
import org.apache.hadoop.fs.FSInputStream
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
object ParallerlExecution {
val log = Logger.getLogger(getClass.getName)
def main(args: Array[String]): Unit = {
val hdfsFilePath=args(0)
log.info("Start of program!!")
val queryList=load_file("")
parallerlExecution(queryList,hdfsFilePath)
log.info("End of program!!!")
}
/* def loadFile():List[String]={
fromFile("").getLines().toList
}
*/
def load_file(path:String)={
val pt=new Path(path)
val fs = FileSystem.get(new Configuration())
val br=new BufferedReader(new InputStreamReader(fs.open(pt)))
var res:List[String]= List()
try {
var line=br.readLine()
while (line != null){
System.out.println(line);
res= res :+ line
line=br.readLine()
}
} finally {
// you should close out the BufferedReader
br.close();
}
res
}
def parallerlExecution(queryList:List[String],hdfsFilePath:String): Unit ={
val spark=SparkSession.builder().appName("test").master("local[*]").getOrCreate()
/*
--properties-file parallelprop.conf
create file and put info like below
spark.jdbc.url <jdbc url >
spark.jdbc.username <user name for db>
spark.jdbc.password <password for db>
*/
/*val url=spark.sparkContext.getConf.get("spark.jdbc.url")
val username=spark.sparkContext.getConf.get("spark.jdbc.username")
val jecksProvider=spark.sparkContext.getConf.get("spark.jecks.provider")
val passwordAlial=spark.sparkContext.getConf.get("spark.password.alias")*/
val prop=readHdfsFile(hdfsFilePath)
val jecksProvider=prop.getProperty("jeck-provider")
val passwordAlial=prop.getProperty("password-alias")
val url=prop.getProperty("url")
val username=prop.getProperty("username")
val password=extractPwdFromJceks(jecksProvider,passwordAlial)
val pool= Executors.newFixedThreadPool(3)
for(query<-queryList){
val r= new Runnable {
override def run(): Unit = {
val st = System.currentTimeMillis();
val tableName=""
val df = spark.read
.format("jdbc")
.option("url", "jdbc:postgresql:dbserver")
.option("dbtable", query)
.option("user", username)
.option("password", password)
.load()
val count = df.count
val et = System.currentTimeMillis();
val resIntoHdfs=spark.sparkContext.parallelize( Seq(url,count))
resIntoHdfs.coalesce(1).saveAsTextFile("hdfs path to write result example /user/abc/"+et)
val rddOfDataframe = df.rdd.map(_.toString())
val size = calcRDDSize(rddOfDataframe)
val logInput="Thread" + Thread.currentThread().getId() + " Record Count " + count + " StartTime " + st + " Endtime " + et +" Size: "+size+ " Query: " + query
case class LogOut(value:String)
import spark.implicits._
case class Out(tableName:String,sizeOfData:String)
df.write.mode("overwrite").save("<path hdfs>"+tableName)
val sizeOfData=getFileSizeByPath("<path hdfs>"+tableName)
val outDF=spark.sparkContext.parallelize( Seq(Out(tableName,sizeOfData.toString))).toDF
outDF.coalesce(1).write.mode("append").csv("hdfs path to save result ")
println("Thread" + Thread.currentThread().getId() + " Record Count " + count + " StartTime " + st + " Endtime " + et +" Size: "+size+ " Query: " + query)
log.info(logInput)
}
}
pool.execute(r)
}
pool.shutdown()
}
def calcRDDSize(rdd: RDD[String]): Long = {
rdd.map(_.getBytes("UTF-8").length.toLong)
.reduce(_+_) //add the sizes together
}
def extractPwdFromJceks(jceksfile:String, password_alias:String):String = {
val conf:Configuration = new Configuration()
conf.set(CredentialProviderFactory.CREDENTIAL_PROVIDER_PATH, jceksfile)
conf.getPassword(password_alias).mkString("")
}
def readHdfsFile(path:String):Properties={
val prop=new Properties()
val fis=FileSystem.get(new Configuration()).open(new Path(path))
prop.load(fis)
prop
}
private def getFileSizeByPath(filePath : String): Long = {
val path = new Path(filePath)
val hdfs = path.getFileSystem(new Configuration())
val cSummary = hdfs.getContentSummary(path)
val length = cSummary.getLength
length
}
}

Related

How to use parameters inside a block when defining an extension function?

val test: Int.(String) -> Int = {
plus((this))
}
When defining this type of extension function, how can I use arguments( Here, the argument of type String) inside the block?
When defining extension functions at the same time as declaration like this, can only this be used?
You can access it using it:
val test: Int.(String) -> Int = {
println("this = $this")
println("it = $it")
42
}
fun main() {
println("result = " + 1.test("a"))
}
This will output
this = 1
it = a
result = 42
An alternative is to introduce a lambda-parameter:
val test: Int.(String) -> Int = { s ->
println("this = $this")
println("s = $s")
42
}
fun main() {
println("result = " + 1.test("a"))
}
This will output
this = 1
s = a
result = 42

Why is Kotlin's generateSequence returning one too many items in the example below?

I'm calculating the projection of instants in time based on a cron expression and returning them as a Sequence. Here's the class:
// (package omitted)
import org.springframework.scheduling.support.CronExpression
import java.time.Instant
import java.time.LocalDate
import java.time.LocalDateTime
import java.time.ZonedDateTime
import java.time.temporal.ChronoUnit
class Recurrence(val cronExpression: String) {
private val cron = CronExpression.parse(cronExpression)
fun instants(
fromInclusive: LocalDate = LocalDate.now(),
toExclusive: LocalDate = fromInclusive.plusMonths(1)
): Sequence<LocalDateTime> = instants(fromInclusive.atStartOfDay(), toExclusive.atStartOfDay())
fun instants(
fromInclusive: LocalDateTime = LocalDateTime.now(),
toExclusive: LocalDateTime = fromInclusive.plusMonths(1)
): Sequence<LocalDateTime> {
return generateSequence(cron.next(fromInclusive.minusNanos(1))) {
if (it.isBefore(toExclusive)) {
cron.next(it)
} else {
null
}
}
}
}
The following test fails because the first assertion is false: the returned list has one extra, unexpected element at the end.
// (package omitted)
import java.time.LocalDate
import java.time.Month
import kotlin.test.Test
import kotlin.test.assertEquals
class RecurrenceTest {
#Test
fun testInstants() {
val r = Recurrence("#daily")
val from = LocalDate.of(2021, Month.JANUARY, 1)
val forDays = 31
val instants = r.instants(from, from.plusDays(forDays.toLong())).toList()
assertEquals(forDays, instants.size)
(1..forDays).forEach {
assertEquals(from.plusDays(it.toLong() - 1).atStartOfDay(), instants[it - 1])
}
}
}
If I reimplement by building an ArrayList instead, it works as expected:
// new collection-based methods in Recurrence
fun instantsList(
fromInclusive: LocalDate = LocalDate.now(),
toExclusive: LocalDate = fromInclusive.plusMonths(1)
): List<LocalDateTime> = instantsList(fromInclusive.atStartOfDay(), toExclusive.atStartOfDay())
fun instantsList(
fromInclusive: LocalDateTime = LocalDateTime.now(),
toExclusive: LocalDateTime = fromInclusive.plusMonths(1)
): List<LocalDateTime> {
val list = arrayListOf<LocalDateTime>()
var it = cron.next(fromInclusive.minusNanos(1))
while (it !== null) {
if (it.isBefore(toExclusive)) {
list.add(it)
it = cron.next(it)
} else {
break
}
}
return list
}
The one line to change in the test is to use the new method:
val instants = r.instantsList(from, from.plusDays(forDays.toLong()))
Why is the sequence-based implementation returning me one more element than the list-based one?
If I read your code correctly, in list implementation you check if it.isBefore(toExclusive) and only then you add it to the list. In sequence implementation you do the same check it.isBefore(toExclusive) and then you add next item to the sequence.
Similar with the first item. In list implementation you check if cron.next(fromInclusive.minusNanos(1)) meets the requirement. In sequence implementation you always add it.
Thanks, #broot -- you spotted the issue. Just took another set of eyeballs. Correct sequence implementation is
fun instants(
fromInclusive: LocalDateTime = LocalDateTime.now(),
toExclusive: LocalDateTime = fromInclusive.plusMonths(1)
): Sequence<LocalDateTime> {
val seed = cron.next(fromInclusive.minusNanos(1))
return generateSequence(seed) {
val next = cron.next(it)
if (next.isBefore(toExclusive)) {
next
} else {
null
}
}
}

Redirect print() output into a String instead of stdout

I'm trying to redirect the stdout of an object's print() function into a String variable.
In order to do this I am using the following Java imports (within Kotlin):
import java.io.PipedOutputStream
import java.io.PipedInputStream
import java.io.PrintStream
And the following function:
fun index(): String {
val df_out = DataFrame.fromJson("https://jsonplaceholder.typicode.com/posts")
val pipeOut = PipedOutputStream()
val pipeIn = PipedInputStream(pipeOut)
System.setOut(PrintStream(pipeOut));
df_out.print(maxRows = 10)
val dfAsStr = pipeIn.bufferedReader().use { it.readText() }
return dfAsStr
}
The idea is to capture the output of the print() method into a PipedInputStream in order to be able to return it as a (processed) String.
This function code doesn't terminate.
You can try this one. If I got your question correctly.
fun index(): String {
val df_out = DataFrame.fromJson("https://jsonplaceholder.typicode.com/posts")
val outStream = ByteArrayOutputStream().apply { System.setOut(PrintStream(df_out)) }
df_out.print(maxRows = 10)
return outStream.toString()
}

ojAlgo - Expressing Variables as Boundaries in Optimization?

I've been playing around with ojAlgo and I've been pretty thrilled with it so far. I've worked through a few studies with it but I'm having trouble with this problem described in this article.
I'm using Kotlin instead of Java, but that shouldn't cause any issues. I'm stuck trying to input an expression into my model but bounding on a variable rather than a literal numeric value. How do I input that?
Here is my work so far:
import org.ojalgo.optimisation.ExpressionsBasedModel
import org.ojalgo.optimisation.Variable
fun main(args: Array<String>) {
val model = ExpressionsBasedModel()
val ingredients = sequenceOf(
Ingredient("Pork", 4.32, 30),
Ingredient("Wheat", 2.46, 20),
Ingredient("Starch", 1.86, 17)
).map { it.name to it }
.toMap()
val sausageTypes = sequenceOf(
SausageType("Economy", .40),
SausageType("Premium", .60)
).map { it.description to it }
.toMap()
// Map concatenated string keys to variables
val variables = ingredients.values.asSequence().flatMap { ingredient ->
sausageTypes.values.asSequence()
.map { type -> Combo(ingredient,type)}
}.map { it.toString() to Variable.make(it.toString()).lower(0).weight(it.ingredient.cost) }
.toMap()
// add variables to model
model.addVariables(variables.values)
// Pe + We + Se = 350 * 0.05
model.addExpression("EconomyDemand").level(350.0 * 0.05).apply {
set(variables["Pork-Economy"], 1)
set(variables["Wheat-Economy"], 1)
set(variables["Starch-Economy"], 1)
}
// Pp + Wp + Sp = 500 * 0.05
model.addExpression("PremiumDemand").level(500.0 * 0.05).apply {
set(variables["Pork-Premium"], 1)
set(variables["Wheat-Premium"], 1)
set(variables["Starch-Premium"], 1)
}
// Pe >= 0.4(Pe + We + Se)
// compile error?
model.addExpression("EconomyGovRestriction").upper(variables["Pork-Economy"]).apply {
set(variables["Pork-Economy"], .4)
set(variables["Wheat-Economy"], .4)
set(variables["Starch-Economy"], .4)
}
}
data class Combo(val ingredient: Ingredient, val sausageType: SausageType) {
override fun toString() = "$sausageType-$ingredient"
}
data class SausageType(val description: String, val porkRequirement: Double) {
override fun toString() = description
}
data class Ingredient(val name: String, val cost: Double, val availability: Int) {
override fun toString() = name
}
For future readers, here is the full working solution I came up with.
import org.ojalgo.optimisation.ExpressionsBasedModel
import org.ojalgo.optimisation.Variable
import java.math.RoundingMode
fun main(args: Array<String>) {
val model = ExpressionsBasedModel()
val ingredients = sequenceOf(
Ingredient("Pork", 4.32, 30),
Ingredient("Wheat", 2.46, 20),
Ingredient("Starch", 1.86, 17)
).map { it.name to it }
.toMap()
val sausageTypes = sequenceOf(
SausageType("Economy", .40),
SausageType("Premium", .60)
).map { it.description to it }
.toMap()
// Map concatenated string keys to variables
val variables = ingredients.values.asSequence().flatMap { ingredient ->
sausageTypes.values.asSequence()
.map { type -> Combo(ingredient,type)}
}.map { it.toString() to Variable.make(it.toString()).lower(0).weight(it.ingredient.cost) }
.toMap()
// add variables to model
model.addVariables(variables.values)
// Pe + We + Se = 350 * 0.05
model.addExpression("EconomyDemand").level(17.5).apply {
set(variables["Pork-Economy"], 1)
set(variables["Wheat-Economy"], 1)
set(variables["Starch-Economy"], 1)
}
// Pp + Wp + Sp = 500 * 0.05
model.addExpression("PremiumDemand").level(25).apply {
set(variables["Pork-Premium"], 1)
set(variables["Wheat-Premium"], 1)
set(variables["Starch-Premium"], 1)
}
// Pe >= 0.4(Pe + We + Se)
model.addExpression("EconomyPorkRatio").upper(0.0).apply {
set(variables["Pork-Economy"], -0.6)
set(variables["Wheat-Economy"], .4)
set(variables["Starch-Economy"], .4)
}
// Pe >= 0.6(Pp + Wp + Sp)
model.addExpression("PremiumPorkRatio").upper(0.0).apply {
set(variables["Pork-Premium"], -0.4)
set(variables["Wheat-Premium"], .6)
set(variables["Starch-Premium"], .6)
}
// Se <= .25(Pe + We + Se)
// Sp <= .25(Pp + Wp + Sp)
sausageTypes.values.forEach {
model.addExpression("${it}StarchRestriction").lower(0.0).apply {
set(variables["Pork-$it"], .25)
set(variables["Wheat-$it"], .25)
set(variables["Starch-$it"], -0.75)
}
}
// Pe + Pp <= 30
// We + Wp <= 20
// Se + Sp <= 17
ingredients.values.forEach { ingredient ->
model.addExpression("${ingredient}SupplyConstraint").upper(ingredient.availability).apply {
sausageTypes.values.forEach { sausageType ->
set(variables["$ingredient-$sausageType"], 1)
}
}
}
// Pe + Pp >= 23
model.addExpression("ContractPorkRestriction").lower(23).apply {
set(variables["Pork-Economy"], 1)
set(variables["Pork-Premium"], 1)
}
// go!
val result = model.minimise()
println("OPTIMIZED COST: ${result.value}")
model.variables.asSequence()
.map { it.name }
.zip(result.asSequence().map { it.setScale(3, RoundingMode.HALF_DOWN) })
.forEach(::println)
}
data class Combo(val ingredient: Ingredient, val sausageType: SausageType) {
override fun toString() = "$ingredient-$sausageType"
}
data class SausageType(val description: String, val porkRequirement: Double) {
override fun toString() = description
}
data class Ingredient(val name: String, val cost: Double, val availability: Int) {
override fun toString() = name
}
OUTPUT:
OPTIMIZED COST: 140.955
(Pork-Economy, 8.000)
(Pork-Premium, 15.000)
(Wheat-Economy, 5.125)
(Wheat-Premium, 3.750)
(Starch-Economy, 4.375)
(Starch-Premium, 6.250)
You can't do that. You can't directly model expr1 >= expr2. Instead you have to model (expr1 - expr2) >= 0. There is an example on the ojAlgo wiki describing how to model a similar problem: https://github.com/optimatika/ojAlgo/wiki/The-Diet-Problem

org.apache.spark.sql.AnalysisException when calling saveAsTable

How can I resolve this error?
The code below works in Zeppelin but not when compiled into assembly jar and submitted with spark-submit.
Error is:
org.apache.spark.sql.AnalysisException: Specifying database name or
other qualifiers are not allowed for temporary tables. If the table
name has dots (.) in it, please quote the table name with backticks
(`).;
Code:
import org.apache.spark._
import org.apache.spark.rdd.NewHadoopRDD
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext
import java.text.SimpleDateFormat
import java.util.Calendar
case class Benchmark(date: String, time: String, start_end: String,
server: String, timestamp: Long, interface: String,
cid: String, raw: String)
object job {
def main(args: Array[String]) {
val sdf = new java.text.SimpleDateFormat("yyyyMMdd")
val sdf1 = new java.text.SimpleDateFormat("yyyy-MM-dd")
val calendar = Calendar.getInstance()
calendar.set(Calendar.DAY_OF_YEAR,
calendar.get(Calendar.DAY_OF_YEAR) -1)
val date = sdf.format(calendar.getTime())
val dt = sdf1.format(calendar.getTime())
val conf = new SparkConf().setAppName("Interface_HtoH_Job")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val hiveContext = new HiveContext(sc)
val benchmarkText = sc.textFile(s"hdfs:/rawlogs/prod/log/${date}/*.gz")
val pattern = "([0-9-]{10}) ([0-9:]{8}),[0-9]{1,3} Benchmark..* - (Start|End)<ID=([0-9a-zA-Z_]+)-([0-9]+)><([0-9a-zA-Z.,:!#() =_-]*)><cid=TaskId_([0-9A-Z#_a-z]+),.*><[,0-9:a-zA-Z ]+>".r
benchmarkText.filter { ln => ln.startsWith("2017-") }
.filter { l => l.endsWith(">") }
.filter { k => k.contains("<cid=TaskId") }
.map { line =>
try {
var pattern(date,time,startEnd,server,ts,interface,cid) = line
Benchmark(date,time,startEnd,server,ts.toLong,interface,cid,line)
} catch {
case e: Exception => Benchmark(dt,"00:00:00","bad",e.toString,"0".toLong,"bad","bad",line)
}
}.toDF()
.write
.mode("overwrite")
.saveAsTable("prod_ol_bm.interface_benchmark_tmp") // error here
}
}
Running using spark-submit on:
HDP : 2.5.3.0-37
Spark : 1.6.2.2.5.3.0-37 built for Hadoop 2.7.3.2.5.3.0-37
Change following line
val sqlContext = new SQLContext(sc)
to
val sqlContext = new HiveContext(sc)
Both shell and zeppelin create HiveContext with the name sqlContext, which is a little bit silly.
You need HiveContext to connect to hive.