org.apache.spark.sql.AnalysisException when calling saveAsTable - hive

How can I resolve this error?
The code below works in Zeppelin but not when compiled into assembly jar and submitted with spark-submit.
Error is:
org.apache.spark.sql.AnalysisException: Specifying database name or
other qualifiers are not allowed for temporary tables. If the table
name has dots (.) in it, please quote the table name with backticks
(`).;
Code:
import org.apache.spark._
import org.apache.spark.rdd.NewHadoopRDD
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext
import java.text.SimpleDateFormat
import java.util.Calendar
case class Benchmark(date: String, time: String, start_end: String,
server: String, timestamp: Long, interface: String,
cid: String, raw: String)
object job {
def main(args: Array[String]) {
val sdf = new java.text.SimpleDateFormat("yyyyMMdd")
val sdf1 = new java.text.SimpleDateFormat("yyyy-MM-dd")
val calendar = Calendar.getInstance()
calendar.set(Calendar.DAY_OF_YEAR,
calendar.get(Calendar.DAY_OF_YEAR) -1)
val date = sdf.format(calendar.getTime())
val dt = sdf1.format(calendar.getTime())
val conf = new SparkConf().setAppName("Interface_HtoH_Job")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val hiveContext = new HiveContext(sc)
val benchmarkText = sc.textFile(s"hdfs:/rawlogs/prod/log/${date}/*.gz")
val pattern = "([0-9-]{10}) ([0-9:]{8}),[0-9]{1,3} Benchmark..* - (Start|End)<ID=([0-9a-zA-Z_]+)-([0-9]+)><([0-9a-zA-Z.,:!#() =_-]*)><cid=TaskId_([0-9A-Z#_a-z]+),.*><[,0-9:a-zA-Z ]+>".r
benchmarkText.filter { ln => ln.startsWith("2017-") }
.filter { l => l.endsWith(">") }
.filter { k => k.contains("<cid=TaskId") }
.map { line =>
try {
var pattern(date,time,startEnd,server,ts,interface,cid) = line
Benchmark(date,time,startEnd,server,ts.toLong,interface,cid,line)
} catch {
case e: Exception => Benchmark(dt,"00:00:00","bad",e.toString,"0".toLong,"bad","bad",line)
}
}.toDF()
.write
.mode("overwrite")
.saveAsTable("prod_ol_bm.interface_benchmark_tmp") // error here
}
}
Running using spark-submit on:
HDP : 2.5.3.0-37
Spark : 1.6.2.2.5.3.0-37 built for Hadoop 2.7.3.2.5.3.0-37

Change following line
val sqlContext = new SQLContext(sc)
to
val sqlContext = new HiveContext(sc)
Both shell and zeppelin create HiveContext with the name sqlContext, which is a little bit silly.
You need HiveContext to connect to hive.

Related

Getting the error type mismatch while using functions in scala

import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.functions._
object sparkcpp
{
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
def extract(): DataFrame =
{
val df = spark.read.option("inferschema","true").option("header","true").csv("olive\\A.csv")
}
def transform(df: DataFrame): DataFrame =
{
val df = df.select(df("name"),df("age"))
}
def load(df: DataFrame): DataFrame =
{
val df = df.write.csv("testing.csv")
}
def main(args: Array[String]): Unit =
{
load(transform(extract()))
}
}
*In the above code i'm getting the 'type mismatch error' when I try to load the dataframe in a csv file
*This is the first time i'm working with scala using function. Am I doing it right?
*The aim of the program is to load the dataframe defined to the specified location using function.
My major concern is the functions that i've used..is it right? please make changes as required.
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.functions._
object sparkcpp
{
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
def extract(): DataFrame =
{
spark.read.option("inferschema","true").option("header","true").csv("olive\\A.csv")
}
def transform(df: DataFrame): DataFrame =
{
df.select(df("name"),df("age"))
}
def load(df: DataFrame): DataFrame =
{
df.write.csv("testing.csv")
}
def main(args: Array[String]): Unit =
{
load(transform(extract()))
}
}
You should return the dataframe from extract and load method

How to pass current ClassLoader to KotlinToJVMBytecodeCompiler for dynamic (runtime) compilation kotlin code programmatically?

I created simple utility for runtime compilation kotlin code:
package com.example
import org.jetbrains.kotlin.cli.common.CLIConfigurationKeys
import org.jetbrains.kotlin.cli.common.config.addKotlinSourceRoot
import org.jetbrains.kotlin.cli.common.messages.MessageRenderer
import org.jetbrains.kotlin.cli.common.messages.PrintingMessageCollector
import org.jetbrains.kotlin.cli.jvm.compiler.EnvironmentConfigFiles
import org.jetbrains.kotlin.cli.jvm.compiler.KotlinCoreEnvironment
import org.jetbrains.kotlin.cli.jvm.compiler.KotlinToJVMBytecodeCompiler
import org.jetbrains.kotlin.cli.jvm.config.addJvmClasspathRoots
import org.jetbrains.kotlin.codegen.state.GenerationState
import org.jetbrains.kotlin.com.intellij.openapi.Disposable
import org.jetbrains.kotlin.config.CommonConfigurationKeys
import org.jetbrains.kotlin.config.CompilerConfiguration
import org.jetbrains.kotlin.config.JVMConfigurationKeys
import org.jetbrains.kotlin.config.JvmTarget
import java.io.File
import kotlin.script.experimental.jvm.util.KotlinJars
class KotlinDynamicCompiler {
fun compileScript(moduleName: String,
sourcePath: String,
saveClassesDir: File
): GenerationState {
val stubDisposable = StubDisposable();
val configuration = CompilerConfiguration()
configuration.put(CommonConfigurationKeys.MODULE_NAME, moduleName)
configuration.put(CLIConfigurationKeys.MESSAGE_COLLECTOR_KEY, PrintingMessageCollector(System.out, MessageRenderer.PLAIN_FULL_PATHS, true))
configuration.put(JVMConfigurationKeys.OUTPUT_DIRECTORY, saveClassesDir)
configuration.put(JVMConfigurationKeys.JVM_TARGET, JvmTarget.JVM_1_8)
configuration.addKotlinSourceRoot(sourcePath)
configuration.addJvmClasspathRoots(listOf(KotlinJars.stdlib))
val env = KotlinCoreEnvironment.createForProduction(stubDisposable, configuration, EnvironmentConfigFiles.JVM_CONFIG_FILES)
return KotlinToJVMBytecodeCompiler.analyzeAndGenerate(env)!!;
}
inner class StubDisposable : Disposable {
#Volatile
var isDisposed: Boolean = false
private set
override fun dispose() {
isDisposed = true
}
};
}
And it works for code as
package com.example.kt
class SimpleClass(val str:String){
fun test(){
}
}
class UsedSimpleClass(val simpleClass: SimpleClass, val file: java.io.File) {
}
But it not works if I want to use no-base package classes as:
package com.example.kt
import com.example.pojo.TestPojo //class have in project that call runtime compilation
class SimpleClass(val str:TestPojo){
}
or:
package com.example.kt
import com.fasterxml.jackson.databind.ObjectMapper //class have in project classpath where called runtime compilation
class SimpleClass(val str:ObjectMapper){
}
How to pass current ClassLoader to KotlinToJVMBytecodeCompiler for dynamic (runtime) compilation kotlin code programmatically?
More details:
Test project on github with crashed test: https://github.com/nekkiy/dynamic-kotlin
Cause:
We need use codegeneration and would like to test generated code. But I don't understand how to pass current classes environment.
Thanks for attention.
Solution:
I have used method fun classpathFromClassloader(currentClassLoader: ClassLoader, unpackJarCollections: Boolean = false): List<File>? from kotlin.script.experimental.jvm.util.jvmClasspathUtil.kt and it works.
Result dynamic compiller:
package com.example
import org.jetbrains.kotlin.cli.common.CLIConfigurationKeys
import org.jetbrains.kotlin.cli.common.config.addKotlinSourceRoots
import org.jetbrains.kotlin.cli.common.messages.MessageRenderer
import org.jetbrains.kotlin.cli.common.messages.PrintingMessageCollector
import org.jetbrains.kotlin.cli.jvm.compiler.EnvironmentConfigFiles
import org.jetbrains.kotlin.cli.jvm.compiler.KotlinCoreEnvironment
import org.jetbrains.kotlin.cli.jvm.compiler.KotlinToJVMBytecodeCompiler
import org.jetbrains.kotlin.cli.jvm.config.addJvmClasspathRoots
import org.jetbrains.kotlin.codegen.state.GenerationState
import org.jetbrains.kotlin.com.intellij.openapi.Disposable
import org.jetbrains.kotlin.config.CommonConfigurationKeys
import org.jetbrains.kotlin.config.CompilerConfiguration
import org.jetbrains.kotlin.config.JVMConfigurationKeys
import org.jetbrains.kotlin.config.JvmTarget
import java.io.ByteArrayOutputStream
import java.io.File
import java.io.PrintStream
import kotlin.script.experimental.jvm.util.KotlinJars
import kotlin.script.experimental.jvm.util.classpathFromClassloader
class KotlinDynamicCompiler {
fun compileModule(moduleName: String,
sourcePath: List<String>,
saveClassesDir: File,
classLoader: ClassLoader? = null,
forcedAddKotlinStd: Boolean = true
): GenerationState {
val stubDisposable = StubDisposable();
val configuration = CompilerConfiguration()
configuration.put(CommonConfigurationKeys.MODULE_NAME, moduleName)
val baos = ByteArrayOutputStream()
val ps: PrintStream = PrintStream(baos)
configuration.put(CLIConfigurationKeys.MESSAGE_COLLECTOR_KEY, PrintingMessageCollector(ps, MessageRenderer.PLAIN_FULL_PATHS, true))
configuration.put(JVMConfigurationKeys.OUTPUT_DIRECTORY, saveClassesDir)
// configuration.put(JVMConfigurationKeys.RETAIN_OUTPUT_IN_MEMORY, true)
configuration.put(JVMConfigurationKeys.JVM_TARGET, JvmTarget.JVM_1_8)
val classPath = mutableSetOf<File>()
if (classLoader != null) {
classPath.addAll(classpathFromClassloader(classLoader)!!);
}
if (forcedAddKotlinStd) {
classPath.add(KotlinJars.stdlib)
}
configuration.addJvmClasspathRoots(classPath.toList())
configuration.addKotlinSourceRoots(sourcePath)
val env = KotlinCoreEnvironment.createForProduction(stubDisposable, configuration, EnvironmentConfigFiles.JVM_CONFIG_FILES)
val result = KotlinToJVMBytecodeCompiler.analyzeAndGenerate(env);
ps.flush();
if (result != null) {
return result
} else {
throw IllegalStateException("Compilation error. Details:\n$baos")
}
}
inner class StubDisposable : Disposable {
#Volatile
var isDisposed: Boolean = false
private set
override fun dispose() {
isDisposed = true
}
};
}
Note: This function is contained in experimental package.
P.S. I also updated github-project.

How to Create Dataframe in spark scala for single coumn

I am new to spark scala.
I have dataframe which contains 10 columns, But I want to add one more column for that data frame, that column is date format date will be generated by random numbers.
import java.util.Date
import java.util.ArrayList
import java.text.SimpleDateFormat
object Datecolumn {
def main(args: Array[String]) {
val dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
val date = new Date();
//println(dateFormat.format(date));
val li= new ArrayList[String]
for(i<- 1 to 10)
{
li.add(dateFormat.format(date))
}
// val dateColumn =
val Lii = li.listIterator()
while(Lii.hasNext())
{
println(Lii.next())
}
li.toDF("Date") //.toDF is not a member of Java.util.ArrayList
}
}
//Initialise Spark Session
val spark = SparkSession
.builder()
.master("local")
.appName("ParquetAppendMode")
.getOrCreate()
import spark.implicits._
//create a simple dataframe with one column
val dataFrame = spark.sparkContext.parallelize(1 to 10).toDF("number")
dataFrame.show
//add another column with current timestamp
dataFrame.withColumn("timestamp", unix_timestamp()).show()
Hope this helps if I understood you!

Akka HTTP doesn't render collection when using custom ToEntityMarshaller

I have defined a custom ToEntityMarshaller for type Organisation. When requesting localhost:8080/organisations it return an empty JSON array. Only when I remove the implicit def organisationMarshaller: ToEntityMarshaller[Organisation] it return the correct representation of the stream.
Anybody has an idea what is going on here?
import akka.NotUsed
import akka.actor.ActorSystem
import akka.http.scaladsl.Http
import akka.http.scaladsl.common.{EntityStreamingSupport, JsonEntityStreamingSupport}
import akka.http.scaladsl.model.{HttpEntity, StatusCodes, _}
import akka.http.scaladsl.server.Directives._
import akka.stream.ActorMaterializer
import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport._
import akka.http.scaladsl.marshalling.{Marshaller, ToEntityMarshaller, ToResponseMarshaller}
import akka.http.scaladsl.model.TransferEncodings.gzip
import akka.http.scaladsl.model.headers.{HttpEncoding, HttpEncodings}
import akka.stream.scaladsl.{Flow, Source}
import akka.util.ByteString
import spray.json.DefaultJsonProtocol
import spray.json.DefaultJsonProtocol._
import scala.concurrent.Future
import scala.io.StdIn
import scala.util.Random
final case class Organisation(name: String, id: String)
trait Protocols extends DefaultJsonProtocol {
import spray.json._
implicit val organisationFormat = jsonFormat2(Organisation)
val `vnd.example.api.v1+json` =
MediaType.applicationWithFixedCharset("vnd.example.api.v1+json", HttpCharsets.`UTF-8`)
// -- WORKS AFTER REMOVING THIS DECLARATION --
implicit def organisationMarshaller: ToEntityMarshaller[Organisation] = Marshaller.oneOf(
Marshaller.withFixedContentType(`vnd.example.api.v1+json`) { organisation =>
HttpEntity(`vnd.example.api.v1+json`, organisation.toJson.compactPrint)
})
}
object Server extends App with Protocols {
implicit val system = ActorSystem("api")
implicit val materializer = ActorMaterializer()
implicit val executionContext = system.dispatcher
implicit val jsonStreamingSupport: JsonEntityStreamingSupport = EntityStreamingSupport.json()
.withParallelMarshalling(parallelism = 10, unordered = false)
// (fake) async database query api
def dummyOrganisation(id: String) = Organisation(s"Organisation $id", id.toString)
def fetchOrganisation(id: String): Future[Option[Organisation]] = Future(Some(dummyOrganisation(id)))
def fetchOrganisations(): Source[Organisation, NotUsed] = Source.fromIterator(() => Iterator.fill(10000) {
val id = Random.nextInt()
dummyOrganisation(id.toString)
})
val route =
encodeResponse {
pathPrefix("organisations") {
get {
val organisations = fetchOrganisations()
complete(organisations)
}
}
}
val bindingFuture = Http().bindAndHandle(route, "localhost", 8080)
println(s"Server online at http://localhost:8080/\nPress RETURN to stop...")
StdIn.readLine()
bindingFuture.flatMap(_.unbind()).onComplete(_ => system.terminate())
}

How to convert spark SchemaRDD into RDD of my case class?

In the spark docs it's clear how to create parquet files from RDD of your own case classes; (from the docs)
val people: RDD[Person] = ??? // An RDD of case class objects, from the previous example.
// The RDD is implicitly converted to a SchemaRDD by createSchemaRDD, allowing it to be stored using Parquet.
people.saveAsParquetFile("people.parquet")
But not clear how to convert back, really we want a method readParquetFile where we can do:
val people: RDD[Person] = sc.readParquestFile[Person](path)
where those values of the case class are defined are those which are read by the method.
An easy way is to provide your own converter (Row) => CaseClass. This is a bit more manual, but if you know what you are reading it should be quite straightforward.
Here is an example:
import org.apache.spark.sql.SchemaRDD
case class User(data: String, name: String, id: Long)
def sparkSqlToUser(r: Row): Option[User] = {
r match {
case Row(time: String, name: String, id: Long) => Some(User(time,name, id))
case _ => None
}
}
val parquetData: SchemaRDD = sqlContext.parquetFile("hdfs://localhost/user/data.parquet")
val caseClassRdd: org.apache.spark.rdd.RDD[User] = parquetData.flatMap(sparkSqlToUser)
The best solution I've come up with that requires the least amount of copy and pasting for new classes is as follows (I'd still like to see another solution though)
First you have to define your case class, and a (partially) reusable factory method
import org.apache.spark.sql.catalyst.expressions
case class MyClass(fooBar: Long, fred: Long)
// Here you want to auto gen these functions using macros or something
object Factories extends java.io.Serializable {
def longLong[T](fac: (Long, Long) => T)(row: expressions.Row): T =
fac(row(0).asInstanceOf[Long], row(1).asInstanceOf[Long])
}
Some boiler plate which will already be available
import scala.reflect.runtime.universe._
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.createSchemaRDD
The magic
import scala.reflect.ClassTag
import org.apache.spark.sql.SchemaRDD
def camelToUnderscores(name: String) =
"[A-Z]".r.replaceAllIn(name, "_" + _.group(0).toLowerCase())
def getCaseMethods[T: TypeTag]: List[String] = typeOf[T].members.sorted.collect {
case m: MethodSymbol if m.isCaseAccessor => m
}.toList.map(_.toString)
def caseClassToSQLCols[T: TypeTag]: List[String] =
getCaseMethods[T].map(_.split(" ")(1)).map(camelToUnderscores)
def schemaRDDToRDD[T: TypeTag: ClassTag](schemaRDD: SchemaRDD, fac: expressions.Row => T) = {
val tmpName = "tmpTableName" // Maybe should use a random string
schemaRDD.registerAsTable(tmpName)
sqlContext.sql("SELECT " + caseClassToSQLCols[T].mkString(", ") + " FROM " + tmpName)
.map(fac)
}
Example use
val parquetFile = sqlContext.parquetFile(path)
val normalRDD: RDD[MyClass] =
schemaRDDToRDD[MyClass](parquetFile, Factories.longLong[MyClass](MyClass.apply))
See also:
http://apache-spark-user-list.1001560.n3.nabble.com/Spark-SQL-Convert-SchemaRDD-back-to-RDD-td9071.html
Though I failed to find any example or documentation by following the JIRA link.
there is a simple method to convert schema rdd to rdd using pyspark in Spark 1.2.1.
sc = SparkContext() ## create SparkContext
srdd = sqlContext.sql(sql)
c = srdd.collect() ## convert rdd to list
rdd = sc.parallelize(c)
there must be similar approach using scala.
Very crufty attempt. Very unconvinced this will have decent performance. Surely there must a macro-based alternative...
import scala.reflect.runtime.universe.typeOf
import scala.reflect.runtime.universe.MethodSymbol
import scala.reflect.runtime.universe.NullaryMethodType
import scala.reflect.runtime.universe.TypeRef
import scala.reflect.runtime.universe.Type
import scala.reflect.runtime.universe.NoType
import scala.reflect.runtime.universe.termNames
import scala.reflect.runtime.universe.runtimeMirror
schemaRdd.map(row => RowToCaseClass.rowToCaseClass(row.toSeq, typeOf[X], 0))
object RowToCaseClass {
// http://dcsobral.blogspot.com/2012/08/json-serialization-with-reflection-in.html
def rowToCaseClass(record: Seq[_], t: Type, depth: Int): Any = {
val fields = t.decls.sorted.collect {
case m: MethodSymbol if m.isCaseAccessor => m
}
val values = fields.zipWithIndex.map {
case (field, i) =>
field.typeSignature match {
case NullaryMethodType(sig) if sig =:= typeOf[String] => record(i).asInstanceOf[String]
case NullaryMethodType(sig) if sig =:= typeOf[Int] => record(i).asInstanceOf[Int]
case NullaryMethodType(sig) =>
if (sig.baseType(typeOf[Seq[_]].typeSymbol) != NoType) {
sig match {
case TypeRef(_, _, args) =>
record(i).asInstanceOf[Seq[Seq[_]]].map {
r => rowToCaseClass(r, args(0), depth + 1)
}.toSeq
}
} else {
sig match {
case TypeRef(_, u, _) =>
rowToCaseClass(record(i).asInstanceOf[Seq[_]], sig, depth + 1)
}
}
}
}.asInstanceOf[Seq[Object]]
val mirror = runtimeMirror(t.getClass.getClassLoader)
val ctor = t.member(termNames.CONSTRUCTOR).asMethod
val klass = t.typeSymbol.asClass
val method = mirror.reflectClass(klass).reflectConstructor(ctor)
method.apply(values: _*)
}
}