i am trying to fetch application_number record from hive table and collect as a list. and from this list, i am iterating list and for each and every application_number i am trying to call curl command.
Here is my sample code:
object th extends Serializable
{
def main(args: Array[String]): Unit =
{
val conf = new SparkConf().setAppName("th").setMaster("local")
conf.set("spark.debug.maxToStringFields", "10000000")
val context = new SparkContext(conf)
val sqlCotext = new SQLContext(context)
val hiveContext = new HiveContext(context)
import hiveContext.implicits._
val list = hiveContext.sql("select application_number from tableA").collect().take(100)
val l1=context.parallelize(list)
val stu1 =StructType(
StructField("application_number", LongType, true) ::
StructField("event_code", StringType, true) ::
StructField("event_description", StringType, true) ::
StructField("event_recorded_date", StringType, true) :: Nil)
var initialDF1 = sqlCotext.createDataFrame(context.emptyRDD[Row], stu1)
l1.repartition(10).foreachPartition(f=>{f.foreach(f=>
{
val schema=StructType(List(
StructField("queryResults",StructType(
List(StructField("searchResponse",StructType(
List(StructField("response",StructType(
List(StructField("docs",ArrayType(StructType(
List(
StructField("transactions",ArrayType(StructType(
List
(
StructField("code", StringType, nullable = true),
StructField("description", StringType, nullable = true),
StructField("recordDate", StringType, nullable = true)
)
)))
)
))))
)))
)))
))
))
val z = f.toString().replace("[","").replace("]","").replace(" ","").replace("(","").replace(")","")
if(z!= null)
{
val cmd = Seq("curl", "-X", "POST", "--insecure", "--header", "Content-Type: application/json", "--header", "Accept: application/json", "-d", "{\"searchText\":\""+z+"\",\"qf\":\"applId\"}", "https://ped.uspto.gov/api/queries") //cmd.!
val r = cmd.!!
val r1 = r.toString()
val rdd = context.parallelize(Seq(r1))
val dff = sqlCotext.read.schema(schema).json(rdd.toDS)
val dfContent = dff.select(explode(dff("queryResults.searchResponse.response.docs.transactions"))).toDF("transaction")
val a1 = dfContent.select("transaction.code").collect()
val a2 = dfContent.select("transaction.description").collect()
val a3 = dfContent.select("transaction.recordDate").collect()
for (mmm1 <- a1; mm2 <- a2; mm3 <- a3)
{
val ress1 = mmm1.toString().replace("[", " ").replace("]", " ").replace("WrappedArray(","").replace(")","")
val res2 = mm2.toString().replace("[", " ").replace("]", " ").replace("WrappedArray(","").replace(")","")
val res3 = mm3.toString().replace("[", " ").replace("]", " ").replace("WrappedArray(","").replace(")","")
initialDF1 = initialDF1.union(Seq((z, ress1, res2, res3)).toDF("application_number", "event_code", "event_description", "event_recorded_date"))
}
}
})})
initialDF1.registerTempTable("curlTH")
hiveContext.sql("insert into table default.ipg_tableB select application_number,event_code,event_description,event_recorded_date from curlTH")
}
}
i am getting Task not serializable exception.
Here is my error trace:
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2094)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:924)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:923)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:923)
at newipg170103.th$.main(th.scala:58)
at newipg170103.th.main(th.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:738)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:187)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:212)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:126)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.NotSerializableException: org.apache.spark.SparkContext
Serialization stack:
- object not serializable (class: org.apache.spark.SparkContext, value: org.apache.spark.SparkContext#1e592ef2)
- field (class: newipg170103.th$$anonfun$main$1, name: context$1, type: class org.apache.spark.SparkContext)
- object (class newipg170103.th$$anonfun$main$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:295)
... 20 more
In Apache Spark it is not permitted to use SQLContext, SparkContext or SparkSession within action or transformation (map, foreach, mapPartitions, foreachPartition, and so on).
Therefore
l1.repartition(10).foreachPartition(f=>{f.foreach(f=>
...
val rdd = context.parallelize(Seq(r1))
val dff = sqlCotext.read.schema(schema).json(rdd.toDS)
)})
is not valid Spark code.
Related
Below is the code to run spark code in thread
we can run spark task on parallel below code is example to run using thread
spark dataframe pass query to run future will help to run parallel
Added logger file and property file to read property
package Test
import java.util.concurrent.Executors
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import scala.io.Source.fromFile
import org.apache.log4j.Logger
object ParallerlExecution {
val log = Logger.getLogger(getClass.getName)
def main(args: Array[String]): Unit = {
log.info("Start of program!!")
val queryList=loadFile()
parallerlExecution(queryList)
log.info("End of program!!!")
}
def loadFile():List[String]={
fromFile("").getLines().toList
}
def parallerlExecution(queryList:List[String]): Unit ={
val spark=SparkSession.builder().appName("test").master("local[*]").getOrCreate()
/*
--properties-file parallelprop.conf
create file and put info like below
spark.jdbc.url <jdbc url >
spark.jdbc.username <user name for db>
spark.jdbc.password <password for db>
*/
val url=spark.sparkContext.getConf.get("spark.jdbc.url")
val username=spark.sparkContext.getConf.get("spark.jdbc.username")
val password=spark.sparkContext.getConf.get("spark.jdbc.password")
val pool= Executors.newFixedThreadPool(3)
for(query<-queryList){
val r= new Runnable {
override def run(): Unit = {
val st = System.currentTimeMillis();
val df = spark.read
.format("jdbc")
.option("url", "jdbc:postgresql:dbserver")
.option("dbtable", query)
.option("user", username)
.option("password", password)
.load()
val count = df.count
val et = System.currentTimeMillis();
val resIntoHdfs=spark.sparkContext.parallelize( Seq(url,count))
resIntoHdfs.coalesce(1).saveAsTextFile("hdfs path to write result example /user/abc/"+et)
val rddOfDataframe = df.rdd.map(_.toString())
val size = calcRDDSize(rddOfDataframe)
val logInput="Thread" + Thread.currentThread().getId() + " Record Count " + count + " StartTime " + st + " Endtime " + et +" Size: "+size+ " Query: " + query
case class LogOut(value:String)
import spark.implicits._
val logDF=spark.sparkContext.parallelize( Seq(LogOut(logInput))).toDF
logDF.coalesce(1).write.mode("append").save("hdfs path to save result example /home/abc/logsqlresult")
println("Thread" + Thread.currentThread().getId() + " Record Count " + count + " StartTime " + st + " Endtime " + et +" Size: "+size+ " Query: " + query)
log.info(logInput)
}
}
pool.execute(r)
}
pool.shutdown()
}
def calcRDDSize(rdd: RDD[String]): Long = {
rdd.map(_.getBytes("UTF-8").length.toLong)
.reduce(_+_) //add the sizes together
}
}
we can run spark in multi threading defined thread pool and in loop we can run our spark program in parallel mode
package Test
import java.io.{BufferedReader, InputStreamReader}
import java.util.concurrent.Executors
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import scala.io.Source.fromFile
import org.apache.log4j.Logger
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.security.alias.CredentialProviderFactory
import java.util.Properties
import org.apache.hadoop.fs.FSInputStream
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
object ParallerlExecution {
val log = Logger.getLogger(getClass.getName)
def main(args: Array[String]): Unit = {
val hdfsFilePath=args(0)
log.info("Start of program!!")
val queryList=load_file("")
parallerlExecution(queryList,hdfsFilePath)
log.info("End of program!!!")
}
/* def loadFile():List[String]={
fromFile("").getLines().toList
}
*/
def load_file(path:String)={
val pt=new Path(path)
val fs = FileSystem.get(new Configuration())
val br=new BufferedReader(new InputStreamReader(fs.open(pt)))
var res:List[String]= List()
try {
var line=br.readLine()
while (line != null){
System.out.println(line);
res= res :+ line
line=br.readLine()
}
} finally {
// you should close out the BufferedReader
br.close();
}
res
}
def parallerlExecution(queryList:List[String],hdfsFilePath:String): Unit ={
val spark=SparkSession.builder().appName("test").master("local[*]").getOrCreate()
/*
--properties-file parallelprop.conf
create file and put info like below
spark.jdbc.url <jdbc url >
spark.jdbc.username <user name for db>
spark.jdbc.password <password for db>
*/
/*val url=spark.sparkContext.getConf.get("spark.jdbc.url")
val username=spark.sparkContext.getConf.get("spark.jdbc.username")
val jecksProvider=spark.sparkContext.getConf.get("spark.jecks.provider")
val passwordAlial=spark.sparkContext.getConf.get("spark.password.alias")*/
val prop=readHdfsFile(hdfsFilePath)
val jecksProvider=prop.getProperty("jeck-provider")
val passwordAlial=prop.getProperty("password-alias")
val url=prop.getProperty("url")
val username=prop.getProperty("username")
val password=extractPwdFromJceks(jecksProvider,passwordAlial)
val pool= Executors.newFixedThreadPool(3)
for(query<-queryList){
val r= new Runnable {
override def run(): Unit = {
val st = System.currentTimeMillis();
val tableName=""
val df = spark.read
.format("jdbc")
.option("url", "jdbc:postgresql:dbserver")
.option("dbtable", query)
.option("user", username)
.option("password", password)
.load()
val count = df.count
val et = System.currentTimeMillis();
val resIntoHdfs=spark.sparkContext.parallelize( Seq(url,count))
resIntoHdfs.coalesce(1).saveAsTextFile("hdfs path to write result example /user/abc/"+et)
val rddOfDataframe = df.rdd.map(_.toString())
val size = calcRDDSize(rddOfDataframe)
val logInput="Thread" + Thread.currentThread().getId() + " Record Count " + count + " StartTime " + st + " Endtime " + et +" Size: "+size+ " Query: " + query
case class LogOut(value:String)
import spark.implicits._
case class Out(tableName:String,sizeOfData:String)
df.write.mode("overwrite").save("<path hdfs>"+tableName)
val sizeOfData=getFileSizeByPath("<path hdfs>"+tableName)
val outDF=spark.sparkContext.parallelize( Seq(Out(tableName,sizeOfData.toString))).toDF
outDF.coalesce(1).write.mode("append").csv("hdfs path to save result ")
println("Thread" + Thread.currentThread().getId() + " Record Count " + count + " StartTime " + st + " Endtime " + et +" Size: "+size+ " Query: " + query)
log.info(logInput)
}
}
pool.execute(r)
}
pool.shutdown()
}
def calcRDDSize(rdd: RDD[String]): Long = {
rdd.map(_.getBytes("UTF-8").length.toLong)
.reduce(_+_) //add the sizes together
}
def extractPwdFromJceks(jceksfile:String, password_alias:String):String = {
val conf:Configuration = new Configuration()
conf.set(CredentialProviderFactory.CREDENTIAL_PROVIDER_PATH, jceksfile)
conf.getPassword(password_alias).mkString("")
}
def readHdfsFile(path:String):Properties={
val prop=new Properties()
val fis=FileSystem.get(new Configuration()).open(new Path(path))
prop.load(fis)
prop
}
private def getFileSizeByPath(filePath : String): Long = {
val path = new Path(filePath)
val hdfs = path.getFileSystem(new Configuration())
val cSummary = hdfs.getContentSummary(path)
val length = cSummary.getLength
length
}
}
I'm trying to convert a SQL query to a relational algebra expression using the Apache Calcite SqlToRelConverter.
It works fine for this query (quotes are for ensuring lowercase):
queryToRelationalAlgebraRoot("SELECT \"country\" FROM \"mytable\"")
But on this query it fails:
queryToRelationalAlgebraRoot("SELECT \"country\", SUM(\"salary\") FROM \"mytable\" GROUP BY \"country\"")
with this error:
org.apache.calcite.sql.validate.SqlValidatorException: No match found for function signature SUM(<NUMERIC>)
It seems that somehow the SQL validator doesn't have aggregation functions like sum or count registered.
case class Income(id: Int, salary: Double, country: String)
class SparkDataFrameTable(df: DataFrame) extends AbstractTable {
def getRowType(typeFactory: RelDataTypeFactory): RelDataType = {
val typeList = df.schema.fields.map {
field => field.dataType match {
case t: StringType => typeFactory.createSqlType(SqlTypeName.VARCHAR)
case t: IntegerType => typeFactory.createSqlType(SqlTypeName.INTEGER)
case t: DoubleType => typeFactory.createSqlType(SqlTypeName.DOUBLE)
}
}.toList.asJava
val fieldNameList = df.schema.fieldNames.toList.asJava
typeFactory.createStructType(typeList, fieldNameList)
}
}
object RelationalAlgebra {
def queryToRelationalAlgebraRoot(query: String): RelRoot = {
val sqlParser = SqlParser.create(query)
val sqlParseTree = sqlParser.parseQuery()
val frameworkConfig = Frameworks.newConfigBuilder().build()
val planner = new PlannerImpl(frameworkConfig)
val rootSchema = CalciteSchema.createRootSchema(true, true)
// some sample data for testing
val inc1 = new Income(1, 100000, "USA")
val inc2 = new Income(2, 110000, "USA")
val inc3 = new Income(3, 80000, "Canada")
val spark = SparkSession.builder().master("local").getOrCreate()
import spark.implicits._
val df = Seq(inc1, inc2, inc3).toDF()
rootSchema.add("mytable", new SparkDataFrameTable(df))
val defaultSchema = List[String]().asJava
val calciteConnectionConfigProperties = new Properties()
val calciteConnectionConfigImpl = new CalciteConnectionConfigImpl(calciteConnectionConfigProperties)
val sqlTypeFactoryImpl = new SqlTypeFactoryImpl(RelDataTypeSystem.DEFAULT)
val calciteCatelogReader = new CalciteCatalogReader(rootSchema, defaultSchema, sqlTypeFactoryImpl, calciteConnectionConfigImpl)
val defaultValidator = SqlValidatorUtil.newValidator(new SqlStdOperatorTable(), calciteCatelogReader, sqlTypeFactoryImpl, SqlConformanceEnum.LENIENT)
val relExpressionOptimizationCluster = RelOptCluster.create(new VolcanoPlanner(), new RexBuilder(sqlTypeFactoryImpl))
val sqlToRelConfig = SqlToRelConverter.configBuilder().build()
val sqlToRelConverter = new SqlToRelConverter(planner, defaultValidator, calciteCatelogReader, relExpressionOptimizationCluster, StandardConvertletTable.INSTANCE, sqlToRelConfig)
sqlToRelConverter.convertQuery(sqlParseTree, true, true)
}
}
The problem with the code is that new SqlStdOperatorTable() creates a validator which is not initialized. The correct way to use SqlStdOperatorTable is to use SqlStdOperatorTable.instance().
I found the solution after emailing the dev#calcite.apache.org mailing list. I would like to thank Yuzhao Chen for looking into the question I had and pointing out the problem with my code.
I am not familiar with the api but your SQL needs group by country. And if a tool is to take this output and use it, it will probably require that you name the column too with an alias.
How can I resolve this error?
The code below works in Zeppelin but not when compiled into assembly jar and submitted with spark-submit.
Error is:
org.apache.spark.sql.AnalysisException: Specifying database name or
other qualifiers are not allowed for temporary tables. If the table
name has dots (.) in it, please quote the table name with backticks
(`).;
Code:
import org.apache.spark._
import org.apache.spark.rdd.NewHadoopRDD
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext
import java.text.SimpleDateFormat
import java.util.Calendar
case class Benchmark(date: String, time: String, start_end: String,
server: String, timestamp: Long, interface: String,
cid: String, raw: String)
object job {
def main(args: Array[String]) {
val sdf = new java.text.SimpleDateFormat("yyyyMMdd")
val sdf1 = new java.text.SimpleDateFormat("yyyy-MM-dd")
val calendar = Calendar.getInstance()
calendar.set(Calendar.DAY_OF_YEAR,
calendar.get(Calendar.DAY_OF_YEAR) -1)
val date = sdf.format(calendar.getTime())
val dt = sdf1.format(calendar.getTime())
val conf = new SparkConf().setAppName("Interface_HtoH_Job")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val hiveContext = new HiveContext(sc)
val benchmarkText = sc.textFile(s"hdfs:/rawlogs/prod/log/${date}/*.gz")
val pattern = "([0-9-]{10}) ([0-9:]{8}),[0-9]{1,3} Benchmark..* - (Start|End)<ID=([0-9a-zA-Z_]+)-([0-9]+)><([0-9a-zA-Z.,:!#() =_-]*)><cid=TaskId_([0-9A-Z#_a-z]+),.*><[,0-9:a-zA-Z ]+>".r
benchmarkText.filter { ln => ln.startsWith("2017-") }
.filter { l => l.endsWith(">") }
.filter { k => k.contains("<cid=TaskId") }
.map { line =>
try {
var pattern(date,time,startEnd,server,ts,interface,cid) = line
Benchmark(date,time,startEnd,server,ts.toLong,interface,cid,line)
} catch {
case e: Exception => Benchmark(dt,"00:00:00","bad",e.toString,"0".toLong,"bad","bad",line)
}
}.toDF()
.write
.mode("overwrite")
.saveAsTable("prod_ol_bm.interface_benchmark_tmp") // error here
}
}
Running using spark-submit on:
HDP : 2.5.3.0-37
Spark : 1.6.2.2.5.3.0-37 built for Hadoop 2.7.3.2.5.3.0-37
Change following line
val sqlContext = new SQLContext(sc)
to
val sqlContext = new HiveContext(sc)
Both shell and zeppelin create HiveContext with the name sqlContext, which is a little bit silly.
You need HiveContext to connect to hive.
I have the following simplied definition of an addition operation over a field:
import inox._
import inox.trees.{forall => _, _}
import inox.trees.dsl._
object Field {
val element = FreshIdentifier("element")
val zero = FreshIdentifier("zero")
val one = FreshIdentifier("one")
val elementADT = mkSort(element)()(Seq(zero, one))
val zeroADT = mkConstructor(zero)()(Some(element)) {_ => Seq()}
val oneADT = mkConstructor(one)()(Some(element)) {_ => Seq()}
val addID = FreshIdentifier("add")
val addFunction = mkFunDef(addID)("element") { case Seq(eT) =>
val args: Seq[ValDef] = Seq("f1" :: eT, "f2" :: eT)
val retType: Type = eT
val body: Seq[Variable] => Expr = { case Seq(f1,f2) =>
//do the addition for this field
f1 //do something better...
}
(args, retType, body)
}
//-------Helper functions for arithmetic operations and zero element of field----------------
implicit class ExprOperands(private val lhs: Expr) extends AnyVal{
def +(rhs: Expr): Expr = E(addID)(T(element)())(lhs, rhs)
}
}
I'd like this operation to be used with infix notation and the current solution that I find to do so in Scala is given here. So that's why I'm including the implicit class at the bottom.
Say now I want to use this definition of addition:
import inox._
import inox.trees.{forall => _, _}
import inox.trees.dsl._
import welder._
object Curve{
val affinePoint = FreshIdentifier("affinePoint")
val infinitePoint = FreshIdentifier("infinitePoint")
val finitePoint = FreshIdentifier("finitePoint")
val first = FreshIdentifier("first")
val second = FreshIdentifier("second")
val affinePointADT = mkSort(affinePoint)("F")(Seq(infinitePoint,finitePoint))
val infiniteADT = mkConstructor(infinitePoint)("F")(Some(affinePoint))(_ => Seq())
val finiteADT = mkConstructor(finitePoint)("F")(Some(affinePoint)){ case Seq(fT) =>
Seq(ValDef(first, fT), ValDef(second, fT))
}
val F = T(Field.element)()
val affine = T(affinePoint)(F)
val infinite = T(infinitePoint)(F)
val finite = T(finitePoint)(F)
val onCurveID = FreshIdentifier("onCurve")
val onCurveFunction = mkFunDef(onCurveID)() { case Seq() =>
val args: Seq[ValDef] = Seq("p" :: affine, "a" :: F, "b" :: F)
val retType: Type = BooleanType
val body: Seq[Variable] => Expr = { case Seq(p,a,b) =>
if_(p.isInstOf(finite)){
val x: Expr = p.asInstOf(finite).getField(first)
val y: Expr = p.asInstOf(finite).getField(second)
x === y+y
} else_ {
BooleanLiteral(true)
}
}
(args, retType, body)
}
//---------------------------Registering elements-----------------------------------
val symbols = NoSymbols
.withADTs(Seq(affinePointADT,
infiniteADT,
finiteADT,
Field.zeroADT,
Field.oneADT,
Field.elementADT))
.withFunctions(Seq(Field.addFunction,
onCurveFunction))
val program = InoxProgram(Context.empty, symbols)
val theory = theoryOf(program)
import theory._
val expr = (E(BigInt(1)) + E(BigInt(1))) === E(BigInt(2))
val theorem: Theorem = prove(expr)
}
This won't compile giving the following error:
java.lang.ExceptionInInitializerError
at Main$.main(Main.scala:4)
at Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
Caused by: inox.ast.TypeOps$TypeErrorException: Type error: if (p.isInstanceOf[finitePoint[element]]) {
p.asInstanceOf[finitePoint[element]].first == p.asInstanceOf[finitePoint[element]].second + p.asInstanceOf[finitePoint[element]].second
} else {
true
}, expected Boolean, found <untyped>
at inox.ast.TypeOps$TypeErrorException$.apply(TypeOps.scala:24)
at inox.ast.TypeOps$class.typeCheck(TypeOps.scala:264)
at inox.ast.SimpleSymbols$SimpleSymbols.typeCheck(SimpleSymbols.scala:12)
at inox.ast.Definitions$AbstractSymbols$$anonfun$ensureWellFormed$2.apply(Definitions.scala:166)
at inox.ast.Definitions$AbstractSymbols$$anonfun$ensureWellFormed$2.apply(Definitions.scala:165)
at scala.collection.TraversableLike$WithFilter$$anonfun$foreach$1.apply(TraversableLike.scala:733)
at scala.collection.immutable.Map$Map2.foreach(Map.scala:137)
at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:732)
at inox.ast.Definitions$AbstractSymbols$class.ensureWellFormed(Definitions.scala:165)
at inox.ast.SimpleSymbols$SimpleSymbols.ensureWellFormed$lzycompute(SimpleSymbols.scala:12)
at inox.ast.SimpleSymbols$SimpleSymbols.ensureWellFormed(SimpleSymbols.scala:12)
at inox.solvers.unrolling.AbstractUnrollingSolver$class.assertCnstr(UnrollingSolver.scala:129)
at inox.solvers.SolverFactory$$anonfun$getFromName$1$$anon$1.inox$tip$TipDebugger$$super$assertCnstr(SolverFactory.scala:115)
at inox.tip.TipDebugger$class.assertCnstr(TipDebugger.scala:52)
at inox.solvers.SolverFactory$$anonfun$getFromName$1$$anon$1.assertCnstr(SolverFactory.scala:115)
at inox.solvers.SolverFactory$$anonfun$getFromName$1$$anon$1.assertCnstr(SolverFactory.scala:115)
at welder.Solvers$class.prove(Solvers.scala:51)
at welder.package$$anon$1.prove(package.scala:10)
at welder.Solvers$class.prove(Solvers.scala:23)
at welder.package$$anon$1.prove(package.scala:10)
at Curve$.<init>(curve.scala:61)
at Curve$.<clinit>(curve.scala)
at Main$.main(Main.scala:4)
at Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
In fact, what is happening is that in the expression x === y+y the + is not being applied correctly so that it is untyped. I recall that inside the objects of Welder proofs one cannot define nested objects or classes I don't know if this has to do with it.
Anyways, do I have to forget about using infix notation in my code for Welder or there is a solution to this?
The issue here is that the implicit class you defined is not visible when you create y+y (you would need to import Field._ for it to be visible).
I don't remember exactly how implicit resolution takes place in Scala, so maybe adding import Field._ inside the Curve object will override the + that comes from the inox DSL (that's the one being applied when you write y+y, giving you an arithmetic plus expression that expects integer arguments, hence the type error). Otherwise, you'll unfortunately have ambiguity in the implicit resolution, and I'm not sure it's possible to use the infix + operator in that case without giving up the whole dsl.
When trying to run this code:
val conf = new SparkConf()
.setMaster("local[1]")
.setAppName("Small")
.set("spark.executor.memory", "2g")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
val df = sc.parallelize(Array((1,30),(2,10),(3,20),(1,10)(2,30))).toDF("books","readers")
val results = df.join(
df.select($"books" as "r_books", $"readers" as "r_readers"),
$"readers" === $"r_readers" and $"books" < $"r_books"
)
.groupBy($"books", $"r_books")
.agg($"books", $"r_books", count($"readers"))
Under SBT console started with the following build.sbt:
name := "Small"
version := "1.0"
scalaVersion := "2.10.4"
libraryDependencies += "org.apache.spark" %% "spark-core" % "1.3.1"
libraryDependencies += "org.apache.spark" %% "spark-sql" % "1.3.1"
Returns error:
scala.reflect.internal.MissingRequirementError: class org.apache.spark.sql.catalyst.ScalaReflection in JavaMirror with java.net.URLClassLoader#13a9a4f9 of ...
Any ideas?
you have some spelling errors, and finally the last function count cannot be there you need to import org.apache.spark.sql.functions._
Check this, I corrct the Spelling errors and add the imports
import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.sql.functions._
/**
* Created by anquegi on 01/06/15.
*/
object QSpark162015 extends App {
val conf = new SparkConf()
.setMaster("local[2]")
.setAppName("QSpark162015")
.set("spark.executor.memory", "2g")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
val df = sc.parallelize(Array((1, 30), (2, 10), (3, 20), (1, 10), (2, 30))).toDF("books", "readers")
val results = df.join(
df.select($"books" as "r_books", $"readers" as "r_readers"),
$"readers" === $"r_readers" and $"books" < $"r_books"
)
.groupBy($"books", $"r_books")
.agg($"books", $"r_books", count($"readers"))
results.foreach(println _)
sc.stop()
}