lead/lag function in apache PIG - sql

Is there a function in apache PIG that's similar to Lead/Lag function in SQL? Or any pig function that can look back to previous row of record?

Yes, there is pre-defined functionality. See the Over() and Stitch() methods in Piggybank. Over() has examples listed in the documentation.

Here is an alternative:
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
public class GenericLag2 extends EvalFunc<Tuple>{
private List<String> lagObjects = null;
#Override
public Tuple exec(Tuple input) throws IOException {
if (lagObjects == null) {
lagObjects = new ArrayList<String>();
return null;
}
try {
Tuple output = TupleFactory.getInstance().newTuple(lagObjects.size());
for (int i = 0; i < lagObjects.size(); i++) {
output.set(i, lagObjects.get(i));
}
lagObjects.clear();
for (int i = 0; i < input.size(); i++) {
lagObjects.add(input.get(i).toString());
}
return output;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
#Override
public Schema outputSchema(Schema input) {
Schema tupleSchema = new Schema();
try {
for (int i = 0; i < input.size(); i++) {
tupleSchema.add(new FieldSchema("lag_" + i, DataType.CHARARRAY));
}
return new Schema(new FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), tupleSchema, DataType.TUPLE));
} catch (FrontendException e) {
e.printStackTrace();
return null;
}
}
}
I assume this would be faster, but I'm not sure, as you would have to do the following:
...
C = ORDER A BY important_order_by_field, second_important_order_by_field
D = FOREACH B GENERATE
important_order_by_field
,second_important_order_by_field
,...
,FLATTEN(LAG(
string_field_to_lag
,int_field_to_lag
,date_field_to_lag
))
;
E = FOREACH D GENERATE
important_order_by_field
,second_important_order_by_field
,...
,string_field_to_lag
,(int) int_field_to_lag
,(date_field_to_lag IS NULL ?
null :
ToDate(SUBSTRING(REPLACE(date_field_to_lag, 'T', ' '), 0, 19), 'yyyy-MM-dd HH:mm:ss'))
as date_field_to_lag
;
DUMP E;

Ok here is my first shot at this. Mind you, I just started learning how to code UDFs today.
Maven's pom.xml file contains:
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.0.0-cdh4.1.0</version>
</dependency>
...
Java UDF Class:
import java.io.IOException;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
public class GenericLag extends EvalFunc<String>{
private String lagObject = null;
#Override
public String exec(Tuple input) throws IOException {
try {
String returnObject = getLagObject();
setLagObject(input.get(0).toString());
return returnObject;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
public String getLagObject() {
return lagObject;
}
public void setLagObject(String lagObject) {
this.lagObject = lagObject;
}
}
Initially, I had used Object instead of String everywhere that you see "String" above, but I received this error:
ERROR org.apache.pig.tools.grunt.Grunt - ERROR 2080: Foreach currently does not handle type Unknown
I had to issue setLagObject(input.get(0).toString()); instead of setLagObject(input.get(0); or I would have received errors like:
java.lang.ClassCastException: java.lang.Integer cannot be cast to java.lang.String
java.lang.ClassCastException: java.lang.Double cannot be cast to java.lang.String
java.lang.ClassCastException: org.joda.time.DateTime cannot be cast to java.lang.String
Here is how I use it in Pig:
REGISTER /path/to/compiled/file.jar
DEFINE LAG fully.qualified.domain.name.GenericLag();
A = LOAD '/hdfs/path/to/directory' USING PigStorage(',') AS (
important_order_by_field:int
,second_important_order_by_field:string
,...
,string_field_to_lag:string
,int_field_to_lag:int
,date_field_to_lag:string
);
B = FOREACH A GENERATE
important_order_by_field
,second_important_order_by_field
,...
,string_field_to_lag
,int_field_to_lag
,ToDate(date_field_to_lag, 'yyyy-MM-dd HH:mm:ss')
;
C = ORDER A BY important_order_by_field, second_important_order_by_field
D = FOREACH B GENERATE
important_order_by_field
,second_important_order_by_field
,...
,LAG(string_field_to_lag) AS lag_string
,(int) LAG(int_field_to_lag) AS lag_int
,(date_field_to_lag IS NULL ?
null :
ToDate(SUBSTRING(REPLACE(LAG(date_field_to_lag), 'T', ' ') ,0,19), 'yyyy-MM-dd HH:mm:ss')) AS lag_date
;
DUMP D;
If I did the last line like this:
ToDate(SUBSTRING(REPLACE(LAG(date_field_to_lag), 'T', ' ') ,0,19), 'yyyy-MM-dd HH:mm:ss') AS lag_date
It would return the following error
ERROR org.apache.pig.tools.grunt.Grunt - ERROR 1066: Unable to open iterator for alias LAGGED_RHODES. Backend error : null
Which when checking the logs reveals:
java.lang.NullPointerException
at org.joda.time.format.DateTimeFormatterBuilder$NumberFormatter.parseInto(DateTimeFormatterBuilder.java:1200)
because the first row will contain a null value.

Related

AbstractStringBuilder.ensureCapacityInternal get NullPointerException in storm bolt

online system, the storm Bolt get NullPointerException,though I think I check it before line 61; It gets NullPointerException once in a while;
import ***.KeyUtils;
import ***.redis.PipelineHelper;
import ***.redis.PipelinedCacheClusterClient;
import **.redis.R2mClusterClient;
import org.apache.commons.lang3.StringUtils;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.IRichBolt;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Tuple;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import java.util.Map;
/**
* RedisBolt batch operate
*/
public class RedisBolt implements IRichBolt {
static final long serialVersionUID = 737015318988609460L;
private static ApplicationContext applicationContext;
private static long logEmitNumber = 0;
private static StringBuffer totalCmds = new StringBuffer();
private Logger logger = LoggerFactory.getLogger(getClass());
private OutputCollector _collector;
private R2mClusterClient r2mClusterClient;
#Override
public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
_collector = outputCollector;
if (applicationContext == null) {
applicationContext = new ClassPathXmlApplicationContext("spring/spring-config-redisbolt.xml");
}
if (r2mClusterClient == null) {
r2mClusterClient = (R2mClusterClient) applicationContext.getBean("r2mClusterClient");
}
}
#Override
public void execute(Tuple tuple) {
String log = tuple.getString(0);
String lastCommands = tuple.getString(1);
try {
//log count
if (StringUtils.isNotEmpty(log)) {
logEmitNumber++;
}
if (StringUtils.isNotEmpty(lastCommands)) {
if(totalCmds==null){
totalCmds = new StringBuffer();
}
totalCmds.append(lastCommands);//line 61
}
//日志数量控制
int numberLimit = 1;
String flow_log_limit = r2mClusterClient.get(KeyUtils.KEY_PIPELINE_LIMIT);
if (StringUtils.isNotEmpty(flow_log_limit)) {
try {
numberLimit = Integer.parseInt(flow_log_limit);
} catch (Exception e) {
numberLimit = 1;
logger.error("error", e);
}
}
if (logEmitNumber >= numberLimit) {
StringBuffer _totalCmds = new StringBuffer(totalCmds);
try {
//pipeline submit
PipelinedCacheClusterClient pip = r2mClusterClient.pipelined();
String[] commandArray = _totalCmds.toString().split(KeyUtils.REDIS_CMD_SPILT);
PipelineHelper.cmd(pip, commandArray);
pip.sync();
pip.close();
totalCmds = new StringBuffer();
} catch (Exception e) {
logger.error("error", e);
}
logEmitNumber = 0;
}
} catch (Exception e) {
logger.error(new StringBuffer("====RedisBolt error for log=[ ").append(log).append("] \n commands=[").append(lastCommands).append("]").toString(), e);
_collector.reportError(e);
_collector.fail(tuple);
}
_collector.ack(tuple);
}
#Override
public void cleanup() {
}
#Override
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
}
#Override
public Map<String, Object> getComponentConfiguration() {
return null;
}
}
exception info:
java.lang.NullPointerException at java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:113) at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:415) at java.lang.StringBuffer.append(StringBuffer.java:237) at com.jd.jr.dataeye.storm.bolt.RedisBolt.execute(RedisBolt.java:61) at org.apache.storm.daemon.executor$fn__5044$tuple_action_fn__5046.invoke(executor.clj:727) at org.apache.storm.daemon.executor$mk_task_receiver$fn__4965.invoke(executor.clj:459) at org.apache.storm.disruptor$clojure_handler$reify__4480.onEvent(disruptor.clj:40) at org.apache.storm.utils.DisruptorQueue.consumeBatchToCursor(DisruptorQueue.java:472) at org.apache.storm.utils.DisruptorQueue.consumeBatchWhenAvailable(DisruptorQueue.java:451) at org.apache.storm.disruptor$consume_batch_when_available.invoke(disruptor.clj:73) at org.apache.storm.daemon.executor$fn__5044$fn__5057$fn__5110.invoke(executor.clj:846) at org.apache.storm.util$async_loop$fn__557.invoke(util.clj:484) at clojure.lang.AFn.run(AFn.java:22) at java.lang.Thread.run(Thread.java:745)
can anyone give me some advice to find the reason.
That is really odd thing to happen. Please read the code for two classes.
https://github.com/openjdk-mirror/jdk7u-jdk/blob/master/src/share/classes/java/lang/AbstractStringBuilder.java
https://github.com/openjdk-mirror/jdk7u-jdk/blob/master/src/share/classes/java/lang/StringBuffer.java
AbstractStringBuilder has constructor with no args which doesn't allocate the field 'value', which makes accessing the 'value' field being NPE. Any constructors in StringBuffer use that constructor. So maybe some odd thing happens in serialization/deserialization and unfortunately 'value' field in AbstractStringBuilder is being null.
Maybe initializing totalCmds in prepare() would be better, and also you need to consider synchronization (thread-safety) between bolts. prepare() can be called per bolt instance so fields are thread-safe, but class fields are not thread-safe.
I think I find the problem maybe;
the key point is
"StringBuffer _totalCmds = new StringBuffer(totalCmds);" and " totalCmds.append(lastCommands);//line 61"
when new a object, It takes serval steps:
(1) allocate memory and return reference
(2) initialize
if append after (1) and before (2) then the StringBuffer.java extends AbstractStringBuilder.java
/**
* The value is used for character storage.
*/
char[] value;
value is not initialized;so this will get null:
#Override
public synchronized void ensureCapacity(int minimumCapacity) {
if (minimumCapacity > value.length) {
expandCapacity(minimumCapacity);
}
}
this blot has a another question, some data maybe lost under a multithreaded environment

How do I configure spring-kafka to ignore messages in the wrong format?

We have an issue with one of our Kafka topics which is consumed by the DefaultKafkaConsumerFactory & ConcurrentMessageListenerContainer combination described here with a JsonDeserializer used by the Factory. Unfortunately someone got a little enthusiastic and published some invalid messages onto the topic. It appears that spring-kafka silently fails to process past the first of these messages. Is it possible to have spring-kafka log an error and continue? Looking at the error messages which are logged it seems that perhaps the Apache kafka-clients library should deal with the case that when iterating a batch of messages one or more of them may fail to parse?
The below code is an example test case illustrating this issue:
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.common.serialization.Serializer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.kafka.common.serialization.StringSerializer;
import org.junit.ClassRule;
import org.junit.Test;
import org.springframework.kafka.core.DefaultKafkaConsumerFactory;
import org.springframework.kafka.core.DefaultKafkaProducerFactory;
import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.kafka.listener.KafkaMessageListenerContainer;
import org.springframework.kafka.listener.MessageListener;
import org.springframework.kafka.listener.config.ContainerProperties;
import org.springframework.kafka.support.SendResult;
import org.springframework.kafka.support.serializer.JsonDeserializer;
import org.springframework.kafka.support.serializer.JsonSerializer;
import org.springframework.kafka.test.rule.KafkaEmbedded;
import org.springframework.kafka.test.utils.ContainerTestUtils;
import org.springframework.util.concurrent.ListenableFuture;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
import static org.springframework.kafka.test.hamcrest.KafkaMatchers.hasKey;
import static org.springframework.kafka.test.hamcrest.KafkaMatchers.hasValue;
/**
* #author jfreedman
*/
public class TestSpringKafka {
private static final String TOPIC1 = "spring.kafka.1.t";
#ClassRule
public static KafkaEmbedded embeddedKafka = new KafkaEmbedded(1, true, 1, TOPIC1);
#Test
public void submitMessageThenGarbageThenAnotherMessage() throws Exception {
final BlockingQueue<ConsumerRecord<String, JsonObject>> records = createListener(TOPIC1);
final KafkaTemplate<String, JsonObject> objectTemplate = createPublisher("json", new JsonSerializer<JsonObject>());
sendAndVerifyMessage(records, objectTemplate, "foo", new JsonObject("foo"), 0L);
// push some garbage text to Kafka which cannot be marshalled, this should not interrupt processing
final KafkaTemplate<String, String> garbageTemplate = createPublisher("garbage", new StringSerializer());
final SendResult<String, String> garbageResult = garbageTemplate.send(TOPIC1, "bar","bar").get(5, TimeUnit.SECONDS);
assertEquals(1L, garbageResult.getRecordMetadata().offset());
sendAndVerifyMessage(records, objectTemplate, "baz", new JsonObject("baz"), 2L);
}
private <T> KafkaTemplate<String, T> createPublisher(final String label, final Serializer<T> serializer) {
final Map<String, Object> producerProps = new HashMap<>();
producerProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, embeddedKafka.getBrokersAsString());
producerProps.put(ProducerConfig.CLIENT_ID_CONFIG, "TestPublisher-" + label);
producerProps.put(ProducerConfig.ACKS_CONFIG, "all");
producerProps.put(ProducerConfig.RETRIES_CONFIG, 2);
producerProps.put(ProducerConfig.MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION, 1);
producerProps.put(ProducerConfig.REQUEST_TIMEOUT_MS_CONFIG, 5000);
producerProps.put(ProducerConfig.MAX_BLOCK_MS_CONFIG, 5000);
producerProps.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
producerProps.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, serializer.getClass());
final DefaultKafkaProducerFactory<String, T> pf = new DefaultKafkaProducerFactory<>(producerProps);
pf.setValueSerializer(serializer);
return new KafkaTemplate<>(pf);
}
private BlockingQueue<ConsumerRecord<String, JsonObject>> createListener(final String topic) throws Exception {
final Map<String, Object> consumerProps = new HashMap<>();
consumerProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, embeddedKafka.getBrokersAsString());
consumerProps.put(ConsumerConfig.GROUP_ID_CONFIG, "TestConsumer");
consumerProps.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, true);
consumerProps.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "100");
consumerProps.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, 15000);
consumerProps.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
consumerProps.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, JsonDeserializer.class);
final DefaultKafkaConsumerFactory<String, JsonObject> cf = new DefaultKafkaConsumerFactory<>(consumerProps);
cf.setValueDeserializer(new JsonDeserializer<>(JsonObject.class));
final KafkaMessageListenerContainer<String, JsonObject> container = new KafkaMessageListenerContainer<>(cf, new ContainerProperties(topic));
final BlockingQueue<ConsumerRecord<String, JsonObject>> records = new LinkedBlockingQueue<>();
container.setupMessageListener((MessageListener<String, JsonObject>) records::add);
container.setBeanName("TestListener");
container.start();
ContainerTestUtils.waitForAssignment(container, embeddedKafka.getPartitionsPerTopic());
return records;
}
private void sendAndVerifyMessage(final BlockingQueue<ConsumerRecord<String, JsonObject>> records,
final KafkaTemplate<String, JsonObject> template,
final String key, final JsonObject value,
final long expectedOffset) throws InterruptedException, ExecutionException, TimeoutException {
final ListenableFuture<SendResult<String, JsonObject>> future = template.send(TOPIC1, key, value);
final ConsumerRecord<String, JsonObject> record = records.poll(5, TimeUnit.SECONDS);
assertThat(record, hasKey(key));
assertThat(record, hasValue(value));
assertEquals(expectedOffset, future.get(5, TimeUnit.SECONDS).getRecordMetadata().offset());
}
public static final class JsonObject {
private String value;
public JsonObject() {}
JsonObject(final String value) {
this.value = value;
}
public String getValue() {
return value;
}
public void setValue(final String value) {
this.value = value;
}
#Override
public boolean equals(final Object o) {
if (this == o) { return true; }
if (o == null || getClass() != o.getClass()) { return false; }
final JsonObject that = (JsonObject) o;
return Objects.equals(value, that.value);
}
#Override
public int hashCode() {
return Objects.hash(value);
}
#Override
public String toString() {
return "JsonObject{" +
"value='" + value + '\'' +
'}';
}
}
}
I have a solution but I don't know if it's the best one, I extended JsonDeserializer as follows which results in a null value being consumed by spring-kafka and requires the necessary downstream changes to handle that case.
class SafeJsonDeserializer[A >: Null](targetType: Class[A], objectMapper: ObjectMapper) extends JsonDeserializer[A](targetType, objectMapper) with Logging {
override def deserialize(topic: String, data: Array[Byte]): A = try {
super.deserialize(topic, data)
} catch {
case e: Exception =>
logger.error("Failed to deserialize data [%s] from topic [%s]".format(new String(data), topic), e)
null
}
}
Starting from the spring-kafka-2.x.x, we now have the comfort of declaring beans in the config file for the interface KafkaListenerErrorHandler with a implementation something as
#Bean
public ConsumerAwareListenerErrorHandler listen3ErrorHandler() {
return (m, e, c) -> {
this.listen3Exception = e;
MessageHeaders headers = m.getHeaders();
c.seek(new org.apache.kafka.common.TopicPartition(
headers.get(KafkaHeaders.RECEIVED_TOPIC, String.class),
headers.get(KafkaHeaders.RECEIVED_PARTITION_ID, Integer.class)),
headers.get(KafkaHeaders.OFFSET, Long.class));
return null;
};
}
more resources can be found at https://docs.spring.io/spring-kafka/reference/htmlsingle/#annotation-error-handling There is also another link with the similar issue: Spring Kafka error handling - v1.1.x and How to handle SerializationException after deserialization
Use ErrorHandlingDeserializer2. This is a delegating key/value deserializer that catches exceptions, returning them in the headers as serialized java objects.
Under consumer configuration, add/update the below lines:
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.springframework.kafka.support.serializer.ErrorHandlingDeserializer2
configProps.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,
classOf[ErrorHandlingDeserializer2[JsonDeserializer]].getName)
configProps.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, classOf[ErrorHandlingDeserializer2[StringDeserializer]].getName)
configProps.put(ErrorHandlingDeserializer2.KEY_DESERIALIZER_CLASS, classOf[StringDeserializer].getName)
configProps.put(ErrorHandlingDeserializer2.VALUE_DESERIALIZER_CLASS, classOf[JsonDeserializer].getName)

How to serialize a Predicate<T> from Nashorn engine in java 8

How can i serialize a predicate obtained from java ScriptEngine nashorn? or how can i cast jdk.nashorn.javaadapters.java.util.function.Predicate to Serializable?
Here is the case:
I have this class
import java.io.Serializable;
import java.util.function.Predicate;
public class Filter implements Serializable {
private Predicate<Object> filter;
public Predicate<Object> getFilter() {
return filter;
}
public void setFilter(Predicate<Object> filter) {
this.filter = filter;
}
public boolean evaluate(int value) {
return filter.test(value);
}
}
and
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.function.Predicate;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
public class TestFilterSer {
public static void main(String[] args) throws ScriptException {
Filter f = new Filter();
//This works
//f.setFilter(getCastedPred());
// But I want this to work
f.setFilter(getScriptEnginePred());
System.out.println(f.evaluate(6));
try (ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(new File("pred.ser")))) {
oos.writeObject(f);
} catch (IOException e) {
e.printStackTrace();
}
f= null;
try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(new File("pred.ser")))) {
f= (Filter)ois.readObject();
} catch (IOException | ClassNotFoundException e) {
e.printStackTrace();
}
System.out.println(f.evaluate(7));
}
public static Predicate<Object> getCastedPred() {
Predicate<Object> isEven = (Predicate<Object> & Serializable)(i) -> (Integer)i%2 == 0;
return isEven;
}
public static Predicate<Object> getScriptEnginePred() throws ScriptException {
ScriptEngine engine = new ScriptEngineManager().getEngineByName("nashorn");
Predicate<Object> p = (Predicate<Object> & Serializable)engine.eval(
String.format("new java.util.function.Predicate(%s)", "function(i) i%2==0")
);
return p;
}
}
Requirement: To be able to serialize the Predicate obtained from Nashorn engine.
Observation: When I get Predicate from method getCastedPred(). It works because it is java.util.function.Predicate. it does Serialize after casting to Serializable. But when I get the Predicate from the Nashorn engine, Internally it returns me the jdk.nashorn.javaadapters.java.util.function.Predicate this one doesn't Serialize and casting to Serializable doesn't work.
Any idea how can i serialize this type of Predicate?
The problem is your API uses Predicate, not AggregateFilter. So the target type for the lambda in
setAggregatePredicate(x -> true)
will be Predicate, not AggregateFilter -- and the compiler won't know to make it serializable. If you change your API to use the more specific functional interface, serializable lambdas will be generated.

PIG UDF to convert tuple to multiple tuple output

I am new to PIG and I am trying to create a UDF which get a tuple and return multiple tuple based on a delimited. So I have written one UDF to read the below data file
2012/01/01 Name1 Category1|Category2|Category3
2012/01/01 Name2 Category2|Category3
2012/01/01 Name3 Category1|Category5
Basically i am trying to read $2 field
Category1|Category2|Category3
Category2|Category3
Category1|Category5
to get the output as :-
Category1, Category2, Category3
Category2, Category3
Category1, Category5
Below is the UDF code i have written..
package com.test.multipleTuple;
import java.io.IOException;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
public class TupleToMultipleTuple extends EvalFunc<String> {
#Override
public String exec(Tuple input) throws IOException {
// Keep the count of every cell in the
Tuple aux = TupleFactory.getInstance().newTuple();
if (input == null || input.size() == 0)
return null;
try {
String del = "\\|";
String str = (String) input.get(0);
String field[] = str.split(del);
for (String nxt : field) {
aux.append(nxt.trim().toString());
}
} catch (Exception e) {
throw new IOException("Caught exception processing input row ", e);
}
return aux.toDelimitedString(",");
}
}
created Jar --> TupleToMultipleTuple.jar
But I am getting the below error while executing it .
Pig Stack Trace
---------------
ERROR 1066: Unable to open iterator for alias B
org.apache.pig.impl.logicalLayer.FrontendException: ERROR 1066: Unable to open iterator for alias B
at org.apache.pig.PigServer.openIterator(PigServer.java:892)
at org.apache.pig.tools.grunt.GruntParser.processDump(GruntParser.java:774)
at org.apache.pig.tools.pigscript.parser.PigScriptParser.parse(PigScriptParser.java:372)
at org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:198)
at org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:173)
at org.apache.pig.tools.grunt.Grunt.run(Grunt.java:69)
at org.apache.pig.Main.run(Main.java:547)
at org.apache.pig.Main.main(Main.java:158)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
Caused by: java.io.IOException: Job terminated with anomalous status FAILED
at org.apache.pig.PigServer.openIterator(PigServer.java:884)
... 13 more
Can you please help me in rectifying the issue. Thanks.
Pig script for applying the UDF..
REGISTER TupleToMultipleTuple.jar;
DEFINE myFunc com.test.multipleTuple.TupleToMultipleTuple();
A = load 'data.txt' USING PigStorage(' ');
B = foreach A generate myFunc($2);
dump B;
You can use the built-in split function like this:
flatten(STRSPLIT($2,'[|]',3))as(cat1:chararray,cat2:chararray,cat3:chararray)
and you will get 3 tuples named cat1, cat2 and cat2 typed as chararray and delimited by the current delimiter of the relation which they belong to.
Found the issue .. Issue was while parsing DayaByteArray to String.. used toString() to fix it
package com.test.multipleTuple;
import java.io.IOException;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
public class TupleToMultipleTuple extends EvalFunc<String> {
#Override
public String exec(Tuple input) throws IOException {
// Keep the count of every cell in the
Tuple aux = TupleFactory.getInstance().newTuple();
if (input == null || input.size() == 0)
return null;
try {
String del = "\\|";
String str = (String) input.get(0).toString();
String field[] = str.split(del);
for (String nxt : field) {
aux.append(nxt.trim().toString());
}
} catch (Exception e) {
throw new IOException("Caught exception processing input row ", e);
}
return aux.toDelimitedString(",");
}
}

why does hibernate hql distinct cause an sql distinct on left join?

I've got this test HQL:
select distinct o from Order o left join fetch o.lineItems
and it does generate an SQL distinct without an obvious reason:
select distinct order0_.id as id61_0_, orderline1_.order_id as order1_62_1_...
The SQL resultset is always the same (with and without an SQL distinct):
order id | order name | orderline id | orderline name
---------+------------+--------------+---------------
1 | foo | 1 | foo item
1 | foo | 2 | bar item
1 | foo | 3 | test item
2 | empty | NULL | NULL
3 | bar | 4 | qwerty item
3 | bar | 5 | asdfgh item
Why does hibernate generate the SQL distinct? The SQL distinct doesn't make any sense and makes the query slower than needed.
This is contrary to the FAQ which mentions that hql distinct in this case is just a shortcut for the result transformer:
session.createQuery("select distinct o
from Order o left join fetch
o.lineItems").list();
It looks like you are using the SQL DISTINCT keyword here. Of course, this is not SQL, this is HQL. This distinct is just a shortcut for the result transformer, in this case. Yes, in other cases an HQL distinct will translate straight into a SQL DISTINCT. Not in this case: you can not filter out duplicates at the SQL level, the very nature of a product/join forbids this - you want the duplicates or you don't get all the data you need.
thanks
Have a closer look at the sql statement that hibernate generates - yes it does use the "distinct" keyword but not in the way I think you are expecting it to (or the way that the Hibernate FAQ is implying) i.e. to return a set of "distinct" or "unique" orders.
It doesn't use the distinct keyword to return distinct orders, as that wouldn't make sense in that SQL query, considering the join that you have also specified.
The resulting sql set still needs processing by the ResultTransformer, as clearly the sql set contains duplicate orders. That's why they say that the HQL distinct keyword doesn't directly map to the SQL distinct keyword.
I had the exact same problem and I think this is an Hibernate issue (not a bug because code doesn't fail). However, I have to dig deeper to make sure it's an issue.
Hibernate (at least in version 4 which its the version I'm working on my project, specifically 4.3.11) uses the concept of SPI, long story short: its like an API to extend or modify the framework.
I took advantage of this feature to replace the classes org.hibernate.hql.internal.ast.ASTQueryTranslatorFactory (This class is called by Hibernate and delegates the job of generating the SQL query) and org.hibernate.hql.internal.ast.QueryTranslatorImpl (This is sort of an internal class which is called by org.hibernate.hql.internal.ast.ASTQueryTranslatorFactory and generates the actual SQL query). I did it as follows:
Replacement for org.hibernate.hql.internal.ast.ASTQueryTranslatorFactory:
package org.hibernate.hql.internal.ast;
import java.util.Map;
import org.hibernate.engine.query.spi.EntityGraphQueryHint;
import org.hibernate.engine.spi.SessionFactoryImplementor;
import org.hibernate.hql.spi.QueryTranslator;
public class NoDistinctInSQLASTQueryTranslatorFactory extends ASTQueryTranslatorFactory {
#Override
public QueryTranslator createQueryTranslator(String queryIdentifier, String queryString, Map filters, SessionFactoryImplementor factory, EntityGraphQueryHint entityGraphQueryHint) {
return new NoDistinctInSQLQueryTranslatorImpl(queryIdentifier, queryString, filters, factory, entityGraphQueryHint);
}
}
Replacement for org.hibernate.hql.internal.ast.QueryTranslatorImpl:
package org.hibernate.hql.internal.ast;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.hibernate.HibernateException;
import org.hibernate.MappingException;
import org.hibernate.QueryException;
import org.hibernate.ScrollableResults;
import org.hibernate.engine.query.spi.EntityGraphQueryHint;
import org.hibernate.engine.spi.QueryParameters;
import org.hibernate.engine.spi.RowSelection;
import org.hibernate.engine.spi.SessionFactoryImplementor;
import org.hibernate.engine.spi.SessionImplementor;
import org.hibernate.event.spi.EventSource;
import org.hibernate.hql.internal.QueryExecutionRequestException;
import org.hibernate.hql.internal.antlr.HqlSqlTokenTypes;
import org.hibernate.hql.internal.antlr.HqlTokenTypes;
import org.hibernate.hql.internal.antlr.SqlTokenTypes;
import org.hibernate.hql.internal.ast.exec.BasicExecutor;
import org.hibernate.hql.internal.ast.exec.DeleteExecutor;
import org.hibernate.hql.internal.ast.exec.MultiTableDeleteExecutor;
import org.hibernate.hql.internal.ast.exec.MultiTableUpdateExecutor;
import org.hibernate.hql.internal.ast.exec.StatementExecutor;
import org.hibernate.hql.internal.ast.tree.AggregatedSelectExpression;
import org.hibernate.hql.internal.ast.tree.FromElement;
import org.hibernate.hql.internal.ast.tree.InsertStatement;
import org.hibernate.hql.internal.ast.tree.QueryNode;
import org.hibernate.hql.internal.ast.tree.Statement;
import org.hibernate.hql.internal.ast.util.ASTPrinter;
import org.hibernate.hql.internal.ast.util.ASTUtil;
import org.hibernate.hql.internal.ast.util.NodeTraverser;
import org.hibernate.hql.spi.FilterTranslator;
import org.hibernate.hql.spi.ParameterTranslations;
import org.hibernate.internal.CoreMessageLogger;
import org.hibernate.internal.util.ReflectHelper;
import org.hibernate.internal.util.StringHelper;
import org.hibernate.internal.util.collections.IdentitySet;
import org.hibernate.loader.hql.QueryLoader;
import org.hibernate.param.ParameterSpecification;
import org.hibernate.persister.entity.Queryable;
import org.hibernate.type.Type;
import org.jboss.logging.Logger;
import antlr.ANTLRException;
import antlr.RecognitionException;
import antlr.TokenStreamException;
import antlr.collections.AST;
/**
* A QueryTranslator that uses an Antlr-based parser.
*
* #author Joshua Davis (pgmjsd#sourceforge.net)
*/
public class NoDistinctInSQLQueryTranslatorImpl extends QueryTranslatorImpl implements FilterTranslator {
private static final CoreMessageLogger LOG = Logger.getMessageLogger(
CoreMessageLogger.class,
QueryTranslatorImpl.class.getName()
);
private SessionFactoryImplementor factory;
private final String queryIdentifier;
private String hql;
private boolean shallowQuery;
private Map tokenReplacements;
//TODO:this is only needed during compilation .. can we eliminate the instvar?
private Map enabledFilters;
private boolean compiled;
private QueryLoader queryLoader;
private StatementExecutor statementExecutor;
private Statement sqlAst;
private String sql;
private ParameterTranslations paramTranslations;
private List<ParameterSpecification> collectedParameterSpecifications;
private EntityGraphQueryHint entityGraphQueryHint;
/**
* Creates a new AST-based query translator.
*
* #param queryIdentifier The query-identifier (used in stats collection)
* #param query The hql query to translate
* #param enabledFilters Currently enabled filters
* #param factory The session factory constructing this translator instance.
*/
public NoDistinctInSQLQueryTranslatorImpl(
String queryIdentifier,
String query,
Map enabledFilters,
SessionFactoryImplementor factory) {
super(queryIdentifier, query, enabledFilters, factory);
this.queryIdentifier = queryIdentifier;
this.hql = query;
this.compiled = false;
this.shallowQuery = false;
this.enabledFilters = enabledFilters;
this.factory = factory;
}
public NoDistinctInSQLQueryTranslatorImpl(
String queryIdentifier,
String query,
Map enabledFilters,
SessionFactoryImplementor factory,
EntityGraphQueryHint entityGraphQueryHint) {
this(queryIdentifier, query, enabledFilters, factory);
this.entityGraphQueryHint = entityGraphQueryHint;
}
/**
* Compile a "normal" query. This method may be called multiple times.
* Subsequent invocations are no-ops.
*
* #param replacements Defined query substitutions.
* #param shallow Does this represent a shallow (scalar or entity-id)
* select?
* #throws QueryException There was a problem parsing the query string.
* #throws MappingException There was a problem querying defined mappings.
*/
#Override
public void compile(
Map replacements,
boolean shallow) throws QueryException, MappingException {
doCompile(replacements, shallow, null);
}
/**
* Compile a filter. This method may be called multiple times. Subsequent
* invocations are no-ops.
*
* #param collectionRole the role name of the collection used as the basis
* for the filter.
* #param replacements Defined query substitutions.
* #param shallow Does this represent a shallow (scalar or entity-id)
* select?
* #throws QueryException There was a problem parsing the query string.
* #throws MappingException There was a problem querying defined mappings.
*/
#Override
public void compile(
String collectionRole,
Map replacements,
boolean shallow) throws QueryException, MappingException {
doCompile(replacements, shallow, collectionRole);
}
/**
* Performs both filter and non-filter compiling.
*
* #param replacements Defined query substitutions.
* #param shallow Does this represent a shallow (scalar or entity-id)
* select?
* #param collectionRole the role name of the collection used as the basis
* for the filter, NULL if this is not a filter.
*/
private synchronized void doCompile(Map replacements, boolean shallow, String collectionRole) {
// If the query is already compiled, skip the compilation.
if (compiled) {
LOG.debug("compile() : The query is already compiled, skipping...");
return;
}
// Remember the parameters for the compilation.
this.tokenReplacements = replacements;
if (tokenReplacements == null) {
tokenReplacements = new HashMap();
}
this.shallowQuery = shallow;
try {
// PHASE 1 : Parse the HQL into an AST.
final HqlParser parser = parse(true);
// PHASE 2 : Analyze the HQL AST, and produce an SQL AST.
final HqlSqlWalker w = analyze(parser, collectionRole);
sqlAst = (Statement) w.getAST();
// at some point the generate phase needs to be moved out of here,
// because a single object-level DML might spawn multiple SQL DML
// command executions.
//
// Possible to just move the sql generation for dml stuff, but for
// consistency-sake probably best to just move responsiblity for
// the generation phase completely into the delegates
// (QueryLoader/StatementExecutor) themselves. Also, not sure why
// QueryLoader currently even has a dependency on this at all; does
// it need it? Ideally like to see the walker itself given to the delegates directly...
if (sqlAst.needsExecutor()) {
statementExecutor = buildAppropriateStatementExecutor(w);
} else {
// PHASE 3 : Generate the SQL.
generate((QueryNode) sqlAst);
queryLoader = new QueryLoader(this, factory, w.getSelectClause());
}
compiled = true;
} catch (QueryException qe) {
if (qe.getQueryString() == null) {
throw qe.wrapWithQueryString(hql);
} else {
throw qe;
}
} catch (RecognitionException e) {
// we do not actually propagate ANTLRExceptions as a cause, so
// log it here for diagnostic purposes
LOG.trace("Converted antlr.RecognitionException", e);
throw QuerySyntaxException.convert(e, hql);
} catch (ANTLRException e) {
// we do not actually propagate ANTLRExceptions as a cause, so
// log it here for diagnostic purposes
LOG.trace("Converted antlr.ANTLRException", e);
throw new QueryException(e.getMessage(), hql);
}
//only needed during compilation phase...
this.enabledFilters = null;
}
private void generate(AST sqlAst) throws QueryException, RecognitionException {
if (sql == null) {
final SqlGenerator gen = new SqlGenerator(factory);
gen.statement(sqlAst);
sql = gen.getSQL();
//Hack: The distinct operator is removed from the sql
//string to avoid executing a distinct query in the db server when
//the distinct is used in hql.
sql = sql.replace("distinct", "");
if (LOG.isDebugEnabled()) {
LOG.debugf("HQL: %s", hql);
LOG.debugf("SQL: %s", sql);
}
gen.getParseErrorHandler().throwQueryException();
collectedParameterSpecifications = gen.getCollectedParameters();
}
}
private static final ASTPrinter SQL_TOKEN_PRINTER = new ASTPrinter(SqlTokenTypes.class);
private HqlSqlWalker analyze(HqlParser parser, String collectionRole) throws QueryException, RecognitionException {
final HqlSqlWalker w = new HqlSqlWalker(this, factory, parser, tokenReplacements, collectionRole);
final AST hqlAst = parser.getAST();
// Transform the tree.
w.statement(hqlAst);
if (LOG.isDebugEnabled()) {
LOG.debug(SQL_TOKEN_PRINTER.showAsString(w.getAST(), "--- SQL AST ---"));
}
w.getParseErrorHandler().throwQueryException();
return w;
}
private HqlParser parse(boolean filter) throws TokenStreamException, RecognitionException {
// Parse the query string into an HQL AST.
final HqlParser parser = HqlParser.getInstance(hql);
parser.setFilter(filter);
LOG.debugf("parse() - HQL: %s", hql);
parser.statement();
final AST hqlAst = parser.getAST();
final NodeTraverser walker = new NodeTraverser(new JavaConstantConverter());
walker.traverseDepthFirst(hqlAst);
showHqlAst(hqlAst);
parser.getParseErrorHandler().throwQueryException();
return parser;
}
private static final ASTPrinter HQL_TOKEN_PRINTER = new ASTPrinter(HqlTokenTypes.class);
#Override
void showHqlAst(AST hqlAst) {
if (LOG.isDebugEnabled()) {
LOG.debug(HQL_TOKEN_PRINTER.showAsString(hqlAst, "--- HQL AST ---"));
}
}
private void errorIfDML() throws HibernateException {
if (sqlAst.needsExecutor()) {
throw new QueryExecutionRequestException("Not supported for DML operations", hql);
}
}
private void errorIfSelect() throws HibernateException {
if (!sqlAst.needsExecutor()) {
throw new QueryExecutionRequestException("Not supported for select queries", hql);
}
}
#Override
public String getQueryIdentifier() {
return queryIdentifier;
}
#Override
public Statement getSqlAST() {
return sqlAst;
}
private HqlSqlWalker getWalker() {
return sqlAst.getWalker();
}
/**
* Types of the return values of an <tt>iterate()</tt> style query.
*
* #return an array of <tt>Type</tt>s.
*/
#Override
public Type[] getReturnTypes() {
errorIfDML();
return getWalker().getReturnTypes();
}
#Override
public String[] getReturnAliases() {
errorIfDML();
return getWalker().getReturnAliases();
}
#Override
public String[][] getColumnNames() {
errorIfDML();
return getWalker().getSelectClause().getColumnNames();
}
#Override
public Set<Serializable> getQuerySpaces() {
return getWalker().getQuerySpaces();
}
#Override
public List list(SessionImplementor session, QueryParameters queryParameters)
throws HibernateException {
// Delegate to the QueryLoader...
errorIfDML();
final QueryNode query = (QueryNode) sqlAst;
final boolean hasLimit = queryParameters.getRowSelection() != null && queryParameters.getRowSelection().definesLimits();
final boolean needsDistincting = (query.getSelectClause().isDistinct() || hasLimit) && containsCollectionFetches();
QueryParameters queryParametersToUse;
if (hasLimit && containsCollectionFetches()) {
LOG.firstOrMaxResultsSpecifiedWithCollectionFetch();
RowSelection selection = new RowSelection();
selection.setFetchSize(queryParameters.getRowSelection().getFetchSize());
selection.setTimeout(queryParameters.getRowSelection().getTimeout());
queryParametersToUse = queryParameters.createCopyUsing(selection);
} else {
queryParametersToUse = queryParameters;
}
List results = queryLoader.list(session, queryParametersToUse);
if (needsDistincting) {
int includedCount = -1;
// NOTE : firstRow is zero-based
int first = !hasLimit || queryParameters.getRowSelection().getFirstRow() == null
? 0
: queryParameters.getRowSelection().getFirstRow();
int max = !hasLimit || queryParameters.getRowSelection().getMaxRows() == null
? -1
: queryParameters.getRowSelection().getMaxRows();
List tmp = new ArrayList();
IdentitySet distinction = new IdentitySet();
for (final Object result : results) {
if (!distinction.add(result)) {
continue;
}
includedCount++;
if (includedCount < first) {
continue;
}
tmp.add(result);
// NOTE : ( max - 1 ) because first is zero-based while max is not...
if (max >= 0 && (includedCount - first) >= (max - 1)) {
break;
}
}
results = tmp;
}
return results;
}
/**
* Return the query results as an iterator
*/
#Override
public Iterator iterate(QueryParameters queryParameters, EventSource session)
throws HibernateException {
// Delegate to the QueryLoader...
errorIfDML();
return queryLoader.iterate(queryParameters, session);
}
/**
* Return the query results, as an instance of <tt>ScrollableResults</tt>
*/
#Override
public ScrollableResults scroll(QueryParameters queryParameters, SessionImplementor session)
throws HibernateException {
// Delegate to the QueryLoader...
errorIfDML();
return queryLoader.scroll(queryParameters, session);
}
#Override
public int executeUpdate(QueryParameters queryParameters, SessionImplementor session)
throws HibernateException {
errorIfSelect();
return statementExecutor.execute(queryParameters, session);
}
/**
* The SQL query string to be called; implemented by all subclasses
*/
#Override
public String getSQLString() {
return sql;
}
#Override
public List<String> collectSqlStrings() {
ArrayList<String> list = new ArrayList<>();
if (isManipulationStatement()) {
String[] sqlStatements = statementExecutor.getSqlStatements();
Collections.addAll(list, sqlStatements);
} else {
list.add(sql);
}
return list;
}
// -- Package local methods for the QueryLoader delegate --
#Override
public boolean isShallowQuery() {
return shallowQuery;
}
#Override
public String getQueryString() {
return hql;
}
#Override
public Map getEnabledFilters() {
return enabledFilters;
}
#Override
public int[] getNamedParameterLocs(String name) {
return getWalker().getNamedParameterLocations(name);
}
#Override
public boolean containsCollectionFetches() {
errorIfDML();
List collectionFetches = ((QueryNode) sqlAst).getFromClause().getCollectionFetches();
return collectionFetches != null && collectionFetches.size() > 0;
}
#Override
public boolean isManipulationStatement() {
return sqlAst.needsExecutor();
}
#Override
public void validateScrollability() throws HibernateException {
// Impl Note: allows multiple collection fetches as long as the
// entire fecthed graph still "points back" to a single
// root entity for return
errorIfDML();
final QueryNode query = (QueryNode) sqlAst;
// If there are no collection fetches, then no further checks are needed
List collectionFetches = query.getFromClause().getCollectionFetches();
if (collectionFetches.isEmpty()) {
return;
}
// A shallow query is ok (although technically there should be no fetching here...)
if (isShallowQuery()) {
return;
}
// Otherwise, we have a non-scalar select with defined collection fetch(es).
// Make sure that there is only a single root entity in the return (no tuples)
if (getReturnTypes().length > 1) {
throw new HibernateException("cannot scroll with collection fetches and returned tuples");
}
FromElement owner = null;
for (Object o : query.getSelectClause().getFromElementsForLoad()) {
// should be the first, but just to be safe...
final FromElement fromElement = (FromElement) o;
if (fromElement.getOrigin() == null) {
owner = fromElement;
break;
}
}
if (owner == null) {
throw new HibernateException("unable to locate collection fetch(es) owner for scrollability checks");
}
// This is not strictly true. We actually just need to make sure that
// it is ordered by root-entity PK and that that order-by comes before
// any non-root-entity ordering...
AST primaryOrdering = query.getOrderByClause().getFirstChild();
if (primaryOrdering != null) {
// TODO : this is a bit dodgy, come up with a better way to check this (plus see above comment)
String[] idColNames = owner.getQueryable().getIdentifierColumnNames();
String expectedPrimaryOrderSeq = StringHelper.join(
", ",
StringHelper.qualify(owner.getTableAlias(), idColNames)
);
if (!primaryOrdering.getText().startsWith(expectedPrimaryOrderSeq)) {
throw new HibernateException("cannot scroll results with collection fetches which are not ordered primarily by the root entity's PK");
}
}
}
private StatementExecutor buildAppropriateStatementExecutor(HqlSqlWalker walker) {
final Statement statement = (Statement) walker.getAST();
switch (walker.getStatementType()) {
case HqlSqlTokenTypes.DELETE: {
final FromElement fromElement = walker.getFinalFromClause().getFromElement();
final Queryable persister = fromElement.getQueryable();
if (persister.isMultiTable()) {
return new MultiTableDeleteExecutor(walker);
} else {
return new DeleteExecutor(walker, persister);
}
}
case HqlSqlTokenTypes.UPDATE: {
final FromElement fromElement = walker.getFinalFromClause().getFromElement();
final Queryable persister = fromElement.getQueryable();
if (persister.isMultiTable()) {
// even here, if only properties mapped to the "base table" are referenced
// in the set and where clauses, this could be handled by the BasicDelegate.
// TODO : decide if it is better performance-wise to doAfterTransactionCompletion that check, or to simply use the MultiTableUpdateDelegate
return new MultiTableUpdateExecutor(walker);
} else {
return new BasicExecutor(walker, persister);
}
}
case HqlSqlTokenTypes.INSERT:
return new BasicExecutor(walker, ((InsertStatement) statement).getIntoClause().getQueryable());
default:
throw new QueryException("Unexpected statement type");
}
}
#Override
public ParameterTranslations getParameterTranslations() {
if (paramTranslations == null) {
paramTranslations = new ParameterTranslationsImpl(getWalker().getParameters());
}
return paramTranslations;
}
#Override
public List<ParameterSpecification> getCollectedParameterSpecifications() {
return collectedParameterSpecifications;
}
#Override
public Class getDynamicInstantiationResultType() {
AggregatedSelectExpression aggregation = queryLoader.getAggregatedSelectExpression();
return aggregation == null ? null : aggregation.getAggregationResultType();
}
public static class JavaConstantConverter implements NodeTraverser.VisitationStrategy {
private AST dotRoot;
#Override
public void visit(AST node) {
if (dotRoot != null) {
// we are already processing a dot-structure
if (ASTUtil.isSubtreeChild(dotRoot, node)) {
return;
}
// we are now at a new tree level
dotRoot = null;
}
if (node.getType() == HqlTokenTypes.DOT) {
dotRoot = node;
handleDotStructure(dotRoot);
}
}
private void handleDotStructure(AST dotStructureRoot) {
final String expression = ASTUtil.getPathText(dotStructureRoot);
final Object constant = ReflectHelper.getConstantValue(expression);
if (constant != null) {
dotStructureRoot.setFirstChild(null);
dotStructureRoot.setType(HqlTokenTypes.JAVA_CONSTANT);
dotStructureRoot.setText(expression);
}
}
}
#Override
public EntityGraphQueryHint getEntityGraphQueryHint() {
return entityGraphQueryHint;
}
#Override
public void setEntityGraphQueryHint(EntityGraphQueryHint entityGraphQueryHint) {
this.entityGraphQueryHint = entityGraphQueryHint;
}
}
If you follow the code flow you will notice that I just modified the method private void generate(AST sqlAst) throws QueryException, RecognitionException and added the a following lines:
//Hack: The distinct keywordis removed from the sql string to
//avoid executing a distinct query in the DBMS when the distinct
//is used in hql.
sql = sql.replace("distinct", "");
What I do with this code is to remove the distinct keyword from the generated SQL query.
After creating the classes above, I added the following line in the hibernate configuration file:
<property name="hibernate.query.factory_class">org.hibernate.hql.internal.ast.NoDistinctInSQLASTQueryTranslatorFactory</property>
This line tells hibernate to use my custom class to parse HQL queries and generate SQL queries without the distinct keyword. Notice I created my custom classes in the same package where the original HQL parser resides.