Accessing TableRow columns in BigQuery Apache Beam - google-bigquery

I am trying to
1.Read JSON events from Cloud Pub/Sub
2.Load the events from Cloud Pub/Sub to BigQuery every 15 minutes using file loads to save cost on streaming inserts.
3.The destination will differ based on "user_id" and "campaign_id" field in the JSON event, "user_id" will be dataset name and "campaign_id" will be the table name. The partition name comes from the event timestamp.
4.The schema for all tables stays same.
I am new to Java and Beam. I think my code mostly does what I am trying to do and I just a need little help here.
But I unable to access "campaign_id" and "user_id" field in the JSON message.
So, my events are not routing to the correct table.
package ...;
import com.google.api.services.bigquery.model.TableSchema;
import javafx.scene.control.TableRow;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinations;
import org.apache.beam.sdk.io.gcp.bigquery.TableDestination;
import org.apache.beam.sdk.io.gcp.bigquery.TableRowJsonCoder;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.ValueInSingleWindow;
import org.joda.time.Duration;
import org.joda.time.Instant;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method.FILE_LOADS;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition.WRITE_APPEND;
public class ClickLogConsumer {
private static final int BATCH_INTERVAL_SECS = 15 * 60;
private static final String PROJECT = "pure-app";
public static PTransform<PCollection<String>, PCollection<com.google.api.services.bigquery.model.TableRow>> jsonToTableRow() {
return new JsonToTableRow();
}
private static class JsonToTableRow
extends PTransform<PCollection<String>, PCollection<com.google.api.services.bigquery.model.TableRow>> {
#Override
public PCollection<com.google.api.services.bigquery.model.TableRow> expand(PCollection<String> stringPCollection) {
return stringPCollection.apply("JsonToTableRow", MapElements.<String, com.google.api.services.bigquery.model.TableRow>via(
new SimpleFunction<String, com.google.api.services.bigquery.model.TableRow>() {
#Override
public com.google.api.services.bigquery.model.TableRow apply(String json) {
try {
InputStream inputStream = new ByteArrayInputStream(
json.getBytes(StandardCharsets.UTF_8.name()));
//OUTER is used here to prevent EOF exception
return TableRowJsonCoder.of().decode(inputStream, Coder.Context.OUTER);
} catch (IOException e) {
throw new RuntimeException("Unable to parse input", e);
}
}
}));
}
}
public static void main(String[] args) throws Exception {
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply(PubsubIO.readStrings().withTimestampAttribute("timestamp").fromTopic("projects/pureapp-199410/topics/clicks"))
.apply(jsonToTableRow())
.apply("WriteToBQ",
BigQueryIO.writeTableRows()
.withMethod(FILE_LOADS)
.withWriteDisposition(WRITE_APPEND)
.withCreateDisposition(CREATE_IF_NEEDED)
.withTriggeringFrequency(Duration.standardSeconds(BATCH_INTERVAL_SECS))
.withoutValidation()
.to(new DynamicDestinations<TableRow, String>() {
#Override
public String getDestination(ValueInSingleWindow<TableRow> element) {
String tableName = "campaign_id"; // JSON message in Pub/Sub has "campaign_id" field, how do I access it here?
String datasetName = "user_id"; // JSON message in Pub/Sub has "user_id" field, how do I access it here?
Instant eventTimestamp = element.getTimestamp();
String partition = new SimpleDateFormat("yyyyMMdd").format(eventTimestamp);
return String.format("%s:%s.%s$%s", PROJECT, datasetName, tableName, partition);
}
#Override
public TableDestination getTable(String table) {
return new TableDestination(table, null);
}
#Override
public TableSchema getSchema(String destination) {
return getTableSchema();
}
}));
pipeline.run();
}
}
I arrived at the above code based on reading:
1.https://medium.com/myheritage-engineering/kafka-to-bigquery-load-a-guide-for-streaming-billions-of-daily-events-cbbf31f4b737
2.https://shinesolutions.com/2017/12/05/fun-with-serializable-functions-and-dynamic-destinations-in-cloud-dataflow/
3.https://beam.apache.org/documentation/sdks/javadoc/2.0.0/org/apache/beam/sdk/io/gcp/bigquery/DynamicDestinations.html
4.BigQueryIO - Write performance with streaming and FILE_LOADS
5.Inserting into BigQuery via load jobs (not streaming)
Update
import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.api.services.bigquery.model.TimePartitioning;
import com.google.common.collect.ImmutableList;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.io.gcp.bigquery.TableDestination;
import org.apache.beam.sdk.io.gcp.bigquery.TableRowJsonCoder;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.values.PCollection;
import org.joda.time.Duration;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method.FILE_LOADS;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition.WRITE_APPEND;
public class ClickLogConsumer {
private static final int BATCH_INTERVAL_SECS = 15 * 60;
private static final String PROJECT = "pure-app";
public static PTransform<PCollection<String>, PCollection<TableRow>> jsonToTableRow() {
return new JsonToTableRow();
}
private static class JsonToTableRow
extends PTransform<PCollection<String>, PCollection<TableRow>> {
#Override
public PCollection<TableRow> expand(PCollection<String> stringPCollection) {
return stringPCollection.apply("JsonToTableRow", MapElements.<String, com.google.api.services.bigquery.model.TableRow>via(
new SimpleFunction<String, TableRow>() {
#Override
public TableRow apply(String json) {
try {
InputStream inputStream = new ByteArrayInputStream(
json.getBytes(StandardCharsets.UTF_8.name()));
//OUTER is used here to prevent EOF exception
return TableRowJsonCoder.of().decode(inputStream, Coder.Context.OUTER);
} catch (IOException e) {
throw new RuntimeException("Unable to parse input", e);
}
}
}));
}
}
public static void main(String[] args) throws Exception {
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply(PubsubIO.readStrings().withTimestampAttribute("timestamp").fromTopic("projects/pureapp-199410/topics/clicks"))
.apply(jsonToTableRow())
.apply(BigQueryIO.write()
.withTriggeringFrequency(Duration.standardSeconds(BATCH_INTERVAL_SECS))
.withMethod(FILE_LOADS)
.withWriteDisposition(WRITE_APPEND)
.withCreateDisposition(CREATE_IF_NEEDED)
.withSchema(new TableSchema().setFields(
ImmutableList.of(
new TableFieldSchema().setName("timestamp").setType("TIMESTAMP"),
new TableFieldSchema().setName("exchange").setType("STRING"))))
.to((row) -> {
String datasetName = row.getValue().get("user_id").toString();
String tableName = row.getValue().get("campaign_id").toString();
return new TableDestination(String.format("%s:%s.%s", PROJECT, datasetName, tableName), "Some destination");
})
.withTimePartitioning(new TimePartitioning().setField("timestamp")));
pipeline.run();
}
}

How about: String tableName = element.getValue().get("campaign_id").toString() and likewise for the dataset.
Besides, for inserting into time-partitioned tables, I strongly recommend using BigQuery's Column-Based Partitioning, instead of using a partition decorator in the table name. Please see "Loading historical data into time-partitioned BigQuery tables" in the javadoc - you'll need a timestamp column. (note that the javadoc has a typo: "time" vs "timestamp")

Related

Error in BigQuery Snippets

I'm new to data flow and trying to get schema of table in big query dynamically.
Also i need to get the name of destination table dynamically for which i'm using dynamic destination class in BigQueryIO.write.to(). It works if the schema is provided for the destination table before executing the pipeline. But to get the schema dynamically i'm using BigQuery Snippets which takes datasetId and tableId as input and returns schema for a given table. It gives errors mentioned below when tried to run the pipeline with Snippets.
Any help is appreciated.
Thanks in advance.
Exception in thread "main" java.lang.NoSuchMethodError: com.google.api.client.googleapis.services.json.AbstractGoogleJsonClient$Builder.setBatchPath(Ljava/lang/String;)Lcom/google/api/client/googleapis/services/AbstractGoogleClient$Builder;
at com.google.api.services.bigquery.Bigquery$Builder.setBatchPath(Bigquery.java:3519)
at com.google.api.services.bigquery.Bigquery$Builder.<init>(Bigquery.java:3498)
at org.apache.beam.sdk.io.gcp.bigquery.BigQueryServicesImpl.newBigQueryClient(BigQueryServicesImpl.java:881)
at org.apache.beam.sdk.io.gcp.bigquery.BigQueryServicesImpl.access$200(BigQueryServicesImpl.java:79)
at org.apache.beam.sdk.io.gcp.bigquery.BigQueryServicesImpl$DatasetServiceImpl.<init>(BigQueryServicesImpl.java:388)
at org.apache.beam.sdk.io.gcp.bigquery.BigQueryServicesImpl$DatasetServiceImpl.<init>(BigQueryServicesImpl.java:345)
at org.apache.beam.sdk.io.gcp.bigquery.BigQueryServicesImpl.getDatasetService(BigQueryServicesImpl.java:105)
at org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO$TypedRead.validate(BigQueryIO.java:676)
at org.apache.beam.sdk.Pipeline$ValidateVisitor.enterCompositeTransform(Pipeline.java:640)
at org.apache.beam.sdk.runners.TransformHierarchy$Node.visit(TransformHierarchy.java:656)
at org.apache.beam.sdk.runners.TransformHierarchy$Node.visit(TransformHierarchy.java:660)
at org.apache.beam.sdk.runners.TransformHierarchy$Node.access$600(TransformHierarchy.java:311)
at org.apache.beam.sdk.runners.TransformHierarchy.visit(TransformHierarchy.java:245)
at org.apache.beam.sdk.Pipeline.traverseTopologically(Pipeline.java:458)
at org.apache.beam.sdk.Pipeline.validate(Pipeline.java:575)
at org.apache.beam.sdk.Pipeline.run(Pipeline.java:310)
at org.apache.beam.sdk.Pipeline.run(Pipeline.java:297)
at project2.configTable.main(configTable.java:146)
Code:
package project2;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.beam.runners.dataflow.DataflowRunner;
import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinations;
import org.apache.beam.sdk.io.gcp.bigquery.TableDestination;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.View;
import org.apache.beam.sdk.transforms.DoFn.ProcessContext;
import org.apache.beam.sdk.transforms.DoFn.ProcessElement;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionView;
import org.apache.beam.sdk.values.ValueInSingleWindow;
import com.google.api.services.bigquery.model.Table;
import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.BigQueryOptions;
import com.google.cloud.bigquery.Field;
import com.google.cloud.bigquery.FieldList;
import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.BigQueryOptions;
import com.google.cloud.bigquery.DatasetInfo;
import com.google.cloud.bigquery.Field;
import com.google.cloud.bigquery.FieldValueList;
import com.google.cloud.bigquery.InsertAllRequest;
import com.google.cloud.bigquery.InsertAllResponse;
import com.google.cloud.bigquery.LegacySQLTypeName;
import com.google.cloud.bigquery.QueryJobConfiguration;
import com.google.cloud.bigquery.StandardTableDefinition;
import com.google.cloud.bigquery.TableId;
import com.google.cloud.bigquery.TableInfo;
import java.util.HashMap;
import java.util.Map;
import avro.shaded.com.google.common.collect.ImmutableList;
public class configTable {
public static void main(String[] args) {
// TODO Auto-generated method stub
customInt op=PipelineOptionsFactory.as(customInt.class);
op.setProject("my-new-project");
op.setTempLocation("gs://train-10/projects");
op.setWorkerMachineType("n1-standard-1");
op.setTemplateLocation("gs://train-10/main-template-with-snippets");
op.setRunner(DataflowRunner.class);
org.apache.beam.sdk.Pipeline p=org.apache.beam.sdk.Pipeline.create(op);
PCollection<TableRow> indata=p.apply("Taking side input",BigQueryIO.readTableRows().from("my-new-project:training.config"));
PCollectionView<String> view=indata.apply("Convert to view",ParDo.of(new DoFn<TableRow, String>() {
#ProcessElement
public void processElement(ProcessContext c) {
TableRow row=c.element();
c.output(row.get("file").toString());
}
})).apply(View.asSingleton());
PCollection<TableRow> mainop = p.apply("Taking input",TextIO.read().from(NestedValueProvider.of(op.getInputFile(), new SerializableFunction<String, String>() {
public String apply(String input) {
// TODO Auto-generated method stub
return "gs://train-10/projects/"+input;
}
} ))).apply("Transform",ParDo.of(new DoFn<String, TableRow>() {
#ProcessElement
public void processElement(ProcessContext c ) {
c.output(new TableRow().set("data", c.element()));
}
}));
mainop.apply("Write data",BigQueryIO.writeTableRows().to(new DynamicDestinations<TableRow, String>() {
#Override
public String getDestination(ValueInSingleWindow<TableRow> element) {
// TODO Auto-generated method stub
String d=sideInput(view);
String tablespec="my-new-project:training."+d;
return tablespec;
}
#Override
public List<PCollectionView<?>> getSideInputs() {
return ImmutableList.of(view);
}
#Override
public TableDestination getTable(String destination) {
// TODO Auto-generated method stub
//String dest=String.format("%s:%s.%s","my-new-project","training", destination);
String dest=destination;
return new TableDestination(dest, dest);
}
#Override
public TableSchema getSchema(String destination) {
BigQuery bigquery = BigQueryOptions.getDefaultInstance().getService();
com.google.cloud.bigquery.Table table=bigquery.getTable("training", destination);
com.google.cloud.bigquery.Schema tbschema=table.getDefinition().getSchema();
FieldList tfld=tbschema.getFields();
List<TableFieldSchema> flds=new ArrayList<>();
for (Field each : tfld) {
flds.add(new TableFieldSchema().setName(each.getName()).setType(each.getType().toString()));
}
return new TableSchema().setFields(flds);
}
}).withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED).withWriteDisposition(WriteDisposition.WRITE_TRUNCATE));
p.run();
}
}
I don't think you can do both WRITE_TRUNCATE
.withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED).withWriteDisposition(WriteDisposition.WRITE_TRUNCATE))
and get the table's definition
com.google.cloud.bigquery.Table table=bigquery.getTable("training", destination);
com.google.cloud.bigquery.Schema tbschema=table.getDefinition().getSchema();
Because even if the table exists, it may be recreated when paired with a BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE and at that point, the getTable call will fail. In other words, WRITE_TRUNCATE is not an atomic operation.
I suggest that you have the table (with right schema) created before hand (CREATE_NEVER) or append to the table if it exists (WRITE_EMPTY or WRITE_APPEND) or store the schema outside of the dataflow pipeline and read it in.

How do I configure spring-kafka to ignore messages in the wrong format?

We have an issue with one of our Kafka topics which is consumed by the DefaultKafkaConsumerFactory & ConcurrentMessageListenerContainer combination described here with a JsonDeserializer used by the Factory. Unfortunately someone got a little enthusiastic and published some invalid messages onto the topic. It appears that spring-kafka silently fails to process past the first of these messages. Is it possible to have spring-kafka log an error and continue? Looking at the error messages which are logged it seems that perhaps the Apache kafka-clients library should deal with the case that when iterating a batch of messages one or more of them may fail to parse?
The below code is an example test case illustrating this issue:
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.common.serialization.Serializer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.kafka.common.serialization.StringSerializer;
import org.junit.ClassRule;
import org.junit.Test;
import org.springframework.kafka.core.DefaultKafkaConsumerFactory;
import org.springframework.kafka.core.DefaultKafkaProducerFactory;
import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.kafka.listener.KafkaMessageListenerContainer;
import org.springframework.kafka.listener.MessageListener;
import org.springframework.kafka.listener.config.ContainerProperties;
import org.springframework.kafka.support.SendResult;
import org.springframework.kafka.support.serializer.JsonDeserializer;
import org.springframework.kafka.support.serializer.JsonSerializer;
import org.springframework.kafka.test.rule.KafkaEmbedded;
import org.springframework.kafka.test.utils.ContainerTestUtils;
import org.springframework.util.concurrent.ListenableFuture;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
import static org.springframework.kafka.test.hamcrest.KafkaMatchers.hasKey;
import static org.springframework.kafka.test.hamcrest.KafkaMatchers.hasValue;
/**
* #author jfreedman
*/
public class TestSpringKafka {
private static final String TOPIC1 = "spring.kafka.1.t";
#ClassRule
public static KafkaEmbedded embeddedKafka = new KafkaEmbedded(1, true, 1, TOPIC1);
#Test
public void submitMessageThenGarbageThenAnotherMessage() throws Exception {
final BlockingQueue<ConsumerRecord<String, JsonObject>> records = createListener(TOPIC1);
final KafkaTemplate<String, JsonObject> objectTemplate = createPublisher("json", new JsonSerializer<JsonObject>());
sendAndVerifyMessage(records, objectTemplate, "foo", new JsonObject("foo"), 0L);
// push some garbage text to Kafka which cannot be marshalled, this should not interrupt processing
final KafkaTemplate<String, String> garbageTemplate = createPublisher("garbage", new StringSerializer());
final SendResult<String, String> garbageResult = garbageTemplate.send(TOPIC1, "bar","bar").get(5, TimeUnit.SECONDS);
assertEquals(1L, garbageResult.getRecordMetadata().offset());
sendAndVerifyMessage(records, objectTemplate, "baz", new JsonObject("baz"), 2L);
}
private <T> KafkaTemplate<String, T> createPublisher(final String label, final Serializer<T> serializer) {
final Map<String, Object> producerProps = new HashMap<>();
producerProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, embeddedKafka.getBrokersAsString());
producerProps.put(ProducerConfig.CLIENT_ID_CONFIG, "TestPublisher-" + label);
producerProps.put(ProducerConfig.ACKS_CONFIG, "all");
producerProps.put(ProducerConfig.RETRIES_CONFIG, 2);
producerProps.put(ProducerConfig.MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION, 1);
producerProps.put(ProducerConfig.REQUEST_TIMEOUT_MS_CONFIG, 5000);
producerProps.put(ProducerConfig.MAX_BLOCK_MS_CONFIG, 5000);
producerProps.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
producerProps.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, serializer.getClass());
final DefaultKafkaProducerFactory<String, T> pf = new DefaultKafkaProducerFactory<>(producerProps);
pf.setValueSerializer(serializer);
return new KafkaTemplate<>(pf);
}
private BlockingQueue<ConsumerRecord<String, JsonObject>> createListener(final String topic) throws Exception {
final Map<String, Object> consumerProps = new HashMap<>();
consumerProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, embeddedKafka.getBrokersAsString());
consumerProps.put(ConsumerConfig.GROUP_ID_CONFIG, "TestConsumer");
consumerProps.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, true);
consumerProps.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "100");
consumerProps.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, 15000);
consumerProps.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
consumerProps.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, JsonDeserializer.class);
final DefaultKafkaConsumerFactory<String, JsonObject> cf = new DefaultKafkaConsumerFactory<>(consumerProps);
cf.setValueDeserializer(new JsonDeserializer<>(JsonObject.class));
final KafkaMessageListenerContainer<String, JsonObject> container = new KafkaMessageListenerContainer<>(cf, new ContainerProperties(topic));
final BlockingQueue<ConsumerRecord<String, JsonObject>> records = new LinkedBlockingQueue<>();
container.setupMessageListener((MessageListener<String, JsonObject>) records::add);
container.setBeanName("TestListener");
container.start();
ContainerTestUtils.waitForAssignment(container, embeddedKafka.getPartitionsPerTopic());
return records;
}
private void sendAndVerifyMessage(final BlockingQueue<ConsumerRecord<String, JsonObject>> records,
final KafkaTemplate<String, JsonObject> template,
final String key, final JsonObject value,
final long expectedOffset) throws InterruptedException, ExecutionException, TimeoutException {
final ListenableFuture<SendResult<String, JsonObject>> future = template.send(TOPIC1, key, value);
final ConsumerRecord<String, JsonObject> record = records.poll(5, TimeUnit.SECONDS);
assertThat(record, hasKey(key));
assertThat(record, hasValue(value));
assertEquals(expectedOffset, future.get(5, TimeUnit.SECONDS).getRecordMetadata().offset());
}
public static final class JsonObject {
private String value;
public JsonObject() {}
JsonObject(final String value) {
this.value = value;
}
public String getValue() {
return value;
}
public void setValue(final String value) {
this.value = value;
}
#Override
public boolean equals(final Object o) {
if (this == o) { return true; }
if (o == null || getClass() != o.getClass()) { return false; }
final JsonObject that = (JsonObject) o;
return Objects.equals(value, that.value);
}
#Override
public int hashCode() {
return Objects.hash(value);
}
#Override
public String toString() {
return "JsonObject{" +
"value='" + value + '\'' +
'}';
}
}
}
I have a solution but I don't know if it's the best one, I extended JsonDeserializer as follows which results in a null value being consumed by spring-kafka and requires the necessary downstream changes to handle that case.
class SafeJsonDeserializer[A >: Null](targetType: Class[A], objectMapper: ObjectMapper) extends JsonDeserializer[A](targetType, objectMapper) with Logging {
override def deserialize(topic: String, data: Array[Byte]): A = try {
super.deserialize(topic, data)
} catch {
case e: Exception =>
logger.error("Failed to deserialize data [%s] from topic [%s]".format(new String(data), topic), e)
null
}
}
Starting from the spring-kafka-2.x.x, we now have the comfort of declaring beans in the config file for the interface KafkaListenerErrorHandler with a implementation something as
#Bean
public ConsumerAwareListenerErrorHandler listen3ErrorHandler() {
return (m, e, c) -> {
this.listen3Exception = e;
MessageHeaders headers = m.getHeaders();
c.seek(new org.apache.kafka.common.TopicPartition(
headers.get(KafkaHeaders.RECEIVED_TOPIC, String.class),
headers.get(KafkaHeaders.RECEIVED_PARTITION_ID, Integer.class)),
headers.get(KafkaHeaders.OFFSET, Long.class));
return null;
};
}
more resources can be found at https://docs.spring.io/spring-kafka/reference/htmlsingle/#annotation-error-handling There is also another link with the similar issue: Spring Kafka error handling - v1.1.x and How to handle SerializationException after deserialization
Use ErrorHandlingDeserializer2. This is a delegating key/value deserializer that catches exceptions, returning them in the headers as serialized java objects.
Under consumer configuration, add/update the below lines:
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.springframework.kafka.support.serializer.ErrorHandlingDeserializer2
configProps.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,
classOf[ErrorHandlingDeserializer2[JsonDeserializer]].getName)
configProps.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, classOf[ErrorHandlingDeserializer2[StringDeserializer]].getName)
configProps.put(ErrorHandlingDeserializer2.KEY_DESERIALIZER_CLASS, classOf[StringDeserializer].getName)
configProps.put(ErrorHandlingDeserializer2.VALUE_DESERIALIZER_CLASS, classOf[JsonDeserializer].getName)

Exception in thread "pool-1-thread-7" java.lang.NullPointerException

I'm trying to figure it out y i get this exception as i wrote in the title.
i'm building a multi thread MMU (memory management unit) project, first i initialize and add some pages into the ram and to my HD also and then i create processes and running all the system. a bit of information about the project:
into runConfig i;m reading a json file with a list of processCycles and every processCycles including a list of processCycle that including a list of pageIds with a list of data of those pages.
i think i did everything ok but still i get this exception, can somebody help me?
MMUDriver is the class that run all the systems:
package hit.driver;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import com.google.gson.Gson;
import com.google.gson.JsonIOException;
import com.google.gson.JsonSyntaxException;
import com.google.gson.stream.JsonReader;
import com.hit.algorithm.IAlgoCache;
import com.hit.algorithm.LRUAlgoCacheImpl;
import com.hit.algorithm.MRUAlgoCacheImpl;
import com.hit.algorithm.RandomReplacementAlgoCacheImpl;
import hit.memoryunits.HardDisk;
import hit.memoryunits.MemoryManagementUnit;
import hit.memoryunits.MemoryManagementUnitTest;
import hit.memoryunits.Page;
import hit.processes.ProcessCycles;
import hit.processes.Process;
import hit.processes.RunConfiguration;
#SuppressWarnings("unused")
public class MMUDriver {
private static final String CONFIG_FILE_NAME="Configuration.json";
// private static final String CONFIG_FILE_NAME="test.json";
//read from JSON file the lists of processCycles
private static RunConfiguration readConfigurationFile() throws JsonIOException, JsonSyntaxException, FileNotFoundException{
RunConfiguration runConfig = new Gson().fromJson(new JsonReader(new FileReader(CONFIG_FILE_NAME)), RunConfiguration.class);
return runConfig;
}
//
private static List<Process> createProcesses(List<ProcessCycles> processCycles,MemoryManagementUnit mmu ){
int i=0;
List<Process> processList = new ArrayList<Process>();
for ( ProcessCycles currentListProcess : processCycles) {
Process process = new Process(++i , mmu , currentListProcess);
processList.add(process);
}
return processList;
}
//
private static void runProcesses(List<Process> processes){
//Executor is a interface that take care of threads מתי מתחיל טרד מתי נגמר איך הם ירוצו במקביל וכו'
//thread pool - אוסף של threads
ExecutorService executorservice = Executors.newCachedThreadPool();
for (Process process : processes)
executorservice.execute(process);
executorservice.shutdown();//
}
//
public static void main(String[] args) throws java.lang.InterruptedException,
InvocationTargetException, JsonIOException, JsonSyntaxException, ClassNotFoundException, IOException{
CLI cli = new CLI(System.in,System.out);
String [] configuration;
while((configuration = cli.getConfiguration()) != null){
IAlgoCache<Long, Long> algo = null;
int capacity = Integer.valueOf(configuration[1]);
switch(configuration[0]){ //which configuration to play on LRU|MRU|RR
case "LRU":
algo = new LRUAlgoCacheImpl<Long, Long>(capacity);
break;
case "MRU":
algo = new MRUAlgoCacheImpl<Long, Long>(capacity);
break;
case "RR":
algo = new RandomReplacementAlgoCacheImpl<Long, Long>(capacity);
break;
}
MemoryManagementUnit mmu = new MemoryManagementUnit(algo, capacity);
MemoryManagementUnitTest.test(mmu);//add some pages to ram for test
RunConfiguration runConfig = readConfigurationFile(); //Getting the user request pages throw json file
List<ProcessCycles> processCycles = runConfig.getProcessesCycles();
List<Process> processes = createProcesses(processCycles , mmu);
runProcesses(processes);
Map<Long, Page<byte[]>> bla= mmu.getRam().getRamPages();
System.out.println("1");
// mmu.shoutDown();
}
}
}
Process is like a thread in this project
package hit.processes;
import java.io.IOException;
import java.util.List;
import hit.memoryunits.MemoryManagementUnit;
import hit.memoryunits.Page;
#SuppressWarnings("unused")
public class Process implements Runnable {
private int id;
private MemoryManagementUnit mmu;
private ProcessCycles processCycles;
private Thread processThread=null;
public Process(int id, MemoryManagementUnit mmu, ProcessCycles processCycles){
this.id=id;
this.mmu=mmu;
this.processCycles=processCycles;
}
public int getId(){
return id;
}
public void setId(int id){
this.id=id;
}
/*
public void start(){
if(processThread==null)
processThread = new Thread(this,"process"); //
processThread.start();
}*/
#Override
public void run() {
synchronized(mmu){
//runing on all the list of processCycles
for (ProcessCycle currentProcessCycle : processCycles.getProcessCycles()) {
List<Long> pages = currentProcessCycle.getPages();
List<byte[]> data = currentProcessCycle.getData();
Page<byte[]>[] pagesList = null;
//creating a boolean array to know if the page is for writing or reading in size of pages.size()
boolean [] writePages = new boolean[pages.size()];
for (int i = 0; i < pages.size(); i++)
if(data.get(i) == null) //if the page is empty
writePages[i] = false;
else{
writePages[i] = true ;//if there is data in the page and we want to write it to the ram
}
try {
// Getting the pages from ram to see which we shall update
pagesList = mmu.getPages(pages.toArray(new Long[pages.size()]), writePages);
}
catch (ClassNotFoundException e1) {
e1.printStackTrace();
} catch (IOException e1) {
e1.printStackTrace();
}
for (int j = 0; j < pages.size(); j++)
if(writePages[j] == true) //if the page isn't for read only
pagesList[j].setContent(data.get(j));
try {
Thread.sleep(currentProcessCycle.getSleepMs());
}
catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
}
MMU class is that class that take care of all the requests from the user and manage all the pagging
package hit.memoryunits;
import java.io.FileNotFoundException;
import java.io.IOException;
import com.hit.algorithm.IAlgoCache;
public class MemoryManagementUnit {
private IAlgoCache<Long,Long> algo;
private RAM<Long, Page<byte[]>> ram;
public MemoryManagementUnit(IAlgoCache<Long, Long> algo, int ramCapacity) {
this.algo = algo;
this.ram= new RAM<Long, Page<byte[]>>(ramCapacity);
}
#SuppressWarnings("null")
public Page<byte[]> [] getPages(Long [] pageIds , boolean [] writePages) throws IOException, ClassNotFoundException{
for (int i = 0; i < pageIds.length; i++) {
if(algo.getElement((long) pageIds[i].hashCode())== null)//the page isn't exist in the cache
if(!ram.isFull())
{
if(writePages[i] == true){//if the page is for write we need to overwrite data
Page<byte[]> pageToRam = null;
pageToRam.setPageId(pageIds[i]);
pageToRam.setContent(null);
ram.addPage(pageToRam);// ram is not full need to upload from hd
}
else{//if the page is for read-only so we need to upload the page from HD
ram.addPage(HardDisk.getInstance().pageFault(pageIds[i]));// ram is not full need to upload from hd
}
algo.putElement(Long.valueOf(pageIds[i].hashCode()),pageIds[i]);//update cache
}
else
{
Page<byte[]> pageToRemove = ram.getPage(algo.putElement(Long.valueOf(pageIds[i].hashCode()),pageIds[i]));
ram.removePage(pageToRemove);
ram.addPage(HardDisk.getInstance().pageReplacement(pageToRemove,pageIds[i]));// ram full need to do replacement
algo.putElement(Long.valueOf(pageIds[i].hashCode()),pageIds[i]);//update cache
}
}
return ram.getPages(pageIds);
}
public RAM<Long, Page<byte[]>> getRam(){
return ram;
}
public IAlgoCache<Long,Long> getAlgo(){
return algo;
}
public void setAlgo(IAlgoCache<Long,Long> algo){
this.algo = algo;
}
public void setRam(RAM<Long, Page<byte[]>> ram){
this.ram = ram;
}
public void shoutDown() throws FileNotFoundException, IOException, ClassNotFoundException{
HardDisk.getInstance().saveToDisk(this.getRam().getRamPages());
}
}

Neo4j error caused by Lucene (Too many open files)

I've just started evaluating Neo4j to see how well its fits our use case.
I'm using the embedded Java API to insert edges and nodes into a graph.
After creating around 5000 nodes I get the following error (using Neo4j 2.1.6 and 2.1.7 on OS X Yosemite)
org.neo4j.graphdb.TransactionFailureException: Unable to commit transaction
Caused by: javax.transaction.xa.XAException
Caused by: org.neo4j.kernel.impl.nioneo.store.UnderlyingStorageException: java.io.FileNotFoundException: /Users/mihir.k/IdeaProjects/Turant/target/neo4j-hello-db/schema/label/lucene/_8zr.frq (Too many open files)
Caused by: java.io.FileNotFoundException: /Users/mihir.k/IdeaProjects/Turant/target/neo4j-hello-db/schema/label/lucene/_8zr.frq (Too many open files)
I've looked at numerous similar StackOverFlow questions and other related threads online. They all suggest increasing the max open files limit.
I've tried doing that.
These are my settings:
kern.maxfiles: 65536
kern.maxfilesperproc: 65536
However this hasn't fixed the error.
While the Neo4j code runs I tried using the lsof|wc -l command. The code always breaks when around 10000 files are open.
The following is the main class that deals with Neo4j:
import java.io.File;
import java.io.Serializable;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.neo4j.cypher.internal.compiler.v1_9.commands.True;
import org.neo4j.cypher.internal.compiler.v2_0.ast.False;
import org.neo4j.graphdb.*;
import org.neo4j.graphdb.factory.GraphDatabaseFactory;
import org.neo4j.graphdb.schema.Schema;
import org.neo4j.graphdb.schema.IndexDefinition;
import org.neo4j.graphdb.index.UniqueFactory;
import org.neo4j.graphdb.index.Index;
import org.neo4j.graphdb.index.IndexHits;
public class Neo4jDB implements Serializable {
private static final String DB_PATH = "target/neo4j-hello-db-spark";
IndexDefinition indexDefinition;
private static GraphDatabaseFactory dbFactory;
public static GraphDatabaseService db;
public void main(String[] args) {
System.out.println("Life is a disease, sexually transmitted and irrevocably fatal. Stop coding and read some Neil Gaiman.");
}
public void startDbInstance() {
db =new GraphDatabaseFactory().newEmbeddedDatabase(DB_PATH);
}
public Node createOrGetNode ( LabelsUser360 label , String key, String nodeName ,Map<String,Object> propertyMap)
{
System.out.println("Creating/Getting node");
try ( Transaction tx = db.beginTx() ) {
Node node;
if (db.findNodesByLabelAndProperty(label, key, nodeName).iterator().hasNext()) {
node = db.findNodesByLabelAndProperty(label, key, nodeName).iterator().next();
} else {
node = db.createNode(label);
node.setProperty(key, nodeName);
}
for (Map.Entry<String, Object> entry : propertyMap.entrySet()) {
node.setProperty(entry.getKey(), entry.getValue());
}
tx.success();
return node;
}
}
public void createUniquenessConstraint(LabelsUser360 label , String property)
{
try ( Transaction tx = db.beginTx() )
{
db.schema()
.constraintFor(label)
.assertPropertyIsUnique(property)
.create();
tx.success();
}
}
public void createOrUpdateRelationship(RelationshipsUser360 relationshipType ,Node startNode, Node endNode, Map<String,Object> propertyMap)
{
try ( Transaction tx = db.beginTx() ) {
if (startNode.hasRelationship(relationshipType, Direction.OUTGOING)) {
Relationship relationship = startNode.getSingleRelationship(relationshipType, Direction.OUTGOING);
for (Map.Entry<String, Object> entry : propertyMap.entrySet()) {
relationship.setProperty(entry.getKey(), entry.getValue());
}
} else {
Relationship relationship = startNode.createRelationshipTo(endNode, relationshipType);
for (Map.Entry<String, Object> entry : propertyMap.entrySet()) {
relationship.setProperty(entry.getKey(), entry.getValue());
}
}
tx.success();
}
}
public void registerShutdownHook( final GraphDatabaseService graphDb )
{
Runtime.getRuntime().addShutdownHook( new Thread()
{
#Override
public void run()
{
db.shutdown();
}
} );
}
}
There is another Neo4jAdapter class that is used to implement domain specific logic. It uses the Neo4jDB class to do add/update nodes/properties/relationships
import org.apache.lucene.index.IndexWriter;
import org.codehaus.jackson.map.ObjectMapper;
import org.json.*;
import org.neo4j.graphdb.*;
import org.neo4j.graphdb.factory.GraphDatabaseFactory;
import org.neo4j.graphdb.schema.IndexDefinition;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
public class Neo4jAdapter implements Serializable {
static Neo4jDB n4j = new Neo4jDB();
public static GraphDatabaseService db = Neo4jDB.db ;
public void begin() {
n4j.startDbInstance();
}
public static void main(String[] args) {}
public String graphPut(String jsonString) {
System.out.println("graphput called");
HashMap<String, Object> map = jsonToMap(jsonString); //Json deserializer
Node startNode = n4j.createOrGetNode(...);
Node endNode = n4j.createOrGetNode(...);
propertyMap = new HashMap<String, Object>();
propertyMap.put(....);
try (Transaction tx = Neo4jDB.db.beginTx()) {
Relationship relationship = startNode.getSingleRelationship(...);
if (relationship != null) {
Integer currentCount = (Integer) relationship.getProperty("count");
Integer updatedCount = currentCount + 1;
propertyMap.put("count", updatedCount);
} else {
Integer updatedCount = 1;
propertyMap.put("count", updatedCount);
}
tx.success();
}
n4j.createOrUpdateRelationship(RelationshipsUser360.BLAH, startNode, endNode, propertyMap);
}
}
}
return "Are you sponge worthy??";
}
}
Finally, there is a Sprak App that calls the "graphput" method of the Neo4jAdapter class. The relevant code snippet is (the following is scala+spark code) :
val graphdb : Neo4jAdapter = new Neo4jAdapter()
graphdb.begin()
linesEnriched.foreach(a=>graphdb.graphPutMap(a))
where 'a' is a json string and linesEnriched is a Spark RDD (basically a set of strings)

Primavera API create udfcode local mode

Knows someone where I can find a sample of adding primavera UDF through API?
My code is below but nothing happens when I run it.
I can connect to the database, read the UDF, delete the udf, but not to create
Thank you
package apitest;
import com.primavera.bo.base.u;
import com.primavera.common.value.ObjectId;
import com.primavera.integration.client.*;
import com.primavera.integration.client.Session;
import com.primavera.integration.client.EnterpriseLoadManager;
import com.primavera.integration.client.RMIURL;
import com.primavera.integration.common.DatabaseInstance;
import com.primavera.integration.client.bo.BOIterator;
import com.primavera.integration.client.bo.object.Project;
import com.primavera.integration.client.bo.object.UDFCode;
import com.primavera.integration.client.bo.InternalBOHelper;
import com.primavera.integration.client.bo.helper.UDFCodeHelper;
public class API
{
public static void main( String[] args )
{
System.setProperty("primavera.bootstrap.home","C:\\P6IntegrationAPI_1");
Session session = null;
try
{
DatabaseInstance[] dbInstances = Session.getDatabaseInstances(
RMIURL.getRmiUrl( RMIURL.LOCAL_SERVICE ) );
// Assume only one database instance for now, and hardcode the username and
// password for this sample code
session = Session.login( RMIURL.getRmiUrl( RMIURL.LOCAL_SERVICE ),
dbInstances[0].getDatabaseId(), "admin", "admin" );
//u.delete(session, (UDFCode)(new String ("High")));
UDFCode u = new UDFCode(session);
u.setCodeValue("cdc");
u.setDescription("cdcds");
u.setObjectId(ObjectId.USESSION_OVERRIDE_ID);
u.setCodeTypeObjectId(ObjectId.USESSION_OVERRIDE_ID);
u.setSequenceNumber(0);
u.create();
System.out.println("cdcx");
}
catch ( Exception e )
{
System.out.println(e.getCause());
}
finally
{
if ( session != null )
session.logout();
}
}
}
A UDF Field in P6 is in fact 3 Types of Tables/Classes.
UdfType - is the type of UDF (Name, Datatype etc...)
UdfValue - this is the actual value u want to set i guess ?
UdfCodes - Thats UserDefined Codes so its a bit complex and is in fact a Code and no Field!
So if u want a simple UDF, just create a UdfType, this is part of the globaldata and must exists befor u add values. You can also add it over the client/web and fill it with values later.
Next try to insert some UdfValues on Activities. Thats the way i go every time ;)
Use the following template code to connect to the DB . Make sure you change bootstrap,username password values before using it.
import java.util.Date;
import java.util.Iterator;
import com.primavera.PrimaveraException;
import com.primavera.ServerException;
import com.primavera.bo.events.enm.SpreadPeriodType;
import com.primavera.common.value.Duration;
import com.primavera.common.value.ObjectId;
import com.primavera.common.value.spread.ActivitySpread;
import com.primavera.common.value.spread.ActivitySpreadPeriod;
import com.primavera.integration.client.GlobalObjectManager;
import com.primavera.integration.client.Session;
import com.primavera.integration.client.bo.BOIterator;
import com.primavera.integration.client.bo.BusinessObjectException;
import com.primavera.integration.client.bo.enm.ActivityStatus;
import com.primavera.integration.client.bo.enm.ProjectStatus;
import com.primavera.integration.client.bo.object.Activity;
import com.primavera.integration.client.bo.object.ActivityNote;
import com.primavera.integration.client.bo.object.EPS;
import com.primavera.integration.client.bo.object.NotebookTopic;
import com.primavera.integration.client.bo.object.Project;
import com.primavera.integration.client.bo.object.BaselineProject;
import com.primavera.integration.common.CopyActivityOptions;
import com.primavera.integration.common.CopyProjectOptions;
import com.primavera.integration.common.CopyWBSOptions;
import com.primavera.integration.common.DatabaseInstance;
import com.primavera.integration.client.bo.object.*;
import com.primavera.integration.client.*;
import com.primavera.integration.network.NetworkException;
import com.primavera.integration.*;
public class EPSFilter {
static Session session = null;
//static Session session = null;
private static final String PRIMAVERA_BOOTSTRAP_HOME = "primavera.bootstrap.home";
static final String PRIMAVERA_BOOTSTRAP_HOME_VALUE = "C:\\P6EPPM_832\\p6";
static final String PRIMAVERA_USERNAME = "admin";
static final String PRIMAVERA_PASSWORD = "admin";
static final String PRIMAVERA_SYSTEM = "pmdb832_n1";
static void openSession() throws PrimaveraException {
for (DatabaseInstance dbi : Session.getDatabaseInstances(null)) {
if (PRIMAVERA_SYSTEM.equalsIgnoreCase(dbi.getDatabaseName())) {
session = Session.login(null, dbi.getDatabaseId(), PRIMAVERA_USERNAME, PRIMAVERA_PASSWORD);
//session = Session.login (RMIURL.getRmiUrl( RMIURL.LOCAL_SERVICE ), "1", "admin", "admin");
System.out.println("Connected to Primavera instance " + PRIMAVERA_SYSTEM);
return;
}
}
throw new RuntimeException("Error, Primavera instance " + PRIMAVERA_SYSTEM + " not found");
}
public static void main(String[] args) throws BusinessObjectException, ServerException, NetworkException {
try {
System.setProperty(PRIMAVERA_BOOTSTRAP_HOME, PRIMAVERA_BOOTSTRAP_HOME_VALUE);
openSession();
} catch (PrimaveraException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
Your code should follow to create the UDF's. If you still face issues. Please post the error you are getting