Dataflow to BigQuery and storage system lag is very high - google-bigquery

We are creating a data pipeline in GCP and facing some issue during testing. Our current architecture is on AWS, to test we are pushing one copy of data to pubsub from Lambda realtime.
Facing latency issue from pubsub to BigQuery and storage via dataflow (Is there a way to do bulk load as per table instead of inserting one event at a time) We have a windowing of 5 min and after 5min we group data by event key for storage purpose and write all event in that duration in single file can we do something similar in BigQuery and define schema only once for one event type instead of all event.
Auto Scale of worker is not happening min 2 and max 10 is given
All services used are in asia-northeast1
We receive generally 3million records per day what would be the best server config for dataflow.
package purplle.datapipeline;
import static java.nio.charset.StandardCharsets.UTF_8;
import java.net.SocketTimeoutException;
import java.time.LocalDateTime;
import java.time.ZoneId;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinations;
import org.apache.beam.sdk.io.gcp.bigquery.InsertRetryPolicy;
import org.apache.beam.sdk.io.gcp.bigquery.TableDestination;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.GroupByKey;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime;
import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
import org.apache.beam.sdk.transforms.windowing.Repeatedly;
import org.apache.beam.sdk.transforms.windowing.Window;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.ValueInSingleWindow;
import org.joda.time.Duration;
import org.json.JSONException;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.api.services.bigquery.Bigquery;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.cloud.storage.Blob;
import com.google.cloud.storage.BlobId;
import com.google.cloud.storage.BlobInfo;
import com.google.cloud.storage.Storage;
import com.google.cloud.storage.StorageOptions;
import purplle.datapipeline.buisness.EventSchemaBuilder;
import purplle.datapipeline.buisness.Ordering;
import purplle.datapipeline.common.Constants;
import purplle.datapipeline.helpers.Event_ordering;
import purplle.datapipeline.helpers.Event_schema;
import purplle.datapipeline.helpers.JSON_helper;
public class StarterPipeline {
public interface StarterPipelineOption extends PipelineOptions {
/**
* Set this required option to specify where to read the input.
*/
#Description("Path of the file to read from")
#Default.String(Constants.pubsub_event_pipeline_url)
String getInputFile();
void setInputFile(String value);
}
#SuppressWarnings("serial")
static class ParseJsonData_storage extends DoFn<String, KV<String, String>> {
#ProcessElement
public void processElement(ProcessContext c) throws JSONException {
Logger log = LoggerFactory.getLogger(StarterPipeline.class);
if (c.element().length() > 0 && JSON_helper.isJSONValid(c.element())) {
JSONObject event_obj = new JSONObject(c.element());
if (event_obj.length() > 0 && event_obj.has("event")) {
JSONObject ob2 = JSON_helper.flatJsonConvertKeyToLower(event_obj);
if (ob2.length() > 0 && ob2.has("event")) {
// Reorder the json object then pass to create pipe saperated string.
KV<String, String> event_kv_pair = Event_ordering.order_event_columns(ob2, "storage");
if (!event_kv_pair.getKey().isEmpty() && event_kv_pair.getKey().length() > 0) {
c.output(event_kv_pair);
} else {
log = LoggerFactory.getLogger(StarterPipeline.class);
log.error("Storage string empty = " + c.element());
}
} else {
log = LoggerFactory.getLogger(StarterPipeline.class);
log.error("Storage object error = " + c.element());
}
} else {
log = LoggerFactory.getLogger(StarterPipeline.class);
log.error("Storage object error = " + c.element());
}
} else {
log = LoggerFactory.getLogger(StarterPipeline.class);
log.error("Storage empty element = " + c.element());
}
}
}
#SuppressWarnings("serial")
static class ParseJsonData_bigquery extends DoFn<String, TableRow> {
#ProcessElement
public void processElement(ProcessContext c) throws JSONException {
Logger log = LoggerFactory.getLogger(StarterPipeline.class);
log.info("Event json = " + c.element());
if (!c.element().isEmpty() && JSON_helper.isJSONValid(c.element())) {
JSONObject event_obj = new JSONObject(c.element());
if (event_obj.length() > 0 && event_obj.has("event")) {
JSONObject ob2 = JSON_helper.flatJsonConvertKeyToLower(event_obj);
if (ob2.length() > 0 && ob2.has("event")) {
TableRow event_row = EventSchemaBuilder.get_event_row(ob2, "bigquery");
if (!event_row.isEmpty()) {
c.output(event_row);
} else {
log = LoggerFactory.getLogger(StarterPipeline.class);
log.error("Bigquery set event ordering schema error = " + c.element());
}
} else {
log = LoggerFactory.getLogger(StarterPipeline.class);
log.error("Bigquery set event ordering object error = " + c.element());
}
} else {
log = LoggerFactory.getLogger(StarterPipeline.class);
log.error("Bigquery event item object error = " + c.element());
}
} else {
log = LoggerFactory.getLogger(StarterPipeline.class);
log.error("Bigquery event item error = " + c.element());
}
}
}
#SuppressWarnings("serial")
static class Write_to_GCS extends DoFn<KV<String, String>, TextIO.Write> {
#ProcessElement
public void processElement(ProcessContext c) throws JSONException {
String event_string = c.element().getValue();
String event_name = c.element().getKey();
LocalDateTime now = LocalDateTime.now(ZoneId.of("Asia/Kolkata"));
int year = now.getYear();
int month = now.getMonthValue();
int day = now.getDayOfMonth();
int hour = now.getHour();
int minute = now.getMinute();
int second = now.getSecond();
String storage_file_path = event_name + "/" + year + "/" + month + "/" + day + "/" + hour + "/" + event_name
+ "-" + year + "-" + month + "-" + day + "-" + hour + "-" + minute + "-" + second + ".txt";
Logger log = LoggerFactory.getLogger(StarterPipeline.class);
log.info("Writing file to location = " + storage_file_path);
// Create your service object
Storage storage = StorageOptions.getDefaultInstance().getService();
// Upload a blob to the newly created bucket
BlobId blobId = BlobId.of(Constants.gcp_events_bucket_name, storage_file_path);
BlobInfo blobInfo = BlobInfo.newBuilder(blobId).setContentType("text/plain").build();
#SuppressWarnings("unused")
Blob blob = storage.create(blobInfo, event_string.getBytes(UTF_8));
}
}
#SuppressWarnings("serial")
public static class ReadEventJson_storage extends PTransform<PCollection<String>, PCollection<KV<String, String>>> {
#Override
public PCollection<KV<String, String>> expand(PCollection<String> lines) {
Logger log = LoggerFactory.getLogger(StarterPipeline.class);
log.info("Storage workflow started");
#SuppressWarnings("unused")
Boolean tempbool = Event_ordering.setEventsOrdering();
// Convert lines of text into individual words.
PCollection<KV<String, String>> words = lines.apply(ParDo.of(new ParseJsonData_storage()));
return words;
}
}
#SuppressWarnings("serial")
public static class ReadEventJson_bigquery extends PTransform<PCollection<String>, PCollection<TableRow>> {
#Override
public PCollection<TableRow> expand(PCollection<String> lines) {
Logger log = LoggerFactory.getLogger(StarterPipeline.class);
log.info("Bigquery workflow started");
#SuppressWarnings("unused")
Boolean tempbool = Event_ordering.setEventsOrdering();
log.info("Bigquery get event ordering");
Ordering events_ordering = Event_ordering.getEventsOrdering();
Event_schema es = new Event_schema();
es.setEventSchema(events_ordering);
// Convert lines of text into individual words.
PCollection<TableRow> table_row = lines.apply(ParDo.of(new ParseJsonData_bigquery()));
log.info("Bigquery workflow rows prepared");
return table_row;
}
}
/** A SimpleFunction that converts a Word and Count into a printable string. */
#SuppressWarnings("serial")
public static class CombineEventStrings extends SimpleFunction<KV<String, Iterable<String>>, KV<String, String>> {
#Override
public KV<String, String> apply(KV<String, Iterable<String>> input) {
String combined_event = "";
for (String combined_str : input.getValue()) {
combined_event += combined_str + "\n";
}
Logger log = LoggerFactory.getLogger(StarterPipeline.class);
log.info("combined_event = " + combined_event);
KV<String, String> return_kv = KV.of(input.getKey(), combined_event);
return return_kv;
}
}
#SuppressWarnings("serial")
public static void main(String[] args) throws SocketTimeoutException {
Logger log = LoggerFactory.getLogger(StarterPipeline.class);
log.info("Events pipeline job started");
StarterPipelineOption options = PipelineOptionsFactory.fromArgs(args).withValidation()
.as(StarterPipelineOption.class);
Pipeline p = Pipeline.create(options);
log.info("Pipeline created");
log.info("Pipeline Started");
PCollection<String> datastream = p.apply("Read Events From Pubsub",
PubsubIO.readStrings().fromSubscription(Constants.pubsub_event_pipeline_url));
// PCollection<String> windowed_items =
// datastream.apply(Window.<String>into(FixedWindows.of(Duration.standardMinutes(1))));
// PCollection<String> windowed_items = datastream.apply(
// Window.<String>into(SlidingWindows.of(Duration.standardMinutes(1)).every(Duration.standardSeconds(10))));
PCollection<String> windowed_items = datastream.apply(Window.<String>into(new GlobalWindows())
.triggering(Repeatedly.forever(
AfterProcessingTime.pastFirstElementInPane().plusDelayOf(Duration.standardSeconds(300))))
.withAllowedLateness(Duration.standardDays(10)).discardingFiredPanes());
// Write to storage
windowed_items.apply("Read and make pipe separated event string", new ReadEventJson_storage())
.apply("Combine events by keys", GroupByKey.<String, String>create())
.apply("Combine events strings by event name", MapElements.via(new CombineEventStrings()))
.apply("Manually write events to GCS", ParDo.of(new Write_to_GCS()));
// Write into Big Query
windowed_items.apply("Read and make event table row", new ReadEventJson_bigquery())
.apply("Write_events_to_BQ",
BigQueryIO.writeTableRows().to(new DynamicDestinations<TableRow, String>() {
public String getDestination(ValueInSingleWindow<TableRow> element) {
String destination = EventSchemaBuilder
.fetch_destination_based_on_event(element.getValue().get("event").toString());
return destination;
}
#Override
public TableDestination getTable(String table) {
String destination = EventSchemaBuilder.fetch_table_name_based_on_event(table);
return new TableDestination(destination, destination);
}
#Override
public TableSchema getSchema(String table) {
TableSchema table_schema = EventSchemaBuilder.fetch_table_schema_based_on_event(table);
return table_schema;
}
}).withCreateDisposition(CreateDisposition.CREATE_NEVER)
.withWriteDisposition(WriteDisposition.WRITE_APPEND)
.withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors())
);
p.run().waitUntilFinish();
log.info("Events Pipeline Job Stopped");
}
}
Image : Dataflow Progress 1 |
Dataflow Progress 2 |
Dataflow Job Description

Check this post out:
https://medium.com/teads-engineering/give-meaning-to-100-billion-analytics-events-a-day-d6ba09aa8f44
They are handling 100 billion events a day with Dataflow.
Instead of streaming, they opted for batch. Note that they chose a hard way to batch, currently Dataflow has a way easier and faster path.
Their described latency "oscillates between 3 min (minimum duration of the Write BQ phase) and 30 min".
This latency could be way shorter if they moved to the new Dataflow "easy" batch to BigQuery mode.
(the connector deserves a deeper appreciation post, but in the meantime check this slide out https://twitter.com/felipehoffa/status/1000024539944902656)

Related

Spring AOP - passing arguments between annotated methods

i've written a utility to monitor individual business transactions. For example, Alice calls a method which calls more methods and i want info on just Alice's call, separate from Bob's call to the same method.
Right now the entry point creates a Transaction object and it's passed as an argument to each method:
class Example {
public Item getOrderEntryPoint(int orderId) {
Transaction transaction = transactionManager.create();
transaction.trace("getOrderEntryPoint");
Order order = getOrder(orderId, transaction);
transaction.stop();
logger.info(transaction);
return item;
}
private Order getOrder(int orderId, Transaction t) {
t.trace("getOrder");
Order order = getItems(itemId, t);
t.addStat("number of items", order.getItems().size());
for (Item item : order.getItems()) {
SpecialOffer offer = getSpecialOffer(item, t);
if (null != offer) {
t.incrementStat("offers", 1);
}
}
t.stop();
return order;
}
private SpecialOffer getSpecialOffer(Item item, Transaction t) {
t.trace("getSpecialOffer(" + item.id + ")", TraceCategory.Database);
return offerRepository.getByItem(item);
t.stop();
}
}
This will print to the log something like:
Transaction started by Alice at 10:42
Statistics:
number of items : 3
offers : 1
Category Timings (longest first):
DB : 2s 903ms
code : 187ms
Timings (longest first):
getSpecialOffer(1013) : 626ms
getItems : 594ms
Trace:
getOrderEntryPoint (7ms)
getOrder (594ms)
getSpecialOffer(911) (90ms)
getSpecialOffer(1013) (626ms)
getSpecialOffer(2942) (113ms)
It works great but passing the transaction object around is ugly. Someone suggested AOP but i don't see how to pass the transaction created in the first method to all the other methods.
The Transaction object is pretty simple:
public class Transaction {
private String uuid = UUID.createRandom();
private List<TraceEvent> events = new ArrayList<>();
private Map<String,Int> stats = new HashMap<>();
}
class TraceEvent {
private String name;
private long durationInMs;
}
The app that uses it is a Web app, and this multi-threaded, but the individual transactions are on a single thread - no multi-threading, async code, competition for resources, etc.
My attempt at an annotation:
#Around("execution(* *(..)) && #annotation(Trace)")
public Object around(ProceedingJoinPoint point) {
String methodName = MethodSignature.class.cast(point.getSignature()).getMethod().getName();
//--- Where do i get this call's instance of TRANSACTION from?
if (null == transaction) {
transaction = TransactionManager.createTransaction();
}
transaction.trace(methodName);
Object result = point.proceed();
transaction.stop();
return result;
Introduction
Unfortunately, your pseudo code does not compile. It contains several syntactical and logical errors. Furthermore, some helper classes are missing. If I did not have spare time today and was looking for a puzzle to solve, I would not have bothered making my own MCVE out of it, because that would actually have been your job. Please do read the MCVE article and learn to create one next time, otherwise you will not get a lot of qualified help here. This was your free shot because you are new on SO.
Original situation: passing through transaction objects in method calls
Application helper classes:
package de.scrum_master.app;
public class Item {
private int id;
public Item(int id) {
this.id = id;
}
public int getId() {
return id;
}
#Override
public String toString() {
return "Item[id=" + id + "]";
}
}
package de.scrum_master.app;
public class SpecialOffer {}
package de.scrum_master.app;
public class OfferRepository {
public SpecialOffer getByItem(Item item) {
if (item.getId() < 30)
return new SpecialOffer();
return null;
}
}
package de.scrum_master.app;
import java.util.ArrayList;
import java.util.List;
public class Order {
private int id;
public Order(int id) {
this.id = id;
}
public List<Item> getItems() {
List<Item> items = new ArrayList<>();
int offset = id == 12345 ? 0 : 1;
items.add(new Item(11 + offset, this));
items.add(new Item(22 + offset, this));
items.add(new Item(33 + offset, this));
return items;
}
}
Trace classes:
package de.scrum_master.trace;
public enum TraceCategory {
Code, Database
}
package de.scrum_master.trace;
class TraceEvent {
private String name;
private TraceCategory category;
private long durationInMs;
private boolean finished = false;
public TraceEvent(String name, TraceCategory category, long startTime) {
this.name = name;
this.category = category;
this.durationInMs = startTime;
}
public long getDurationInMs() {
return durationInMs;
}
public void setDurationInMs(long durationInMs) {
this.durationInMs = durationInMs;
}
public boolean isFinished() {
return finished;
}
public void setFinished(boolean finished) {
this.finished = finished;
}
#Override
public String toString() {
return "TraceEvent[name=" + name + ", category=" + category +
", durationInMs=" + durationInMs + ", finished=" + finished + "]";
}
}
Transaction classes:
Here I tried to mimic your own Transaction class with as few as possible changes, but there was a lot I had to add and modify in order to emulate a simplified version of your trace output. This is not thread-safe and the way I am locating the last unfinished TraceEvent is not nice and only works cleanly if there are not exceptions. But you get the idea, I hope. The point is to just make it basically work and subsequently get log output similar to your example. If this was originally my code, I would have solved it differently.
package de.scrum_master.trace;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.UUID;
public class Transaction {
private String uuid = UUID.randomUUID().toString();
private List<TraceEvent> events = new ArrayList<>();
private Map<String, Integer> stats = new HashMap<>();
public void trace(String message) {
trace(message, TraceCategory.Code);
}
public void trace(String message, TraceCategory category) {
events.add(new TraceEvent(message, category, System.currentTimeMillis()));
}
public void stop() {
TraceEvent event = getLastUnfinishedEvent();
event.setDurationInMs(System.currentTimeMillis() - event.getDurationInMs());
event.setFinished(true);
}
private TraceEvent getLastUnfinishedEvent() {
return events
.stream()
.filter(event -> !event.isFinished())
.reduce((first, second) -> second)
.orElse(null);
}
public void addStat(String text, int size) {
stats.put(text, size);
}
public void incrementStat(String text, int increment) {
Integer currentCount = stats.get(text);
if (currentCount == null)
currentCount = 0;
stats.put(text, currentCount + increment);
}
#Override
public String toString() {
return "Transaction {" +
toStringUUID() +
toStringStats() +
toStringEvents() +
"\n}\n";
}
private String toStringUUID() {
return "\n uuid = " + uuid;
}
private String toStringStats() {
String result = "\n stats = {";
for (Entry<String, Integer> statEntry : stats.entrySet())
result += "\n " + statEntry;
return result + "\n }";
}
private String toStringEvents() {
String result = "\n events = {";
for (TraceEvent event : events)
result += "\n " + event;
return result + "\n }";
}
}
package de.scrum_master.trace;
public class TransactionManager {
public Transaction create() {
return new Transaction();
}
}
Example driver application:
package de.scrum_master.app;
import de.scrum_master.trace.TraceCategory;
import de.scrum_master.trace.Transaction;
import de.scrum_master.trace.TransactionManager;
public class Example {
private TransactionManager transactionManager = new TransactionManager();
private OfferRepository offerRepository = new OfferRepository();
public Order getOrderEntryPoint(int orderId) {
Transaction transaction = transactionManager.create();
transaction.trace("getOrderEntryPoint");
sleep(100);
Order order = getOrder(orderId, transaction);
transaction.stop();
System.out.println(transaction);
return order;
}
private Order getOrder(int orderId, Transaction t) {
t.trace("getOrder");
sleep(200);
Order order = new Order(orderId);
t.addStat("number of items", order.getItems().size());
for (Item item : order.getItems()) {
SpecialOffer offer = getSpecialOffer(item, t);
if (null != offer)
t.incrementStat("special offers", 1);
}
t.stop();
return order;
}
private SpecialOffer getSpecialOffer(Item item, Transaction t) {
t.trace("getSpecialOffer(" + item.getId() + ")", TraceCategory.Database);
sleep(50);
SpecialOffer specialOffer = offerRepository.getByItem(item);
t.stop();
return specialOffer;
}
private void sleep(long millis) {
try {
Thread.sleep(millis);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
new Example().getOrderEntryPoint(12345);
new Example().getOrderEntryPoint(23456);
}
}
If you run this code, the output is as follows:
Transaction {
uuid = 62ec9739-bd32-4a56-b6b3-a8a13624961a
stats = {
special offers=2
number of items=3
}
events = {
TraceEvent[name=getOrderEntryPoint, category=Code, durationInMs=561, finished=true]
TraceEvent[name=getOrder, category=Code, durationInMs=451, finished=true]
TraceEvent[name=getSpecialOffer(11), category=Database, durationInMs=117, finished=true]
TraceEvent[name=getSpecialOffer(22), category=Database, durationInMs=69, finished=true]
TraceEvent[name=getSpecialOffer(33), category=Database, durationInMs=63, finished=true]
}
}
Transaction {
uuid = a420cd70-96e5-44c4-a0a4-87e421d05e87
stats = {
special offers=2
number of items=3
}
events = {
TraceEvent[name=getOrderEntryPoint, category=Code, durationInMs=469, finished=true]
TraceEvent[name=getOrder, category=Code, durationInMs=369, finished=true]
TraceEvent[name=getSpecialOffer(12), category=Database, durationInMs=53, finished=true]
TraceEvent[name=getSpecialOffer(23), category=Database, durationInMs=63, finished=true]
TraceEvent[name=getSpecialOffer(34), category=Database, durationInMs=53, finished=true]
}
}
AOP refactoring
Preface
Please note that I am using AspectJ here because two things about your code would never work with Spring AOP because it works with a delegation pattern based on dynamic proxies:
self-invocation (internally calling a method of the same class or super-class)
intercepting private methods
Because of these Spring AOP limitations I advise you to either refactor your code so as to avoid the two issues above or to configure your Spring applications to use full AspectJ via LTW (load-time weaving) instead.
As you noticed, my sample code does not use Spring at all because AspectJ is completely independent of Spring and works with any Java application (or other JVM languages, too).
Refactoring idea
Now what should you do in order to get rid of passing around tracing information (Transaction objects), polluting your core application code and tangling it with trace calls?
You extract transaction tracing into an aspect taking care of all trace(..) and stop() calls.
Unfortunately your Transaction class contains different types of information and does different things, so you cannot completely get rid of context information about how to trace for each affected method. But at least you can extract that context information from the method bodies and transform it into a declarative form using annotations with parameters.
These annotations can be targeted by an aspect taking care of handling transaction tracing.
Added and updated code, iteration 1
Annotations related to transaction tracing:
package de.scrum_master.trace;
import static java.lang.annotation.ElementType.METHOD;
import static java.lang.annotation.RetentionPolicy.RUNTIME;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
#Retention(RUNTIME)
#Target(METHOD)
public #interface TransactionEntryPoint {}
package de.scrum_master.trace;
import static java.lang.annotation.ElementType.METHOD;
import static java.lang.annotation.RetentionPolicy.RUNTIME;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
#Retention(RUNTIME)
#Target(METHOD)
public #interface TransactionTrace {
String message() default "__METHOD_NAME__";
TraceCategory category() default TraceCategory.Code;
String addStat() default "";
String incrementStat() default "";
}
Refactored application classes with annotations:
package de.scrum_master.app;
import java.util.ArrayList;
import java.util.List;
import de.scrum_master.trace.TransactionTrace;
public class Order {
private int id;
public Order(int id) {
this.id = id;
}
#TransactionTrace(message = "", addStat = "number of items")
public List<Item> getItems() {
List<Item> items = new ArrayList<>();
int offset = id == 12345 ? 0 : 1;
items.add(new Item(11 + offset));
items.add(new Item(22 + offset));
items.add(new Item(33 + offset));
return items;
}
}
Nothing much here, only added an annotation to getItems(). But the sample application class changes massively, getting much cleaner and simpler:
package de.scrum_master.app;
import de.scrum_master.trace.TraceCategory;
import de.scrum_master.trace.TransactionEntryPoint;
import de.scrum_master.trace.TransactionTrace;
public class Example {
private OfferRepository offerRepository = new OfferRepository();
#TransactionEntryPoint
#TransactionTrace
public Order getOrderEntryPoint(int orderId) {
sleep(100);
Order order = getOrder(orderId);
return order;
}
#TransactionTrace
private Order getOrder(int orderId) {
sleep(200);
Order order = new Order(orderId);
for (Item item : order.getItems()) {
SpecialOffer offer = getSpecialOffer(item);
// Do something with special offers
}
return order;
}
#TransactionTrace(category = TraceCategory.Database, incrementStat = "specialOffers")
private SpecialOffer getSpecialOffer(Item item) {
sleep(50);
SpecialOffer specialOffer = offerRepository.getByItem(item);
return specialOffer;
}
private void sleep(long millis) {
try {
Thread.sleep(millis);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
new Example().getOrderEntryPoint(12345);
new Example().getOrderEntryPoint(23456);
}
}
See? Except for a few annotations there is nothing left of the transaction tracing logic, the application code only takes care of its core concern. If you also remove the sleep() method which only makes the application slower for demonstration purposes (because we want some nice statistics with measured times >0 ms), the class gets even more compact.
But of course we need to put the transaction tracing logic somewhere, more precisely modularise it into an AspectJ aspect:
Transaction tracing aspect:
package de.scrum_master.trace;
import java.lang.reflect.Array;
import java.util.Arrays;
import java.util.Collection;
import java.util.stream.Collectors;
import org.aspectj.lang.JoinPoint;
import org.aspectj.lang.ProceedingJoinPoint;
import org.aspectj.lang.annotation.After;
import org.aspectj.lang.annotation.Around;
import org.aspectj.lang.annotation.Aspect;
import org.aspectj.lang.annotation.Pointcut;
import org.aspectj.lang.reflect.MethodSignature;
#Aspect("percflow(entryPoint())")
public class TransactionTraceAspect {
private static TransactionManager transactionManager = new TransactionManager();
private Transaction transaction = transactionManager.create();
#Pointcut("execution(* *(..)) && #annotation(de.scrum_master.trace.TransactionEntryPoint)")
private static void entryPoint() {}
#Around("execution(* *(..)) && #annotation(transactionTrace)")
public Object doTrace(ProceedingJoinPoint joinPoint, TransactionTrace transactionTrace) throws Throwable {
preTrace(transactionTrace, joinPoint);
Object result = joinPoint.proceed();
postTrace(transactionTrace);
addStat(transactionTrace, result);
incrementStat(transactionTrace, result);
return result;
}
private void preTrace(TransactionTrace transactionTrace, ProceedingJoinPoint joinPoint) {
String traceMessage = transactionTrace.message();
if ("".equals(traceMessage))
return;
MethodSignature signature = (MethodSignature) joinPoint.getSignature();
if ("__METHOD_NAME__".equals(traceMessage)) {
traceMessage = signature.getName() + "(";
traceMessage += Arrays.stream(joinPoint.getArgs()).map(arg -> arg.toString()).collect(Collectors.joining(", "));
traceMessage += ")";
}
transaction.trace(traceMessage, transactionTrace.category());
}
private void postTrace(TransactionTrace transactionTrace) {
if ("".equals(transactionTrace.message()))
return;
transaction.stop();
}
private void addStat(TransactionTrace transactionTrace, Object result) {
if ("".equals(transactionTrace.addStat()) || result == null)
return;
if (result instanceof Collection)
transaction.addStat(transactionTrace.addStat(), ((Collection<?>) result).size());
else if (result.getClass().isArray())
transaction.addStat(transactionTrace.addStat(), Array.getLength(result));
}
private void incrementStat(TransactionTrace transactionTrace, Object result) {
if ("".equals(transactionTrace.incrementStat()) || result == null)
return;
transaction.incrementStat(transactionTrace.incrementStat(), 1);
}
#After("entryPoint()")
public void logFinishedTransaction(JoinPoint joinPoint) {
System.out.println(transaction);
}
}
Let me explain what this aspect does:
#Pointcut(..) entryPoint() says: Find me all methods in the code annotated by #TransactionEntryPoint. This pointcut is used in two places:
#Aspect("percflow(entryPoint())") says: Create one aspect instance for each control flow beginning at a transaction entry point.
#After("entryPoint()") logFinishedTransaction(..) says: Execute this advice (AOP terminology for a method linked to a pointcut) after an entry point methods is finished. The corresponding method just prints the transaction statistics just like in the original code at the end of Example.getOrderEntryPoint(..).
#Around("execution(* *(..)) && #annotation(transactionTrace)") doTrace(..)says: Wrap methods annotated by TransactionTrace and do the following (method body):
add new trace element and start measuring time
execute original (wrapped) method and store result
update trace element with measured time
add one type of statistics (optional)
increment another type of statistics (optional)
return wrapped method's result to its caller
The private methods are just helpers for the #Around advice.
The console log when running the updated Example class and active AspectJ is:
Transaction {
uuid = 4529d325-c604-441d-8997-45ca659abb14
stats = {
specialOffers=2
number of items=3
}
events = {
TraceEvent[name=getOrderEntryPoint(12345), category=Code, durationInMs=468, finished=true]
TraceEvent[name=getOrder(12345), category=Code, durationInMs=366, finished=true]
TraceEvent[name=getSpecialOffer(Item[id=11]), category=Database, durationInMs=59, finished=true]
TraceEvent[name=getSpecialOffer(Item[id=22]), category=Database, durationInMs=50, finished=true]
TraceEvent[name=getSpecialOffer(Item[id=33]), category=Database, durationInMs=51, finished=true]
}
}
Transaction {
uuid = ef76a996-8621-478b-a376-e9f7a729a501
stats = {
specialOffers=2
number of items=3
}
events = {
TraceEvent[name=getOrderEntryPoint(23456), category=Code, durationInMs=452, finished=true]
TraceEvent[name=getOrder(23456), category=Code, durationInMs=351, finished=true]
TraceEvent[name=getSpecialOffer(Item[id=12]), category=Database, durationInMs=50, finished=true]
TraceEvent[name=getSpecialOffer(Item[id=23]), category=Database, durationInMs=50, finished=true]
TraceEvent[name=getSpecialOffer(Item[id=34]), category=Database, durationInMs=50, finished=true]
}
}
You see, it looks almost identical to the original application.
Idea for further simplification, iteration 2
When reading method Example.getOrder(int orderId) I was wondering why you are calling order.getItems(), looping over it and calling getSpecialOffer(item) inside the loop. In your sample code you do not use the results for anything other than updating the transaction trace object. I am assuming that in your real code you do something with the order and with the special offers in that method.
But just in case you really do not need those calls inside that method, I suggest
you factor the calls out right into the aspect, getting rid of the TransactionTrace annotation parameters String addStat() and String incrementStat().
The Example code would get even simpler and
the annotation #TransactionTrace(message = "", addStat = "number of items") in class would go away, too.
I am leaving this refactoring to you if you think it makes sense.

Controlling job execution based on exceptions in simple chunk job in Spring Batch

I'm having simple chunk CSV processing job.
I would like to change execution flow when there is particular type of error during processing (eg. invalid line structure)
In order to prevent throwing errors I need to provide custom exceptionHandler that will swallow parsing exception:
#Bean
fun processCsvStep(
stepBuilderFactory: StepBuilderFactory,
reader: ItemReader<InputRow>,
processor: ItemProcessor<InputRow, OutputObject>,
writer: ItemWriter<OutputObject>
) = stepBuilderFactory.get(PROCESS_CSV_STEP)
.chunk<InputRow, OutputObject>(
CHUNKS_NUMBER
)
.reader(reader)
.processor(processor)
.writer(writer)
.exceptionHandler { context: RepeatContext, throwable: Throwable ->
context.setTerminateOnly()
logger.error { "Exception during parsing: ${throwable.message}" }
}
.build()!!
Then in my Job I can rely only on rollback count:
#Bean
fun createCsvJob(jobs: JobBuilderFactory, processCsvStep: Step, moveCsvStep: Step, moveFailedCsvStep: Step) = jobs.get(PROCESS_CSV_JOB)
.start(processCsvStep)
.next { jobExecution: JobExecution, stepExecution: StepExecution ->
return#next when (stepExecution.rollbackCount) {
0 -> FlowExecutionStatus.COMPLETED
else -> FlowExecutionStatus.FAILED
}
}
.on(FlowExecutionStatus.FAILED.name)
.to(moveFailedCsvStep)
.on(FlowExecutionStatus.COMPLETED.name)
.to(moveCsvStep)
.end()
.build()!!
Is there any way to pass information from exception handler to JobExecutionDecider? I would like to make execution decision based on type of exception that happened during parsing. Is this possible?
I would like to make execution decision based on type of exception that happened during parsing. Is this possible?
You can get access to the exception that happened during the step from the decider through stepExecution#getFailureExceptions. Here is an example:
import java.util.Arrays;
import java.util.List;
import org.springframework.batch.core.Job;
import org.springframework.batch.core.JobParameters;
import org.springframework.batch.core.Step;
import org.springframework.batch.core.configuration.annotation.EnableBatchProcessing;
import org.springframework.batch.core.configuration.annotation.JobBuilderFactory;
import org.springframework.batch.core.configuration.annotation.StepBuilderFactory;
import org.springframework.batch.core.job.flow.FlowExecutionStatus;
import org.springframework.batch.core.job.flow.JobExecutionDecider;
import org.springframework.batch.core.launch.JobLauncher;
import org.springframework.batch.item.ItemReader;
import org.springframework.batch.item.ItemWriter;
import org.springframework.batch.item.support.ListItemReader;
import org.springframework.batch.repeat.RepeatStatus;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.annotation.AnnotationConfigApplicationContext;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
#Configuration
#EnableBatchProcessing
public class MyJob {
#Autowired
private JobBuilderFactory jobs;
#Autowired
private StepBuilderFactory steps;
#Bean
public ItemReader<Integer> itemReader() {
return new ListItemReader<>(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10));
}
#Bean
public ItemWriter<Integer> itemWriter() {
return items -> {
for (Integer item : items) {
if (items.contains(3)) {
throw new IllegalArgumentException("no 3!");
}
System.out.println("item = " + item);
}
};
}
#Bean
public Step step1() {
return steps.get("step1")
.<Integer, Integer>chunk(5)
.reader(itemReader())
.writer(itemWriter())
.build();
}
#Bean
public Step step2() {
return steps.get("step2")
.tasklet((contribution, chunkContext) -> {
System.out.println("step2");
return RepeatStatus.FINISHED;
})
.build();
}
#Bean
public Step step3() {
return steps.get("step3")
.tasklet((contribution, chunkContext) -> {
System.out.println("step3");
return RepeatStatus.FINISHED;
})
.build();
}
#Bean
public JobExecutionDecider decider() {
return (jobExecution, stepExecution) -> {
int rollbackCount = stepExecution.getRollbackCount();
List<Throwable> failureExceptions = stepExecution.getFailureExceptions();
System.out.println("rollbackCount = " + rollbackCount);
System.out.println("failureExceptions = " + failureExceptions);
// make the decision based on rollbackCount and/or failureExceptions and return status accordingly
return FlowExecutionStatus.COMPLETED;
};
}
#Bean
public Job job() {
return jobs.get("job")
.start(step1())
.on("*").to(decider())
.from(decider()).on("COMPLETED").to(step2())
.from(decider()).on("FAILED").to(step3())
.build()
.build();
}
public static void main(String[] args) throws Exception {
ApplicationContext context = new AnnotationConfigApplicationContext(MyJob.class);
JobLauncher jobLauncher = context.getBean(JobLauncher.class);
Job job = context.getBean(Job.class);
jobLauncher.run(job, new JobParameters());
}
}
In this example, if an exception occurs during step1, the decider can get it from the step execution and make the decision accordingly (go to step2 or step3).
So I'm not sure you really need an exception handler and a way to pass information to the decider. The same idea applies of you want to make the decision based on the rollbackCount, commitCount, readCount, or any other metric.
Hope this helps.

How do I configure spring-kafka to ignore messages in the wrong format?

We have an issue with one of our Kafka topics which is consumed by the DefaultKafkaConsumerFactory & ConcurrentMessageListenerContainer combination described here with a JsonDeserializer used by the Factory. Unfortunately someone got a little enthusiastic and published some invalid messages onto the topic. It appears that spring-kafka silently fails to process past the first of these messages. Is it possible to have spring-kafka log an error and continue? Looking at the error messages which are logged it seems that perhaps the Apache kafka-clients library should deal with the case that when iterating a batch of messages one or more of them may fail to parse?
The below code is an example test case illustrating this issue:
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.common.serialization.Serializer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.kafka.common.serialization.StringSerializer;
import org.junit.ClassRule;
import org.junit.Test;
import org.springframework.kafka.core.DefaultKafkaConsumerFactory;
import org.springframework.kafka.core.DefaultKafkaProducerFactory;
import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.kafka.listener.KafkaMessageListenerContainer;
import org.springframework.kafka.listener.MessageListener;
import org.springframework.kafka.listener.config.ContainerProperties;
import org.springframework.kafka.support.SendResult;
import org.springframework.kafka.support.serializer.JsonDeserializer;
import org.springframework.kafka.support.serializer.JsonSerializer;
import org.springframework.kafka.test.rule.KafkaEmbedded;
import org.springframework.kafka.test.utils.ContainerTestUtils;
import org.springframework.util.concurrent.ListenableFuture;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
import static org.springframework.kafka.test.hamcrest.KafkaMatchers.hasKey;
import static org.springframework.kafka.test.hamcrest.KafkaMatchers.hasValue;
/**
* #author jfreedman
*/
public class TestSpringKafka {
private static final String TOPIC1 = "spring.kafka.1.t";
#ClassRule
public static KafkaEmbedded embeddedKafka = new KafkaEmbedded(1, true, 1, TOPIC1);
#Test
public void submitMessageThenGarbageThenAnotherMessage() throws Exception {
final BlockingQueue<ConsumerRecord<String, JsonObject>> records = createListener(TOPIC1);
final KafkaTemplate<String, JsonObject> objectTemplate = createPublisher("json", new JsonSerializer<JsonObject>());
sendAndVerifyMessage(records, objectTemplate, "foo", new JsonObject("foo"), 0L);
// push some garbage text to Kafka which cannot be marshalled, this should not interrupt processing
final KafkaTemplate<String, String> garbageTemplate = createPublisher("garbage", new StringSerializer());
final SendResult<String, String> garbageResult = garbageTemplate.send(TOPIC1, "bar","bar").get(5, TimeUnit.SECONDS);
assertEquals(1L, garbageResult.getRecordMetadata().offset());
sendAndVerifyMessage(records, objectTemplate, "baz", new JsonObject("baz"), 2L);
}
private <T> KafkaTemplate<String, T> createPublisher(final String label, final Serializer<T> serializer) {
final Map<String, Object> producerProps = new HashMap<>();
producerProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, embeddedKafka.getBrokersAsString());
producerProps.put(ProducerConfig.CLIENT_ID_CONFIG, "TestPublisher-" + label);
producerProps.put(ProducerConfig.ACKS_CONFIG, "all");
producerProps.put(ProducerConfig.RETRIES_CONFIG, 2);
producerProps.put(ProducerConfig.MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION, 1);
producerProps.put(ProducerConfig.REQUEST_TIMEOUT_MS_CONFIG, 5000);
producerProps.put(ProducerConfig.MAX_BLOCK_MS_CONFIG, 5000);
producerProps.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
producerProps.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, serializer.getClass());
final DefaultKafkaProducerFactory<String, T> pf = new DefaultKafkaProducerFactory<>(producerProps);
pf.setValueSerializer(serializer);
return new KafkaTemplate<>(pf);
}
private BlockingQueue<ConsumerRecord<String, JsonObject>> createListener(final String topic) throws Exception {
final Map<String, Object> consumerProps = new HashMap<>();
consumerProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, embeddedKafka.getBrokersAsString());
consumerProps.put(ConsumerConfig.GROUP_ID_CONFIG, "TestConsumer");
consumerProps.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, true);
consumerProps.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "100");
consumerProps.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, 15000);
consumerProps.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
consumerProps.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, JsonDeserializer.class);
final DefaultKafkaConsumerFactory<String, JsonObject> cf = new DefaultKafkaConsumerFactory<>(consumerProps);
cf.setValueDeserializer(new JsonDeserializer<>(JsonObject.class));
final KafkaMessageListenerContainer<String, JsonObject> container = new KafkaMessageListenerContainer<>(cf, new ContainerProperties(topic));
final BlockingQueue<ConsumerRecord<String, JsonObject>> records = new LinkedBlockingQueue<>();
container.setupMessageListener((MessageListener<String, JsonObject>) records::add);
container.setBeanName("TestListener");
container.start();
ContainerTestUtils.waitForAssignment(container, embeddedKafka.getPartitionsPerTopic());
return records;
}
private void sendAndVerifyMessage(final BlockingQueue<ConsumerRecord<String, JsonObject>> records,
final KafkaTemplate<String, JsonObject> template,
final String key, final JsonObject value,
final long expectedOffset) throws InterruptedException, ExecutionException, TimeoutException {
final ListenableFuture<SendResult<String, JsonObject>> future = template.send(TOPIC1, key, value);
final ConsumerRecord<String, JsonObject> record = records.poll(5, TimeUnit.SECONDS);
assertThat(record, hasKey(key));
assertThat(record, hasValue(value));
assertEquals(expectedOffset, future.get(5, TimeUnit.SECONDS).getRecordMetadata().offset());
}
public static final class JsonObject {
private String value;
public JsonObject() {}
JsonObject(final String value) {
this.value = value;
}
public String getValue() {
return value;
}
public void setValue(final String value) {
this.value = value;
}
#Override
public boolean equals(final Object o) {
if (this == o) { return true; }
if (o == null || getClass() != o.getClass()) { return false; }
final JsonObject that = (JsonObject) o;
return Objects.equals(value, that.value);
}
#Override
public int hashCode() {
return Objects.hash(value);
}
#Override
public String toString() {
return "JsonObject{" +
"value='" + value + '\'' +
'}';
}
}
}
I have a solution but I don't know if it's the best one, I extended JsonDeserializer as follows which results in a null value being consumed by spring-kafka and requires the necessary downstream changes to handle that case.
class SafeJsonDeserializer[A >: Null](targetType: Class[A], objectMapper: ObjectMapper) extends JsonDeserializer[A](targetType, objectMapper) with Logging {
override def deserialize(topic: String, data: Array[Byte]): A = try {
super.deserialize(topic, data)
} catch {
case e: Exception =>
logger.error("Failed to deserialize data [%s] from topic [%s]".format(new String(data), topic), e)
null
}
}
Starting from the spring-kafka-2.x.x, we now have the comfort of declaring beans in the config file for the interface KafkaListenerErrorHandler with a implementation something as
#Bean
public ConsumerAwareListenerErrorHandler listen3ErrorHandler() {
return (m, e, c) -> {
this.listen3Exception = e;
MessageHeaders headers = m.getHeaders();
c.seek(new org.apache.kafka.common.TopicPartition(
headers.get(KafkaHeaders.RECEIVED_TOPIC, String.class),
headers.get(KafkaHeaders.RECEIVED_PARTITION_ID, Integer.class)),
headers.get(KafkaHeaders.OFFSET, Long.class));
return null;
};
}
more resources can be found at https://docs.spring.io/spring-kafka/reference/htmlsingle/#annotation-error-handling There is also another link with the similar issue: Spring Kafka error handling - v1.1.x and How to handle SerializationException after deserialization
Use ErrorHandlingDeserializer2. This is a delegating key/value deserializer that catches exceptions, returning them in the headers as serialized java objects.
Under consumer configuration, add/update the below lines:
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.springframework.kafka.support.serializer.ErrorHandlingDeserializer2
configProps.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,
classOf[ErrorHandlingDeserializer2[JsonDeserializer]].getName)
configProps.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, classOf[ErrorHandlingDeserializer2[StringDeserializer]].getName)
configProps.put(ErrorHandlingDeserializer2.KEY_DESERIALIZER_CLASS, classOf[StringDeserializer].getName)
configProps.put(ErrorHandlingDeserializer2.VALUE_DESERIALIZER_CLASS, classOf[JsonDeserializer].getName)

Exception in thread "pool-1-thread-7" java.lang.NullPointerException

I'm trying to figure it out y i get this exception as i wrote in the title.
i'm building a multi thread MMU (memory management unit) project, first i initialize and add some pages into the ram and to my HD also and then i create processes and running all the system. a bit of information about the project:
into runConfig i;m reading a json file with a list of processCycles and every processCycles including a list of processCycle that including a list of pageIds with a list of data of those pages.
i think i did everything ok but still i get this exception, can somebody help me?
MMUDriver is the class that run all the systems:
package hit.driver;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import com.google.gson.Gson;
import com.google.gson.JsonIOException;
import com.google.gson.JsonSyntaxException;
import com.google.gson.stream.JsonReader;
import com.hit.algorithm.IAlgoCache;
import com.hit.algorithm.LRUAlgoCacheImpl;
import com.hit.algorithm.MRUAlgoCacheImpl;
import com.hit.algorithm.RandomReplacementAlgoCacheImpl;
import hit.memoryunits.HardDisk;
import hit.memoryunits.MemoryManagementUnit;
import hit.memoryunits.MemoryManagementUnitTest;
import hit.memoryunits.Page;
import hit.processes.ProcessCycles;
import hit.processes.Process;
import hit.processes.RunConfiguration;
#SuppressWarnings("unused")
public class MMUDriver {
private static final String CONFIG_FILE_NAME="Configuration.json";
// private static final String CONFIG_FILE_NAME="test.json";
//read from JSON file the lists of processCycles
private static RunConfiguration readConfigurationFile() throws JsonIOException, JsonSyntaxException, FileNotFoundException{
RunConfiguration runConfig = new Gson().fromJson(new JsonReader(new FileReader(CONFIG_FILE_NAME)), RunConfiguration.class);
return runConfig;
}
//
private static List<Process> createProcesses(List<ProcessCycles> processCycles,MemoryManagementUnit mmu ){
int i=0;
List<Process> processList = new ArrayList<Process>();
for ( ProcessCycles currentListProcess : processCycles) {
Process process = new Process(++i , mmu , currentListProcess);
processList.add(process);
}
return processList;
}
//
private static void runProcesses(List<Process> processes){
//Executor is a interface that take care of threads מתי מתחיל טרד מתי נגמר איך הם ירוצו במקביל וכו'
//thread pool - אוסף של threads
ExecutorService executorservice = Executors.newCachedThreadPool();
for (Process process : processes)
executorservice.execute(process);
executorservice.shutdown();//
}
//
public static void main(String[] args) throws java.lang.InterruptedException,
InvocationTargetException, JsonIOException, JsonSyntaxException, ClassNotFoundException, IOException{
CLI cli = new CLI(System.in,System.out);
String [] configuration;
while((configuration = cli.getConfiguration()) != null){
IAlgoCache<Long, Long> algo = null;
int capacity = Integer.valueOf(configuration[1]);
switch(configuration[0]){ //which configuration to play on LRU|MRU|RR
case "LRU":
algo = new LRUAlgoCacheImpl<Long, Long>(capacity);
break;
case "MRU":
algo = new MRUAlgoCacheImpl<Long, Long>(capacity);
break;
case "RR":
algo = new RandomReplacementAlgoCacheImpl<Long, Long>(capacity);
break;
}
MemoryManagementUnit mmu = new MemoryManagementUnit(algo, capacity);
MemoryManagementUnitTest.test(mmu);//add some pages to ram for test
RunConfiguration runConfig = readConfigurationFile(); //Getting the user request pages throw json file
List<ProcessCycles> processCycles = runConfig.getProcessesCycles();
List<Process> processes = createProcesses(processCycles , mmu);
runProcesses(processes);
Map<Long, Page<byte[]>> bla= mmu.getRam().getRamPages();
System.out.println("1");
// mmu.shoutDown();
}
}
}
Process is like a thread in this project
package hit.processes;
import java.io.IOException;
import java.util.List;
import hit.memoryunits.MemoryManagementUnit;
import hit.memoryunits.Page;
#SuppressWarnings("unused")
public class Process implements Runnable {
private int id;
private MemoryManagementUnit mmu;
private ProcessCycles processCycles;
private Thread processThread=null;
public Process(int id, MemoryManagementUnit mmu, ProcessCycles processCycles){
this.id=id;
this.mmu=mmu;
this.processCycles=processCycles;
}
public int getId(){
return id;
}
public void setId(int id){
this.id=id;
}
/*
public void start(){
if(processThread==null)
processThread = new Thread(this,"process"); //
processThread.start();
}*/
#Override
public void run() {
synchronized(mmu){
//runing on all the list of processCycles
for (ProcessCycle currentProcessCycle : processCycles.getProcessCycles()) {
List<Long> pages = currentProcessCycle.getPages();
List<byte[]> data = currentProcessCycle.getData();
Page<byte[]>[] pagesList = null;
//creating a boolean array to know if the page is for writing or reading in size of pages.size()
boolean [] writePages = new boolean[pages.size()];
for (int i = 0; i < pages.size(); i++)
if(data.get(i) == null) //if the page is empty
writePages[i] = false;
else{
writePages[i] = true ;//if there is data in the page and we want to write it to the ram
}
try {
// Getting the pages from ram to see which we shall update
pagesList = mmu.getPages(pages.toArray(new Long[pages.size()]), writePages);
}
catch (ClassNotFoundException e1) {
e1.printStackTrace();
} catch (IOException e1) {
e1.printStackTrace();
}
for (int j = 0; j < pages.size(); j++)
if(writePages[j] == true) //if the page isn't for read only
pagesList[j].setContent(data.get(j));
try {
Thread.sleep(currentProcessCycle.getSleepMs());
}
catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
}
MMU class is that class that take care of all the requests from the user and manage all the pagging
package hit.memoryunits;
import java.io.FileNotFoundException;
import java.io.IOException;
import com.hit.algorithm.IAlgoCache;
public class MemoryManagementUnit {
private IAlgoCache<Long,Long> algo;
private RAM<Long, Page<byte[]>> ram;
public MemoryManagementUnit(IAlgoCache<Long, Long> algo, int ramCapacity) {
this.algo = algo;
this.ram= new RAM<Long, Page<byte[]>>(ramCapacity);
}
#SuppressWarnings("null")
public Page<byte[]> [] getPages(Long [] pageIds , boolean [] writePages) throws IOException, ClassNotFoundException{
for (int i = 0; i < pageIds.length; i++) {
if(algo.getElement((long) pageIds[i].hashCode())== null)//the page isn't exist in the cache
if(!ram.isFull())
{
if(writePages[i] == true){//if the page is for write we need to overwrite data
Page<byte[]> pageToRam = null;
pageToRam.setPageId(pageIds[i]);
pageToRam.setContent(null);
ram.addPage(pageToRam);// ram is not full need to upload from hd
}
else{//if the page is for read-only so we need to upload the page from HD
ram.addPage(HardDisk.getInstance().pageFault(pageIds[i]));// ram is not full need to upload from hd
}
algo.putElement(Long.valueOf(pageIds[i].hashCode()),pageIds[i]);//update cache
}
else
{
Page<byte[]> pageToRemove = ram.getPage(algo.putElement(Long.valueOf(pageIds[i].hashCode()),pageIds[i]));
ram.removePage(pageToRemove);
ram.addPage(HardDisk.getInstance().pageReplacement(pageToRemove,pageIds[i]));// ram full need to do replacement
algo.putElement(Long.valueOf(pageIds[i].hashCode()),pageIds[i]);//update cache
}
}
return ram.getPages(pageIds);
}
public RAM<Long, Page<byte[]>> getRam(){
return ram;
}
public IAlgoCache<Long,Long> getAlgo(){
return algo;
}
public void setAlgo(IAlgoCache<Long,Long> algo){
this.algo = algo;
}
public void setRam(RAM<Long, Page<byte[]>> ram){
this.ram = ram;
}
public void shoutDown() throws FileNotFoundException, IOException, ClassNotFoundException{
HardDisk.getInstance().saveToDisk(this.getRam().getRamPages());
}
}

Neo4j error caused by Lucene (Too many open files)

I've just started evaluating Neo4j to see how well its fits our use case.
I'm using the embedded Java API to insert edges and nodes into a graph.
After creating around 5000 nodes I get the following error (using Neo4j 2.1.6 and 2.1.7 on OS X Yosemite)
org.neo4j.graphdb.TransactionFailureException: Unable to commit transaction
Caused by: javax.transaction.xa.XAException
Caused by: org.neo4j.kernel.impl.nioneo.store.UnderlyingStorageException: java.io.FileNotFoundException: /Users/mihir.k/IdeaProjects/Turant/target/neo4j-hello-db/schema/label/lucene/_8zr.frq (Too many open files)
Caused by: java.io.FileNotFoundException: /Users/mihir.k/IdeaProjects/Turant/target/neo4j-hello-db/schema/label/lucene/_8zr.frq (Too many open files)
I've looked at numerous similar StackOverFlow questions and other related threads online. They all suggest increasing the max open files limit.
I've tried doing that.
These are my settings:
kern.maxfiles: 65536
kern.maxfilesperproc: 65536
However this hasn't fixed the error.
While the Neo4j code runs I tried using the lsof|wc -l command. The code always breaks when around 10000 files are open.
The following is the main class that deals with Neo4j:
import java.io.File;
import java.io.Serializable;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.neo4j.cypher.internal.compiler.v1_9.commands.True;
import org.neo4j.cypher.internal.compiler.v2_0.ast.False;
import org.neo4j.graphdb.*;
import org.neo4j.graphdb.factory.GraphDatabaseFactory;
import org.neo4j.graphdb.schema.Schema;
import org.neo4j.graphdb.schema.IndexDefinition;
import org.neo4j.graphdb.index.UniqueFactory;
import org.neo4j.graphdb.index.Index;
import org.neo4j.graphdb.index.IndexHits;
public class Neo4jDB implements Serializable {
private static final String DB_PATH = "target/neo4j-hello-db-spark";
IndexDefinition indexDefinition;
private static GraphDatabaseFactory dbFactory;
public static GraphDatabaseService db;
public void main(String[] args) {
System.out.println("Life is a disease, sexually transmitted and irrevocably fatal. Stop coding and read some Neil Gaiman.");
}
public void startDbInstance() {
db =new GraphDatabaseFactory().newEmbeddedDatabase(DB_PATH);
}
public Node createOrGetNode ( LabelsUser360 label , String key, String nodeName ,Map<String,Object> propertyMap)
{
System.out.println("Creating/Getting node");
try ( Transaction tx = db.beginTx() ) {
Node node;
if (db.findNodesByLabelAndProperty(label, key, nodeName).iterator().hasNext()) {
node = db.findNodesByLabelAndProperty(label, key, nodeName).iterator().next();
} else {
node = db.createNode(label);
node.setProperty(key, nodeName);
}
for (Map.Entry<String, Object> entry : propertyMap.entrySet()) {
node.setProperty(entry.getKey(), entry.getValue());
}
tx.success();
return node;
}
}
public void createUniquenessConstraint(LabelsUser360 label , String property)
{
try ( Transaction tx = db.beginTx() )
{
db.schema()
.constraintFor(label)
.assertPropertyIsUnique(property)
.create();
tx.success();
}
}
public void createOrUpdateRelationship(RelationshipsUser360 relationshipType ,Node startNode, Node endNode, Map<String,Object> propertyMap)
{
try ( Transaction tx = db.beginTx() ) {
if (startNode.hasRelationship(relationshipType, Direction.OUTGOING)) {
Relationship relationship = startNode.getSingleRelationship(relationshipType, Direction.OUTGOING);
for (Map.Entry<String, Object> entry : propertyMap.entrySet()) {
relationship.setProperty(entry.getKey(), entry.getValue());
}
} else {
Relationship relationship = startNode.createRelationshipTo(endNode, relationshipType);
for (Map.Entry<String, Object> entry : propertyMap.entrySet()) {
relationship.setProperty(entry.getKey(), entry.getValue());
}
}
tx.success();
}
}
public void registerShutdownHook( final GraphDatabaseService graphDb )
{
Runtime.getRuntime().addShutdownHook( new Thread()
{
#Override
public void run()
{
db.shutdown();
}
} );
}
}
There is another Neo4jAdapter class that is used to implement domain specific logic. It uses the Neo4jDB class to do add/update nodes/properties/relationships
import org.apache.lucene.index.IndexWriter;
import org.codehaus.jackson.map.ObjectMapper;
import org.json.*;
import org.neo4j.graphdb.*;
import org.neo4j.graphdb.factory.GraphDatabaseFactory;
import org.neo4j.graphdb.schema.IndexDefinition;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
public class Neo4jAdapter implements Serializable {
static Neo4jDB n4j = new Neo4jDB();
public static GraphDatabaseService db = Neo4jDB.db ;
public void begin() {
n4j.startDbInstance();
}
public static void main(String[] args) {}
public String graphPut(String jsonString) {
System.out.println("graphput called");
HashMap<String, Object> map = jsonToMap(jsonString); //Json deserializer
Node startNode = n4j.createOrGetNode(...);
Node endNode = n4j.createOrGetNode(...);
propertyMap = new HashMap<String, Object>();
propertyMap.put(....);
try (Transaction tx = Neo4jDB.db.beginTx()) {
Relationship relationship = startNode.getSingleRelationship(...);
if (relationship != null) {
Integer currentCount = (Integer) relationship.getProperty("count");
Integer updatedCount = currentCount + 1;
propertyMap.put("count", updatedCount);
} else {
Integer updatedCount = 1;
propertyMap.put("count", updatedCount);
}
tx.success();
}
n4j.createOrUpdateRelationship(RelationshipsUser360.BLAH, startNode, endNode, propertyMap);
}
}
}
return "Are you sponge worthy??";
}
}
Finally, there is a Sprak App that calls the "graphput" method of the Neo4jAdapter class. The relevant code snippet is (the following is scala+spark code) :
val graphdb : Neo4jAdapter = new Neo4jAdapter()
graphdb.begin()
linesEnriched.foreach(a=>graphdb.graphPutMap(a))
where 'a' is a json string and linesEnriched is a Spark RDD (basically a set of strings)