Please forgive me if this has been asked but I have not found any matches yet.
I have some PDF files where images are duplicated on each page's resources but never used in its content stream. I think this is causing the PDFSplit command to create very bloated pages. Is there any utility code or examples to clean up unused resources like this? Maybe a starting point for me to get going?
I was able to clean up the resources for each page by gathering a list of the images used inside the page's content stream. With the list of images, I then check the resources for the page and remove any that weren't used. See the PageExtractor.stripUnusedImages below for implementation details.
The resource object was shared between pages so I also had to make sure each page had its own copy of the resource object before removing images. See PageExtractor.copyResources below for implementation details.
The page splitter:
package org.apache.pdfbox.examples;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionGoTo;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class PageExtractor {
private final Logger log = LoggerFactory.getLogger(this.getClass());
public PDDocument extractPage(PDDocument source, Integer pageNumber) throws IOException {
PDDocument targetPdf = new PDDocument();
targetPdf.getDocument().setVersion(source.getVersion());
targetPdf.setDocumentInformation(source.getDocumentInformation());
targetPdf.getDocumentCatalog().setViewerPreferences(source.getDocumentCatalog().getViewerPreferences());
PDPage sourcePage = source.getPage(pageNumber);
PDPage targetPage = targetPdf.importPage(sourcePage);
targetPage.setResources(sourcePage.getResources());
stripUnusedImages(targetPage);
stripPageLinks(targetPage);
return targetPdf;
}
/**
* Collect the images used from a custom PDFStreamEngine (BI and DO operators)
* Create an empty COSDictionary
* Loop through the page's XObjects that are images and add them to the new COSDictionary if they were found in the PDFStreamEngine
* Assign the newly filled COSDictionary to the page's resource as COSName.XOBJECT
*/
protected void stripUnusedImages(PDPage page) throws IOException {
PDResources resources = copyResources(page);
COSDictionary pageObjects = (COSDictionary) resources.getCOSObject().getDictionaryObject(COSName.XOBJECT);
COSDictionary newObjects = new COSDictionary();
Set<String> imageNames = findImageNames(page);
Iterable<COSName> xObjectNames = resources.getXObjectNames();
for (COSName xObjectName : xObjectNames) {
if (resources.isImageXObject(xObjectName)) {
Boolean used = imageNames.contains(xObjectName.getName());
if (used) {
newObjects.setItem(xObjectName, pageObjects.getItem(xObjectName));
} else {
log.info("Found unused image: name={}", xObjectName.getName());
}
} else {
newObjects.setItem(xObjectName, pageObjects.getItem(xObjectName));
}
}
resources.getCOSObject().setItem(COSName.XOBJECT, newObjects);
page.setResources(resources);
}
/**
* It is necessary to copy the page's resources since it can be shared with other pages. We must ensure changes
* to the resources are scoped to the current page.
*/
protected PDResources copyResources(PDPage page) {
return new PDResources(new COSDictionary(page.getResources().getCOSObject()));
}
protected Set<String> findImageNames(PDPage page) throws IOException {
Set<String> imageNames = new HashSet<>();
PdfImageStreamEngine engine = new PdfImageStreamEngine() {
#Override
void handleImage(Operator operator, List<COSBase> operands) {
COSName name = (COSName) operands.get(0);
imageNames.add(name.getName());
}
};
engine.processPage(page);
return imageNames;
}
/**
* Borrowed from PDFBox page splitter
*
* #see org.apache.pdfbox.multipdf.Splitter#processAnnotations(org.apache.pdfbox.pdmodel.PDPage)
*/
protected void stripPageLinks(PDPage imported) throws IOException {
List<PDAnnotation> annotations = imported.getAnnotations();
for (PDAnnotation annotation : annotations) {
if (annotation instanceof PDAnnotationLink) {
PDAnnotationLink link = (PDAnnotationLink) annotation;
PDDestination destination = link.getDestination();
if (destination == null && link.getAction() != null) {
PDAction action = link.getAction();
if (action instanceof PDActionGoTo) {
destination = ((PDActionGoTo) action).getDestination();
}
}
if (destination instanceof PDPageDestination) {
// TODO preserve links to pages within the splitted result
((PDPageDestination) destination).setPage(null);
}
}
// TODO preserve links to pages within the splitted result
annotation.setPage(null);
}
}
}
The stream reader used to analyze the page's images:
package org.apache.pdfbox.examples;
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorProcessor;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDTransparencyGroup;
import java.io.IOException;
import java.util.List;
abstract public class PdfImageStreamEngine extends PDFStreamEngine {
PdfImageStreamEngine() {
addOperator(new DrawObjectCounter());
}
abstract void handleImage(Operator operator, List<COSBase> operands);
protected class DrawObjectCounter extends OperatorProcessor {
#Override
public void process(Operator operator, List<COSBase> operands) throws IOException {
if (operands != null && isImage(operands.get(0))) {
handleImage(operator, operands);
}
}
protected Boolean isImage(COSBase base) throws IOException {
if (!(base instanceof COSName)) {
return false;
}
COSName name = (COSName)base;
if (context.getResources().isImageXObject(name)) {
return true;
}
PDXObject xObject = context.getResources().getXObject(name);
if (xObject instanceof PDTransparencyGroup) {
context.showTransparencyGroup((PDTransparencyGroup)xObject);
} else if (xObject instanceof PDFormXObject) {
context.showForm((PDFormXObject)xObject);
}
return false;
}
#Override
public String getName() {
return "Do";
}
}
}
online system, the storm Bolt get NullPointerException,though I think I check it before line 61; It gets NullPointerException once in a while;
import ***.KeyUtils;
import ***.redis.PipelineHelper;
import ***.redis.PipelinedCacheClusterClient;
import **.redis.R2mClusterClient;
import org.apache.commons.lang3.StringUtils;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.IRichBolt;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Tuple;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import java.util.Map;
/**
* RedisBolt batch operate
*/
public class RedisBolt implements IRichBolt {
static final long serialVersionUID = 737015318988609460L;
private static ApplicationContext applicationContext;
private static long logEmitNumber = 0;
private static StringBuffer totalCmds = new StringBuffer();
private Logger logger = LoggerFactory.getLogger(getClass());
private OutputCollector _collector;
private R2mClusterClient r2mClusterClient;
#Override
public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
_collector = outputCollector;
if (applicationContext == null) {
applicationContext = new ClassPathXmlApplicationContext("spring/spring-config-redisbolt.xml");
}
if (r2mClusterClient == null) {
r2mClusterClient = (R2mClusterClient) applicationContext.getBean("r2mClusterClient");
}
}
#Override
public void execute(Tuple tuple) {
String log = tuple.getString(0);
String lastCommands = tuple.getString(1);
try {
//log count
if (StringUtils.isNotEmpty(log)) {
logEmitNumber++;
}
if (StringUtils.isNotEmpty(lastCommands)) {
if(totalCmds==null){
totalCmds = new StringBuffer();
}
totalCmds.append(lastCommands);//line 61
}
//日志数量控制
int numberLimit = 1;
String flow_log_limit = r2mClusterClient.get(KeyUtils.KEY_PIPELINE_LIMIT);
if (StringUtils.isNotEmpty(flow_log_limit)) {
try {
numberLimit = Integer.parseInt(flow_log_limit);
} catch (Exception e) {
numberLimit = 1;
logger.error("error", e);
}
}
if (logEmitNumber >= numberLimit) {
StringBuffer _totalCmds = new StringBuffer(totalCmds);
try {
//pipeline submit
PipelinedCacheClusterClient pip = r2mClusterClient.pipelined();
String[] commandArray = _totalCmds.toString().split(KeyUtils.REDIS_CMD_SPILT);
PipelineHelper.cmd(pip, commandArray);
pip.sync();
pip.close();
totalCmds = new StringBuffer();
} catch (Exception e) {
logger.error("error", e);
}
logEmitNumber = 0;
}
} catch (Exception e) {
logger.error(new StringBuffer("====RedisBolt error for log=[ ").append(log).append("] \n commands=[").append(lastCommands).append("]").toString(), e);
_collector.reportError(e);
_collector.fail(tuple);
}
_collector.ack(tuple);
}
#Override
public void cleanup() {
}
#Override
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
}
#Override
public Map<String, Object> getComponentConfiguration() {
return null;
}
}
exception info:
java.lang.NullPointerException at java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:113) at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:415) at java.lang.StringBuffer.append(StringBuffer.java:237) at com.jd.jr.dataeye.storm.bolt.RedisBolt.execute(RedisBolt.java:61) at org.apache.storm.daemon.executor$fn__5044$tuple_action_fn__5046.invoke(executor.clj:727) at org.apache.storm.daemon.executor$mk_task_receiver$fn__4965.invoke(executor.clj:459) at org.apache.storm.disruptor$clojure_handler$reify__4480.onEvent(disruptor.clj:40) at org.apache.storm.utils.DisruptorQueue.consumeBatchToCursor(DisruptorQueue.java:472) at org.apache.storm.utils.DisruptorQueue.consumeBatchWhenAvailable(DisruptorQueue.java:451) at org.apache.storm.disruptor$consume_batch_when_available.invoke(disruptor.clj:73) at org.apache.storm.daemon.executor$fn__5044$fn__5057$fn__5110.invoke(executor.clj:846) at org.apache.storm.util$async_loop$fn__557.invoke(util.clj:484) at clojure.lang.AFn.run(AFn.java:22) at java.lang.Thread.run(Thread.java:745)
can anyone give me some advice to find the reason.
That is really odd thing to happen. Please read the code for two classes.
https://github.com/openjdk-mirror/jdk7u-jdk/blob/master/src/share/classes/java/lang/AbstractStringBuilder.java
https://github.com/openjdk-mirror/jdk7u-jdk/blob/master/src/share/classes/java/lang/StringBuffer.java
AbstractStringBuilder has constructor with no args which doesn't allocate the field 'value', which makes accessing the 'value' field being NPE. Any constructors in StringBuffer use that constructor. So maybe some odd thing happens in serialization/deserialization and unfortunately 'value' field in AbstractStringBuilder is being null.
Maybe initializing totalCmds in prepare() would be better, and also you need to consider synchronization (thread-safety) between bolts. prepare() can be called per bolt instance so fields are thread-safe, but class fields are not thread-safe.
I think I find the problem maybe;
the key point is
"StringBuffer _totalCmds = new StringBuffer(totalCmds);" and " totalCmds.append(lastCommands);//line 61"
when new a object, It takes serval steps:
(1) allocate memory and return reference
(2) initialize
if append after (1) and before (2) then the StringBuffer.java extends AbstractStringBuilder.java
/**
* The value is used for character storage.
*/
char[] value;
value is not initialized;so this will get null:
#Override
public synchronized void ensureCapacity(int minimumCapacity) {
if (minimumCapacity > value.length) {
expandCapacity(minimumCapacity);
}
}
this blot has a another question, some data maybe lost under a multithreaded environment
We have an issue with one of our Kafka topics which is consumed by the DefaultKafkaConsumerFactory & ConcurrentMessageListenerContainer combination described here with a JsonDeserializer used by the Factory. Unfortunately someone got a little enthusiastic and published some invalid messages onto the topic. It appears that spring-kafka silently fails to process past the first of these messages. Is it possible to have spring-kafka log an error and continue? Looking at the error messages which are logged it seems that perhaps the Apache kafka-clients library should deal with the case that when iterating a batch of messages one or more of them may fail to parse?
The below code is an example test case illustrating this issue:
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.common.serialization.Serializer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.kafka.common.serialization.StringSerializer;
import org.junit.ClassRule;
import org.junit.Test;
import org.springframework.kafka.core.DefaultKafkaConsumerFactory;
import org.springframework.kafka.core.DefaultKafkaProducerFactory;
import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.kafka.listener.KafkaMessageListenerContainer;
import org.springframework.kafka.listener.MessageListener;
import org.springframework.kafka.listener.config.ContainerProperties;
import org.springframework.kafka.support.SendResult;
import org.springframework.kafka.support.serializer.JsonDeserializer;
import org.springframework.kafka.support.serializer.JsonSerializer;
import org.springframework.kafka.test.rule.KafkaEmbedded;
import org.springframework.kafka.test.utils.ContainerTestUtils;
import org.springframework.util.concurrent.ListenableFuture;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
import static org.springframework.kafka.test.hamcrest.KafkaMatchers.hasKey;
import static org.springframework.kafka.test.hamcrest.KafkaMatchers.hasValue;
/**
* #author jfreedman
*/
public class TestSpringKafka {
private static final String TOPIC1 = "spring.kafka.1.t";
#ClassRule
public static KafkaEmbedded embeddedKafka = new KafkaEmbedded(1, true, 1, TOPIC1);
#Test
public void submitMessageThenGarbageThenAnotherMessage() throws Exception {
final BlockingQueue<ConsumerRecord<String, JsonObject>> records = createListener(TOPIC1);
final KafkaTemplate<String, JsonObject> objectTemplate = createPublisher("json", new JsonSerializer<JsonObject>());
sendAndVerifyMessage(records, objectTemplate, "foo", new JsonObject("foo"), 0L);
// push some garbage text to Kafka which cannot be marshalled, this should not interrupt processing
final KafkaTemplate<String, String> garbageTemplate = createPublisher("garbage", new StringSerializer());
final SendResult<String, String> garbageResult = garbageTemplate.send(TOPIC1, "bar","bar").get(5, TimeUnit.SECONDS);
assertEquals(1L, garbageResult.getRecordMetadata().offset());
sendAndVerifyMessage(records, objectTemplate, "baz", new JsonObject("baz"), 2L);
}
private <T> KafkaTemplate<String, T> createPublisher(final String label, final Serializer<T> serializer) {
final Map<String, Object> producerProps = new HashMap<>();
producerProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, embeddedKafka.getBrokersAsString());
producerProps.put(ProducerConfig.CLIENT_ID_CONFIG, "TestPublisher-" + label);
producerProps.put(ProducerConfig.ACKS_CONFIG, "all");
producerProps.put(ProducerConfig.RETRIES_CONFIG, 2);
producerProps.put(ProducerConfig.MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION, 1);
producerProps.put(ProducerConfig.REQUEST_TIMEOUT_MS_CONFIG, 5000);
producerProps.put(ProducerConfig.MAX_BLOCK_MS_CONFIG, 5000);
producerProps.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
producerProps.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, serializer.getClass());
final DefaultKafkaProducerFactory<String, T> pf = new DefaultKafkaProducerFactory<>(producerProps);
pf.setValueSerializer(serializer);
return new KafkaTemplate<>(pf);
}
private BlockingQueue<ConsumerRecord<String, JsonObject>> createListener(final String topic) throws Exception {
final Map<String, Object> consumerProps = new HashMap<>();
consumerProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, embeddedKafka.getBrokersAsString());
consumerProps.put(ConsumerConfig.GROUP_ID_CONFIG, "TestConsumer");
consumerProps.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, true);
consumerProps.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "100");
consumerProps.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, 15000);
consumerProps.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
consumerProps.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, JsonDeserializer.class);
final DefaultKafkaConsumerFactory<String, JsonObject> cf = new DefaultKafkaConsumerFactory<>(consumerProps);
cf.setValueDeserializer(new JsonDeserializer<>(JsonObject.class));
final KafkaMessageListenerContainer<String, JsonObject> container = new KafkaMessageListenerContainer<>(cf, new ContainerProperties(topic));
final BlockingQueue<ConsumerRecord<String, JsonObject>> records = new LinkedBlockingQueue<>();
container.setupMessageListener((MessageListener<String, JsonObject>) records::add);
container.setBeanName("TestListener");
container.start();
ContainerTestUtils.waitForAssignment(container, embeddedKafka.getPartitionsPerTopic());
return records;
}
private void sendAndVerifyMessage(final BlockingQueue<ConsumerRecord<String, JsonObject>> records,
final KafkaTemplate<String, JsonObject> template,
final String key, final JsonObject value,
final long expectedOffset) throws InterruptedException, ExecutionException, TimeoutException {
final ListenableFuture<SendResult<String, JsonObject>> future = template.send(TOPIC1, key, value);
final ConsumerRecord<String, JsonObject> record = records.poll(5, TimeUnit.SECONDS);
assertThat(record, hasKey(key));
assertThat(record, hasValue(value));
assertEquals(expectedOffset, future.get(5, TimeUnit.SECONDS).getRecordMetadata().offset());
}
public static final class JsonObject {
private String value;
public JsonObject() {}
JsonObject(final String value) {
this.value = value;
}
public String getValue() {
return value;
}
public void setValue(final String value) {
this.value = value;
}
#Override
public boolean equals(final Object o) {
if (this == o) { return true; }
if (o == null || getClass() != o.getClass()) { return false; }
final JsonObject that = (JsonObject) o;
return Objects.equals(value, that.value);
}
#Override
public int hashCode() {
return Objects.hash(value);
}
#Override
public String toString() {
return "JsonObject{" +
"value='" + value + '\'' +
'}';
}
}
}
I have a solution but I don't know if it's the best one, I extended JsonDeserializer as follows which results in a null value being consumed by spring-kafka and requires the necessary downstream changes to handle that case.
class SafeJsonDeserializer[A >: Null](targetType: Class[A], objectMapper: ObjectMapper) extends JsonDeserializer[A](targetType, objectMapper) with Logging {
override def deserialize(topic: String, data: Array[Byte]): A = try {
super.deserialize(topic, data)
} catch {
case e: Exception =>
logger.error("Failed to deserialize data [%s] from topic [%s]".format(new String(data), topic), e)
null
}
}
Starting from the spring-kafka-2.x.x, we now have the comfort of declaring beans in the config file for the interface KafkaListenerErrorHandler with a implementation something as
#Bean
public ConsumerAwareListenerErrorHandler listen3ErrorHandler() {
return (m, e, c) -> {
this.listen3Exception = e;
MessageHeaders headers = m.getHeaders();
c.seek(new org.apache.kafka.common.TopicPartition(
headers.get(KafkaHeaders.RECEIVED_TOPIC, String.class),
headers.get(KafkaHeaders.RECEIVED_PARTITION_ID, Integer.class)),
headers.get(KafkaHeaders.OFFSET, Long.class));
return null;
};
}
more resources can be found at https://docs.spring.io/spring-kafka/reference/htmlsingle/#annotation-error-handling There is also another link with the similar issue: Spring Kafka error handling - v1.1.x and How to handle SerializationException after deserialization
Use ErrorHandlingDeserializer2. This is a delegating key/value deserializer that catches exceptions, returning them in the headers as serialized java objects.
Under consumer configuration, add/update the below lines:
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.springframework.kafka.support.serializer.ErrorHandlingDeserializer2
configProps.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,
classOf[ErrorHandlingDeserializer2[JsonDeserializer]].getName)
configProps.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, classOf[ErrorHandlingDeserializer2[StringDeserializer]].getName)
configProps.put(ErrorHandlingDeserializer2.KEY_DESERIALIZER_CLASS, classOf[StringDeserializer].getName)
configProps.put(ErrorHandlingDeserializer2.VALUE_DESERIALIZER_CLASS, classOf[JsonDeserializer].getName)
I am trying to build a simple Java code that obtains flight options using Google QPX API, for the flight from New York to London. I signed up with Google and got API_key.
I read the documentation, but unfortunately, I couldn't find any example that show me how to do it.
Here what I tried so far:
import com.google.api.services.qpxExpress.model.*;
import java.util.*;
public class Main
{
public static void main(String[] args)
{
// Passengers
PassengerCounts passengers = new PassengerCounts();
passengers.setAdultCount(2);
// Slice
List<SliceInput> slices = new ArrayList<SliceInput>();
SliceInput slice = new SliceInput();
slice.setOrigin("JFK"); // John Kennedy Airport in Ney York
slice.setDestination("LHR"); // London Heathrow
slice.setDate("2015-07-01");
slices.add(slice);
// Options request
TripOptionsRequest tripOptions = new TripOptionsRequest();
tripOptions.setPassengers(passengers);
tripOptions.setSlice(slices);
// Search request
TripsSearchRequest tripSearchReq = new TripsSearchRequest();
tripSearchReq.setRequest(tripOptions);
// Next steps?
// Setting up QPXExpress?
}
}
I will appreciate it if someone can help me to complete the code.
Thanks in advance.
This is a simple code, successfully tested, hope it helps
package com.airline.api;
import java.io.IOException;
import java.util.*;
import javax.swing.text.View;
import com.google.api.client.auth.oauth2.Credential;
import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport;
import com.google.api.client.http.HttpRequestInitializer;
import com.google.api.client.http.HttpTransport;
import com.google.api.client.json.JsonFactory;
import com.google.api.client.json.jackson2.JacksonFactory;
import com.google.api.services.qpxExpress.QPXExpressRequestInitializer;
import com.google.api.services.qpxExpress.QPXExpress;
import com.google.api.services.qpxExpress.model.FlightInfo;
import com.google.api.services.qpxExpress.model.LegInfo;
import com.google.api.services.qpxExpress.model.PassengerCounts;
import com.google.api.services.qpxExpress.model.PricingInfo;
import com.google.api.services.qpxExpress.model.SegmentInfo;
import com.google.api.services.qpxExpress.model.SliceInfo;
import com.google.api.services.qpxExpress.model.TripOption;
import com.google.api.services.qpxExpress.model.TripOptionsRequest;
import com.google.api.services.qpxExpress.model.TripsSearchRequest;
import com.google.api.services.qpxExpress.model.SliceInput;
import com.google.api.services.qpxExpress.model.TripsSearchResponse;
public class AirlineReservation {
/**
* #param args
*/
private static final String APPLICATION_NAME = "MyFlightApplication";
private static final String API_KEY = "AIzaSyDnBCdsmTnrL5XFrO2TjJyvFioswjakNYU";
/** Global instance of the HTTP transport. */
private static HttpTransport httpTransport;
/** Global instance of the JSON factory. */
private static final JsonFactory JSON_FACTORY = JacksonFactory.getDefaultInstance();
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
httpTransport = GoogleNetHttpTransport.newTrustedTransport();
PassengerCounts passengers= new PassengerCounts();
passengers.setAdultCount(1);
List<SliceInput> slices = new ArrayList<SliceInput>();
SliceInput slice = new SliceInput();
slice.setOrigin("NYC");
slice.setDestination("LGA");
slice.setDate("2015-04-29");
slices.add(slice);
TripOptionsRequest request= new TripOptionsRequest();
request.setSolutions(10);
request.setPassengers(passengers);
request.setSlice(slices);
TripsSearchRequest parameters = new TripsSearchRequest();
parameters.setRequest(request);
QPXExpress qpXExpress= new QPXExpress.Builder(httpTransport, JSON_FACTORY, null).setApplicationName(APPLICATION_NAME)
.setGoogleClientRequestInitializer(new QPXExpressRequestInitializer(API_KEY)).build();
TripsSearchResponse list= qpXExpress.trips().search(parameters).execute();
List<TripOption> tripResults=list.getTrips().getTripOption();
String id;
for(int i=0; i<tripResults.size(); i++){
//Trip Option ID
id= tripResults.get(i).getId();
System.out.println("id "+id);
//Slice
List<SliceInfo> sliceInfo= tripResults.get(i).getSlice();
for(int j=0; j<sliceInfo.size(); j++){
int duration= sliceInfo.get(j).getDuration();
System.out.println("duration "+duration);
List<SegmentInfo> segInfo= sliceInfo.get(j).getSegment();
for(int k=0; k<segInfo.size(); k++){
String bookingCode= segInfo.get(k).getBookingCode();
System.out.println("bookingCode "+bookingCode);
FlightInfo flightInfo=segInfo.get(k).getFlight();
String flightNum= flightInfo.getNumber();
System.out.println("flightNum "+flightNum);
String flightCarrier= flightInfo.getCarrier();
System.out.println("flightCarrier "+flightCarrier);
List<LegInfo> leg=segInfo.get(k).getLeg();
for(int l=0; l<leg.size(); l++){
String aircraft= leg.get(l).getAircraft();
System.out.println("aircraft "+aircraft);
String arrivalTime= leg.get(l).getArrivalTime();
System.out.println("arrivalTime "+arrivalTime);
String departTime=leg.get(l).getDepartureTime();
System.out.println("departTime "+departTime);
String dest=leg.get(l).getDestination();
System.out.println("Destination "+dest);
String destTer= leg.get(l).getDestinationTerminal();
System.out.println("DestTer "+destTer);
String origin=leg.get(l).getOrigin();
System.out.println("origun "+origin);
String originTer=leg.get(l).getOriginTerminal();
System.out.println("OriginTer "+originTer);
int durationLeg= leg.get(l).getDuration();
System.out.println("durationleg "+durationLeg);
int mil= leg.get(l).getMileage();
System.out.println("Milleage "+mil);
}
}
}
//Pricing
List<PricingInfo> priceInfo= tripResults.get(i).getPricing();
for(int p=0; p<priceInfo.size(); p++){
String price= priceInfo.get(p).getSaleTotal();
System.out.println("Price "+price);
}
}
return;
} catch (IOException e) {
System.err.println(e.getMessage());
} catch (Throwable t) {
t.printStackTrace();
}
System.exit(1);
}
}
I've just started evaluating Neo4j to see how well its fits our use case.
I'm using the embedded Java API to insert edges and nodes into a graph.
After creating around 5000 nodes I get the following error (using Neo4j 2.1.6 and 2.1.7 on OS X Yosemite)
org.neo4j.graphdb.TransactionFailureException: Unable to commit transaction
Caused by: javax.transaction.xa.XAException
Caused by: org.neo4j.kernel.impl.nioneo.store.UnderlyingStorageException: java.io.FileNotFoundException: /Users/mihir.k/IdeaProjects/Turant/target/neo4j-hello-db/schema/label/lucene/_8zr.frq (Too many open files)
Caused by: java.io.FileNotFoundException: /Users/mihir.k/IdeaProjects/Turant/target/neo4j-hello-db/schema/label/lucene/_8zr.frq (Too many open files)
I've looked at numerous similar StackOverFlow questions and other related threads online. They all suggest increasing the max open files limit.
I've tried doing that.
These are my settings:
kern.maxfiles: 65536
kern.maxfilesperproc: 65536
However this hasn't fixed the error.
While the Neo4j code runs I tried using the lsof|wc -l command. The code always breaks when around 10000 files are open.
The following is the main class that deals with Neo4j:
import java.io.File;
import java.io.Serializable;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.neo4j.cypher.internal.compiler.v1_9.commands.True;
import org.neo4j.cypher.internal.compiler.v2_0.ast.False;
import org.neo4j.graphdb.*;
import org.neo4j.graphdb.factory.GraphDatabaseFactory;
import org.neo4j.graphdb.schema.Schema;
import org.neo4j.graphdb.schema.IndexDefinition;
import org.neo4j.graphdb.index.UniqueFactory;
import org.neo4j.graphdb.index.Index;
import org.neo4j.graphdb.index.IndexHits;
public class Neo4jDB implements Serializable {
private static final String DB_PATH = "target/neo4j-hello-db-spark";
IndexDefinition indexDefinition;
private static GraphDatabaseFactory dbFactory;
public static GraphDatabaseService db;
public void main(String[] args) {
System.out.println("Life is a disease, sexually transmitted and irrevocably fatal. Stop coding and read some Neil Gaiman.");
}
public void startDbInstance() {
db =new GraphDatabaseFactory().newEmbeddedDatabase(DB_PATH);
}
public Node createOrGetNode ( LabelsUser360 label , String key, String nodeName ,Map<String,Object> propertyMap)
{
System.out.println("Creating/Getting node");
try ( Transaction tx = db.beginTx() ) {
Node node;
if (db.findNodesByLabelAndProperty(label, key, nodeName).iterator().hasNext()) {
node = db.findNodesByLabelAndProperty(label, key, nodeName).iterator().next();
} else {
node = db.createNode(label);
node.setProperty(key, nodeName);
}
for (Map.Entry<String, Object> entry : propertyMap.entrySet()) {
node.setProperty(entry.getKey(), entry.getValue());
}
tx.success();
return node;
}
}
public void createUniquenessConstraint(LabelsUser360 label , String property)
{
try ( Transaction tx = db.beginTx() )
{
db.schema()
.constraintFor(label)
.assertPropertyIsUnique(property)
.create();
tx.success();
}
}
public void createOrUpdateRelationship(RelationshipsUser360 relationshipType ,Node startNode, Node endNode, Map<String,Object> propertyMap)
{
try ( Transaction tx = db.beginTx() ) {
if (startNode.hasRelationship(relationshipType, Direction.OUTGOING)) {
Relationship relationship = startNode.getSingleRelationship(relationshipType, Direction.OUTGOING);
for (Map.Entry<String, Object> entry : propertyMap.entrySet()) {
relationship.setProperty(entry.getKey(), entry.getValue());
}
} else {
Relationship relationship = startNode.createRelationshipTo(endNode, relationshipType);
for (Map.Entry<String, Object> entry : propertyMap.entrySet()) {
relationship.setProperty(entry.getKey(), entry.getValue());
}
}
tx.success();
}
}
public void registerShutdownHook( final GraphDatabaseService graphDb )
{
Runtime.getRuntime().addShutdownHook( new Thread()
{
#Override
public void run()
{
db.shutdown();
}
} );
}
}
There is another Neo4jAdapter class that is used to implement domain specific logic. It uses the Neo4jDB class to do add/update nodes/properties/relationships
import org.apache.lucene.index.IndexWriter;
import org.codehaus.jackson.map.ObjectMapper;
import org.json.*;
import org.neo4j.graphdb.*;
import org.neo4j.graphdb.factory.GraphDatabaseFactory;
import org.neo4j.graphdb.schema.IndexDefinition;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
public class Neo4jAdapter implements Serializable {
static Neo4jDB n4j = new Neo4jDB();
public static GraphDatabaseService db = Neo4jDB.db ;
public void begin() {
n4j.startDbInstance();
}
public static void main(String[] args) {}
public String graphPut(String jsonString) {
System.out.println("graphput called");
HashMap<String, Object> map = jsonToMap(jsonString); //Json deserializer
Node startNode = n4j.createOrGetNode(...);
Node endNode = n4j.createOrGetNode(...);
propertyMap = new HashMap<String, Object>();
propertyMap.put(....);
try (Transaction tx = Neo4jDB.db.beginTx()) {
Relationship relationship = startNode.getSingleRelationship(...);
if (relationship != null) {
Integer currentCount = (Integer) relationship.getProperty("count");
Integer updatedCount = currentCount + 1;
propertyMap.put("count", updatedCount);
} else {
Integer updatedCount = 1;
propertyMap.put("count", updatedCount);
}
tx.success();
}
n4j.createOrUpdateRelationship(RelationshipsUser360.BLAH, startNode, endNode, propertyMap);
}
}
}
return "Are you sponge worthy??";
}
}
Finally, there is a Sprak App that calls the "graphput" method of the Neo4jAdapter class. The relevant code snippet is (the following is scala+spark code) :
val graphdb : Neo4jAdapter = new Neo4jAdapter()
graphdb.begin()
linesEnriched.foreach(a=>graphdb.graphPutMap(a))
where 'a' is a json string and linesEnriched is a Spark RDD (basically a set of strings)