Full text search on Neo4j over rich text with html markup - lucene

In my Neo4j application I have a Product entity with a name and description fields. Both of these fields are used in legacy indexing over Lucene.
Product.name is a simple text and there are no issues here but Product.description can contain HTML markup and elements.
Right now for my index I use StandardAnalyzer(Version.LUCENE_36). What analyzer should I use in order to skip all HTML elements ?
How to tell Neo4J Lucene index to not use any HTML elements in Product.description ? I'd like to index only words.
UPDATED:
I have found following class HTMLStripCharFilter and reimplemented my Analyzer as following:
public final class StandardAnalyzerV36 extends Analyzer {
private Analyzer analyzer;
public StandardAnalyzerV36() {
analyzer = new StandardAnalyzer(Version.LUCENE_36);
}
public StandardAnalyzerV36(Set<?> stopWords) {
analyzer = new StandardAnalyzer(Version.LUCENE_36, stopWords);
}
#Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
return analyzer.tokenStream(fieldName, new HTMLStripCharFilter(CharReader.get(reader)));
}
#Override
public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
return analyzer.reusableTokenStream(fieldName, reader);
}
}
also I have added a new maven dependecy to my Neo4j project:
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers</artifactId>
<version>3.6.2</version>
</dependency>
Everything works fine right now, but I'm not sure that the method
#Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
return analyzer.tokenStream(fieldName, new HTMLStripCharFilter(CharReader.get(reader)));
}
is a proper place for HTMLStripCharFilter initialization.
Please correct me if I'm wrong.

I have added following init method:
#PostConstruct
public void init() {
GraphDatabaseService graphDb = template.getGraphDatabaseService();
try (Transaction t = graphDb.beginTx()) {
Index<Node> autoIndex = graphDb.index().forNodes("node_auto_index");
graphDb.index().setConfiguration(autoIndex, "type", "fulltext");
graphDb.index().setConfiguration(autoIndex, "to_lower_case", "true");
graphDb.index().setConfiguration(autoIndex, "analyzer", StandardAnalyzerV36.class.getName());
t.success();
}
}
and created following class:
public final class StandardAnalyzerV36 extends Analyzer {
private Analyzer analyzer;
public StandardAnalyzerV36() {
analyzer = new StandardAnalyzer(Version.LUCENE_36);
}
public StandardAnalyzerV36(Set<?> stopWords) {
analyzer = new StandardAnalyzer(Version.LUCENE_36, stopWords);
}
#Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
return analyzer.tokenStream(fieldName, new HTMLStripCharFilter(CharReader.get(reader)));
}
#Override
public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
return analyzer.reusableTokenStream(fieldName, reader);
}
}
Now everything works as expected. Hope it will help someone else. Good luck.

Related

Add weights to documents Lucene8+solr 8 while indexing

I am working on migrating solr from 5.4.3 to 8.11 for one of my search apps and successfully upgraded to 7.7.3. But for further upgradations facing the order of the response data being changed than it was earlier. Here I am trying to use FunctionScoreQuery along with DoubleValuesSource since CustomScoreQuery is deprecated in 7.7.3 and removed in 8.
Below is my code snippet (now I am using solr 8.5.2 and Lucene 8.5.2)
public class CustomQueryParser extends QParserPlugin {
#Override
public QParser createParser(final String qstr, final SolrParams localParams, final SolrParams params,
final SolrQueryRequest req) {
return new MyParser(qstr, localParams, params, req);
}
private static class MyParser extends QParser {
private Query innerQuery;
private String queryString;
public MyParser(final String qstr, final SolrParams localParams, final SolrParams params,
final SolrQueryRequest req) {
super(qstr, localParams, params, req);
if (qstr == null || qstr.trim().length() == 0) {
this.queryString = DEFAULT_SEARCH_QUERY;
setString(this.queryString);
} else {
this.queryString = qstr;
}
try {
if (queryString.contains(":")) {
final QParser parser = getParser(queryString, "edismax", getReq());
this.innerQuery = parser.parse();
} else {
final QParser parser = getParser(queryString, "dismax", getReq());
this.innerQuery = parser.parse();
}
} catch (final SyntaxError ex) {
throw new RuntimeException("Error parsing query", ex);
}
}
#Override
public Query parse() throws SyntaxError{
final Query query = new MyCustomQuery(innerQuery);
final CustomValuesSource customValuesSource = new CustomValuesSource(queryString,innerQuery);
final FunctionScoreQuery fsq = FunctionScoreQuery.boostByValue(query, customValuesSource.fromFloatField("score"));
return fsq;
}
}
}
public class MyCustomQuery extends Query {
#Override
public Weight createWeight(final IndexSearcher searcher, final ScoreMode scoreMode, final float boost) throws IOException {
Weight weight;
if(query == null){
weight = new ConstantScoreWeight(this, boost) {
#Override
public Scorer scorer(final LeafReaderContext context) throws IOException {
return new ConstantScoreScorer(this,score(),scoreMode, DocIdSetIterator.all(context.reader().maxDoc()));
}
#Override
public boolean isCacheable(final LeafReaderContext leafReaderContext) {
return false;
}
};
}else {
weight = searcher.createWeight(query,scoreMode,boost);
}
return weight;
}
}
public class CustomValuesSource extends DoubleValuesSource {
#Override
public DoubleValues getValues(final LeafReaderContext leafReaderContext,final DoubleValues doubleValues) throws IOException {
final DoubleValues dv = new CustomDoubleValues(leafReaderContext);
return dv;
}
class CustomDoubleValues extends DoubleValues {
#Override
public boolean advanceExact(final int doc) throws IOException {
final Document document = leafReaderContext.reader().document(doc);
final List<IndexableField> fields = document.getFields();
for (final IndexableField field : fields) {
// total_score is being calculated with my own preferences
document.add(new FloatDocValuesField("score",total_score));
//can we include the **score** here?
this custom logic which includes score is not even calling.
}
}
}
I am trying for a long time but have not found a single working example. Can anybody help me and save me here.
Thank you,
Syamala.

Lucene 7.5.0 how to set lowercase expanded terms to true

I have implemented my own Analyzer, QueryParser and PerFieldAnalyzerWrapper to implement the ElasticSearch ${field}.raw feature. Everything seems to be working ok, except for when I test using wildcards, etc on StringField types.
I understand this is because these queries don't use the analyzer at all.
In previous versions of lucene, there was a config option to enable the lowercasing of these queries.
I can't find how to do this in the latest version 7.5.0. Can anyone shed some light on this?
Expanded terms are processed by Analyzer.normalize. Since you have implemented your own Analyzer, add an implementation of the normalize method which runs the tokenStream through a LowerCaseFilter.
It can be as simple as:
public class MyAnalyzer extends Analyzer {
protected TokenStreamComponents createComponents(String fieldName) {
//Your createComponents implementation
}
protected TokenStream normalize(String fieldName, TokenStream in) {
return new LowerCaseFilter(in);
}
}
You can set up an analyzer like this for more details you can check out this link
Git link for CJK Bigram Plugin
#BeforeClass
public static void setUp() throws Exception {
analyzer = new Analyzer() {
#Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
new DefaultIcuTokenizerConfig(false, true));
TokenStream result = new CJKBigramFilter(source);
return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
}
};
analyzer2 = new Analyzer() {
#Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
new DefaultIcuTokenizerConfig(false, true));
TokenStream result = new IcuNormalizerFilter(source,
Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
result = new CJKBigramFilter(result);
return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
}
};

No converter found capable of converting from type [java.lang.String] to type [org.springframework.data.solr.core.geo.Point]

I am trying to use spring-data-solr in order to access to my Solr instance through my Spring boot application. I have the following bean class:
#SolrDocument(solrCoreName = "associations")
public class Association implements PlusimpleEntityI {
#Id
#Indexed
private String id;
#Indexed
private String name;
#Indexed
private Point location;
#Indexed
private String description;
#Indexed
private Set<String> tags;
#Indexed
private Set<String> topics;
#Indexed
private Set<String> professionals;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public Point getLocation() {
return location;
}
public void setLocation(Point location) {
this.location = location;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public Set<String> getTags() {
return tags;
}
public void setTags(Set<String> tags) {
this.tags = tags;
}
public Set<String> getTopics() {
return topics;
}
public void setTopics(Set<String> topics) {
this.topics = topics;
}
public Set<String> getProfessionals() {
return professionals;
}
public void setProfessionals(Set<String> professionals) {
this.professionals = professionals;
}
}
I have implemented the following repository in order to access to the related information:
public interface AssociationsRepository extends SolrCrudRepository<Association, String> {
}
I have created a configuration class which looks like the following one:
#Configuration
#EnableSolrRepositories(basePackages = {"com.package.repositories"}, multicoreSupport = true)
public class SolrRepositoryConfig {
#Value("${solr.url}")
private String solrHost;
#Bean
public SolrConverter solrConverter() {
MappingSolrConverter solrConverter = new MappingSolrConverter(new SimpleSolrMappingContext());
solrConverter.setCustomConversions(new CustomConversions(null));
return solrConverter;
}
#Bean
public SolrClientFactory solrClientFactory () throws Exception {
return new MulticoreSolrClientFactory(solrClient());
}
#Bean
public SolrClient solrClient() throws Exception {
return new HttpSolrClient.Builder(solrHost).build();
}
#Bean
public SolrOperations associationsTemplate() throws Exception {
SolrTemplate solrTemplate = new SolrTemplate(solrClient());
solrTemplate.setSolrConverter(solrConverter());
return solrTemplate;
}
}
Unfortunately, when I try to read an association from my Solr instance I got the following error:
org.springframework.core.convert.ConverterNotFoundException: No converter found capable of converting from type [java.lang.String] to type [org.springframework.data.solr.core.geo.Point]
I don't understand why it is not able to find a converter if I have explicitly defined it in the solrTemplate() method.
This is my POM definition:
<dependency>
<groupId>org.springframework.data</groupId>
<artifactId>spring-data-solr</artifactId>
<version>2.1.4.RELEASE</version>
</dependency>
Thank you for your help.
EDIT:
I've also tried with different BUILD-RELEASEs but they are highly unstable and I've found a lot of errors using them.
Alessandro, as you can see directly in the GeoConverters class on GitHub, the implemented converters are only for:
org.springframework.data.geo.Point
and not for:
org.springframework.data.solr.core.geo.Point
Simply use this class and you don't even need a custom converter for this. Spring Data for Solr will perform the conversion for you.
I'm using a slightly patched version of the 3.0.0 M4, but I'm pretty sure this solution should apply seamlessly also to your case.

How to populate a lucene 5.3 index

I have written the following class to populate a Lucene Index. I want to build an Index for Lucene so that I can query for specific documents. Unfortunately my documents are not added to the index.
Here is my code:
public class LuceneIndexer {
private IndexWriter indexWriter;
private IndexReader indexReader;
public LuceneIndexer() throws Exception {
Directory indexDir = FSDirectory.open(Paths.get("./index-directory"));
IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer());
config.setCommitOnClose(true);
config.setOpenMode(OpenMode.CREATE);
this.indexWriter = new IndexWriter(indexDir, config);
indexReader = DirectoryReader.open(this.indexWriter, true);
}
public void indexRelation(String subject, String description, String object) throws IOException {
System.out.println("Indexing relation between: " + subject+" and "+object);
Document doc = new Document();
doc.add(new TextField("subject", subject, Field.Store.YES));
doc.add(new TextField("description", description, Field.Store.YES));
doc.add(new TextField("object", object, Field.Store.YES));
indexWriter.addDocument(doc);
}
public void commit() throws Exception {
indexWriter.commit();
}
public int getNumberOfRelations() {
return indexReader.numDocs();
}
}
I am trying to get the following testcase to pass:
public class LuceneIndexerTest {
private LuceneIndexer instance;
#Before
public void setUp() throws SQLException, IOException {
instance = new LuceneIndexer();
instance.indexRelation("subject1","descr1","object1");
instance.indexRelation("subject2","descr2","object2");
instance.indexRelation("subject3","descr3","object3");
instance.commit();
}
#After
public void tearDown() throws IOException {
instance.close();
}
#Test
public void testIndexing() {
Assert.assertEquals(3, instance.getNumberOfRelations());
Assert.assertEquals(3, instance.getNumberOfRelations("subject"));
}
Unfortunately the Testcase says there are 0 documents in the index.
From Lucene's javadoc: "Any changes made to the index via IndexWriter will not be visible until a new IndexReader is opened".
The indexReader keep a view on your index at the time the IndexReader object was created. Just create a new one after each commit, and your indexReader will work as expected.
Here is the fix for your LuceneIndexer class:
public void commit() throws Exception {
indexWriter.commit();
if (indexReader != null)
indexReader.close();
indexReader = DirectoryReader.open(this.indexWriter, true);
}

Vaadin - Lazy Query Container

I'm doing my project in Vaadin 7. I need to implement a Lazy Query Container for a Treetable. I will get data for the Treetable from a web service.
Could someone please show how to use a Lazy Query Container with a web service as my data source?
Please let me know the steps required to implement this or show sample code to get me started.
There is good documentation for LQC here: https://vaadin.com/wiki/-/wiki/Main/Lazy%20Query%20Container
The examples in documentation are implementing MovieQuery using javax.persistence API, but it might be easier to use the simple MockQuery example as basis and replace the actual data fetching with webservice calls.
Have a look at the following lazy loading Hierarchical interface. All data is read from a webservice IViewService. The example uses the Tree component but it also works for TreeTable.
It's very important to store all elements in a local structure (in my case in the HashMap hierarchy), do not read elements multiple times this does not work. I think because Vaadin does not use equals() and hashCode().
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.softmodeler.common.CommonPlugin;
import com.softmodeler.model.OutputNode;
import com.softmodeler.service.IViewService;
import com.vaadin.data.Container.Hierarchical;
import com.vaadin.data.Item;
import com.vaadin.data.Property;
import com.vaadin.data.util.BeanItem;
/**
* #author Flavio Donzé
* #version 1.0
*/
public class OutputNodeHierachical implements Hierarchical {
private static final long serialVersionUID = 8289589835030184018L;
/** the view service */
private IViewService service = CommonPlugin.getService(IViewService.class);
/** collection of all root nodes */
private List<OutputNode> rootNodes = null;
/** parent=>children mapping */
private Map<OutputNode, List<OutputNode>> hierarchy = new HashMap<>();
/**
* constructor
*
* #param rootNodes collection of all root nodes
*/
public OutputNodeHierachical(List<OutputNode> rootNodes) {
this.rootNodes = Collections.unmodifiableList(rootNodes);
addToHierarchy(rootNodes);
}
#Override
public Collection<?> getChildren(Object itemId) {
try {
List<OutputNode> children = hierarchy.get(itemId);
if (children == null) {
OutputNode node = (OutputNode) itemId;
children = service.getChildren(node.getNodeId(), false);
hierarchy.put(node, children);
// add children to hierarchy, their children will be added on click
addToHierarchy(children);
}
return children;
} catch (Exception e) {
VaadinUtil.handleException(e);
}
return null;
}
/**
* add each element to the hierarchy without their children hierarchy(child=>null)
*
* #param children elements to add
*/
private void addToHierarchy(List<OutputNode> children) {
for (OutputNode child : children) {
hierarchy.put(child, null);
}
}
#Override
public boolean areChildrenAllowed(Object itemId) {
return !((OutputNode) itemId).getChilds().isEmpty();
}
#Override
public boolean hasChildren(Object itemId) {
return !((OutputNode) itemId).getChilds().isEmpty();
}
#Override
public Object getParent(Object itemId) {
String parentId = ((OutputNode) itemId).getParentId();
for (OutputNode node : hierarchy.keySet()) {
if (node.getNodeId().equals(parentId)) {
return node;
}
}
return null;
}
#Override
public Collection<?> rootItemIds() {
return rootNodes;
}
#Override
public boolean isRoot(Object itemId) {
return rootNodes.contains(itemId);
}
#Override
public Item getItem(Object itemId) {
return new BeanItem<OutputNode>((OutputNode) itemId);
}
#Override
public boolean containsId(Object itemId) {
return hierarchy.containsKey(itemId);
}
#Override
public Collection<?> getItemIds() {
return hierarchy.keySet();
}
#Override
public int size() {
return hierarchy.size();
}
#Override
public boolean setParent(Object itemId, Object newParentId) throws UnsupportedOperationException {
throw new UnsupportedOperationException();
}
#Override
public boolean setChildrenAllowed(Object itemId, boolean areChildrenAllowed) throws UnsupportedOperationException {
throw new UnsupportedOperationException();
}
#Override
public Item addItem(Object itemId) throws UnsupportedOperationException {
throw new UnsupportedOperationException();
}
#Override
public Object addItem() throws UnsupportedOperationException {
throw new UnsupportedOperationException();
}
#Override
public boolean removeItem(Object itemId) throws UnsupportedOperationException {
throw new UnsupportedOperationException();
}
#Override
public boolean removeAllItems() throws UnsupportedOperationException {
throw new UnsupportedOperationException();
}
#Override
public Class<?> getType(Object propertyId) {
throw new UnsupportedOperationException();
}
#Override
public Collection<?> getContainerPropertyIds() {
throw new UnsupportedOperationException();
}
#Override
public Property<?> getContainerProperty(Object itemId, Object propertyId) {
throw new UnsupportedOperationException();
}
#Override
public boolean addContainerProperty(Object propertyId, Class<?> type, Object defaultValue) throws UnsupportedOperationException {
throw new UnsupportedOperationException();
}
#Override
public boolean removeContainerProperty(Object propertyId) throws UnsupportedOperationException {
throw new UnsupportedOperationException();
}
}
Adding the container to the Tree like this:
OutputNodeHierachical dataSource = new OutputNodeHierachical(rootNodes);
Tree mainTree = new Tree();
mainTree.setSizeFull();
mainTree.setContainerDataSource(dataSource);
mainTree.addItemClickListener(new ItemClickListener() {
private static final long serialVersionUID = -413371711541672605L;
#Override
public void itemClick(ItemClickEvent event) {
OutputNode node = (OutputNode) event.getItemId();
openObject(node.getObjectId());
}
});