one field is not getting indexed in solr - indexing

I am trying to index the data using DIH with sql query
in dataconfig.xml I am putting this-
<entity
name="slot_data_sunday"
dataSource="dineout"
onError="skip"
query="SELECT
tl_id AS event_id,
MAX(CASE
WHEN (dow = 0) THEN TRUE
ELSE FALSE
END) AS dow_0,
MAX(CASE
WHEN (dow = 1) THEN TRUE
ELSE FALSE
END) AS dow_1,
(CASE
WHEN (dow = 2) THEN TRUE
ELSE FALSE
END) AS dow_2,
MAX(CASE
WHEN (dow = 3) THEN TRUE
ELSE FALSE
END) AS dow_3,
MAX(CASE
WHEN (dow = 4) THEN TRUE
ELSE FALSE
END) AS dow_4,
(CASE
WHEN (dow = 5) THEN TRUE
ELSE FALSE
END) AS dow_5,
MAX(CASE
WHEN (dow = 6) THEN TRUE
ELSE FALSE
END) AS dow_6
FROM
ndf_ticket_slots"
cacheKey="event_id"
cacheLookup="ticket.event_id"
cacheImpl="SortedMapBackedCache" </entity>
In schema.xml, I am putting
<field name="dow_0" type="boolean" indexed="true" stored="true" />
<field name="dow_1" type="boolean" indexed="true" stored="true" />
<field name="dow_2" type="boolean" indexed="true" stored="true" />
<field name="dow_3" type="boolean" indexed="true" stored="true" />
<field name="dow_4" type="boolean" indexed="true" stored="true" />
<field name="dow_5" type="boolean" indexed="true" stored="true" />
<field name="dow_6" type="boolean" indexed="true" stored="true" />
But I am not getting these fields in my indexes, can anyone let me know the issue here

I have found that whenever we use any aggregaiate functions like max/min in sql query and use the same query in DIH for indexing, it is not getting indexed. So try avoiding use of that. Thanks

Related

Parse image as well as text from pdf using tika in solr(Request Handler)

I am trying to index pdf file using solr 6 and want to extract and save image(if have) to some location. I am using below configuration but not able to extract the image. I have successfully index the pdf text contents.
schema.xml
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="Breast-Cancer_PDFSchema" version="1.6">
<uniqueKey>id</uniqueKey>
<field name="id" type="strings" multiValued="false" indexed="true" required="true" stored="true"/>
<field name="_version_" type="long" indexed="true" stored="true"/>
<field name="date" type="tdates" indexed="true" stored="true"/>
<field name="pdf_pdfversion" type="strings" indexed="true" stored="true"/>
<field name="stream_content_type" type="strings" indexed="true" stored="true"/>
<field name="access_permission_modify_annotations" type="strings" indexed="true" stored="true"/>
<field name="access_permission_can_print_degraded" type="strings" indexed="true" stored="true"/>
<field name="dcterms_created" type="strings" indexed="true" stored="true"/>
<field name="last_modified" type="strings" indexed="true" stored="true"/>
<field name="dcterms_modified" type="strings" indexed="true" stored="true"/>
<field name="dc_format" type="strings" indexed="true" stored="true"/>
<field name="last_save_date" type="strings" indexed="true" stored="true"/>
<field name="access_permission_fill_in_form" type="strings" indexed="true" stored="true"/>
<field name="pdf_docinfo_modified" type="strings" indexed="true" stored="true"/>
<field name="stream_name" type="strings" indexed="true" stored="true"/>
<field name="meta_save_date" type="strings" indexed="true" stored="true"/>
<field name="pdf_encrypted" type="strings" indexed="true" stored="true"/>
<field name="modified" type="strings" indexed="true" stored="true"/>
<field name="content_type" type="strings" indexed="true" stored="true"/>
<field name="stream_size" type="strings" indexed="true" stored="true"/>
<field name="x_parsed_by" type="strings" indexed="true" stored="true"/>
<field name="meta_creation_date" type="strings" indexed="true" stored="true"/>
<field name="stream_source_info" type="strings" indexed="true" stored="true"/>
<field name="created" type="strings" indexed="true" stored="true"/>
<field name="access_permission_extract_for_accessibility" type="strings" indexed="true" stored="true"/>
<field name="access_permission_assemble_document" type="strings" indexed="true" stored="true"/>
<field name="xmptpg_npages" type="strings" indexed="true" stored="true"/>
<field name="creation_date" type="strings" indexed="true" stored="true"/>
<field name="access_permission_extract_content" type="strings" indexed="true" stored="true"/>
<field name="access_permission_can_print" type="strings" indexed="true" stored="true"/>
<field name="producer" type="strings" indexed="true" stored="true"/>
<field name="subject" type="strings" indexed="true" stored="true"/>
<field name="dc_creator" type="strings" indexed="true" stored="true"/>
<field name="aapl_keywords" type="strings" indexed="true" stored="true"/>
<field name="pdf_docinfo_producer" type="strings" indexed="true" stored="true"/>
<field name="resourcename" type="strings" indexed="true" stored="true"/>
<field name="access_permission_can_modify" type="strings" indexed="true" stored="true"/>
<field name="pdf_docinfo_created" type="strings" indexed="true" stored="true"/>
<field name="_text_" type="strings" indexed="true" stored="true"/>
<fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" />
<fieldType name="strings" class="solr.TextField" sortMissingLast="true" multiValued="true" />
<fieldType name="long" class="solr.TrieLongField" positionIncrementGap="0" precisionStep="0"/>
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
<fieldType name="tdates" class="solr.TrieDateField" positionIncrementGap="0" multiValued="true" precisionStep="6"/>
<fieldType name="tlongs" class="solr.TrieLongField" positionIncrementGap="0" multiValued="true" precisionStep="8"/>
<fieldType name="tdoubles" class="solr.TrieDoubleField" positionIncrementGap="0" multiValued="true" precisionStep="8"/>
solr-config.xml
<requestHandler name="/update/extract" startup="lazy" class="org.apache.solr.handler.extraction.ExtractingRequestHandler" >
<entries>
<entry class="org.apache.tika.parser.pdf.AutoDetectParser"> </entry>
</entries>
<lst name="defaults">
<str name="lowernames">true</str>
<str name="fmap.meta">ignored_</str>
<str name="fmap.content">_text_</str>
<str name="fmap.id">id</str>
</lst>
</requestHandler>
I have followed the apache solr offical documentation but made changes to solr-config.xml according to them still have the same issue.
After reading your question and if I am not wrong, you are posting PDF file (which contains some text and images) to solr. The solr will only index that document it will not extract the images and store them to other location.
Solr internally uses Tika libraries for parsing the documents, but it cannot be used in your requirement
To achieve your requirement,
parse your pdf and extract all the images and other contents and store them
index all the extracted contents and pdf in solr.

Solr Apache config with NULL value on foreign key in INNER JOIN

I'm trying to configure Solr to allow query data from my DB. After I've configured it, I've added a new field that is a foreign key to another table.
Old records have this field NULL.
Schema DB
Table: offers
Fields: id, type_material (foreign key), (others fields not need to show)
Table: materials
Fields: id, name
Solr config
File db-data-config.xml:
<dataConfig>
<dataSource type="JdbcDataSource" driver="com.mysql.jdbc.Driver" url="jdbc:mysql://path" user="user" password="pwd" />
    <document name="offers">
       <entity name="offers"
query="SELECT o.* FROM offers o inner join offer_group g on o.offer_group_id = g.id where g.status = 0"
deltaQuery="select id from offers where updated_at > '${dataimporter.last_index_time}'">
<field column="id" name="id" />
<field column="product_code" name="product_code" />
<field column="gender" name="gender" />
<field column="colors" name="colors" />
<field column="year" name="year" />
<field column="tags" name="tags" />
<field column="size" name="size" />
<field column="size_typology" name="size_typology" />
<field column="season" name="season" />
<field column="quantity" name="quantity" />
<field column="price" name="price" />
<field column="typology" name="typology" />
<field column="model" name="model" />
<entity name="brands"
query="select name from brands where id='${offers.brand_id}'"
deltaQuery="select id from brands where updated_at > '${dataimporter.last_index_time}'" >
<field name="brand_name" column="name" />
</entity>
<entity name="materials"
query="select name from materials where id='${offers.type_material}' OR '${offers.type_material}' = NULL">
<field name="material_name" column="name" />
</entity>
<entity name="offer_group"
query="select shop_id from offer_group where id='${offers.offer_group_id}'"
deltaQuery="select id from offer_group where updated_at > '${dataimporter.last_index_time}'" >
<field name="shop_id" column="shop_id" />
</entity>
</entity>
    </document>
</dataConfig>
File schema.xml:
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="offers" version="1.5">
<fieldType name="string" class="solr.StrField"></fieldType>
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/>
<!-- Just like text_general except it reverses the characters of
each token, to enable more efficient leading wildcard queries. -->
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory" />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33" minTrailing="3" />
<filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="15" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory" />
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="random" class="solr.RandomSortField" indexed="true" />
<dynamicField name="random_*" type="random" indexed="true" stored="false"/>
<!-- End randomize offers-->
<field name="_version_" type="long" indexed="true" stored="true" required="false"/>
<field name="id" type="long" indexed="true" stored="true" required="true" />
<field name="brand_id" type="long" indexed="true" stored="true" required="true" />
<field name="shop_id" type="long" indexed="true" stored="true" required="true" />
<field name="brand_name" type="text_general" indexed="true" stored="true" required="true" />
<field name="type_material" type="long" indexed="true" stored="true" default="NULL" />
<field name="material_name" type="text_general" indexed="true" stored="true" default="NULL" />
<field name="offer_group_id" type="long" indexed="true" stored="true" required="true" />
<field name="product_code" type="text_general" indexed="true" stored="true" default="NULL" />
<field name="gender" type="string" indexed="true" stored="true" default="NULL" />
<field name="colors" type="text_general" indexed="true" stored="true" default="NULL" />
<field name="year" type="text_general" indexed="true" stored="true" default="NULL" />
<field name="tags" type="text_general" indexed="true" stored="true" default="NULL" />
<field name="size" type="string" indexed="true" stored="true" default="NULL" />
<field name="size_typology" type="string" indexed="true" stored="true" default="NULL" />
<field name="season" type="text_general" indexed="true" stored="true" default="NULL" />
<field name="quantity" type="string" indexed="true" stored="true" default="NULL" />
<field name="price" type="float" indexed="true" stored="true" default="NULL" />
<field name="typology" type="text_general" indexed="true" stored="true" default="NULL" />
<field name="photo_url" type="string" indexed="true" stored="true" required="true" />
<field name="model" type="text_general" indexed="true" stored="true" default="NULL" />
<field name="created_at" type="date" indexed="true" stored="true"/>
<field name="updated_at" type="date" indexed="true" stored="true"/>
<field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
<uniqueKey>id</uniqueKey>
<copyField source="colors" dest="text"/>
<copyField source="year" dest="text"/>
<copyField source="season" dest="text"/>
<copyField source="typology" dest="text"/>
<copyField source="model" dest="text"/>
<copyField source="tags" dest="text"/>
<copyField source="product_code" dest="text"/>
<copyField source="brand_name" dest="text"/>
<copyField source="material_name" dest="text" />
<copyField source="gender" dest="text"/>
</schema>
When search's query start, return all offers that it hasn't type_material 's field equal to NULL.
I want to retry also those.
Just use a filter query &fq=type_material:NULL

How to reduce Qtime (query time) in Solr

I need to reduce qtime in Solr from 24ms to 10ms. I have tried with the following parameters:
responseHeader: {
status: 0,
QTime: 24,
params: {
json.wrf: "jQuery111205813984868582338_1454571708911",
sort: "id desc",
indent: "on",
hl.simple.pre: "<b>",
hl.fl: "title keywords model_name",
wt: "json",
hl: "true",
rows: "100",
hl.highlightMultiTerm: "true",
hl.snippets: "1",
start: "0",
q: "{!q.op=AND df=title}pl~1",
_: "1454571708913",
hl.simple.post: "</b>",
qt: "spellchecker",
hl.usePhraseHighlighter: "true"
}
}
Solr URL:
http://www.example.com:8983/solr/mycoll/select?q={!q.op=AND%20df=title}pl~1&indent=on&sort=id%20desc&qt=spellchecker&wt=json&hl=true&hl.fl=title+keywords+model_name&hl.simple.pre=%3Cb%3E&hl.simple.post=%3C%2Fb%3E&hl.usePhraseHighlighter=true&hl.highlightMultiTerm=true&hl.snippets=1&start=0&rows=100&json.wrf=jQuery111205813984868582338_1454571708911&_=1454571708913
schema.xml
<field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" />
<field name="title" type="text_general" indexed="true" stored="true" multiValued="true" />
<field name="subject" type="text_general" indexed="true" stored="true" />
<field name="description" type="text_general" indexed="true" stored="true" />
<field name="comments" type="text_general" indexed="true" stored="true" />
<field name="author" type="text_general" indexed="true" stored="true" />
<field name="keywords" type="text_general" indexed="true" stored="true" />
<field name="category" type="text_general" indexed="true" stored="true" />
<field name="resourcename" type="text_general" indexed="true" stored="true" />
<field name="url" type="text_general" indexed="true" stored="true" />
<field name="content_type" type="string" indexed="true" stored="true" multiValued="true" />
<field name="last_modified" type="date" indexed="true" stored="true" />
<field name="links" type="string" indexed="true" stored="true" multiValued="true" />
Please see these reference links:
http://wiki.apache.org/solr/SolrPerformanceFactors
https://issues.apache.org/jira/browse/SOLR-2218

Solr 4 Lucene - Error loading class 'Solr.UUIDField'

Trying to create a UUID field in my schema.xml, I just get this error when starting Solr:
Plugin init failure for [schema.xml] fieldType "uuid": Error loading class 'Solr.UUIDField'
My schema looks like:
<fields>
<field name="uuid" type="uuid" indexed="true" stored="true" />
<updateRequestProcessorChain name="uuid">
<processor class="solr.UUIDUpdateProcessorFactory">
<str name="fieldName">uuid</str>
</processor>
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
<field name="id" type="string" indexed="true" stored="true" required="true"/>
<field name="address" type="text_general" indexed="true" stored="true"/>
<field name="city" type="text_general" indexed="true" stored="true" />
<field name="county" type="string" indexed="true" stored="true" />
<field name="lat" type="text_general" indexed="true" stored="true" />
<field name="lng" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
<field name="price" type="float" indexed="true" stored="true"/>
<field name="bedrooms" type="float" indexed="true" stored="true" />
<field name="image" type="string" indexed="true" stored="true"/>
<field name="region" type="location_rpt" indexed="true" stored="true" />
<defaultSearchField>address</defaultSearchField>
<field name="_version_" type="long" indexed="true" stored="true"/>
<field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
And then in
<fieldType name="uuid" class="Solr.UUIDField" indexed="true" />
From the docs
I'm confused as the the location on the <updateRequestProcessorChain/> section. I feel it shouldn't go in the field declaration part.
The field class is case sensitive probably, try will lower case solr solr.UUIDField :-
<fieldType name="uuid" class="solr.UUIDField" indexed="true" />

Solr 4 - missing required field: uuid

I'm having issues generating a UUID using the dataImportHandler in Solr4. Im trying to import from an existing MySQL database.
My schema.xml contains:
<fields>
<field name="uuid" type="uuid" indexed="true" stored="true" required="true" />
<field name="id" type="string" indexed="true" stored="true" required="true"/>
<field name="address" type="text_general" indexed="true" stored="true"/>
<field name="city" type="text_general" indexed="true" stored="true" />
<field name="county" type="string" indexed="true" stored="true" />
<field name="lat" type="text_general" indexed="true" stored="true" />
<field name="lng" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
<field name="price" type="float" indexed="true" stored="true"/>
<field name="bedrooms" type="float" indexed="true" stored="true" />
<field name="image" type="string" indexed="true" stored="true"/>
<field name="region" type="location_rpt" indexed="true" stored="true" />
<defaultSearchField>address</defaultSearchField>
<field name="_version_" type="long" indexed="true" stored="true"/>
<field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
</fields>
<uniqueKey>uuid</uniqueKey>
and then in <types>
<fieldType name="uuid" class="solr.UUIDField" indexed="true" />
My Solrconfig.xml contains:
<requestHandler name="/dataimport" class="org.apache.solr.handler.dataimport.DataImportHandler">  
<updateRequestProcessorChain name="uuid">
<processor class="solr.UUIDUpdateProcessorFactory">
<str name="fieldName">uuid</str>
</processor>
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
<lst name="defaults">
<str name="config">data-config.xml</str>
</lst>
Whenever I run the update, some docs are inserted ok , buy many return with:
org.apache.solr.common.SolrException: [doc=204] missing required field: uuid
Going by the example at link it should be
<requestHandler name="/dataimport" class="org.apache.solr.handler.dataimport.DataImportHandler">
.........
<lst name="defaults">
<str name="config">data-config.xml</str>
<str name="update.chain">uuid</str>
</lst>
</requestHandler>
<updateRequestProcessorChain name="uuid">
<processor class="solr.UUIDUpdateProcessorFactory">
<str name="fieldName">uuid</str>
</processor>
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>