I need to reduce qtime in Solr from 24ms to 10ms. I have tried with the following parameters:
responseHeader: {
status: 0,
QTime: 24,
params: {
json.wrf: "jQuery111205813984868582338_1454571708911",
sort: "id desc",
indent: "on",
hl.simple.pre: "<b>",
hl.fl: "title keywords model_name",
wt: "json",
hl: "true",
rows: "100",
hl.highlightMultiTerm: "true",
hl.snippets: "1",
start: "0",
q: "{!q.op=AND df=title}pl~1",
_: "1454571708913",
hl.simple.post: "</b>",
qt: "spellchecker",
hl.usePhraseHighlighter: "true"
}
}
Solr URL:
http://www.example.com:8983/solr/mycoll/select?q={!q.op=AND%20df=title}pl~1&indent=on&sort=id%20desc&qt=spellchecker&wt=json&hl=true&hl.fl=title+keywords+model_name&hl.simple.pre=%3Cb%3E&hl.simple.post=%3C%2Fb%3E&hl.usePhraseHighlighter=true&hl.highlightMultiTerm=true&hl.snippets=1&start=0&rows=100&json.wrf=jQuery111205813984868582338_1454571708911&_=1454571708913
schema.xml
<field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" />
<field name="title" type="text_general" indexed="true" stored="true" multiValued="true" />
<field name="subject" type="text_general" indexed="true" stored="true" />
<field name="description" type="text_general" indexed="true" stored="true" />
<field name="comments" type="text_general" indexed="true" stored="true" />
<field name="author" type="text_general" indexed="true" stored="true" />
<field name="keywords" type="text_general" indexed="true" stored="true" />
<field name="category" type="text_general" indexed="true" stored="true" />
<field name="resourcename" type="text_general" indexed="true" stored="true" />
<field name="url" type="text_general" indexed="true" stored="true" />
<field name="content_type" type="string" indexed="true" stored="true" multiValued="true" />
<field name="last_modified" type="date" indexed="true" stored="true" />
<field name="links" type="string" indexed="true" stored="true" multiValued="true" />
Please see these reference links:
http://wiki.apache.org/solr/SolrPerformanceFactors
https://issues.apache.org/jira/browse/SOLR-2218
Related
I am trying to index pdf file using solr 6 and want to extract and save image(if have) to some location. I am using below configuration but not able to extract the image. I have successfully index the pdf text contents.
schema.xml
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="Breast-Cancer_PDFSchema" version="1.6">
<uniqueKey>id</uniqueKey>
<field name="id" type="strings" multiValued="false" indexed="true" required="true" stored="true"/>
<field name="_version_" type="long" indexed="true" stored="true"/>
<field name="date" type="tdates" indexed="true" stored="true"/>
<field name="pdf_pdfversion" type="strings" indexed="true" stored="true"/>
<field name="stream_content_type" type="strings" indexed="true" stored="true"/>
<field name="access_permission_modify_annotations" type="strings" indexed="true" stored="true"/>
<field name="access_permission_can_print_degraded" type="strings" indexed="true" stored="true"/>
<field name="dcterms_created" type="strings" indexed="true" stored="true"/>
<field name="last_modified" type="strings" indexed="true" stored="true"/>
<field name="dcterms_modified" type="strings" indexed="true" stored="true"/>
<field name="dc_format" type="strings" indexed="true" stored="true"/>
<field name="last_save_date" type="strings" indexed="true" stored="true"/>
<field name="access_permission_fill_in_form" type="strings" indexed="true" stored="true"/>
<field name="pdf_docinfo_modified" type="strings" indexed="true" stored="true"/>
<field name="stream_name" type="strings" indexed="true" stored="true"/>
<field name="meta_save_date" type="strings" indexed="true" stored="true"/>
<field name="pdf_encrypted" type="strings" indexed="true" stored="true"/>
<field name="modified" type="strings" indexed="true" stored="true"/>
<field name="content_type" type="strings" indexed="true" stored="true"/>
<field name="stream_size" type="strings" indexed="true" stored="true"/>
<field name="x_parsed_by" type="strings" indexed="true" stored="true"/>
<field name="meta_creation_date" type="strings" indexed="true" stored="true"/>
<field name="stream_source_info" type="strings" indexed="true" stored="true"/>
<field name="created" type="strings" indexed="true" stored="true"/>
<field name="access_permission_extract_for_accessibility" type="strings" indexed="true" stored="true"/>
<field name="access_permission_assemble_document" type="strings" indexed="true" stored="true"/>
<field name="xmptpg_npages" type="strings" indexed="true" stored="true"/>
<field name="creation_date" type="strings" indexed="true" stored="true"/>
<field name="access_permission_extract_content" type="strings" indexed="true" stored="true"/>
<field name="access_permission_can_print" type="strings" indexed="true" stored="true"/>
<field name="producer" type="strings" indexed="true" stored="true"/>
<field name="subject" type="strings" indexed="true" stored="true"/>
<field name="dc_creator" type="strings" indexed="true" stored="true"/>
<field name="aapl_keywords" type="strings" indexed="true" stored="true"/>
<field name="pdf_docinfo_producer" type="strings" indexed="true" stored="true"/>
<field name="resourcename" type="strings" indexed="true" stored="true"/>
<field name="access_permission_can_modify" type="strings" indexed="true" stored="true"/>
<field name="pdf_docinfo_created" type="strings" indexed="true" stored="true"/>
<field name="_text_" type="strings" indexed="true" stored="true"/>
<fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" />
<fieldType name="strings" class="solr.TextField" sortMissingLast="true" multiValued="true" />
<fieldType name="long" class="solr.TrieLongField" positionIncrementGap="0" precisionStep="0"/>
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
<fieldType name="tdates" class="solr.TrieDateField" positionIncrementGap="0" multiValued="true" precisionStep="6"/>
<fieldType name="tlongs" class="solr.TrieLongField" positionIncrementGap="0" multiValued="true" precisionStep="8"/>
<fieldType name="tdoubles" class="solr.TrieDoubleField" positionIncrementGap="0" multiValued="true" precisionStep="8"/>
solr-config.xml
<requestHandler name="/update/extract" startup="lazy" class="org.apache.solr.handler.extraction.ExtractingRequestHandler" >
<entries>
<entry class="org.apache.tika.parser.pdf.AutoDetectParser"> </entry>
</entries>
<lst name="defaults">
<str name="lowernames">true</str>
<str name="fmap.meta">ignored_</str>
<str name="fmap.content">_text_</str>
<str name="fmap.id">id</str>
</lst>
</requestHandler>
I have followed the apache solr offical documentation but made changes to solr-config.xml according to them still have the same issue.
After reading your question and if I am not wrong, you are posting PDF file (which contains some text and images) to solr. The solr will only index that document it will not extract the images and store them to other location.
Solr internally uses Tika libraries for parsing the documents, but it cannot be used in your requirement
To achieve your requirement,
parse your pdf and extract all the images and other contents and store them
index all the extracted contents and pdf in solr.
I'm trying to configure Solr to allow query data from my DB. After I've configured it, I've added a new field that is a foreign key to another table.
Old records have this field NULL.
Schema DB
Table: offers
Fields: id, type_material (foreign key), (others fields not need to show)
Table: materials
Fields: id, name
Solr config
File db-data-config.xml:
<dataConfig>
<dataSource type="JdbcDataSource" driver="com.mysql.jdbc.Driver" url="jdbc:mysql://path" user="user" password="pwd" />
<document name="offers">
<entity name="offers"
query="SELECT o.* FROM offers o inner join offer_group g on o.offer_group_id = g.id where g.status = 0"
deltaQuery="select id from offers where updated_at > '${dataimporter.last_index_time}'">
<field column="id" name="id" />
<field column="product_code" name="product_code" />
<field column="gender" name="gender" />
<field column="colors" name="colors" />
<field column="year" name="year" />
<field column="tags" name="tags" />
<field column="size" name="size" />
<field column="size_typology" name="size_typology" />
<field column="season" name="season" />
<field column="quantity" name="quantity" />
<field column="price" name="price" />
<field column="typology" name="typology" />
<field column="model" name="model" />
<entity name="brands"
query="select name from brands where id='${offers.brand_id}'"
deltaQuery="select id from brands where updated_at > '${dataimporter.last_index_time}'" >
<field name="brand_name" column="name" />
</entity>
<entity name="materials"
query="select name from materials where id='${offers.type_material}' OR '${offers.type_material}' = NULL">
<field name="material_name" column="name" />
</entity>
<entity name="offer_group"
query="select shop_id from offer_group where id='${offers.offer_group_id}'"
deltaQuery="select id from offer_group where updated_at > '${dataimporter.last_index_time}'" >
<field name="shop_id" column="shop_id" />
</entity>
</entity>
</document>
</dataConfig>
File schema.xml:
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="offers" version="1.5">
<fieldType name="string" class="solr.StrField"></fieldType>
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/>
<!-- Just like text_general except it reverses the characters of
each token, to enable more efficient leading wildcard queries. -->
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory" />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33" minTrailing="3" />
<filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="15" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory" />
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="random" class="solr.RandomSortField" indexed="true" />
<dynamicField name="random_*" type="random" indexed="true" stored="false"/>
<!-- End randomize offers-->
<field name="_version_" type="long" indexed="true" stored="true" required="false"/>
<field name="id" type="long" indexed="true" stored="true" required="true" />
<field name="brand_id" type="long" indexed="true" stored="true" required="true" />
<field name="shop_id" type="long" indexed="true" stored="true" required="true" />
<field name="brand_name" type="text_general" indexed="true" stored="true" required="true" />
<field name="type_material" type="long" indexed="true" stored="true" default="NULL" />
<field name="material_name" type="text_general" indexed="true" stored="true" default="NULL" />
<field name="offer_group_id" type="long" indexed="true" stored="true" required="true" />
<field name="product_code" type="text_general" indexed="true" stored="true" default="NULL" />
<field name="gender" type="string" indexed="true" stored="true" default="NULL" />
<field name="colors" type="text_general" indexed="true" stored="true" default="NULL" />
<field name="year" type="text_general" indexed="true" stored="true" default="NULL" />
<field name="tags" type="text_general" indexed="true" stored="true" default="NULL" />
<field name="size" type="string" indexed="true" stored="true" default="NULL" />
<field name="size_typology" type="string" indexed="true" stored="true" default="NULL" />
<field name="season" type="text_general" indexed="true" stored="true" default="NULL" />
<field name="quantity" type="string" indexed="true" stored="true" default="NULL" />
<field name="price" type="float" indexed="true" stored="true" default="NULL" />
<field name="typology" type="text_general" indexed="true" stored="true" default="NULL" />
<field name="photo_url" type="string" indexed="true" stored="true" required="true" />
<field name="model" type="text_general" indexed="true" stored="true" default="NULL" />
<field name="created_at" type="date" indexed="true" stored="true"/>
<field name="updated_at" type="date" indexed="true" stored="true"/>
<field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
<uniqueKey>id</uniqueKey>
<copyField source="colors" dest="text"/>
<copyField source="year" dest="text"/>
<copyField source="season" dest="text"/>
<copyField source="typology" dest="text"/>
<copyField source="model" dest="text"/>
<copyField source="tags" dest="text"/>
<copyField source="product_code" dest="text"/>
<copyField source="brand_name" dest="text"/>
<copyField source="material_name" dest="text" />
<copyField source="gender" dest="text"/>
</schema>
When search's query start, return all offers that it hasn't type_material 's field equal to NULL.
I want to retry also those.
Just use a filter query &fq=type_material:NULL
Trying to create a UUID field in my schema.xml, I just get this error when starting Solr:
Plugin init failure for [schema.xml] fieldType "uuid": Error loading class 'Solr.UUIDField'
My schema looks like:
<fields>
<field name="uuid" type="uuid" indexed="true" stored="true" />
<updateRequestProcessorChain name="uuid">
<processor class="solr.UUIDUpdateProcessorFactory">
<str name="fieldName">uuid</str>
</processor>
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
<field name="id" type="string" indexed="true" stored="true" required="true"/>
<field name="address" type="text_general" indexed="true" stored="true"/>
<field name="city" type="text_general" indexed="true" stored="true" />
<field name="county" type="string" indexed="true" stored="true" />
<field name="lat" type="text_general" indexed="true" stored="true" />
<field name="lng" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
<field name="price" type="float" indexed="true" stored="true"/>
<field name="bedrooms" type="float" indexed="true" stored="true" />
<field name="image" type="string" indexed="true" stored="true"/>
<field name="region" type="location_rpt" indexed="true" stored="true" />
<defaultSearchField>address</defaultSearchField>
<field name="_version_" type="long" indexed="true" stored="true"/>
<field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
And then in
<fieldType name="uuid" class="Solr.UUIDField" indexed="true" />
From the docs
I'm confused as the the location on the <updateRequestProcessorChain/> section. I feel it shouldn't go in the field declaration part.
The field class is case sensitive probably, try will lower case solr solr.UUIDField :-
<fieldType name="uuid" class="solr.UUIDField" indexed="true" />
I'm having issues generating a UUID using the dataImportHandler in Solr4. Im trying to import from an existing MySQL database.
My schema.xml contains:
<fields>
<field name="uuid" type="uuid" indexed="true" stored="true" required="true" />
<field name="id" type="string" indexed="true" stored="true" required="true"/>
<field name="address" type="text_general" indexed="true" stored="true"/>
<field name="city" type="text_general" indexed="true" stored="true" />
<field name="county" type="string" indexed="true" stored="true" />
<field name="lat" type="text_general" indexed="true" stored="true" />
<field name="lng" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
<field name="price" type="float" indexed="true" stored="true"/>
<field name="bedrooms" type="float" indexed="true" stored="true" />
<field name="image" type="string" indexed="true" stored="true"/>
<field name="region" type="location_rpt" indexed="true" stored="true" />
<defaultSearchField>address</defaultSearchField>
<field name="_version_" type="long" indexed="true" stored="true"/>
<field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
</fields>
<uniqueKey>uuid</uniqueKey>
and then in <types>
<fieldType name="uuid" class="solr.UUIDField" indexed="true" />
My Solrconfig.xml contains:
<requestHandler name="/dataimport" class="org.apache.solr.handler.dataimport.DataImportHandler">
<updateRequestProcessorChain name="uuid">
<processor class="solr.UUIDUpdateProcessorFactory">
<str name="fieldName">uuid</str>
</processor>
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
<lst name="defaults">
<str name="config">data-config.xml</str>
</lst>
Whenever I run the update, some docs are inserted ok , buy many return with:
org.apache.solr.common.SolrException: [doc=204] missing required field: uuid
Going by the example at link it should be
<requestHandler name="/dataimport" class="org.apache.solr.handler.dataimport.DataImportHandler">
.........
<lst name="defaults">
<str name="config">data-config.xml</str>
<str name="update.chain">uuid</str>
</lst>
</requestHandler>
<updateRequestProcessorChain name="uuid">
<processor class="solr.UUIDUpdateProcessorFactory">
<str name="fieldName">uuid</str>
</processor>
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
I have changed the schema.xml file and added few fields in that like this
<field name="url" type="string" indexed="true" stored="true" />
<field name="content_type" type="text" indexed="true" stored="true" />
<field name="title" type="text" indexed="true" stored="true" />
<field name="keywords" type="text" indexed="true" stored="true" multiValued="true" />
<field name="text" type="text" indexed="true" stored="true" />
<field name="timestamp" type="text" indexed="true" stored="true" />
<field name="public" type="text" indexed="true" stored="true" multiValued="true" />
<field name="groups" type="text" indexed="true" stored="true" multiValued="true" />
<field name="sitename" type="text" indexed="true" stored="true" />
<field name="context" type="text" indexed="true" stored="true" />
<field name="modified_date" type="text" indexed="true" stored="true" />
so corresponding to these fields I have created one xml file and added some dummy data into that like this.
<add><doc>
<field name="url">http://www.host.com/</field>
<field name="content_type">text/html</field>
<field name="title">Testing Data</field>
<field name="keywords">software</field>
<field name="keywords">software_cycle</field>
<field name="text">search</field>
<field name="timestamp">2006-02-13T15:26:37Z</field>
<field name="public">Optimized</field>
<field name="public">Optimized_data</field>
<field name="groups">Standards</field>
<field name="groups">Standards_data</field>
<field name="sitename">GoInfo</field>
<field name="context">Scalability</field>
<field name="modified_date">2010-11-13T15:26:37Z</field>
</doc></add>
And when I tried to reindex the data into solr like this:-
C:\apache-solr-3.2.0\example\exampledocs>java -Durl=http://localhost:7788/solr/u
pdate -jar post.jar *.xml
SimplePostTool: version 1.3
SimplePostTool: POSTing files to http://localhost:7788/solr/update..
SimplePostTool: POSTing file 30-example.xml
SimplePostTool: POSTing file hd.xml
SimplePostTool: POSTing file other.xml
SimplePostTool: FATAL: Solr returned an error #400 Bad Request
I always get an error after text.xml file and If I remove this text.xml file then I don't get any error.. This is the below error I am getting if I include the text.xml file. Any help will be appreciated.
SEVERE: org.apache.solr.common.SolrException: Document [null] missing required field: id
at org.apache.solr.update.DocumentBuilder.toDocument(DocumentBuilder.java:336)
at org.apache.solr.update.processor.RunUpdateProcessor.processAdd(RunUpdateProcessorFactory.java:60)
at org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:147)
at org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77)
at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:67)
at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:129)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:1360)
at org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:356)
at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:252)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:235)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
at org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:233)
at org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:191)
at org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:127)
at org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:102)
at org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:109)
at org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:298)
at org.apache.coyote.http11.Http11AprProcessor.process(Http11AprProcessor.java:864)
at org.apache.coyote.http11.Http11AprProtocol$Http11ConnectionHandler.process(Http11AprProtocol.java:579)
at org.apache.tomcat.util.net.AprEndpoint$Worker.run(AprEndpoint.java:1665)
at java.lang.Thread.run(Thread.java:662)
You say you added a few fields (supposedly to the sample schema), but you don't mention what happened to the fields that were already there. I'm guessing you left the preexistent fields there, which means that id is still a required field (see here in the sample schema), therefore the error you see.
Make a
"Primary Key" id. It is really required.