I'm running both a Nutch 1.16 crawler instance and a Solr version 8.3.0. I have been able to crawl for files on a local directory and, editing nutch-site.xml, extract some metadata from them (albeit not as much as I wished for) running bin/crawl -s urls dircrawl 2 >& dircrawl.log. The crawled data is then sent to Solr via bin/nutch index dircrawl/crawldb/ -linkdb dircrawl/linkdb/ -dir dircrawl/segments/ -filter -normalize, where the entries are then stored and managed via their tags.
Now, running Solr Admin from the UI, I'm trying to search for the data. I made sure to sign as indexed=true all the entries I am interested in. HOWEVER, running any search other than for *:* returns zero results. I have tried all possible combinations of search fields, no dice either. I'll link to the description of my config files, first for solr then for nutch...
schema.xml (becomes managed-schema when running it, for some reason)
<?xml version="1.0" encoding="UTF-8"?>
<schema name="nutch-crawler-indexing-config" version="1.6">
<uniqueKey>id</uniqueKey>
<fieldType name="_nest_path_" class="solr.NestPathField" omitTermFreqAndPositions="true" omitNorms="true" maxCharsForDocValues="-1" stored="false"/>
<fieldType name="ancestor_path" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.KeywordTokenizerFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/"/>
</analyzer>
</fieldType>
(all fieldTypes are the default ones)
<fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.CJKWidthFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.CJKBigramFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ReversedWildcardFilterFactory" maxPosQuestion="2" maxFractionAsterisk="0.33" maxPosAsterisk="3" withOriginal="true"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.SynonymGraphFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_gl" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_gl.txt" ignoreCase="true"/>
<filter class="solr.GalicianStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_hi" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.IndicNormalizationFilterFactory"/>
<filter class="solr.HindiNormalizationFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_hi.txt" ignoreCase="true"/>
<filter class="solr.HindiStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_hu" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_hu.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Hungarian"/>
</analyzer>
</fieldType>
<fieldType name="text_hy" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_hy.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Armenian"/>
</analyzer>
</fieldType>
<fieldType name="text_id" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_id.txt" ignoreCase="true"/>
<filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/>
</analyzer>
</fieldType>
<fieldType name="text_it" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_it.txt" ignoreCase="true"/>
<filter class="solr.ItalianLightStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_ja" class="solr.TextField" autoGeneratePhraseQueries="false" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
<filter class="solr.JapaneseBaseFormFilterFactory"/>
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt"/>
<filter class="solr.CJKWidthFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_ja.txt" ignoreCase="true"/>
<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_ko" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.KoreanTokenizerFactory" outputUnknownUnigrams="false" decompoundMode="discard"/>
<filter class="solr.KoreanPartOfSpeechStopFilterFactory"/>
<filter class="solr.KoreanReadingFormFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_lv.txt" ignoreCase="true"/>
<filter class="solr.LatvianStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_nl" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_nl.txt" ignoreCase="true"/>
<filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict_nl.txt" ignoreCase="false"/>
<filter class="solr.SnowballPorterFilterFactory" language="Dutch"/>
</analyzer>
</fieldType>
<fieldType name="text_no" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_no.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Norwegian"/>
</analyzer>
</fieldType>
<fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_pt.txt" ignoreCase="true"/>
<filter class="solr.PortugueseLightStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_ro" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_ro.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Romanian"/>
</analyzer>
</fieldType>
<fieldType name="text_ru" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_ru.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Russian"/>
</analyzer>
</fieldType>
<fieldType name="text_sv" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_sv.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Swedish"/>
</analyzer>
</fieldType>
<fieldType name="text_th" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.ThaiTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_th.txt" ignoreCase="true"/>
</analyzer>
</fieldType>
<fieldType name="text_tr" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.TurkishLowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_tr.txt" ignoreCase="false"/>
<filter class="solr.SnowballPorterFilterFactory" language="Turkish"/>
</analyzer>
</fieldType>
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
</analyzer>
</fieldType>
<field name="_nest_path_" type="_nest_path_"/>
<field name="_root_" type="string" docValues="false" indexed="true" stored="false"/>
<field name="_text_" type="text_general" multiValued="true" indexed="true" stored="false"/>
<field name="_version_" type="plong" indexed="false" stored="false"/>
<field name="boost" type="pdoubles"/>
<field name="content" type="text_general"/>
<field name="digest" type="text_general"/>
<field name="id" type="string" multiValued="false" indexed="true" required="true" stored="true"/>
<field name="metatag.author" type="text_general" indexed="true"/>
<field name="metatag.channels" type="plongs"/>
<field name="metatag.creator" type="text_general" indexed="true"/>
<field name="metatag.samplerate" type="plongs"/>
<field name="metatag.version" type="text_general"/>
<field name="title" type="text_general" indexed="true"/>
<field name="tstamp" type="pdates"/>
<field name="url" type="text_general" stored="true"/>
<dynamicField name="*_txt_en_split_tight" type="text_en_splitting_tight" indexed="true" stored="true"/>
<dynamicField name="*_descendent_path" type="descendent_path" indexed="true" stored="true"/>
<dynamicField name="*_ancestor_path" type="ancestor_path" indexed="true" stored="true"/>
<dynamicField name="*_txt_en_split" type="text_en_splitting" indexed="true" stored="true"/>
<dynamicField name="*_txt_sort" type="text_gen_sort" indexed="true" stored="true"/>
<dynamicField name="ignored_*" type="ignored"/>
<dynamicField name="*_txt_rev" type="text_general_rev" indexed="true" stored="true"/>
<dynamicField name="*_phon_en" type="phonetic_en" indexed="true" stored="true"/>
<dynamicField name="*_s_lower" type="lowercase" indexed="true" stored="true"/>
<dynamicField name="*_txt_cjk" type="text_cjk" indexed="true" stored="true"/>
<dynamicField name="random_*" type="random"/>
<dynamicField name="*_t_sort" type="text_gen_sort" multiValued="false" indexed="true" stored="true"/>
<dynamicField name="*_txt_en" type="text_en" indexed="true" stored="true"/>
<dynamicField name="*_txt_ar" type="text_ar" indexed="true" stored="true"/>
<dynamicField name="*_txt_bg" type="text_bg" indexed="true" stored="true"/>
<dynamicField name="*_txt_ca" type="text_ca" indexed="true" stored="true"/>
<dynamicField name="*_txt_cz" type="text_cz" indexed="true" stored="true"/>
<dynamicField name="*_txt_da" type="text_da" indexed="true" stored="true"/>
<dynamicField name="*_txt_de" type="text_de" indexed="true" stored="true"/>
<dynamicField name="*_txt_el" type="text_el" indexed="true" stored="true"/>
<dynamicField name="*_txt_es" type="text_es" indexed="true" stored="true"/>
<dynamicField name="*_txt_et" type="text_et" indexed="true" stored="true"/>
<dynamicField name="*_txt_eu" type="text_eu" indexed="true" stored="true"/>
<dynamicField name="*_txt_fa" type="text_fa" indexed="true" stored="true"/>
<dynamicField name="*_txt_fi" type="text_fi" indexed="true" stored="true"/>
<dynamicField name="*_txt_fr" type="text_fr" indexed="true" stored="true"/>
<dynamicField name="*_txt_ga" type="text_ga" indexed="true" stored="true"/>
<dynamicField name="*_txt_gl" type="text_gl" indexed="true" stored="true"/>
<dynamicField name="*_txt_hi" type="text_hi" indexed="true" stored="true"/>
<dynamicField name="*_txt_hu" type="text_hu" indexed="true" stored="true"/>
<dynamicField name="*_txt_hy" type="text_hy" indexed="true" stored="true"/>
<dynamicField name="*_txt_id" type="text_id" indexed="true" stored="true"/>
<dynamicField name="*_txt_it" type="text_it" indexed="true" stored="true"/>
<dynamicField name="*_txt_ja" type="text_ja" indexed="true" stored="true"/>
<dynamicField name="*_txt_ko" type="text_ko" indexed="true" stored="true"/>
<dynamicField name="*_txt_lv" type="text_lv" indexed="true" stored="true"/>
<dynamicField name="*_txt_nl" type="text_nl" indexed="true" stored="true"/>
<dynamicField name="*_txt_no" type="text_no" indexed="true" stored="true"/>
<dynamicField name="*_txt_pt" type="text_pt" indexed="true" stored="true"/>
<dynamicField name="*_txt_ro" type="text_ro" indexed="true" stored="true"/>
<dynamicField name="*_txt_ru" type="text_ru" indexed="true" stored="true"/>
<dynamicField name="*_txt_sv" type="text_sv" indexed="true" stored="true"/>
<dynamicField name="*_txt_th" type="text_th" indexed="true" stored="true"/>
<dynamicField name="*_txt_tr" type="text_tr" indexed="true" stored="true"/>
<dynamicField name="*_point" type="point" indexed="true" stored="true"/>
<dynamicField name="*_srpt" type="location_rpt" indexed="true" stored="true"/>
<dynamicField name="attr_*" type="text_general" multiValued="true" indexed="true" stored="true"/>
<dynamicField name="*_txt" type="text_general" indexed="true" stored="true"/>
<dynamicField name="*_str" type="strings" docValues="true" indexed="false" stored="false" useDocValuesAsStored="false"/>
<dynamicField name="*_dts" type="pdate" multiValued="true" indexed="true" stored="true"/>
<dynamicField name="*_dpf" type="delimited_payloads_float" indexed="true" stored="true"/>
<dynamicField name="*_dpi" type="delimited_payloads_int" indexed="true" stored="true"/>
<dynamicField name="*_dps" type="delimited_payloads_string" indexed="true" stored="true"/>
<dynamicField name="*_is" type="pints" indexed="true" stored="true"/>
<dynamicField name="*_ss" type="strings" indexed="true" stored="true"/>
<dynamicField name="*_ls" type="plongs" indexed="true" stored="true"/>
<dynamicField name="*_bs" type="booleans" indexed="true" stored="true"/>
<dynamicField name="*_fs" type="pfloats" indexed="true" stored="true"/>
<dynamicField name="*_ds" type="pdoubles" indexed="true" stored="true"/>
<dynamicField name="*_dt" type="pdate" indexed="true" stored="true"/>
<dynamicField name="*_ws" type="text_ws" indexed="true" stored="true"/>
<dynamicField name="*_i" type="pint" indexed="true" stored="true"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
<dynamicField name="*_l" type="plong" indexed="true" stored="true"/>
<dynamicField name="*_t" type="text_general" multiValued="false" indexed="true" stored="true"/>
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
<dynamicField name="*_f" type="pfloat" indexed="true" stored="true"/>
<dynamicField name="*_d" type="pdouble" indexed="true" stored="true"/>
<dynamicField name="*_p" type="location" indexed="true" stored="true"/>
<copyField source="digest" dest="digest_str" maxChars="256"/>
<copyField source="title" dest="title_str" maxChars="256"/>
<copyField source="url" dest="url_str" maxChars="256"/>
<copyField source="content" dest="content_str" maxChars="256"/>
<copyField source="metatag.author" dest="metatag.author_str" maxChars="256"/>
<copyField source="metatag.version" dest="metatag.version_str" maxChars="256"/>
<copyField source="metatag.creator" dest="metatag.creator_str" maxChars="256"/>
</schema>
then
nutch-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>http.agent.name</name>
<value>NutchSpiderTest</value>
</property>
<property>
<name>http.robots.agents</name>
<value>NutchSpiderTest,*</value>
<description>...
</description>
</property>
<property>
<name>plugin.includes</name>
<value>protocol-file|urlfilter-regex|parse-(html|tika|metatags|text)|index-(basic|anchor|metadata)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
<description>...
</description>
</property>
<property>
<name>file.content.limit</name>
<value>-1</value>
<description> Needed to stop buffer overflow errors - Unable to read.....</description>
</property>
<property>
<name>file.crawl.parent</name>
<value>false</value>
<description>The crawler is not restricted to the directories that you specified in the
Urls file but it is jumping into the parent directories as well. For your own crawlings you can
change this behavior (set to false) the way that only directories beneath the directories that you specify get
crawled.</description>
</property>
<property>
<name>parser.skip.truncated</name>
<value>false</value>
<description>Boolean value for whether we should skip parsing for truncated documents. By default this
property is activated due to extremely high levels of CPU which parsing can sometimes take.
</description>
</property>
<!--
<value>protocol-file|protocol-http|protocol-httpclient|urlfilter-(regex|validator)|parse-(html|tika|text)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)|index-more</value>
-->
<!-- Used only if plugin parse-metatags is enabled. -->
<property>
<name>metatags.names</name>
<value>*</value>
<description> ...
</description>
</property>
<property>
<name>index.parse.md</name>
<value>metatag.description,metatag.keywords,metatag.author,metatag.channels,metatag.content_encoding,metatag.content_type,metatag.creator,metatag.dc_creator,metatag.dc_title,metatag.id,metatag.meta_author,metatag.samplerate,metatag.stream_content_type,metatag.stream_name,metatag.stream_size,metatag.stream_source_info,metatag.title,metatag.version,metatag.x_parsed_by,metatag.xmpdm_album,metatag.album,metatag.xmpdm_albumartist,metatag.albumartist,metatag.xmpdm_artist,metatag.artist,metatag.xmpdm_audiochanneltype,metatag.audiochanneltype,metatag.xmpdm_audiocompressor,metatag.audiocompressor,metatag.xmpdm_audiosamplerate,metatag.audiosamplerate,metatag.xmpdm_composer,metatag.composer,metatag.xmpdm_discnumber,metatag.discnumber,metatag.xmpdm_duration,metatag.duration,metatag.xmpdm_genre,metatag.genre,metatag.xmpdm_releasedate,metatag.releasedate,metatag.xmpdm_tracknumber,metatag.tracknumber,metatag.copyright,author,Genre</value>
<description>
Comma-separated list of keys to be taken from the parse metadata to generate fields.
Can be used e.g. for 'description' or 'keywords' provided that these values are generated
by a parser (see parse-metatags plugin)
</description>
</property>
</configuration>
Results of running a query for ":":
{
"responseHeader":{
...,
"params":{
"q":"*:*",
"_":"..."}},
"response":{"numFound":24,"start":0,"docs":[
{...
Response of running any other kind of query:
{
"responseHeader":{
...
"params":{
"q":"Bumblebee",
"_":"..."}},
"response":{"numFound":0,"start":0,"docs":[]
}}
Additionally, the data I'm trying to index is various .mp3 files from the Free Music Archive.
edit: the files I'm trying to search for look like this:
{
"metatag.author":["A Kombi",
"A Kombi"],
"metatag.samplerate":[44100,
44100],
"title":["Plight Of The Bumblebee"],
"url":["file:/c:/Users/.../fma/fma_small/009/009476.mp3"],
"content":["Plight Of The Bumblebee\nPlight Of The Bumblebee\nA Kombi\nMusic to Drive By, track 2\n2004-09-14T00:00:00\nField Recordings\n30014.912\n"],
"metatag.creator":["A Kombi",
"A Kombi"],
"tstamp":["2020-04-02T15:26:29.507Z"],
"digest":["ddd4ab2288c5799a5646592e1a63437f"],
"boost":[0.20851442],
"id":"file:/c:/Users/.../fma/fma_small/009/009476.mp3",
"metatag.version":["MPEG 3 Layer III Version 1",
"MPEG 3 Layer III Version 1"],
"metatag.channels":[2,
2],
"_version_":1662875102548590596}
You have to set which field you're expecting to search against - unless you have a default search field configured. In older versions of schema.xml this can be configured for the schema, but the recommended method is to configure it in the query itself.
However, to support free text search, it's far better to use the edismax query parser by supplying defType=edismax and then setting which fields you want to search through the qf (query fields) parameter.
q=Bumblebee&qf=title&defType=edismax
.. will search for Bumblebee in the title field. You can also give multiple field names to qf, and also adjust the weights given to each:
qf=title^10 content
.. which will search in both title and content, and give ten times more weight to any hits in the title field compared to a hit in the content field.
The fl (field list) parameter adjusts which fields are being returned in the response, which is useful if you only need a small subset of the available fields (such as just the id) to avoid a larger response and having to load all the field values from disk for each document returned.
Good evening,
when I search for the word "app" it dont show the word "apple". But if I search for "app*", it show "apple" and "app". I dont want to write "*" in the search bar. How can I do this if I only search for "app" and it shows "apple" and "app"?
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100" multiValued="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.SynonymFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
I tried to add <filter class="solr.ReversedWildcardFilterFactory"/>
but it didnt work.
Can someone help me?
I use Apache Solr 6.4.1
Sry for my bad english.
Use EdgeNGramFilterFactory
EdgeNGramFilterFactory :
This filter generates edge n-gram tokens of sizes within the given range.
Arguments:
minGramSize: (integer, default 1) The minimum gram size.
maxGramSize: (integer, default 1) The maximum gram size.
Example :
If we use minGramSize = 1 and maxGramSize = 4 then
In: "four score"
Tokenizer to Filter: "four", "score"
Out: "f", "fo", "fou", "four", "s", "sc", "sco", "scor"
For your case you can use the below schema :
<fieldType name="text_ngram" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="200"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.SynonymFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
And update your fieldType to text_ngram Ex.
<field name="name" type="text_ngram" indexed="true" stored="false" multiValued="true"/>
Note : Don't forget to reload the core and reindex data
This is my schema.xml
<?xml version="1.0" encoding="UTF-8"?>
<!-- multi language in single core R&D Pallav Jha -->
<schema name="Pallav" version="1.14">
<uniqueKey>SolrId</uniqueKey>
<defaultSearchField>Name</defaultSearchField>
<solrQueryParser defaultOperator="OR"/>
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true"/>
<fieldType name="date" class="solr.TrieDateField" positionIncrementGap="0" precisionStep="6"/>
<fieldType name="float" class="solr.TrieFloatField" positionIncrementGap="0" precisionStep="0"/>
<fieldType name="int" class="solr.TrieIntField" omitNorms="true" positionIncrementGap="0" precisionStep="0"/>
<fieldType name="long" class="solr.TrieLongField" positionIncrementGap="0" precisionStep="0"/>
<fieldType name="nGramAttributes" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="!!.*?!!" replacement=""/>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.EdgeNGramFilterFactory" maxGramSize="10" minGramSize="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.FrenchLightStemFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="French" />
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
<analyzer type="query">
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.SynonymFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="German2"/>
<filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.FrenchLightStemFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="French" />
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="nGramtext" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EdgeNGramFilterFactory" maxGramSize="15" minGramSize="3"/>
<filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.PhoneticFilterFactory" encoder="Soundex" inject="true"/>
<filter class="solr.FrenchLightStemFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="French" />
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
<analyzer type="query">
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.SynonymFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.PhoneticFilterFactory" encoder="Soundex" inject="true"/>
<filter class="solr.FrenchLightStemFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="French" />
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="string" class="solr.StrField" omitNorms="true" sortMissingLast="true"/>
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.PhoneticFilterFactory" encoder="Soundex" inject="true"/>
</analyzer>
<analyzer type="query">
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.SynonymFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.PhoneticFilterFactory" encoder="Soundex" inject="true"/>
</analyzer>
</fieldType>
<fieldType name="textSpell" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.SynonymFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<field name="SolrId" type="string" indexed="true" required="true" stored="true"/>
<field name="Name" type="string" indexed="true" required="true" stored="true"/>
<field name="en_Name" type="string" indexed="true" required="false" stored="true"/>
<field name="nl_Name" type="string" indexed="true" required="false" stored="true"/>
<field name="fr_Name" type="string" indexed="true" required="false" stored="true"/>
<field name="hi_Name" type="string" indexed="true" required="false" stored="true"/>
<field name="_version_" type="long" indexed="true" stored="true"/>
<field name="nGramContent" type="nGramtext" multiValued="true" indexed="true" required="false" stored="false"/>
<dynamicField name="CDO_*" type="int" indexed="true" required="false" stored="true"/>
<dynamicField name="MDO_*" type="int" indexed="true" required="false" stored="true"/>
<dynamicField name="pa_*" type="string" multiValued="true" indexed="true" required="false" stored="true"/>
<dynamicField name="cp_*" type="string" indexed="true" required="false" stored="true"/>
<dynamicField name="f_*" type="string" multiValued="true" indexed="true" required="false" stored="true"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
<!-- <copyField source="Name" dest="SpellContent"/> -->
</schema>
I am trying to implement multi Language search for french only for testing.
but its not working i am not getting any result.Can any one help me.What i am doing wrong
This is my result for french.solr french search result
The problem is, that field fr_Name is of type string, which means that it's not analyzed or tokenized, that if you want to search something which contains spaces, like Apple Mac Book Pro, you need to use double quotes, for a complete match. So, query "fq":"fr_Name":\"Apple Mac Book Pro\" should work for you.
Some reference material from the Solr wiki:
String (UTF-8 encoded string or Unicode). Strings are intended for
small fields and are not tokenized or analyzed in any way. They have a
hard limit of slightly less than 32K.
Add this fieldtype
<fieldType name="text_fr" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.ElisionFilterFactory" articles="lang/contractions_fr.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_fr.txt" ignoreCase="true"/>
<filter class="solr.FrenchLightStemFilterFactory"/>
</analyzer>
</fieldType>
its working for me
I would like to remove stopwords from my index during indexing and query but somehow the words within the stopwords.txt do not seem to be removed from my index (I can still use these in a query and get result hits with them).
Here is my schema.xml:
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<!-- in this example, we will only use synonyms at query time
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.KStemFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.KStemFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<field name="_version_" type="long" indexed="true" stored="true"/>
<field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" />
<field name="post_content" type="text" indexed="true" stored="true"/>
<field name="post_title" type="text" indexed="true" stored="true"/>
<field name="post_date" type="date" indexed="true" stored="true"/>
<field name="_text_" type="text" indexed="true" stored="false" multiValued="true" termVectors="true" termPositions="true" termOffsets="true"/>
I am using Solr 6.0.
Thanks for any advice,
Sabine
By default the file stopwords.txt does not have any stop words in it.
You can check the same in any of the configSet given by Solr.
But if you check the conf/lang folder , you will find many stopword files.
You can use whichever is applicable for you as per your language.
For the testing purpose you can copy the stopwords from the stopwords_en.txt file and paste in the file stopward.txt in the path configsets/basic_configs/conf/. Here configset may be different for you. It depends which one you have used.
I'm new to Apache Solr, and I want to use it for indexing pdf files. I managed to get it up and running so far and I can now search for added pdf files.
However, I need to be able to retrieve the searched text from the results.
I found an xml snippet in the default solrconfig.xml concerning exactly that:
<requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler" startup="lazy">
<lst name="defaults">
<!-- All the main content goes into "text"... if you need to return
the extracted text or do highlighting, use a stored field. -->
<str name="fmap.content">text</str>
<str name="lowernames">true</str>
<str name="uprefix">ignored_</str>
<!-- capture link hrefs but ignore div attributes -->
<str name="captureAttr">true</str>
<str name="fmap.a">links</str>
<str name="fmap.div">ignored_</str>
</lst>
From what I get from here (http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Content-Extraction-Tika), I think I have to add a new field to schema.xml (e.g. "content") that has stored="true" and indexed="true". However, I'm not really sure how to accomplish this exactly?
any help appreciated, thx
add a schema.xml looking like this:
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="whatever" version="1.2">
<types>
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="../../mapping-ISOLatin1Accent.txt"/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<charFilter class="solr.MappingCharFilterFactory" mapping="../../mapping-ISOLatin1Accent.txt"/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
</types>
<fields>
<field name="internal_id" type="string" indexed="true" stored="true"/>
<field name="cat" type="int" indexed="true" stored="true"/>
<field name="desc" type="text" indexed="true" stored="true"/>
</fields>
<uniqueKey>internal_id</uniqueKey>
<defaultSearchField>desc</defaultSearchField>
<solrQueryParser defaultOperator="OR"/>
<similarity class="org.apache.lucene.search.DefaultSimilarity"/>
</schema>
If the "field" is "stored", it will show up in the results, by default.