-
Notifications
You must be signed in to change notification settings - Fork 11
/
solrconfig_snippets.xml
319 lines (270 loc) · 17.1 KB
/
solrconfig_snippets.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- Cut down config skeleton illustrating how to configure dice solr plugins -->
<config>
<!-- Custom dice plugins. Place the SolrEnhancements.jar file in a folder under your solr core directory, such as ./plugins
and point this to it
-->
<lib dir="./plugins/" regex=".*\.jar" />
<!-- select requestHandler (not representative of our actual request handler) -->
<requestHandler name="/select" class="solr.StandardRequestHandler">
<lst name="defaults">
<str name="indent">true</str>
<str name="wt">json</str>
<str name="qf">title^10 skills^8 description^5 company^20</str>
<str name="mm">-55%</str>
<str name="pf">title^25 skills^20</str>
<str name="pf2">title^15 skills^10</str>
<str name="df">text</str>
<str name="fl">title,skills,latlong,jobDescription,jobPostingDate</str>
<!-- SPELL CHECKER -->
<str name="spellcheck.dictionary">default</str>
<str name="spellcheck">false</str>
<str name="spellcheck.extendedResults">false</str>
<str name="spellcheck.count">5</str>
<str name="spellcheck.alternativeTermCount">20</str>
<!-- Max number of results to be returned for the query
before it stops generating suggestions -->
<!-- Don't use when doing fuzzy matching on a field
<str name="spellcheck.maxResultsForSuggest">100</str>
-->
<str name="spellcheck.collate">true</str>
<str name="spellcheck.collateExtendedResults">true</str>
<!-- This should be false, otherwise this really hurts results for rarer words
If left on, this will try and correct words that are correcly spelled but rare -->
<str name="spellcheck.onlyMorePopular">false</str>
<str name="spellcheck.maxCollationTries">10</str>
<str name="spellcheck.maxCollations">5</str>
<!-- END SPELL CHECKER -->
</lst>
<arr name="last-components">
<str>spellcheck</str>
</arr>
</requestHandler>
<!-- Dice Custom MLT handler. (sample config, not the actual configuration) -->
<requestHandler name="/mlt" class="org.dice.solrenhancements.morelikethis.DiceMoreLikeThisHandler">
<lst name="defaults">
<str name="omitHeader">true</str>
<str name="wt">json</str>
<str name="indent">true</str>
<!-- Query parameter, override at query time - specify a query that matches one or more documents.
This document(s) are used to generate recommendations - similar documents based on matching 'interesting terms'
Note: Can handle More Like THESE functionality - generate recommendations from a set of documents, not just one
Can be used to generate recommendations from a user's browsing history, for example.
-->
<str name="q">id:12345567889</str>
<!-- Field(s) to return in the recommendations -->
<str name="fl">id</str>
<!-- Content stream logic - configure these parameters to extract terms from different parts of the content stream
using synonym filters followed by TypeFilter = synonym.
This allows you to pass in an enture document to generate recommendations from that is NOT in the index. For instance,
an employer could paste in a job description from their own site and we can generate matching job seekers.
To use - at query time, don't specify a q parameter, but populate the stream.head and stream.body parameters instead.
You can use either or both, the reason for 2 parameters is so you can apply different analysis chains to each parameter value separately.
Handles content streams (see regular solr mlt handler).
Recommended useage - use an analysis chain to extract important keywords from your domain.
One approach is to use a synonym filter to detect keywords, followed by a type filter (type=SYNONYM) to do this.
Another is to use the solr text tagger plugin.
-->
<str name="stream.head.fl">title</str>
<str name="stream.body.fl">skills,title</str>
<str name="stream.qf">skills^2.0 title^2.5</str>
<!-- Fields to match over -->
<str name="mlt.fl">title,skills</str>
<int name="rows">10</int> <!-- number of recommendations -->
<int name="mlt.maxflqt">5</int> <!-- max terms to match PER FIELD -->
<bool name="mlt.boost">true</bool> <!-- boost matches (from solr mlt) recommended - true -->
<!-- Relative field weights for matching fields (see mlt.fl) -->
<str name="mlt.qf">skills^2.0 title^2.5</str>
<!-- Normalize field weights to adhere to mlt.qf settings above
(e.g. so that skills have a relative weighting of 2. to 2.5 for titles from mlt.qf above
-->
<bool name="mlt.normflboosts">true</bool>
<!-- Log the term frequency values before using them to boost matches on -->
<bool name="mlt.logtf">true</bool>
<!-- multiplicative boost. At query time we populate this with a geospatial boost
e.g. a simplistic example - div(1.0,geodist(latlong,$locn)) where $locn is the lat long of the target job
-->
<str name="mlt.boostfn"></str>
</lst>
</requestHandler>
<!-- Dice Unsupervised Feedback Handler. (sample config, not the actual configuration) -->
<requestHandler name="/ufselect" class="org.dice.solrenhancements.unsupervisedfeedback.DiceUnsupervisedFeedbackHandler">
<lst name="defaults">
<str name="omitHeader">true</str>
<str name="wt">json</str>
<str name="indent">true</str>
<!-- Query parameter, override at query time - specify a query that matches one or more documents.
This document(s) are used to generate recommendations - similar documents based on matching 'interesting terms'
Note: Can handle More Like THESE functionality - generate recommendations from a set of documents, not just one
Can be used to generate recommendations from a user's browsing history, for example.
-->
<str name="q">id:12345567889</str>
<!-- Fields to return -->
<str name="fl">title,skills,state,city,location</str>
<!-- Fields to match documents on (used for term expansion in second query, taking top most interesting terms) -->
<str name="uf.fl">title,skills</str>
<!-- The Maximum number of documents matching the query defined in the q parameter, to use to extract terms from for query time expansion -->
<!-- This limits the size of the result set for the first query, not the second (see traditional rows parameter to limit the second) -->
<str name="uf.maxdocs">10</str>
<!-- Include the matches from the first query in the result set in a separate match section? -->
<!-- Recommended - Turn on for debugging and analysis, but leave off for production use -->
<str name="uf.match.include">false</str>
<!-- Number of documents to return to the caller (from the second query) -->
<int name="rows">10</int>
<!-- Max terms per field to use in query expansion in second query -->
<int name="uf.maxflqt">5</int>
<!-- Boost relevancy. See regular solr mlt handler documentations. Recommended - true -->
<bool name="uf.boost">true</bool>
<!-- Relative weighting to apply to each field from ufl.fl list -->
<str name="uf.qf">skills^2.0 title^2.5</str>
<!-- Display format for query expansion terms. See regular solr MLT handler documentation -->
<str name="uf.interestingTerms">details</str>
<!-- Normalize query expansion term boosts to comply with uf.qf weightings. Recommended - true -->
<bool name="uf.normflboosts">true</bool>
<!-- Log the term frequency values when apply to boost query expansion terms -->
<bool name="uf.logtf">true</bool>
</lst>
</requestHandler>
<requestHandler name="/titleSuggest" class="org.apache.solr.handler.component.SearchHandler">
<lst name="defaults">
<str name="spellcheck">true</str>
<str name="spellcheck.dictionary">titleSuggest</str>
<!-- Sort most popular first -->
<str name="spellcheck.onlyMorePopular">true</str>
<!-- How many auto-complete entries do you want, max? -->
<str name="spellcheck.count">10</str>
<str name="spellcheck.collate">false</str>
<str name="spellcheck.extendedResults">true</str>
</lst>
<arr name="components">
<str>titleSuggest</str>
</arr>
</requestHandler>
<searchComponent name="spellcheck" class="org.dice.solrenhancements.spellchecker.DiceSpellCheckComponent">
<!-- anaylzer field to use at query time -->
<str name="queryAnalyzerFieldType">dice_spellcheck</str>
<!-- note - multiple "Spell Checkers" can be declared and used by this component -->
<lst name="spellchecker">
<str name="name">default</str>
<str name="field">spellcheck</str>
<!-- a spellchecker built from a field of the main index -->
<str name="classname">org.dice.solrenhancements.spellchecker.DiceDirectSolrSpellChecker</str>
<!-- the spellcheck distance measure used, the default is the internal levenshtein -->
<str name="distanceMeasure">org.apache.lucene.search.spell.JaroWinklerDistance</str>
<!-- minimum accuracy needed to be considered a valid spellcheck suggestion -->
<float name="accuracy">0.5</float>
<!-- the maximum #edits we consider when enumerating terms: can be 1 or 2 -->
<int name="maxEdits">2</int>
<!-- the minimum shared prefix when enumerating terms -->
<int name="minPrefix">1</int>
<!-- maximum number of inspections per result. -->
<int name="maxInspections">50</int>
<!-- minimum length of a query (chars) to be considered for correction -->
<int name="minQueryLength">3</int>
<!-- maximum threshold of documents a query term can appear to be considered for correction -->
<!-- This is a threshold on the terms in the query. They have to appear below this threshold to be considered erroneous \ miss-spelled -->
<float name="maxQueryFrequency">50</float>
<!-- Minimum frequency of tokens to be considered as corrections -->
<float name="thresholdTokenFrequency">10</float>
<!-- score|freq|custom Comparator<SuggestWord> generally you want freq to avoid erroneous corrections -->
<str name="comparatorClass">freq</str>
<!-- *** New parameter for DiceSolrSpellChecker ***
- File to store typo corrections in. Follows the synonym file format.
- This path is RELATIVE to the core's configuration directory -->
<str name="typosFileName">typo_corrections.txt</str>
<str name="characterEncoding">UTF-8</str>
</lst>
</searchComponent>
<!-- Multiple Case Suggester -->
<searchComponent name="titleSuggest" class="solr.SpellCheckComponent">
<str name="queryAnalyzerFieldType">dice_autocomplete</str>
<!-- note - multiple "Spell Checkers" can be declared and used by this component -->
<lst name="spellchecker">
<str name="name">titleSuggest</str>
<str name="field">titleAutocomplete</str>
<!-- a spellchecker built from a field of the main index -->
<!-- ** Custom dice spell checker, tries different casings to look for a match - upper, lower and title casing ** -->
<str name="classname">org.dice.solrenhancements.spellchecker.DiceMultipleCaseSuggester</str>
<!-- the spellcheck distance measure used, the default is the internal levenshtein -->
<str name="lookupImpl">org.apache.solr.spelling.suggest.fst.WFSTLookupFactory</str>
<!-- Alternatives to lookupImpl:
org.apache.solr.spelling.suggest.fst.FSTLookupFactory [finite state automaton]
org.apache.solr.spelling.suggest.fst.WFSTLookupFactory [weighted finite state automaton]
org.apache.solr.spelling.suggest.jaspell.JaspellLookupFactory [default, jaspell-based]
org.apache.solr.spelling.suggest.tst.TSTLookupFactory [ternary trees]
-->
<float name="threshold">0.00005</float>
<str name="buildOnCommit">false</str>
</lst>
</searchComponent>
<searchComponent name="skillSuggest" class="solr.SpellCheckComponent">
<str name="queryAnalyzerFieldType">skills_autocomplete</str>
<!-- not - multiple "Spell Checkers" can be declared and used by this component -->
<lst name="spellchecker">
<str name="name">skillSuggest</str>
<str name="field">skillAutocomplete</str>
<str name="suggestionAnalyzerFieldTypeName">skills_from_text</str>
<!-- a spellchecker built from a field of the main index -->
<str name="classname">org.dice.solrenhancements.spellchecker.DiceSuggester</str>
<!-- the spellcheck distance measure used, the default is the internal levenshtein -->
<str name="lookupImpl">org.apache.solr.spelling.suggest.fst.WFSTLookupFactory</str>
<!-- Alternatives to lookupImpl:
org.apache.solr.spelling.suggest.fst.FSTLookupFactory [finite state automaton]
org.apache.solr.spelling.suggest.fst.WFSTLookupFactory [weighted finite state automaton]
org.apache.solr.spelling.suggest.jaspell.JaspellLookupFactory [default, jaspell-based]
org.apache.solr.spelling.suggest.tst.TSTLookupFactory [ternary trees]
-->
<float name="threshold">0.00005</float>
<str name="buildOnCommit">false</str>
<str name="sourceLocation">skill_counts_A.txt,skill_counts_B.txt,skill_counts_C.txt,skill_counts_D.txt</str>
</lst>
</searchComponent>
<!-- Request handler for spellchecker. Recommend you use in a lsast compoents of the main select handler instead -->
<requestHandler name="/spell" class="solr.SearchHandler" startup="lazy">
<lst name="defaults">
<str name="df">text_dice</str>
<!-- Solr will use suggestions from both the 'default' spellchecker
and from the 'wordbreak' spellchecker and combine them.
collations (re-written queries) can include a combination of
corrections from both spellcheckers -->
<str name="spellcheck.dictionary">default</str>
<str name="spellcheck.dictionary">wordbreak</str>
<str name="spellcheck">on</str>
<str name="spellcheck.extendedResults">true</str>
<str name="spellcheck.count">10</str>
<str name="spellcheck.alternativeTermCount">5</str>
<str name="spellcheck.maxResultsForSuggest">5</str>
<str name="spellcheck.collate">true</str>
<str name="spellcheck.collateExtendedResults">true</str>
<str name="spellcheck.maxCollationTries">10</str>
<str name="spellcheck.maxCollations">5</str>
</lst>
<arr name="last-components">
<str>spellcheck</str>
</arr>
</requestHandler>
<!-- Custom Query Parsers that extend the edismax parers -->
<!-- Extension of edismax to include payload values in the relevancy score calculation -->
<!-- Also requires custom similarity class implementation to take advantage of this (e.g. dice's PayloadAwareDefaultSimilarity) -->
<queryParser name="payloadEdismax" class="org.dice.solrenhancements.queryparsers.PayloadAwareExtendedDismaxQParserPlugin"/>
<!-- Handles expansion of terms to vector representations at query and index time.
Handles solr multi-word synonym issue by replacing spaces with comma's - must ensure analysis chain tokenizes on commas for this to work -->
<queryParser name="vector" class="org.dice.solrenhancements.queryparsers.VectorQParserPlugin"/>
<!-- Handles expansion of terms to list of payload boosted query expansion terms at query time.
Handles solr multi-word synonym issue by replacing spaces with comma's - must ensure analysis chain tokenizes on commas for this to work -->
<queryParser name="queryboost" class="org.dice.solrenhancements.queryparsers.QueryBoostingQParserPlugin"/>
</config>