Project

General

Profile

1
<?xml version="1.0" encoding="UTF-8" ?>
2
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
3
    <xsl:output omit-xml-declaration="yes" indent="yes"/>
4

    
5
    <xsl:template match="/FIELDS">
6

    
7
        <xsl:param name="textFieldType" select="string('text_ancient')"/>
8
        <xsl:variable name="smallcase" select="'abcdefghijklmnopqrstuvwxyz'"/>
9
        <xsl:variable name="uppercase" select="'ABCDEFGHIJKLMNOPQRSTUVWXYZ'"/>
10

    
11
        <!--
12
        D-Net index schema template
13

    
14
        CHANGELOG
15

    
16
        0.1 : first release
17
        0.2 : added preserveOriginal="1" for text field type in the index analyzer and catenateWords="1" for the query analyzer
18
        0.3 : changed language for SnowballPorterFilterFactory to language="German2" (index/query) in the text field type
19
        0.4 : added solr.ASCIIFoldingFilterFactory filter (index/query) in the text field type
20
        0.5 : added long_keyword field type, to be used for objIdentifiers
21
        0.6 : added field types for spellchecking
22
        0.7 : added parameter for text field type
23
        0.8 : added field _version_, needed by Solr 4.0.0 for the transaction log
24
        0.9   : added type: text_en_splitting
25
        0.91  : added type: ngramtext
26
        0.92  : added schema optimizations, removing unnecessary stored fields
27
        0.93  : added attribute preserveOriginal="1" to fieldtype ngramtext (query analysis) to improve matches
28
        0.94  : updated and simplified ngramtext fieldtype
29
        0.95  : update to solr 4.4, removed attribute "compress" from field definition, ngramfield doesn't support NGramFilterFactory anymore
30
        0.96  : update to solr 4.9
31
         -->
32
        <schema name="dnet" version="0.96">
33

    
34
            <types>
35

    
36
                <!-- The StrField type is not analyzed, but indexed/stored verbatim.
37
                   It supports doc values but in that case the field needs to be
38
                   single-valued and either required or have a default value.
39
                  -->
40
                <fieldType name="string" class="solr.StrField" sortMissingLast="true"/>
41

    
42
                <!-- boolean type: "true" or "false" -->
43
                <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
44

    
45
                <!-- sortMissingLast and sortMissingFirst attributes are optional attributes are
46
                     currently supported on types that are sorted internally as strings
47
                     and on numeric types.
48
                     This includes "string","boolean", and, as of 3.5 (and 4.x),
49
                     int, float, long, date, double, including the "Trie" variants.
50
                   - If sortMissingLast="true", then a sort on this field will cause documents
51
                     without the field to come after documents with the field,
52
                     regardless of the requested sort order (asc or desc).
53
                   - If sortMissingFirst="true", then a sort on this field will cause documents
54
                     without the field to come before documents with the field,
55
                     regardless of the requested sort order.
56
                   - If sortMissingLast="false" and sortMissingFirst="false" (the default),
57
                     then default lucene sorting will be used which places docs without the
58
                     field first in an ascending sort and last in a descending sort.
59
                -->
60

    
61
                <!--
62
                  Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
63

    
64
                  These fields support doc values, but they require the field to be
65
                  single-valued and either be required or have a default value.
66
                -->
67
                <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
68
                <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
69
                <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
70
                <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
71

    
72
                <!--
73
                 Numeric field types that index each value at various levels of precision
74
                 to accelerate range queries when the number of values between the range
75
                 endpoints is large. See the javadoc for NumericRangeQuery for internal
76
                 implementation details.
77

    
78
                 Smaller precisionStep values (specified in bits) will lead to more tokens
79
                 indexed per value, slightly larger index size, and faster range queries.
80
                 A precisionStep of 0 disables indexing at different precision levels.
81
                -->
82
                <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" positionIncrementGap="0"/>
83
                <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" positionIncrementGap="0"/>
84
                <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" positionIncrementGap="0"/>
85
                <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" positionIncrementGap="0"/>
86

    
87
                <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
88
                     is a more restricted form of the canonical representation of dateTime
89
                     http://www.w3.org/TR/xmlschema-2/#dateTime
90
                     The trailing "Z" designates UTC time and is mandatory.
91
                     Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
92
                     All other components are mandatory.
93

    
94
                     Expressions can also be used to denote calculations that should be
95
                     performed relative to "NOW" to determine the value, ie...
96

    
97
                           NOW/HOUR
98
                              ... Round to the start of the current hour
99
                           NOW-1DAY
100
                              ... Exactly 1 day prior to now
101
                           NOW/DAY+6MONTHS+3DAYS
102
                              ... 6 months and 3 days in the future from the start of
103
                                  the current day
104

    
105
                     Consult the DateField javadocs for more information.
106

    
107
                     Note: For faster range queries, consider the tdate type
108
                  -->
109
                <fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/>
110

    
111
                <!-- A Trie based date field for faster date range queries and date faceting. -->
112
                <fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/>
113

    
114
                <!--
115
                  Note:
116
                  These should only be used for compatibility with existing indexes (created with older Solr versions)
117
                  or if "sortMissingFirst" or "sortMissingLast" functionality is needed. Use Trie based fields instead.
118

    
119
                  Numeric field types that manipulate the value into
120
                  a string value that isn't human-readable in its internal form,
121
                  but with a lexicographic ordering the same as the numeric ordering,
122
                  so that range queries work correctly.
123
                -->
124
                <fieldType name="sint" class="solr.SortableIntField" sortMissingLast="true" omitNorms="true"/>
125
                <fieldType name="slong" class="solr.SortableLongField" sortMissingLast="true" omitNorms="true"/>
126
                <fieldType name="sfloat" class="solr.SortableFloatField" sortMissingLast="true" omitNorms="true"/>
127
                <fieldType name="sdouble" class="solr.SortableDoubleField" sortMissingLast="true" omitNorms="true"/>
128

    
129
                <!-- A text field that uses WordDelimiterFilter to enable splitting and matching of
130
                    words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
131
                    so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
132
                    Synonyms and stopwords are customized by external files, and stemming is enabled.
133
                    -->
134
                <fieldType name="text_common" class="solr.TextField" positionIncrementGap="100">
135
                    <analyzer type="index">
136
                        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
137
                        <filter class="solr.StopFilterFactory" ignoreCase="true"
138
                                words="stopwords.txt" enablePositionIncrements="true"/>
139
                        <filter class="solr.WordDelimiterFilterFactory"
140
                                preserveOriginal="1" generateWordParts="1"
141
                                generateNumberParts="1" catenateWords="1"
142
                                catenateNumbers="1" catenateAll="0"/>
143
                        <filter class="solr.LowerCaseFilterFactory"/>
144
                        <filter class="solr.ASCIIFoldingFilterFactory"/>
145
                    </analyzer>
146
                    <analyzer type="query">
147
                        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
148
                        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true"
149
                                expand="true"/>
150
                        <filter class="solr.StopFilterFactory" ignoreCase="true"
151
                                words="stopwords.txt" enablePositionIncrements="true"/>
152
                        <filter class="solr.WordDelimiterFilterFactory"
153
                                preserveOriginal="1" generateWordParts="1"
154
                                generateNumberParts="1" catenateWords="0"
155
                                catenateNumbers="0" catenateAll="0"/>
156
                        <filter class="solr.LowerCaseFilterFactory"/>
157
                        <filter class="solr.ASCIIFoldingFilterFactory"/>
158
                    </analyzer>
159
                </fieldType>
160

    
161
                <!-- A text field with defaults appropriate for English, plus
162
                 aggressive word-splitting and autophrase features enabled.
163
                 This field is just like text_en, except it adds
164
                 WordDelimiterFilter to enable splitting and matching of
165
                 words on case-change, alpha numeric boundaries, and
166
                 non-alphanumeric chars.  This means certain compound word
167
                 cases will work, for example query "wi fi" will match
168
                 document "WiFi" or "wi-fi".
169
                    -->
170
                <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100"
171
                           autoGeneratePhraseQueries="true">
172
                    <analyzer type="index">
173
                        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
174
                        <!-- in this example, we will only use synonyms at query time
175
                        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
176
                        -->
177
                        <!-- Case insensitive stop word removal.
178
                        -->
179
                        <filter class="solr.StopFilterFactory"
180
                                ignoreCase="true"
181
                                words="stopwords_en.txt"
182
                        />
183
                        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"
184
                                catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
185
                        <filter class="solr.LowerCaseFilterFactory"/>
186
                        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
187
                        <filter class="solr.PorterStemFilterFactory"/>
188
                    </analyzer>
189
                    <analyzer type="query">
190
                        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
191
                        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true"
192
                                expand="true"/>
193
                        <filter class="solr.StopFilterFactory"
194
                                ignoreCase="true"
195
                                words="stopwords_en.txt"
196
                        />
197
                        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"
198
                                catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
199
                        <filter class="solr.LowerCaseFilterFactory"/>
200
                        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
201
                        <filter class="solr.PorterStemFilterFactory"/>
202
                    </analyzer>
203
                </fieldType>
204

    
205
                <fieldType name="ngramtext" class="solr.TextField" omitNorms="true">
206
                    <analyzer type="index">
207
                        <tokenizer class="solr.KeywordTokenizerFactory"/>
208
                        <filter class="solr.LowerCaseFilterFactory"/>
209
                        <filter class="solr.NGramFilterFactory" minGramSize="3" maxGramSize="25"/>
210
                        <filter class="solr.TrimFilterFactory"/>
211
                        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
212
                    </analyzer>
213
                    <analyzer type="query">
214
                        <tokenizer class="solr.KeywordTokenizerFactory"/>
215
                        <filter class="solr.LowerCaseFilterFactory"/>
216
                    </analyzer>
217
                </fieldType>
218

    
219
                <fieldType name="text_german" class="solr.TextField" positionIncrementGap="100">
220
                    <analyzer type="index">
221
                        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
222
                        <filter class="solr.StopFilterFactory" ignoreCase="true"
223
                                words="stopwords.txt" enablePositionIncrements="true"/>
224
                        <filter class="solr.WordDelimiterFilterFactory"
225
                                preserveOriginal="1" generateWordParts="1"
226
                                generateNumberParts="1" catenateWords="1"
227
                                catenateNumbers="1" catenateAll="0"
228
                                splitOnCaseChange="1"/>
229
                        <filter class="solr.LowerCaseFilterFactory"/>
230
                        <filter class="solr.SnowballPorterFilterFactory" language="German2" protected="protwords.txt"/>
231
                        <filter class="solr.ASCIIFoldingFilterFactory"/>
232
                    </analyzer>
233
                    <analyzer type="query">
234
                        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
235
                        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true"
236
                                expand="true"/>
237
                        <filter class="solr.StopFilterFactory" ignoreCase="true"
238
                                words="stopwords.txt" enablePositionIncrements="true"/>
239
                        <filter class="solr.WordDelimiterFilterFactory"
240
                                generateWordParts="0" generateNumberParts="1"
241
                                catenateWords="1" catenateNumbers="0"
242
                                catenateAll="0" splitOnCaseChange="1"/>
243
                        <filter class="solr.LowerCaseFilterFactory"/>
244
                        <filter class="solr.SnowballPorterFilterFactory" language="German2" protected="protwords.txt"/>
245
                        <filter class="solr.ASCIIFoldingFilterFactory"/>
246
                    </analyzer>
247
                </fieldType>
248

    
249
                <fieldType name="text_ancient" class="solr.TextField" positionIncrementGap="100">
250
                    <analyzer type="index">
251
                        <!-- <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-FoldToASCII.txt"/> -->
252
                        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
253
                        <filter class="solr.ICUFoldingFilterFactory"/>
254
                        <!-- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> -->
255
                        <filter class="solr.WordDelimiterFilterFactory" preserveOriginal="1" generateWordParts="1"
256
                                generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
257
                        <!-- <filter class="solr.LowerCaseFilterFactory"/> -->
258
                        <!-- <filter class="solr.ASCIIFoldingFilterFactory"/> -->
259
                    </analyzer>
260
                    <analyzer type="query">
261
                        <!-- <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-FoldToASCII.txt"/> -->
262
                        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
263
                        <filter class="solr.ICUFoldingFilterFactory"/>
264
                        <!-- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> -->
265
                        <!-- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> -->
266
                        <filter class="solr.WordDelimiterFilterFactory" preserveOriginal="1" generateWordParts="1"
267
                                generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
268
                        <!-- <filter class="solr.LowerCaseFilterFactory"/> -->
269
                        <!-- <filter class="solr.ASCIIFoldingFilterFactory"/> -->
270
                    </analyzer>
271
                </fieldType>
272

    
273
                <!-- A general unstemmed text field that indexes tokens normally and also
274
                     reversed (via ReversedWildcardFilterFactory), to enable more efficient
275
                 leading wildcard queries. -->
276
                <fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100">
277
                    <analyzer type="index">
278
                        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
279
                        <filter class="solr.StopFilterFactory" ignoreCase="true"
280
                                words="stopwords.txt" enablePositionIncrements="true"/>
281
                        <filter class="solr.WordDelimiterFilterFactory"
282
                                generateWordParts="1" generateNumberParts="1"
283
                                catenateWords="1" catenateNumbers="1"
284
                                catenateAll="0" splitOnCaseChange="0"/>
285
                        <filter class="solr.LowerCaseFilterFactory"/>
286
                        <filter class="solr.ReversedWildcardFilterFactory"
287
                                withOriginal="true" maxPosAsterisk="3"
288
                                maxPosQuestion="2" maxFractionAsterisk="0.33"/>
289
                    </analyzer>
290
                    <analyzer type="query">
291
                        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
292
                        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true"
293
                                expand="true"/>
294
                        <filter class="solr.StopFilterFactory" ignoreCase="true"
295
                                words="stopwords.txt" enablePositionIncrements="true"
296
                        />
297
                        <filter class="solr.WordDelimiterFilterFactory"
298
                                generateWordParts="1" generateNumberParts="1"
299
                                catenateWords="0" catenateNumbers="0"
300
                                catenateAll="0" splitOnCaseChange="0"/>
301
                        <filter class="solr.LowerCaseFilterFactory"/>
302
                    </analyzer>
303
                </fieldType>
304

    
305
                <fieldType name="spelltext" class="solr.TextField" positionIncrementGap="100">
306
                    <analyzer type="index">
307
                        <tokenizer class="solr.StandardTokenizerFactory"/>
308
                        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
309
                        <filter class="solr.StandardFilterFactory"/>
310
                        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
311
                    </analyzer>
312
                    <analyzer type="query">
313
                        <tokenizer class="solr.StandardTokenizerFactory"/>
314
                        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true"
315
                                expand="true"/>
316
                        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
317
                        <filter class="solr.StandardFilterFactory"/>
318
                        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
319
                    </analyzer>
320
                </fieldType>
321

    
322
                <!-- used for objIdentifiers -->
323
                <fieldType name="long_keyword" class="solr.TextField">
324
                    <analyzer>
325
                        <tokenizer class="solr.KeywordTokenizerFactory"/>
326
                    </analyzer>
327
                </fieldType>
328

    
329
            </types>
330

    
331
            <fields>
332

    
333
                <xsl:for-each select="./FIELD">
334
                    <xsl:variable name="fieldname" select="translate(@name, $uppercase, $smallcase)"/>
335
                    <xsl:variable name="fieldtype">
336
                        <xsl:choose>
337
                            <xsl:when test="@type='int'">sint</xsl:when>
338
                            <xsl:when test="@type='date'">date</xsl:when>
339
                            <xsl:when test="@type='ngramtext'">ngramtext</xsl:when>
340
                            <xsl:when test="@type='long_keyword'">long_keyword</xsl:when>
341
                            <xsl:when test="@type='text_ancient'">text_ancient</xsl:when>
342
                            <xsl:when test="@tokenizable='false'">string</xsl:when>
343
                            <xsl:otherwise>
344
                                <xsl:value-of select="$textFieldType"/>
345
                            </xsl:otherwise>
346
                        </xsl:choose>
347
                    </xsl:variable>
348
                    <xsl:variable name="isMultivalued">
349
                        <xsl:choose>
350
                            <xsl:when test="@multivalued='false'">false</xsl:when>
351
                            <xsl:otherwise>true</xsl:otherwise>
352
                        </xsl:choose>
353
                    </xsl:variable>
354
                    <xsl:variable name="isStored">
355
                        <xsl:choose>
356
                            <xsl:when test="@stored='true'">true</xsl:when>
357
                            <xsl:otherwise>false</xsl:otherwise>
358
                        </xsl:choose>
359
                    </xsl:variable>
360

    
361
                    <field name="{$fieldname}" type="{$fieldtype}" indexed="{@indexable}"
362
                           stored="{normalize-space($isStored)}" multiValued="{normalize-space($isMultivalued)}"/>
363
                </xsl:for-each>
364

    
365
                <field name="__indexrecordidentifier" type="string" indexed="true" stored="true" multiValued="false"
366
                       required="true"/>
367

    
368
                <field name="__deleted" type="boolean" indexed="true" stored="false" default="false" omitNorms="true"
369
                       omitTermFreqAndPositions="true"/>
370

    
371
                <field name="__dsid" type="string" indexed="true" stored="true" omitNorms="true"
372
                       omitTermFreqAndPositions="true"/>
373

    
374
                <field name="__dsversion" type="tdate" indexed="true" stored="true" omitNorms="true"
375
                       omitTermFreqAndPositions="true"/>
376

    
377
                <field name="__result" type="{$textFieldType}" indexed="false" stored="true"/>
378

    
379
                <field name="__fulltext" type="{$textFieldType}" indexed="false" stored="true" default=""/>
380

    
381
                <field name="__all" type="{$textFieldType}" indexed="true" stored="false" multiValued="true"/>
382

    
383
                <field name="__spell" type="spelltext" indexed="true" stored="false" omitNorms="true"
384
                       omitTermFreqAndPositions="true"/>
385

    
386
                <field name="cql.serverchoice" type="{$textFieldType}" indexed="true" stored="false" multiValued="true"
387
                       omitNorms="true" omitTermFreqAndPositions="true"/>
388

    
389
                <field name="_version_" type="long" indexed="true" stored="true" multiValued="false"/>
390

    
391
                <!-- catchall text field that indexes tokens both normally and in reverse for efficient
392
                     leading wildcard queries. -->
393
                <field name="text_rev" type="text_rev" indexed="true" stored="false" multiValued="true"/>
394

    
395
                <!-- field for ping -->
396
                <field name="text" type="text_common" indexed="false" stored="false"/>
397

    
398

    
399
            </fields>
400

    
401
            <!-- Field to use to determine and enforce document uniqueness.
402
                 Unless this field is marked with required="false", it will be a required field
403
              -->
404
            <uniqueKey>__indexrecordidentifier</uniqueKey>
405

    
406
            <!-- field for the QueryParser to use when an explicit fieldname is absent -->
407
            <defaultSearchField>__all</defaultSearchField>
408

    
409
            <!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
410
            <solrQueryParser defaultOperator="AND"/>
411

    
412
            <xsl:for-each select="./FIELD[(@type = 'ngramtext' or @type = 'text_ancient' or not(@type)) and not(@tokenizable = 'false')]">
413
                <xsl:variable name="fieldname" select="translate(@name, $uppercase, $smallcase)"/>
414
                <copyField source="{$fieldname}" dest="__all"/>
415
            </xsl:for-each>
416

    
417
            <xsl:for-each select="./FIELD[@spellcheck = 'true']">
418
                <xsl:variable name="fieldname" select="translate(@name, $uppercase, $smallcase)"/>
419
                <copyField source="{$fieldname}" dest="__spell"/>
420
            </xsl:for-each>
421

    
422
            <copyField source="cql.serverchoice" dest="__all"/>
423
            <copyField source="__fulltext" dest="__all"/>
424

    
425

    
426
            <!-- Similarity is the scoring routine for each document vs. a query.
427
                 A custom similarity may be specified here, but the default is fine
428
                 for most applications.  -->
429
            <!-- <similarity class="org.apache.lucene.search.DefaultSimilarity"/> -->
430
            <!-- ... OR ...
431
                 Specify a SimilarityFactory class name implementation
432
                 allowing parameters to be used.
433
            -->
434
            <!--
435
            <similarity class="com.example.solr.CustomSimilarityFactory">
436
              <str name="paramkey">param value</str>
437
            </similarity>
438
            -->
439

    
440

    
441
        </schema>
442
    </xsl:template>
443
</xsl:stylesheet>
    (1-1/1)