1<?xml version="1.0" encoding="UTF-8" ?>
2<!--
3A minimal schema.xml file for dokuwiki pages
4This file is a heavily culled version of the example schema.xml from the Solr
5distribution.
6-->
7<schema name="dokuwiki" version="1.4">
8
9  <types>
10    <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
11    <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
12
13    <!-- A Trie based date field for faster date range queries and date faceting. -->
14    <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
15
16    <!-- A text field that only splits on whitespace for exact matching of words -->
17    <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
18      <analyzer>
19        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
20      </analyzer>
21    </fieldType>
22
23    <!-- A general text field that has reasonable, generic
24         cross-language defaults: it tokenizes with StandardTokenizer,
25	 removes stop words from case-insensitive "stopwords.txt"
26	 (empty by default), and down cases.  At query time only, it
27	 also applies synonyms. -->
28    <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
29      <analyzer type="index">
30        <charFilter class="solr.HTMLStripCharFilterFactory"/>
31        <tokenizer class="solr.StandardTokenizerFactory"/>
32        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
33        <!-- in this example, we will only use synonyms at query time
34        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
35        -->
36        <filter class="solr.LowerCaseFilterFactory"/>
37      </analyzer>
38      <analyzer type="query">
39        <tokenizer class="solr.StandardTokenizerFactory"/>
40        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
41        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
42        <filter class="solr.LowerCaseFilterFactory"/>
43      </analyzer>
44    </fieldType>
45
46    <!-- A text field with defaults appropriate for English: it
47         tokenizes with StandardTokenizer, removes English stop words
48         (stopwords_en.txt), down cases, protects words from protwords.txt, and
49         finally applies Porter's stemming.  The query time analyzer
50         also applies synonyms from synonyms.txt. -->
51    <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
52      <analyzer type="index">
53        <charFilter class="solr.HTMLStripCharFilterFactory"/>
54        <tokenizer class="solr.StandardTokenizerFactory"/>
55        <!-- in this example, we will only use synonyms at query time
56        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
57        -->
58        <!-- Case insensitive stop word removal.
59          add enablePositionIncrements=true in both the index and query
60          analyzers to leave a 'gap' for more accurate phrase queries.
61        -->
62        <filter class="solr.StopFilterFactory"
63                ignoreCase="true"
64                words="stopwords_en.txt"
65                enablePositionIncrements="true"
66                />
67        <filter class="solr.LowerCaseFilterFactory"/>
68	<filter class="solr.EnglishPossessiveFilterFactory"/>
69        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
70	<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
71        <filter class="solr.EnglishMinimalStemFilterFactory"/>
72	-->
73        <filter class="solr.PorterStemFilterFactory"/>
74      </analyzer>
75      <analyzer type="query">
76        <tokenizer class="solr.StandardTokenizerFactory"/>
77        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
78        <filter class="solr.StopFilterFactory"
79                ignoreCase="true"
80                words="stopwords_en.txt"
81                enablePositionIncrements="true"
82                />
83        <filter class="solr.LowerCaseFilterFactory"/>
84	<filter class="solr.EnglishPossessiveFilterFactory"/>
85        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
86	<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
87        <filter class="solr.EnglishMinimalStemFilterFactory"/>
88	-->
89        <filter class="solr.PorterStemFilterFactory"/>
90      </analyzer>
91    </fieldType>
92
93    <!-- A text field with defaults appropriate for English, plus
94	 aggressive word-splitting and autophrase features enabled.
95	 This field is just like text_en, except it adds
96	 WordDelimiterFilter to enable splitting and matching of
97	 words on case-change, alpha numeric boundaries, and
98	 non-alphanumeric chars.  This means certain compound word
99	 cases will work, for example query "wi fi" will match
100	 document "WiFi" or "wi-fi".  However, other cases will still
101	 not match, for example if the query is "wifi" and the
102	 document is "wi fi" or if the query is "wi-fi" and the
103	 document is "wifi".
104        -->
105    <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
106      <analyzer type="index">
107        <charFilter class="solr.HTMLStripCharFilterFactory"/>
108        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
109        <!-- in this example, we will only use synonyms at query time
110        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
111        -->
112        <!-- Case insensitive stop word removal.
113          add enablePositionIncrements=true in both the index and query
114          analyzers to leave a 'gap' for more accurate phrase queries.
115        -->
116        <filter class="solr.StopFilterFactory"
117                ignoreCase="true"
118                words="stopwords_en.txt"
119                enablePositionIncrements="true"
120                />
121        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
122        <filter class="solr.LowerCaseFilterFactory"/>
123        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
124        <filter class="solr.PorterStemFilterFactory"/>
125      </analyzer>
126      <analyzer type="query">
127        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
128        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
129        <filter class="solr.StopFilterFactory"
130                ignoreCase="true"
131                words="stopwords_en.txt"
132                enablePositionIncrements="true"
133                />
134        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
135        <filter class="solr.LowerCaseFilterFactory"/>
136        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
137        <filter class="solr.PorterStemFilterFactory"/>
138      </analyzer>
139    </fieldType>
140
141    <!-- Less flexible matching, but less false matches.  Probably not ideal for product names,
142         but may be good for SKUs.  Can insert dashes in the wrong place and still match. -->
143    <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
144      <analyzer>
145        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
146        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
147        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_en.txt"/>
148        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
149        <filter class="solr.LowerCaseFilterFactory"/>
150        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
151        <filter class="solr.EnglishMinimalStemFilterFactory"/>
152        <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
153             possible with WordDelimiterFilter in conjuncton with stemming. -->
154        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
155      </analyzer>
156    </fieldType>
157
158    <!-- Just like text_general except it reverses the characters of
159	 each token, to enable more efficient leading wildcard queries. -->
160    <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
161      <analyzer type="index">
162        <charFilter class="solr.HTMLStripCharFilterFactory"/>
163        <tokenizer class="solr.StandardTokenizerFactory"/>
164        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
165        <filter class="solr.LowerCaseFilterFactory"/>
166        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
167           maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
168      </analyzer>
169      <analyzer type="query">
170        <tokenizer class="solr.StandardTokenizerFactory"/>
171        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
172        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
173        <filter class="solr.LowerCaseFilterFactory"/>
174      </analyzer>
175    </fieldType>
176
177    <!-- lowercases the entire field value, keeping it as a single token.  -->
178    <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
179      <analyzer>
180        <tokenizer class="solr.KeywordTokenizerFactory"/>
181        <filter class="solr.LowerCaseFilterFactory" />
182      </analyzer>
183    </fieldType>
184
185    <!-- Dokuwiki page id, hierarchical, with name spaces -->
186    <fieldType name="page_id" class="solr.TextField" positionIncrementGap="100">
187      <analyzer>
188        <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter=":" replace="/"/>
189      </analyzer>
190    </fieldType>
191
192    <!-- since fields of this type are by default not stored or indexed,
193         any data added to them will be ignored outright.  -->
194    <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
195
196 </types>
197
198 <fields>
199   <field name="id" type="string" indexed="true" stored="true" required="true" />
200   <field name="idpath" type="page_id" indexed="true" stored="true" required="true" />
201   <field name="title" type="text_general" indexed="true" stored="true"/>
202   <field name="content" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true"/>
203   <field name="abstract" type="text_general" indexed="true" stored="true"/>
204   <field name="created" type="tdate" indexed="true" stored="true"/>
205   <field name="modified" type="tdate" indexed="true" stored="true"/>
206   <field name="creator" type="lowercase" indexed="true" stored="true"/>
207   <field name="contributor" type="lowercase" indexed="true" stored="true" multiValued="true"/>
208   <field name="keywords" type="text_ws" indexed="true" stored="true"/>
209   <field name="references" type="text_general" indexed="true" stored="true" multiValued="true"/>
210
211
212   <!-- catchall field, containing all other searchable text fields (implemented
213        via copyField further on in this schema  -->
214   <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
215
216 </fields>
217
218 <!-- Field to use to determine and enforce document uniqueness.
219      Unless this field is marked with required="false", it will be a required field
220   -->
221 <uniqueKey>id</uniqueKey>
222
223 <!-- field for the QueryParser to use when an explicit fieldname is absent -->
224 <defaultSearchField>text</defaultSearchField>
225
226 <!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
227 <solrQueryParser defaultOperator="OR"/>
228
229  <!-- copyField commands copy one field to another at the time a document
230        is added to the index.  It's used either to index the same field differently,
231        or to add multiple fields to the same field for easier/faster searching.  -->
232
233   <copyField source="title" dest="text"/>
234   <copyField source="content" dest="text"/>
235   <copyField source="id" dest="text"/>
236   <copyField source="creator" dest="text"/>
237   <copyField source="references" dest="text"/>
238   <copyField source="id" dest="idpath"/>
239
240</schema>
241