1<?xml version="1.0" encoding="UTF-8" ?> 2<!-- 3A minimal schema.xml file for dokuwiki pages 4This file is a heavily culled version of the example schema.xml from the Solr 5distribution. 6--> 7<schema name="dokuwiki" version="1.4"> 8 9 <types> 10 <!-- The StrField type is not analyzed, but indexed/stored verbatim. --> 11 <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/> 12 13 <!-- A Trie based date field for faster date range queries and date faceting. --> 14 <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/> 15 16 <!-- A text field that only splits on whitespace for exact matching of words --> 17 <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100"> 18 <analyzer> 19 <tokenizer class="solr.WhitespaceTokenizerFactory"/> 20 </analyzer> 21 </fieldType> 22 23 <!-- A general text field that has reasonable, generic 24 cross-language defaults: it tokenizes with StandardTokenizer, 25 removes stop words from case-insensitive "stopwords.txt" 26 (empty by default), and down cases. At query time only, it 27 also applies synonyms. --> 28 <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100"> 29 <analyzer type="index"> 30 <charFilter class="solr.HTMLStripCharFilterFactory"/> 31 <tokenizer class="solr.StandardTokenizerFactory"/> 32 <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> 33 <!-- in this example, we will only use synonyms at query time 34 <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> 35 --> 36 <filter class="solr.LowerCaseFilterFactory"/> 37 </analyzer> 38 <analyzer type="query"> 39 <tokenizer class="solr.StandardTokenizerFactory"/> 40 <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> 41 <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> 42 <filter class="solr.LowerCaseFilterFactory"/> 43 </analyzer> 44 </fieldType> 45 46 <!-- A text field with defaults appropriate for English: it 47 tokenizes with StandardTokenizer, removes English stop words 48 (stopwords_en.txt), down cases, protects words from protwords.txt, and 49 finally applies Porter's stemming. The query time analyzer 50 also applies synonyms from synonyms.txt. --> 51 <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100"> 52 <analyzer type="index"> 53 <charFilter class="solr.HTMLStripCharFilterFactory"/> 54 <tokenizer class="solr.StandardTokenizerFactory"/> 55 <!-- in this example, we will only use synonyms at query time 56 <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> 57 --> 58 <!-- Case insensitive stop word removal. 59 add enablePositionIncrements=true in both the index and query 60 analyzers to leave a 'gap' for more accurate phrase queries. 61 --> 62 <filter class="solr.StopFilterFactory" 63 ignoreCase="true" 64 words="stopwords_en.txt" 65 enablePositionIncrements="true" 66 /> 67 <filter class="solr.LowerCaseFilterFactory"/> 68 <filter class="solr.EnglishPossessiveFilterFactory"/> 69 <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> 70 <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: 71 <filter class="solr.EnglishMinimalStemFilterFactory"/> 72 --> 73 <filter class="solr.PorterStemFilterFactory"/> 74 </analyzer> 75 <analyzer type="query"> 76 <tokenizer class="solr.StandardTokenizerFactory"/> 77 <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> 78 <filter class="solr.StopFilterFactory" 79 ignoreCase="true" 80 words="stopwords_en.txt" 81 enablePositionIncrements="true" 82 /> 83 <filter class="solr.LowerCaseFilterFactory"/> 84 <filter class="solr.EnglishPossessiveFilterFactory"/> 85 <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> 86 <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: 87 <filter class="solr.EnglishMinimalStemFilterFactory"/> 88 --> 89 <filter class="solr.PorterStemFilterFactory"/> 90 </analyzer> 91 </fieldType> 92 93 <!-- A text field with defaults appropriate for English, plus 94 aggressive word-splitting and autophrase features enabled. 95 This field is just like text_en, except it adds 96 WordDelimiterFilter to enable splitting and matching of 97 words on case-change, alpha numeric boundaries, and 98 non-alphanumeric chars. This means certain compound word 99 cases will work, for example query "wi fi" will match 100 document "WiFi" or "wi-fi". However, other cases will still 101 not match, for example if the query is "wifi" and the 102 document is "wi fi" or if the query is "wi-fi" and the 103 document is "wifi". 104 --> 105 <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> 106 <analyzer type="index"> 107 <charFilter class="solr.HTMLStripCharFilterFactory"/> 108 <tokenizer class="solr.WhitespaceTokenizerFactory"/> 109 <!-- in this example, we will only use synonyms at query time 110 <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> 111 --> 112 <!-- Case insensitive stop word removal. 113 add enablePositionIncrements=true in both the index and query 114 analyzers to leave a 'gap' for more accurate phrase queries. 115 --> 116 <filter class="solr.StopFilterFactory" 117 ignoreCase="true" 118 words="stopwords_en.txt" 119 enablePositionIncrements="true" 120 /> 121 <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> 122 <filter class="solr.LowerCaseFilterFactory"/> 123 <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> 124 <filter class="solr.PorterStemFilterFactory"/> 125 </analyzer> 126 <analyzer type="query"> 127 <tokenizer class="solr.WhitespaceTokenizerFactory"/> 128 <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> 129 <filter class="solr.StopFilterFactory" 130 ignoreCase="true" 131 words="stopwords_en.txt" 132 enablePositionIncrements="true" 133 /> 134 <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> 135 <filter class="solr.LowerCaseFilterFactory"/> 136 <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> 137 <filter class="solr.PorterStemFilterFactory"/> 138 </analyzer> 139 </fieldType> 140 141 <!-- Less flexible matching, but less false matches. Probably not ideal for product names, 142 but may be good for SKUs. Can insert dashes in the wrong place and still match. --> 143 <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> 144 <analyzer> 145 <tokenizer class="solr.WhitespaceTokenizerFactory"/> 146 <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/> 147 <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_en.txt"/> 148 <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/> 149 <filter class="solr.LowerCaseFilterFactory"/> 150 <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> 151 <filter class="solr.EnglishMinimalStemFilterFactory"/> 152 <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes 153 possible with WordDelimiterFilter in conjuncton with stemming. --> 154 <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> 155 </analyzer> 156 </fieldType> 157 158 <!-- Just like text_general except it reverses the characters of 159 each token, to enable more efficient leading wildcard queries. --> 160 <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100"> 161 <analyzer type="index"> 162 <charFilter class="solr.HTMLStripCharFilterFactory"/> 163 <tokenizer class="solr.StandardTokenizerFactory"/> 164 <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> 165 <filter class="solr.LowerCaseFilterFactory"/> 166 <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" 167 maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/> 168 </analyzer> 169 <analyzer type="query"> 170 <tokenizer class="solr.StandardTokenizerFactory"/> 171 <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> 172 <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> 173 <filter class="solr.LowerCaseFilterFactory"/> 174 </analyzer> 175 </fieldType> 176 177 <!-- lowercases the entire field value, keeping it as a single token. --> 178 <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100"> 179 <analyzer> 180 <tokenizer class="solr.KeywordTokenizerFactory"/> 181 <filter class="solr.LowerCaseFilterFactory" /> 182 </analyzer> 183 </fieldType> 184 185 <!-- Dokuwiki page id, hierarchical, with name spaces --> 186 <fieldType name="page_id" class="solr.TextField" positionIncrementGap="100"> 187 <analyzer> 188 <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter=":" replace="/"/> 189 </analyzer> 190 </fieldType> 191 192 <!-- since fields of this type are by default not stored or indexed, 193 any data added to them will be ignored outright. --> 194 <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" /> 195 196 </types> 197 198 <fields> 199 <field name="id" type="string" indexed="true" stored="true" required="true" /> 200 <field name="idpath" type="page_id" indexed="true" stored="true" required="true" /> 201 <field name="title" type="text_general" indexed="true" stored="true"/> 202 <field name="content" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true"/> 203 <field name="abstract" type="text_general" indexed="true" stored="true"/> 204 <field name="created" type="tdate" indexed="true" stored="true"/> 205 <field name="modified" type="tdate" indexed="true" stored="true"/> 206 <field name="creator" type="lowercase" indexed="true" stored="true"/> 207 <field name="contributor" type="lowercase" indexed="true" stored="true" multiValued="true"/> 208 <field name="keywords" type="text_ws" indexed="true" stored="true"/> 209 <field name="references" type="text_general" indexed="true" stored="true" multiValued="true"/> 210 211 212 <!-- catchall field, containing all other searchable text fields (implemented 213 via copyField further on in this schema --> 214 <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/> 215 216 </fields> 217 218 <!-- Field to use to determine and enforce document uniqueness. 219 Unless this field is marked with required="false", it will be a required field 220 --> 221 <uniqueKey>id</uniqueKey> 222 223 <!-- field for the QueryParser to use when an explicit fieldname is absent --> 224 <defaultSearchField>text</defaultSearchField> 225 226 <!-- SolrQueryParser configuration: defaultOperator="AND|OR" --> 227 <solrQueryParser defaultOperator="OR"/> 228 229 <!-- copyField commands copy one field to another at the time a document 230 is added to the index. It's used either to index the same field differently, 231 or to add multiple fields to the same field for easier/faster searching. --> 232 233 <copyField source="title" dest="text"/> 234 <copyField source="content" dest="text"/> 235 <copyField source="id" dest="text"/> 236 <copyField source="creator" dest="text"/> 237 <copyField source="references" dest="text"/> 238 <copyField source="id" dest="idpath"/> 239 240</schema> 241