xref: /dokuwiki/inc/indexer.php (revision bc07ee8434d3d60eac5095a7d146d1a5d7f151ad)
1<?php
2/**
3 * Common DokuWiki functions
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10  require_once(DOKU_CONF.'dokuwiki.php');
11  require_once(DOKU_INC.'inc/io.php');
12  require_once(DOKU_INC.'inc/utf8.php');
13  require_once(DOKU_INC.'inc/parserutils.php');
14
15/**
16 * based upon class.search_indexer_phpcms.php::index_entry
17 */
18function idx_getPageWords($id){
19    $body  = rawWiki($id);
20    $body  = utf8_stripspecials($body,' ','._\-:');
21    $body  = utf8_strtolower($body);
22    $body  = trim($body);
23    $words = explode(' ',$body);
24    sort($words);
25
26    $index = array(); //resulting index
27    $old   = '';
28    $doit  = true;
29    $pos   = 0;
30
31    //compact wordlist FIXME check for stopwords
32
33    foreach($words as $word){
34        if(strlen($word) == 0) continue;
35
36        // it's the same word
37        if($word == $old){
38            if($doit == false) {
39                // we didn't wanted it last time
40                continue;
41            }
42            // just increase the counter
43            $index[$word]++;
44            continue;
45        }
46
47        // rememember old word
48        $old  = $word;
49        $doit = true;
50
51        // checking minimum word-size (excepting numbers)
52        if(!is_numeric($word)) {
53            if(strlen($word) < 3) {  #FIXME add config option for max wordsize
54                $doit = false;
55                continue;
56            }
57        }
58
59        //FIXME add stopword check
60
61        // add to index
62        $index[$word] = 1;
63    }
64
65    return $index;
66}
67
68
69
70//Setup VIM: ex: et ts=4 enc=utf-8 :
71