xref: /dokuwiki/inc/indexer.php (revision b4ce25e9a449e7a6a78476bf94bca31cbc4259ce)
1*b4ce25e9SAndreas Gohr<?php
2*b4ce25e9SAndreas Gohr/**
3*b4ce25e9SAndreas Gohr * Common DokuWiki functions
4*b4ce25e9SAndreas Gohr *
5*b4ce25e9SAndreas Gohr * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6*b4ce25e9SAndreas Gohr * @author     Andreas Gohr <andi@splitbrain.org>
7*b4ce25e9SAndreas Gohr */
8*b4ce25e9SAndreas Gohr
9*b4ce25e9SAndreas Gohr  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10*b4ce25e9SAndreas Gohr  require_once(DOKU_CONF.'dokuwiki.php');
11*b4ce25e9SAndreas Gohr  require_once(DOKU_INC.'inc/io.php');
12*b4ce25e9SAndreas Gohr  require_once(DOKU_INC.'inc/utf8.php');
13*b4ce25e9SAndreas Gohr  require_once(DOKU_INC.'inc/parserutils.php');
14*b4ce25e9SAndreas Gohr
15*b4ce25e9SAndreas Gohr/**
16*b4ce25e9SAndreas Gohr * based upon class.search_indexer_phpcms.php::index_entry
17*b4ce25e9SAndreas Gohr */
18*b4ce25e9SAndreas Gohrfunction idx_getPageWords($id){
19*b4ce25e9SAndreas Gohr    $body  = rawWiki($id);
20*b4ce25e9SAndreas Gohr    $body  = utf8_stripspecials($body,' ','._\-:');
21*b4ce25e9SAndreas Gohr    $body  = utf8_strtolower($body);
22*b4ce25e9SAndreas Gohr    $body  = trim($body);
23*b4ce25e9SAndreas Gohr    $words = explode(' ',$body);
24*b4ce25e9SAndreas Gohr    sort($words);
25*b4ce25e9SAndreas Gohr
26*b4ce25e9SAndreas Gohr    $index = array(); //resulting index
27*b4ce25e9SAndreas Gohr    $old   = '';
28*b4ce25e9SAndreas Gohr    $doit  = true;
29*b4ce25e9SAndreas Gohr    $pos   = 0;
30*b4ce25e9SAndreas Gohr
31*b4ce25e9SAndreas Gohr    //compact wordlist FIXME check for stopwords
32*b4ce25e9SAndreas Gohr
33*b4ce25e9SAndreas Gohr    foreach($words as $word){
34*b4ce25e9SAndreas Gohr        if(strlen($word) == 0) continue;
35*b4ce25e9SAndreas Gohr
36*b4ce25e9SAndreas Gohr        // it's the same word
37*b4ce25e9SAndreas Gohr        if($word == $old){
38*b4ce25e9SAndreas Gohr            if($doit == false) {
39*b4ce25e9SAndreas Gohr                // we didn't wanted it last time
40*b4ce25e9SAndreas Gohr                continue;
41*b4ce25e9SAndreas Gohr            }
42*b4ce25e9SAndreas Gohr            // just increase the counter
43*b4ce25e9SAndreas Gohr            $index[$word]++;
44*b4ce25e9SAndreas Gohr            continue;
45*b4ce25e9SAndreas Gohr        }
46*b4ce25e9SAndreas Gohr
47*b4ce25e9SAndreas Gohr        // rememember old word
48*b4ce25e9SAndreas Gohr        $old  = $word;
49*b4ce25e9SAndreas Gohr        $doit = true;
50*b4ce25e9SAndreas Gohr
51*b4ce25e9SAndreas Gohr        // checking minimum word-size (excepting numbers)
52*b4ce25e9SAndreas Gohr        if(!is_numeric($word)) {
53*b4ce25e9SAndreas Gohr            if(strlen($word) < 3) {  #FIXME add config option for max wordsize
54*b4ce25e9SAndreas Gohr                $doit = false;
55*b4ce25e9SAndreas Gohr                continue;
56*b4ce25e9SAndreas Gohr            }
57*b4ce25e9SAndreas Gohr        }
58*b4ce25e9SAndreas Gohr
59*b4ce25e9SAndreas Gohr        //FIXME add stopword check
60*b4ce25e9SAndreas Gohr
61*b4ce25e9SAndreas Gohr        // add to index
62*b4ce25e9SAndreas Gohr        $index[$word] = 1;
63*b4ce25e9SAndreas Gohr    }
64*b4ce25e9SAndreas Gohr
65*b4ce25e9SAndreas Gohr    return $index;
66*b4ce25e9SAndreas Gohr}
67*b4ce25e9SAndreas Gohr
68*b4ce25e9SAndreas Gohr
69*b4ce25e9SAndreas Gohr
70*b4ce25e9SAndreas Gohr//Setup VIM: ex: et ts=4 enc=utf-8 :
71