xref: /plugin/sphinxsearch-was/xmlall.php (revision 4:c8a70a2936eb)
1<?php
2/**
3 * XML feed export
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9
10/* Initialization */
11
12if(!defined('DOKU_INC')) define('DOKU_INC',dirname(__FILE__).'/../../../');
13if(!defined('DOKU_PLUGIN')) define('DOKU_PLUGIN',DOKU_INC.'lib/plugins/');
14
15require_once(DOKU_INC.'inc/init.php');
16require_once(DOKU_INC.'inc/common.php');
17require_once(DOKU_INC.'inc/events.php');
18require_once(DOKU_INC.'inc/parserutils.php');
19require_once(DOKU_INC.'inc/feedcreator.class.php');
20require_once(DOKU_INC.'inc/auth.php');
21require_once(DOKU_INC.'inc/pageutils.php');
22require_once(DOKU_INC.'inc/search.php');
23require_once(DOKU_INC.'inc/parser/parser.php');
24
25
26require_once(DOKU_PLUGIN.'sphinxsearch/PageMapper.php');
27
28if (!file_exists(DOKU_INC.$conf['savedir']."/sphinxsearch/")){
29	mkdir(DOKU_INC.$conf['savedir']."/sphinxsearch/");
30}
31
32$pagesList = getPagesList();
33
34echo '<?xml version="1.0" encoding="utf-8"?>
35<sphinx:docset>
36
37<sphinx:schema>
38<sphinx:field name="title"/>
39<sphinx:field name="body"/>
40<sphinx:field name="categories"/>
41<sphinx:field name="level"/>
42<sphinx:field name="modified"/>
43<sphinx:field name="creator"/>
44<sphinx:attr name="level" type="int" bits="8" default="1"/>
45</sphinx:schema>
46';
47
48$pageMapper = new PageMapper();
49
50foreach($pagesList as $row){
51    $dokuPageId = $row['id'];
52    //get meta data
53    $metadata = p_get_metadata($dokuPageId);
54    $sections = getDocumentsByHeadings($dokuPageId, $metadata);
55    if (!empty($sections)){
56        foreach($sections as $hid => $section){
57            //parse meta data for headers, abstract, date, authors
58            $data = array();
59            $data['id'] = crc32($dokuPageId.$hid);
60            $data['categories'] = getCategories($dokuPageId) . '#' . $hid;
61            $data['level'] = $section['level'];
62            $data['modified'] = $metadata['date']['modified'];
63            $data['creator'] = $metadata['creator'];
64            $data['title'] = strip_tags($section['title']);
65            $data['body'] = strip_tags(p_render('xhtml',p_get_instructions($section['section']),$info));
66
67            echo formatXml($data)."\n";
68            $pageMapper->add($dokuPageId, $section['title'], $hid);
69        }
70    } else {
71        //parse meta data for headers, abstract, date, authors
72        $data = array();
73        $data['id'] = crc32($dokuPageId);
74        $data['categories'] = getCategories($dokuPageId);
75        $data['level'] = 1;
76        $data['modified'] = $metadata['date']['modified'];
77        $data['creator'] = $metadata['creator'];
78        $data['title'] = strip_tags($metadata['title']);
79        $data['body'] = strip_tags(p_wiki_xhtml($dokuPageId,$metadata['date']['modified'],false));
80
81        echo formatXml($data)."\n";
82        $pageMapper->add($dokuPageId, $metadata['title']);
83    }
84}
85
86echo '</sphinx:docset>';
87
88
89
90function formatXml($data)
91{
92    $xmlFormat = '
93<sphinx:document id="{id}">
94<title><![CDATA[[{title}]]></title>
95<body><![CDATA[[{body}]]></body>
96<categories><![CDATA[[{categories}]]></categories>
97<level>{level}</level>
98<modified>{modified}</modified>
99<creator>{creator}</creator>
100</sphinx:document>
101
102';
103
104    return str_replace( array('{id}', '{title}', '{body}', '{categories}', '{level}', '{modified}', '{creator}'),
105                        array($data['id'], $data['title'], $data['body'], $data['categories'],
106                             $data['level'], $data['modified'], $data['creator']),
107                $xmlFormat
108            );
109}
110
111function getDocumentsByHeadings($id, $metadata)
112{
113    if (empty($metadata) || empty($metadata['description']['tableofcontents'])) return false;
114
115    $sections = array();
116    foreach($metadata['description']['tableofcontents'] as $row){
117        $sections[$row['hid']] = array(
118                                    'section' => getSection($id, $row['title']),
119                                    'title' => $row['title'],
120                                    'level' => $row['level']
121                                    );
122    }
123    return $sections;
124}
125
126function getSection($id, $header)
127{
128    // Create the parser
129    $Parser = & new Doku_Parser();
130
131    // Add the Handler
132    $Parser->Handler = & new Doku_Handler();
133
134    // Load the header mode to find headers
135    $Parser->addMode('header',new Doku_Parser_Mode_Header());
136
137    // Load the modes which could contain markup that might be
138    // mistaken for a header
139    $Parser->addMode('listblock',new Doku_Parser_Mode_ListBlock());
140    $Parser->addMode('preformatted',new Doku_Parser_Mode_Preformatted());
141    $Parser->addMode('table',new Doku_Parser_Mode_Table());
142    $Parser->addMode('unformatted',new Doku_Parser_Mode_Unformatted());
143    $Parser->addMode('php',new Doku_Parser_Mode_PHP());
144    $Parser->addMode('html',new Doku_Parser_Mode_HTML());
145    $Parser->addMode('code',new Doku_Parser_Mode_Code());
146    $Parser->addMode('file',new Doku_Parser_Mode_File());
147    $Parser->addMode('quote',new Doku_Parser_Mode_Quote());
148    $Parser->addMode('footnote',new Doku_Parser_Mode_Footnote());
149    $Parser->addMode('internallink',new Doku_Parser_Mode_InternalLink());
150    $Parser->addMode('media',new Doku_Parser_Mode_Media());
151    $Parser->addMode('externallink',new Doku_Parser_Mode_ExternalLink());
152    $Parser->addMode('windowssharelink',new Doku_Parser_Mode_WindowsShareLink());
153    $Parser->addMode('filelink',new Doku_Parser_Mode_FileLink());
154
155    // Loads the raw wiki document
156    $doc = io_readFile(wikiFN($id));
157
158    // Get a list of instructions
159    $instructions = $Parser->parse($doc);
160
161    unset($Parser);
162
163    // Use this to watch when we're inside the section we want
164    $inSection = FALSE;
165    $startPos = 0;
166    $endPos = 0;
167
168    // Loop through the instructions
169    foreach ( $instructions as $instruction ) {
170
171        if ( !$inSection ) {
172
173            // Look for the header for the "Lists" heading
174            if ( $instruction[0] == 'header' &&
175                    trim($instruction[1][0]) == $header ) {
176
177                $startPos = $instruction[2];
178                $inSection = TRUE;
179            }
180        } else {
181
182            // Look for the end of the section
183            if ( $instruction[0] == 'section_close' ) {
184                $endPos = $instruction[2];
185                break;
186            }
187        }
188    }
189
190    // Normalize and pad the document in the same way the parse does
191    // so that byte indexes with match
192    $doc = "\n".str_replace("\r\n","\n",$doc)."\n";
193    $section = substr($doc, $startPos, ($endPos-$startPos));
194
195    return $section;
196}
197
198function getCategories($id)
199{
200    if (empty($id)) return '';
201
202    if (false === strpos($id, ":")){
203        return $id;
204    }
205
206    $ns = explode(":", $id);
207    $nsCount = count($ns);
208
209    $result = '';
210    do{
211        for($i = 0; $i < $nsCount; $i++){
212            $name = $ns[$i];
213            $result .= $name;
214            if ($i < $nsCount - 1){
215                 $result .= ':';
216            }
217        }
218        $result .= ' ';
219    }while($nsCount--);
220    return $result;
221}
222
223
224 /**
225  * Method return all wiki page names
226  * @global array $conf
227  * @return array
228  */
229 function getPagesList()
230 {
231    global $conf;
232
233    $data = array();
234    sort($data);
235    search($data,$conf['datadir'],'search_allpages','','');
236
237    return $data;
238}
239
240/**
241 * Array
242(
243    [date] => Array
244        (
245            [created] => 1239181434
246            [modified] => 1239202933
247        )
248
249    [creator] => Sergey Nikolaev
250    [last_change] => Array
251        (
252            [date] => 1239202933
253            [ip] => 85.118.229.162
254            [type] => E
255            [id] => cal:minutes:boardreader:200904:20090408
256            [user] => snikolaev
257            [sum] =>
258            [extra] =>
259        )
260
261    [contributor] => Array
262        (
263            [snikolaev] => Sergey Nikolaev
264        )
265
266    [title] => BoardReader call of Apr 8 2009
267    [description] => Array
268        (
269            [tableofcontents] => Array
270                (
271                    [0] => Array
272                        (
273                            [hid] => boardreader_call_of_apr_8_2009
274                            [title] => BoardReader call of Apr 8 2009
275                            [type] => ul
276                            [level] => 1
277                        )
278
279                    [1] => Array
280                        (
281                            [hid] => sergey
282                            [title] => Sergey
283                            [type] => ul
284                            [level] => 2
285                        )
286
287                    [2] => Array
288                        (
289                            [hid] => slava
290                            [title] => Slava
291                            [type] => ul
292                            [level] => 2
293                        )
294
295                    [3] => Array
296                        (
297                            [hid] => roman
298                            [title] => Roman
299                            [type] => ul
300                            [level] => 2
301                        )
302
303                    [4] => Array
304                        (
305                            [hid] => nikita
306                            [title] => Nikita
307                            [type] => ul
308                            [level] => 2
309                        )
310
311                    [5] => Array
312                        (
313                            [hid] => discussion
314                            [title] => Discussion
315                            [type] => ul
316                            [level] => 2
317                        )
318
319                )
320
321            [abstract] => Participants: Mindaugas, Sergey, Slava, Roman, Nikita
322
323Duration: 23 min
324
325Sergey
326
327Status:
328
329	*  published Roman's changes
330	*  started reviewing Slava's changes
331
332
333Plans:
334
335	*  start altering (singature field)
336	*  select server error handling
337	*  publish Slava's and Roman's changes
338        )
339
340    [internal] => Array
341        (
342            [cache] => 1
343            [toc] => 1
344        )
345
346)
347
348 */
349
350