xref: /plugin/statistics/Logger.php (revision 69fb56a24f2453b927cc4a932234a024a80bedd0)
1<?php
2
3namespace dokuwiki\plugin\statistics;
4
5/**
6 * Exception thrown when logging should be ignored
7 */
8class IgnoreException extends \RuntimeException
9{
10}
11
12use DeviceDetector\DeviceDetector;
13use DeviceDetector\Parser\Client\Browser;
14use DeviceDetector\Parser\Device\AbstractDeviceParser;
15use DeviceDetector\Parser\OperatingSystem;
16use dokuwiki\HTTP\DokuHTTPClient;
17use dokuwiki\Input\Input;
18use dokuwiki\plugin\sqlite\SQLiteDB;
19use helper_plugin_popularity;
20use helper_plugin_statistics;
21
22class Logger
23{
24    /** @var helper_plugin_statistics The statistics helper plugin instance */
25    protected helper_plugin_statistics $hlp;
26
27    /** @var SQLiteDB The SQLite database instance */
28    protected SQLiteDB $db;
29
30    /** @var string The full user agent string */
31    protected string $uaAgent;
32
33    /** @var string The type of user agent (browser, robot, feedreader) */
34    protected string $uaType = 'browser';
35
36    /** @var string The browser/client name */
37    protected string $uaName;
38
39    /** @var string The browser/client version */
40    protected string $uaVersion;
41
42    /** @var string The operating system/platform */
43    protected string $uaPlatform;
44
45    /** @var string|null The user name, if available */
46    protected ?string $user = null;
47
48    /** @var string The unique user identifier */
49    protected string $uid;
50
51    /** @var string The session identifier */
52    protected string $session;
53
54    /** @var int|null The ID of the main access log entry if any */
55    protected ?int $hit = null;
56
57    /** @var DokuHTTPClient|null The HTTP client instance for testing */
58    protected ?DokuHTTPClient $httpClient = null;
59
60    // region lifecycle
61
62    /**
63     * Constructor
64     *
65     * Parses browser info and set internal vars
66     */
67    public function __construct(helper_plugin_statistics $hlp, ?DokuHTTPClient $httpClient = null)
68    {
69        /** @var Input $INPUT */
70        global $INPUT;
71
72        $this->hlp = $hlp;
73        $this->db = $this->hlp->getDB();
74        $this->httpClient = $httpClient;
75
76        // FIXME if we already have a session, we should not re-parse the user agent
77
78        $ua = trim($INPUT->server->str('HTTP_USER_AGENT'));
79        AbstractDeviceParser::setVersionTruncation(AbstractDeviceParser::VERSION_TRUNCATION_MAJOR);
80        $dd = new DeviceDetector($ua); // FIXME we could use client hints, but need to add headers
81        $dd->discardBotInformation();
82        $dd->parse();
83
84        if ($dd->isFeedReader()) {
85            $this->uaType = 'feedreader';
86        } elseif ($dd->isBot()) {
87            $this->uaType = 'robot';
88            // for now ignore bots
89            throw new IgnoreException('Bot detected, not logging');
90        }
91
92        $this->uaAgent = $ua;
93        $this->uaName = Browser::getBrowserFamily($dd->getClient('name')) ?: 'Unknown';
94        $this->uaVersion = $dd->getClient('version') ?: '0';
95        $this->uaPlatform = OperatingSystem::getOsFamily($dd->getOs('name')) ?: 'Unknown';
96        $this->uid = $this->getUID();
97        $this->session = $this->getSession();
98        $this->user = $INPUT->server->str('REMOTE_USER', null, true);
99    }
100
101    /**
102     * Should be called before logging
103     *
104     * This starts a transaction, so all logging is done in one go. It also logs the user and session data.
105     */
106    public function begin(): void
107    {
108        $this->hlp->getDB()->getPdo()->beginTransaction();
109
110        $this->logUser();
111        $this->logGroups();
112        $this->logDomain();
113        $this->logSession();
114    }
115
116    /**
117     * Should be called after logging
118     *
119     * This commits the transaction started in begin()
120     */
121    public function end(): void
122    {
123        $this->hlp->getDB()->getPdo()->commit();
124    }
125
126    // endregion
127    // region data gathering
128
129    /**
130     * Get the unique user ID
131     *
132     * The user ID is stored in the user preferences and should stay there forever.
133     * @return string The unique user identifier
134     */
135    protected function getUID(): string
136    {
137        if (!isset($_SESSION[DOKU_COOKIE]['statistics']['uid'])) {
138            // when there is no session UID set, we assume this was deliberate and we simply abort all logging
139            // @todo we may later make UID generation optional
140            throw new IgnoreException('No user ID found');
141        }
142
143        return $_SESSION[DOKU_COOKIE]['statistics']['uid'];
144    }
145
146    /**
147     * Return the user's session ID
148     *
149     * @return string The session identifier
150     */
151    protected function getSession(): string
152    {
153        if (!isset($_SESSION[DOKU_COOKIE]['statistics']['id'])) {
154            // when there is no session ID set, we assume this was deliberate and we simply abort all logging
155            throw new IgnoreException('No session ID found');
156        }
157
158        return $_SESSION[DOKU_COOKIE]['statistics']['id'];
159    }
160
161    // endregion
162    // region automatic logging
163
164    /**
165     * Log the user was seen
166     */
167    protected function logUser(): void
168    {
169        if (!$this->user) return;
170
171        $this->db->exec(
172            'INSERT INTO users (user, dt)
173                  VALUES (?, CURRENT_TIMESTAMP)
174            ON CONFLICT (user) DO UPDATE SET
175                         dt = CURRENT_TIMESTAMP
176                   WHERE excluded.user = users.user
177            ',
178            $this->user
179        );
180
181    }
182
183    /**
184     * Log the session and user agent information
185     */
186    protected function logSession(): void
187    {
188        $this->db->exec(
189            'INSERT INTO sessions (session, dt, end, uid, user, ua, ua_info, ua_type, ua_ver, os)
190                  VALUES (?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, ?, ?, ?, ?, ?, ?, ?)
191             ON CONFLICT (session) DO UPDATE SET
192                         end = CURRENT_TIMESTAMP,
193                         user = excluded.user,
194                         uid = excluded.uid
195                   WHERE excluded.session = sessions.session
196             ',
197            $this->session,
198            $this->uid,
199            $this->user,
200            $this->uaAgent,
201            $this->uaName,
202            $this->uaType,
203            $this->uaVersion,
204            $this->uaPlatform
205        );
206    }
207
208    /**
209     * Log all groups for the user
210     *
211     * @todo maybe this should be done only once per session?
212     */
213    protected function logGroups(): void
214    {
215        global $USERINFO;
216
217        if (!$this->user) return;
218        if (!isset($USERINFO['grps'])) return;
219        if (!is_array($USERINFO['grps'])) return;
220        $groups = $USERINFO['grps'];
221
222        $this->db->exec('DELETE FROM groups WHERE user = ?', $this->user);
223
224        if (empty($groups)) {
225            return;
226        }
227
228        $placeholders = implode(',', array_fill(0, count($groups), '(?, ?)'));
229        $params = [];
230        $sql = "INSERT INTO groups (`user`, `group`) VALUES $placeholders";
231        foreach ($groups as $group) {
232            $params[] = $this->user;
233            $params[] = $group;
234        }
235        $this->db->exec($sql, $params);
236    }
237
238    /**
239     * Log email domain
240     *
241     * @todo maybe this should be done only once per session?
242     */
243    protected function logDomain(): void
244    {
245        global $USERINFO;
246        if (!$this->user) return;
247        if (!isset($USERINFO['mail'])) return;
248        $mail = $USERINFO['mail'];
249
250        $pos = strrpos($mail, '@');
251        if (!$pos) return;
252        $domain = substr($mail, $pos + 1);
253        if (empty($domain)) return;
254
255        $sql = 'UPDATE users SET domain = ? WHERE user = ?';
256        $this->db->exec($sql, [$domain, $this->user]);
257    }
258
259    // endregion
260    // region internal loggers called by the dispatchers
261
262    /**
263     * Log the given referer URL
264     *
265     * Note: we DO log empty referers. These are external accesses that did not provide a referer URL.
266     * We do not log referers that are our own pages though.
267     *
268     * engine set -> a search engine referer
269     * no engine set, url empty -> a direct access (bookmark, direct link, etc.)
270     * no engine set, url not empty -> a referer from another page (not a wiki page)
271     * null returned -> referer was a wiki page
272     *
273     * @param $referer
274     * @return int|null The referer ID or null if no referer was logged
275     * @todo we could check against a blacklist here
276     */
277    public function logReferer($referer): ?int
278    {
279        $referer = trim($referer);
280
281        // do not log our own pages as referers (empty referer is OK though)
282        if (!empty($referer)) {
283            $selfre = '^' . preg_quote(DOKU_URL, '/');
284            if (preg_match("/$selfre/", $referer)) {
285                return null;
286            }
287        }
288
289        // is it a search engine?
290        $se = new SearchEngines($referer);
291        $engine = $se->getEngine();
292
293        $sql = 'INSERT OR IGNORE INTO referers (url, engine, dt) VALUES (?, ?, CURRENT_TIMESTAMP)';
294        $this->db->exec($sql, [$referer, $engine]);
295        return (int)$this->db->queryValue('SELECT id FROM referers WHERE url = ?', $referer);
296    }
297
298    /**
299     * Resolve IP to country/city and store in database
300     *
301     * @return string The IP address as stored
302     */
303    public function logIp(): string
304    {
305        $ip = clientIP(true);
306
307        // anonymize the IP address for storage?
308        if ($this->hlp->getConf('anonips')) {
309            $hash = md5($ip . strrev($ip)); // we use the reversed IP as salt to avoid common rainbow tables
310            $host = '';
311        } else {
312            $hash = $ip;
313            $host = gethostbyaddr($ip);
314        }
315
316        // check if IP already known and up-to-date
317        $result = $this->db->queryValue(
318            "SELECT ip
319             FROM   iplocation
320             WHERE  ip = ?
321               AND  lastupd > date('now', '-30 days')",
322            $hash
323        );
324        if ($result) return $hash; // already known and up-to-date
325
326        $http = $this->httpClient ?: new DokuHTTPClient();
327        $http->timeout = 7;
328        $json = $http->get('http://ip-api.com/json/' . $ip); // yes, it's HTTP only
329
330        if (!$json) {
331            \dokuwiki\Logger::error('Statistics Plugin - Failed talk to ip-api.com.');
332            return $hash;
333        }
334        try {
335            $data = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
336        } catch (\JsonException $e) {
337            \dokuwiki\Logger::error('Statistics Plugin - Failed to decode JSON from ip-api.com.', $e);
338            return $hash;
339        }
340        if (!isset($data['status'])) {
341            \dokuwiki\Logger::error('Statistics Plugin - Invalid ip-api.com result' . $ip, $data);
342            return $hash;
343        }
344
345        // we do not check for 'success' status here. when the API can't resolve the IP we still log it
346        // without location data, so we won't re-query it in the next 30 days.
347
348        $this->db->exec(
349            'INSERT OR REPLACE INTO iplocation (
350                    ip, country, code, city, host, lastupd
351                 ) VALUES (
352                    ?, ?, ?, ?, ?, CURRENT_TIMESTAMP
353                 )',
354            $hash,
355            $data['country'] ?? '',
356            $data['countryCode'] ?? '',
357            $data['city'] ?? '',
358            $host
359        );
360
361        return $hash;
362    }
363
364    // endregion
365    // region log dispatchers
366
367    public function logPageView(): void
368    {
369        global $INPUT;
370
371        if (!$INPUT->str('p')) return;
372
373
374        $referer = $INPUT->filter('trim')->str('r');
375        $ip = $this->logIp(); // resolve the IP address
376
377        $data = [
378            'page' => $INPUT->filter('cleanID')->str('p'),
379            'ip' => $ip,
380            'ref_id' => $this->logReferer($referer),
381            'sx' => $INPUT->int('sx'),
382            'sy' => $INPUT->int('sy'),
383            'vx' => $INPUT->int('vx'),
384            'vy' => $INPUT->int('vy'),
385            'session' => $this->session,
386        ];
387
388        $this->db->exec('
389        INSERT INTO pageviews (
390            dt, page, ip, ref_id, screen_x, screen_y, view_x, view_y, session
391        ) VALUES (
392            CURRENT_TIMESTAMP, :page, :ip, :ref_id, :sx, :sy, :vx, :vy, :session
393        )
394        ',
395            $data
396        );
397    }
398
399    /**
400     * Log a click on an external link
401     *
402     * Called from log.php
403     */
404    public function logOutgoing(): void
405    {
406        global $INPUT;
407
408        if (!$INPUT->str('ol')) return;
409
410        $link = $INPUT->filter('trim')->str('ol');
411        $session = $this->session;
412        $page = $INPUT->filter('cleanID')->str('p');
413
414        $this->db->exec(
415            'INSERT INTO outlinks (
416                dt, session, page, link
417             ) VALUES (
418                CURRENT_TIMESTAMP, ?, ?, ?
419             )',
420            $session,
421            $page,
422            $link
423        );
424    }
425
426    /**
427     * Log access to a media file
428     *
429     * Called from action.php
430     *
431     * @param string $media The media ID
432     * @param string $mime The media's mime type
433     * @param bool $inline Is this displayed inline?
434     * @param int $size Size of the media file
435     */
436    public function logMedia(string $media, string $mime, bool $inline, int $size): void
437    {
438        [$mime1, $mime2] = explode('/', strtolower($mime));
439        $inline = $inline ? 1 : 0;
440
441
442        $data = [
443            'media' => cleanID($media),
444            'ip' => $this->logIp(), // resolve the IP address
445            'session' => $this->session,
446            'size' => $size,
447            'mime1' => $mime1,
448            'mime2' => $mime2,
449            'inline' => $inline,
450        ];
451
452        $this->db->exec('
453                INSERT INTO media ( dt, media, ip, session, size, mime1, mime2, inline )
454                     VALUES (CURRENT_TIMESTAMP, :media, :ip, :session, :size, :mime1, :mime2, :inline)
455            ',
456            $data
457        );
458    }
459
460    /**
461     * Log page edits
462     *
463     * called from action.php
464     *
465     * @param string $page The page that was edited
466     * @param string $type The type of edit (create, edit, etc.)
467     */
468    public function logEdit(string $page, string $type): void
469    {
470        $data = [
471            'page' => cleanID($page),
472            'type' => $type,
473            'ip' => $this->logIp(), // resolve the IP address
474            'session' => $this->session
475        ];
476
477        $this->db->exec(
478            'INSERT INTO edits (
479                dt, page, type, ip, session
480             ) VALUES (
481                CURRENT_TIMESTAMP, :page, :type, :ip, :session
482             )',
483            $data
484        );
485    }
486
487    /**
488     * Log login/logoffs and user creations
489     *
490     * @param string $type The type of login event (login, logout, create, failed)
491     * @param string $user The username
492     */
493    public function logLogin(string $type, string $user = ''): void
494    {
495        global $INPUT;
496
497        if (!$user) $user = $INPUT->server->str('REMOTE_USER');
498
499        $ip = clientIP(true);
500
501        $this->db->exec(
502            'INSERT INTO logins (
503                dt, ip, user, type
504             ) VALUES (
505                CURRENT_TIMESTAMP, ?, ?, ?
506             )',
507            $ip,
508            $user,
509            $type
510        );
511    }
512
513    /**
514     * Log search data to the search related tables
515     *
516     * @param string $query The search query
517     * @param string[] $words The query split into words
518     */
519    public function logSearch(string $query, array $words): void
520    {
521        if (!$query) return;
522
523        $sid = $this->db->exec(
524            'INSERT INTO search (dt, ip, session, query) VALUES (CURRENT_TIMESTAMP, ?, ? , ?)',
525            $this->logIp(), // resolve the IP address
526            $this->session,
527            $query,
528        );
529
530        foreach ($words as $word) {
531            if (!$word) continue;
532            $this->db->exec(
533                'INSERT INTO searchwords (sid, word) VALUES (?, ?)',
534                $sid,
535                $word
536            );
537        }
538    }
539
540    /**
541     * Log the current page count and size as today's history entry
542     */
543    public function logHistoryPages(): void
544    {
545        global $conf;
546
547        // use the popularity plugin's search method to find the wanted data
548        /** @var helper_plugin_popularity $pop */
549        $pop = plugin_load('helper', 'popularity');
550        $list = $this->initEmptySearchList();
551        search($list, $conf['datadir'], [$pop, 'searchCountCallback'], ['all' => false], '');
552        $page_count = $list['file_count'];
553        $page_size = $list['file_size'];
554
555        $this->db->exec(
556            'INSERT OR REPLACE INTO history (
557                info, value, dt
558             ) VALUES (
559                ?, ?, CURRENT_TIMESTAMP
560             )',
561            'page_count',
562            $page_count
563        );
564        $this->db->exec(
565            'INSERT OR REPLACE INTO history (
566                info, value, dt
567             ) VALUES (
568                ?, ?, CURRENT_TIMESTAMP
569             )',
570            'page_size',
571            $page_size
572        );
573    }
574
575    /**
576     * Log the current media count and size as today's history entry
577     */
578    public function logHistoryMedia(): void
579    {
580        global $conf;
581
582        // use the popularity plugin's search method to find the wanted data
583        /** @var helper_plugin_popularity $pop */
584        $pop = plugin_load('helper', 'popularity');
585        $list = $this->initEmptySearchList();
586        search($list, $conf['mediadir'], [$pop, 'searchCountCallback'], ['all' => true], '');
587        $media_count = $list['file_count'];
588        $media_size = $list['file_size'];
589
590        $this->db->exec(
591            'INSERT OR REPLACE INTO history (
592                info, value, dt
593             ) VALUES (
594                ?, ?, CURRENT_TIMESTAMP
595             )',
596            'media_count',
597            $media_count
598        );
599        $this->db->exec(
600            'INSERT OR REPLACE INTO history (
601                info, value, dt
602             ) VALUES (
603                ?, ?, CURRENT_TIMESTAMP
604             )',
605            'media_size',
606            $media_size
607        );
608    }
609
610    // endregion
611
612    /**
613     * @todo can be dropped in favor of helper_plugin_popularity::initEmptySearchList() once it's public
614     * @return array
615     */
616    protected function initEmptySearchList()
617    {
618        return array_fill_keys([
619            'file_count',
620            'file_size',
621            'file_max',
622            'file_min',
623            'dir_count',
624            'dir_nest',
625            'file_oldest'
626        ], 0);
627    }
628}
629