xref: /plugin/statistics/SearchEngines.php (revision 06bd4382454fe99f88f03cf234b281e15118ab82)
12d987c80SAndreas Gohr<?php
22d987c80SAndreas Gohr
32d987c80SAndreas Gohrnamespace dokuwiki\plugin\statistics;
42d987c80SAndreas Gohr
52d987c80SAndreas Gohr/**
62d987c80SAndreas Gohr * Defines regular expressions for the most common search engines
72d987c80SAndreas Gohr */
82d987c80SAndreas Gohrclass SearchEngines
92d987c80SAndreas Gohr{
10e357e0dcSAndreas Gohr (aider)    /** @var array Search engine definitions with regex patterns and metadata */
11e357e0dcSAndreas Gohr (aider)    protected array $searchEngines = [
12e357e0dcSAndreas Gohr (aider)        'google' => [
13e357e0dcSAndreas Gohr (aider)            'name' => 'Google',
14e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.google.com',
15e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$',
16e357e0dcSAndreas Gohr (aider)            'params' => ['q']
17e357e0dcSAndreas Gohr (aider)        ],
18e357e0dcSAndreas Gohr (aider)        'bing' => [
19e357e0dcSAndreas Gohr (aider)            'name' => 'Bing',
20e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.bing.com',
21e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$',
22e357e0dcSAndreas Gohr (aider)            'params' => ['q']
23e357e0dcSAndreas Gohr (aider)        ],
24e357e0dcSAndreas Gohr (aider)        'yandex' => [
25e357e0dcSAndreas Gohr (aider)            'name' => 'Яндекс (Yandex)',
26e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.yandex.ru',
27e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$',
28e357e0dcSAndreas Gohr (aider)            'params' => ['query']
29e357e0dcSAndreas Gohr (aider)        ],
30e357e0dcSAndreas Gohr (aider)        'yahoo' => [
31e357e0dcSAndreas Gohr (aider)            'name' => 'Yahoo!',
32e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.yahoo.com',
33e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*yahoo\.com$',
34e357e0dcSAndreas Gohr (aider)            'params' => ['p']
35e357e0dcSAndreas Gohr (aider)        ],
36e357e0dcSAndreas Gohr (aider)        'naver' => [
37e357e0dcSAndreas Gohr (aider)            'name' => '네이버 (Naver)',
38e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.naver.com',
39e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.naver\.com$',
40e357e0dcSAndreas Gohr (aider)            'params' => ['query']
41e357e0dcSAndreas Gohr (aider)        ],
42e357e0dcSAndreas Gohr (aider)        'baidu' => [
43e357e0dcSAndreas Gohr (aider)            'name' => '百度 (Baidu)',
44e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.baidu.com',
45e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*baidu\.com$',
46e357e0dcSAndreas Gohr (aider)            'params' => ['wd', 'word', 'kw']
47e357e0dcSAndreas Gohr (aider)        ],
48e357e0dcSAndreas Gohr (aider)        'ask' => [
49e357e0dcSAndreas Gohr (aider)            'name' => 'Ask',
50e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.ask.com',
51e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*ask\.com$',
52e357e0dcSAndreas Gohr (aider)            'params' => ['ask', 'q', 'searchfor']
53e357e0dcSAndreas Gohr (aider)        ],
54e357e0dcSAndreas Gohr (aider)        'ask_search_results' => [
55e357e0dcSAndreas Gohr (aider)            'name' => 'Ask',
56e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.ask.com',
57e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*search-results\.com$',
58e357e0dcSAndreas Gohr (aider)            'params' => ['ask', 'q', 'searchfor']
59e357e0dcSAndreas Gohr (aider)        ],
60e357e0dcSAndreas Gohr (aider)        'babylon' => [
61e357e0dcSAndreas Gohr (aider)            'name' => 'Babylon',
62e357e0dcSAndreas Gohr (aider)            'url' => 'http://search.babylon.com',
63e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.babylon\.com$',
64e357e0dcSAndreas Gohr (aider)            'params' => ['q']
65e357e0dcSAndreas Gohr (aider)        ],
66e357e0dcSAndreas Gohr (aider)        'aol' => [
67e357e0dcSAndreas Gohr (aider)            'name' => 'AOL Search',
68e357e0dcSAndreas Gohr (aider)            'url' => 'http://search.aol.com',
69e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$',
70e357e0dcSAndreas Gohr (aider)            'params' => ['query', 'q']
71e357e0dcSAndreas Gohr (aider)        ],
72e357e0dcSAndreas Gohr (aider)        'duckduckgo' => [
73e357e0dcSAndreas Gohr (aider)            'name' => 'DuckDuckGo',
74e357e0dcSAndreas Gohr (aider)            'url' => 'http://duckduckgo.com',
75e357e0dcSAndreas Gohr (aider)            'regex' => '^duckduckgo\.com$',
76e357e0dcSAndreas Gohr (aider)            'params' => ['q']
77e357e0dcSAndreas Gohr (aider)        ],
78e357e0dcSAndreas Gohr (aider)        'google_avg' => [
79e357e0dcSAndreas Gohr (aider)            'name' => 'Google',
80e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.google.com',
81e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.avg\.com$',
82e357e0dcSAndreas Gohr (aider)            'params' => ['q']
83e357e0dcSAndreas Gohr (aider)        ]
842d987c80SAndreas Gohr    ];
852d987c80SAndreas Gohr
86*06bd4382SAndreas Gohr (aider)    /** @var string|null The referrer URL being analyzed */
87*06bd4382SAndreas Gohr (aider)    protected ?string $referrer = null;
88*06bd4382SAndreas Gohr (aider)
89*06bd4382SAndreas Gohr (aider)    /** @var array|null Cached analysis result */
90*06bd4382SAndreas Gohr (aider)    protected ?array $analysisResult = null;
91*06bd4382SAndreas Gohr (aider)
92*06bd4382SAndreas Gohr (aider)    public function __construct(?string $referrer = null)
932d987c80SAndreas Gohr    {
94e357e0dcSAndreas Gohr (aider)        // Add the internal DokuWiki search engine
95e357e0dcSAndreas Gohr (aider)        $this->searchEngines['dokuwiki'] = [
96e357e0dcSAndreas Gohr (aider)            'name' => 'DokuWiki Internal Search',
97e357e0dcSAndreas Gohr (aider)            'url' => wl(),
98e357e0dcSAndreas Gohr (aider)            'regex' => '',
99e357e0dcSAndreas Gohr (aider)            'params' => ['q']
100e357e0dcSAndreas Gohr (aider)        ];
101*06bd4382SAndreas Gohr (aider)
102*06bd4382SAndreas Gohr (aider)        if ($referrer !== null) {
103*06bd4382SAndreas Gohr (aider)            $this->setReferrer($referrer);
104*06bd4382SAndreas Gohr (aider)        }
105*06bd4382SAndreas Gohr (aider)    }
106*06bd4382SAndreas Gohr (aider)
107*06bd4382SAndreas Gohr (aider)    /**
108*06bd4382SAndreas Gohr (aider)     * Set the referrer URL to analyze
109*06bd4382SAndreas Gohr (aider)     *
110*06bd4382SAndreas Gohr (aider)     * @param string $referrer The HTTP referrer URL
111*06bd4382SAndreas Gohr (aider)     */
112*06bd4382SAndreas Gohr (aider)    public function setReferrer(string $referrer): void
113*06bd4382SAndreas Gohr (aider)    {
114*06bd4382SAndreas Gohr (aider)        $this->referrer = $referrer;
115*06bd4382SAndreas Gohr (aider)        $this->analysisResult = null; // Clear cache
116*06bd4382SAndreas Gohr (aider)    }
117*06bd4382SAndreas Gohr (aider)
118*06bd4382SAndreas Gohr (aider)    /**
119*06bd4382SAndreas Gohr (aider)     * Check if the referrer is from a search engine
120*06bd4382SAndreas Gohr (aider)     *
121*06bd4382SAndreas Gohr (aider)     * @return bool True if the referrer is from a search engine
122*06bd4382SAndreas Gohr (aider)     */
123*06bd4382SAndreas Gohr (aider)    public function isSearchEngine(): bool
124*06bd4382SAndreas Gohr (aider)    {
125*06bd4382SAndreas Gohr (aider)        $this->analyze();
126*06bd4382SAndreas Gohr (aider)        return $this->analysisResult !== null;
127*06bd4382SAndreas Gohr (aider)    }
128*06bd4382SAndreas Gohr (aider)
129*06bd4382SAndreas Gohr (aider)    /**
130*06bd4382SAndreas Gohr (aider)     * Get the search engine name
131*06bd4382SAndreas Gohr (aider)     *
132*06bd4382SAndreas Gohr (aider)     * @return string|null The search engine name or null if not a search engine
133*06bd4382SAndreas Gohr (aider)     */
134*06bd4382SAndreas Gohr (aider)    public function getName(): ?string
135*06bd4382SAndreas Gohr (aider)    {
136*06bd4382SAndreas Gohr (aider)        $this->analyze();
137*06bd4382SAndreas Gohr (aider)        return $this->analysisResult['name'] ?? null;
138*06bd4382SAndreas Gohr (aider)    }
139*06bd4382SAndreas Gohr (aider)
140*06bd4382SAndreas Gohr (aider)    /**
141*06bd4382SAndreas Gohr (aider)     * Get the search engine URL
142*06bd4382SAndreas Gohr (aider)     *
143*06bd4382SAndreas Gohr (aider)     * @return string|null The search engine URL or null if not a search engine
144*06bd4382SAndreas Gohr (aider)     */
145*06bd4382SAndreas Gohr (aider)    public function getUrl(): ?string
146*06bd4382SAndreas Gohr (aider)    {
147*06bd4382SAndreas Gohr (aider)        $this->analyze();
148*06bd4382SAndreas Gohr (aider)        if (!$this->analysisResult) {
149*06bd4382SAndreas Gohr (aider)            return null;
150*06bd4382SAndreas Gohr (aider)        }
151*06bd4382SAndreas Gohr (aider)
152*06bd4382SAndreas Gohr (aider)        $engineKey = $this->analysisResult['engine'];
153*06bd4382SAndreas Gohr (aider)        return $this->searchEngines[$engineKey]['url'] ?? null;
154*06bd4382SAndreas Gohr (aider)    }
155*06bd4382SAndreas Gohr (aider)
156*06bd4382SAndreas Gohr (aider)    /**
157*06bd4382SAndreas Gohr (aider)     * Get the search query
158*06bd4382SAndreas Gohr (aider)     *
159*06bd4382SAndreas Gohr (aider)     * @return string|null The search query or null if not a search engine
160*06bd4382SAndreas Gohr (aider)     */
161*06bd4382SAndreas Gohr (aider)    public function getQuery(): ?string
162*06bd4382SAndreas Gohr (aider)    {
163*06bd4382SAndreas Gohr (aider)        $this->analyze();
164*06bd4382SAndreas Gohr (aider)        return $this->analysisResult['query'] ?? null;
165*06bd4382SAndreas Gohr (aider)    }
166*06bd4382SAndreas Gohr (aider)
167*06bd4382SAndreas Gohr (aider)    /**
168*06bd4382SAndreas Gohr (aider)     * Analyze the current referrer
169*06bd4382SAndreas Gohr (aider)     */
170*06bd4382SAndreas Gohr (aider)    protected function analyze(): void
171*06bd4382SAndreas Gohr (aider)    {
172*06bd4382SAndreas Gohr (aider)        if ($this->analysisResult !== null || $this->referrer === null) {
173*06bd4382SAndreas Gohr (aider)            return; // Already analyzed or no referrer set
174*06bd4382SAndreas Gohr (aider)        }
175*06bd4382SAndreas Gohr (aider)
176*06bd4382SAndreas Gohr (aider)        $this->analysisResult = $this->analyzeReferrer($this->referrer);
1772d987c80SAndreas Gohr    }
1782d987c80SAndreas Gohr
179a171b9c7SAndreas Gohr (aider)    /**
180a171b9c7SAndreas Gohr (aider)     * Analyze a referrer URL to extract search engine information and query
181a171b9c7SAndreas Gohr (aider)     *
182a171b9c7SAndreas Gohr (aider)     * @param string $referer The HTTP referer URL
183a171b9c7SAndreas Gohr (aider)     * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine
184a171b9c7SAndreas Gohr (aider)     */
185a171b9c7SAndreas Gohr (aider)    public function analyzeReferrer(string $referer): ?array
186a171b9c7SAndreas Gohr (aider)    {
187a171b9c7SAndreas Gohr (aider)        $referer = strtolower($referer);
188a171b9c7SAndreas Gohr (aider)
189a171b9c7SAndreas Gohr (aider)        // parse the referer
190a171b9c7SAndreas Gohr (aider)        $urlparts = parse_url($referer);
191a171b9c7SAndreas Gohr (aider)        if (!isset($urlparts['host'])) {
192a171b9c7SAndreas Gohr (aider)            return null;
193a171b9c7SAndreas Gohr (aider)        }
194a171b9c7SAndreas Gohr (aider)
195a171b9c7SAndreas Gohr (aider)        $domain = $urlparts['host'];
196a171b9c7SAndreas Gohr (aider)        $qpart = $urlparts['query'] ?? '';
197a171b9c7SAndreas Gohr (aider)        if (!$qpart && isset($urlparts['fragment'])) {
198a171b9c7SAndreas Gohr (aider)            $qpart = $urlparts['fragment']; // google does this
199a171b9c7SAndreas Gohr (aider)        }
200a171b9c7SAndreas Gohr (aider)
201a171b9c7SAndreas Gohr (aider)        $params = [];
202a171b9c7SAndreas Gohr (aider)        if ($qpart) {
203a171b9c7SAndreas Gohr (aider)            parse_str($qpart, $params);
204a171b9c7SAndreas Gohr (aider)        }
205a171b9c7SAndreas Gohr (aider)
206a171b9c7SAndreas Gohr (aider)        $query = '';
207a171b9c7SAndreas Gohr (aider)        $engineKey = '';
208a171b9c7SAndreas Gohr (aider)        $engineName = '';
209a171b9c7SAndreas Gohr (aider)
210a171b9c7SAndreas Gohr (aider)        // check domain against known search engines
211a171b9c7SAndreas Gohr (aider)        foreach ($this->searchEngines as $key => $engine) {
212a171b9c7SAndreas Gohr (aider)            if (!$engine['regex']) continue; // skip engines without regex (like dokuwiki)
213a171b9c7SAndreas Gohr (aider)
214a171b9c7SAndreas Gohr (aider)            if (preg_match('/' . $engine['regex'] . '/', $domain)) {
215a171b9c7SAndreas Gohr (aider)                $engineKey = $key;
216a171b9c7SAndreas Gohr (aider)                $engineName = $engine['name'];
217a171b9c7SAndreas Gohr (aider)
218a171b9c7SAndreas Gohr (aider)                // check the known parameters for content
219a171b9c7SAndreas Gohr (aider)                foreach ($engine['params'] as $param) {
220a171b9c7SAndreas Gohr (aider)                    if (!empty($params[$param])) {
221a171b9c7SAndreas Gohr (aider)                        $query = $params[$param];
222a171b9c7SAndreas Gohr (aider)                        break;
223a171b9c7SAndreas Gohr (aider)                    }
224a171b9c7SAndreas Gohr (aider)                }
225a171b9c7SAndreas Gohr (aider)                break;
226a171b9c7SAndreas Gohr (aider)            }
227a171b9c7SAndreas Gohr (aider)        }
228a171b9c7SAndreas Gohr (aider)
229a171b9c7SAndreas Gohr (aider)        // try some generic search engine parameters if no specific engine matched
230a171b9c7SAndreas Gohr (aider)        if (!$engineKey) {
231a171b9c7SAndreas Gohr (aider)            foreach (['search', 'query', 'q', 'keywords', 'keyword'] as $param) {
232a171b9c7SAndreas Gohr (aider)                if (!empty($params[$param])) {
233a171b9c7SAndreas Gohr (aider)                    $query = $params[$param];
234a171b9c7SAndreas Gohr (aider)                    // generate name from domain
235a171b9c7SAndreas Gohr (aider)                    $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); // strip tld
236a171b9c7SAndreas Gohr (aider)                    $engineName = explode('.', $engineName);
237a171b9c7SAndreas Gohr (aider)                    $engineName = array_pop($engineName);
238a171b9c7SAndreas Gohr (aider)                    $engineKey = 'generic_' . $engineName;
239a171b9c7SAndreas Gohr (aider)                    break;
240a171b9c7SAndreas Gohr (aider)                }
241a171b9c7SAndreas Gohr (aider)            }
242a171b9c7SAndreas Gohr (aider)        }
243a171b9c7SAndreas Gohr (aider)
244a171b9c7SAndreas Gohr (aider)        // still no hit? not a search engine
245a171b9c7SAndreas Gohr (aider)        if (!$engineKey || !$query) {
246a171b9c7SAndreas Gohr (aider)            return null;
247a171b9c7SAndreas Gohr (aider)        }
248a171b9c7SAndreas Gohr (aider)
249a171b9c7SAndreas Gohr (aider)        // clean the query
250a171b9c7SAndreas Gohr (aider)        $query = preg_replace('/^(cache|related):[^\+]+/', '', $query); // non-search queries
251a171b9c7SAndreas Gohr (aider)        $query = preg_replace('/ +/', ' ', $query); // ws compact
252a171b9c7SAndreas Gohr (aider)        $query = trim($query);
253a171b9c7SAndreas Gohr (aider)
254a171b9c7SAndreas Gohr (aider)        if (!$query) {
255a171b9c7SAndreas Gohr (aider)            return null;
256a171b9c7SAndreas Gohr (aider)        }
257a171b9c7SAndreas Gohr (aider)
258a171b9c7SAndreas Gohr (aider)        return [
259a171b9c7SAndreas Gohr (aider)            'engine' => $engineKey,
260a171b9c7SAndreas Gohr (aider)            'name' => $engineName,
261a171b9c7SAndreas Gohr (aider)            'query' => $query
262a171b9c7SAndreas Gohr (aider)        ];
263a171b9c7SAndreas Gohr (aider)    }
264a171b9c7SAndreas Gohr (aider)
265a171b9c7SAndreas Gohr (aider)    /**
266a171b9c7SAndreas Gohr (aider)     * Get search engine information by key
267a171b9c7SAndreas Gohr (aider)     *
268a171b9c7SAndreas Gohr (aider)     * @param string $key The search engine key
269a171b9c7SAndreas Gohr (aider)     * @return array|null The search engine data or null if not found
270a171b9c7SAndreas Gohr (aider)     */
271a171b9c7SAndreas Gohr (aider)    public function getSearchEngine(string $key): ?array
272a171b9c7SAndreas Gohr (aider)    {
273a171b9c7SAndreas Gohr (aider)        return $this->searchEngines[$key] ?? null;
274a171b9c7SAndreas Gohr (aider)    }
275a171b9c7SAndreas Gohr (aider)
276a171b9c7SAndreas Gohr (aider)    /**
277a171b9c7SAndreas Gohr (aider)     * Get all search engines
278a171b9c7SAndreas Gohr (aider)     *
279a171b9c7SAndreas Gohr (aider)     * @return array All search engine definitions
280a171b9c7SAndreas Gohr (aider)     */
281a171b9c7SAndreas Gohr (aider)    public function getAllSearchEngines(): array
282a171b9c7SAndreas Gohr (aider)    {
283a171b9c7SAndreas Gohr (aider)        return $this->searchEngines;
284a171b9c7SAndreas Gohr (aider)    }
285a171b9c7SAndreas Gohr (aider)
2862d987c80SAndreas Gohr}
287