1<?php
2
3declare(strict_types = 1);
4
5namespace LanguageDetection;
6
7use LanguageDetection\Tokenizer\TokenizerInterface;
8use LanguageDetection\Tokenizer\WhitespaceTokenizer;
9
10/**
11 * Class NgramParser
12 *
13 * @copyright Patrick Schur
14 * @license https://opensource.org/licenses/mit-license.html MIT
15 * @author Patrick Schur <patrick_schur@outlook.de>
16 * @package LanguageDetection
17 */
18abstract class NgramParser
19{
20    /**
21     * @var int
22     */
23    protected $minLength = 1;
24
25    /**
26     * @var int
27     */
28    protected $maxLength = 3;
29
30    /**
31     * @var int
32     */
33    protected $maxNgrams = 310;
34
35    /**
36     * @var TokenizerInterface
37     */
38    protected $tokenizer = null;
39
40    /**
41     * @param int $minLength
42     * @throws \LengthException
43     */
44    public function setMinLength(int $minLength)
45    {
46        if ($minLength <= 0 || $minLength >= $this->maxLength)
47        {
48            throw new \LengthException('$minLength must be greater than zero and less than $this->maxLength.');
49        }
50
51        $this->minLength = $minLength;
52    }
53
54    /**
55     * @param int $maxLength
56     * @throws \LengthException
57     */
58    public function setMaxLength(int $maxLength)
59    {
60        if ($maxLength <= $this->minLength)
61        {
62            throw new \LengthException('$maxLength must be greater than $this->minLength.');
63        }
64
65        $this->maxLength = $maxLength;
66    }
67
68    /**
69     * @param int $maxNgrams
70     * @throws \LengthException
71     */
72    public function setMaxNgrams(int $maxNgrams)
73    {
74        if ($maxNgrams <= 0)
75        {
76            throw new \LengthException('$maxNgrams must be greater than zero.');
77        }
78
79        $this->maxNgrams = $maxNgrams;
80    }
81
82    /**
83     * Sets the tokenizer
84     *
85     * @param TokenizerInterface $tokenizer
86     */
87    public function setTokenizer(TokenizerInterface $tokenizer)
88    {
89        $this->tokenizer = $tokenizer;
90    }
91
92    /**
93     * @param string $str
94     * @return array
95     */
96    private function tokenize(string $str)
97    {
98        if (null === $this->tokenizer)
99        {
100            $this->tokenizer = new WhitespaceTokenizer();
101        }
102
103        return $this->tokenizer->tokenize($str);
104    }
105
106    /**
107     * @param string $str
108     * @return array
109     */
110    protected function getNgrams(string $str): array
111    {
112        $tokens = [];
113
114        foreach ($this->tokenize($str) as $word)
115        {
116            $l = \mb_strlen($word);
117
118            for ($i = $this->minLength; $i <= $this->maxLength; ++$i)
119            {
120                for ($j = 0; ($i + $j - 1) < $l; ++$j, ++$tmp)
121                {
122                    $tmp = &$tokens[$i][\mb_substr($word, $j, $i)];
123                }
124            }
125        }
126
127        foreach ($tokens as $i => $token)
128        {
129            $sum = \array_sum($token);
130
131            foreach ($token as $j => $value)
132            {
133                $tokens[$i][$j] = $value / $sum;
134            }
135        }
136
137        if (!\count($tokens))
138        {
139            return [];
140        }
141
142        $tokens = \array_merge(...$tokens);
143        unset($tokens['_']);
144
145        \arsort($tokens, SORT_NUMERIC);
146
147        return \array_slice(
148            \array_keys($tokens),
149            0,
150            $this->maxNgrams
151        );
152    }
153}
154