1<?php 2 3declare(strict_types = 1); 4 5namespace LanguageDetection; 6 7use LanguageDetection\Tokenizer\TokenizerInterface; 8use LanguageDetection\Tokenizer\WhitespaceTokenizer; 9 10/** 11 * Class NgramParser 12 * 13 * @copyright Patrick Schur 14 * @license https://opensource.org/licenses/mit-license.html MIT 15 * @author Patrick Schur <patrick_schur@outlook.de> 16 * @package LanguageDetection 17 */ 18abstract class NgramParser 19{ 20 /** 21 * @var int 22 */ 23 protected $minLength = 1; 24 25 /** 26 * @var int 27 */ 28 protected $maxLength = 3; 29 30 /** 31 * @var int 32 */ 33 protected $maxNgrams = 310; 34 35 /** 36 * @var TokenizerInterface 37 */ 38 protected $tokenizer = null; 39 40 /** 41 * @param int $minLength 42 * @throws \LengthException 43 */ 44 public function setMinLength(int $minLength) 45 { 46 if ($minLength <= 0 || $minLength >= $this->maxLength) 47 { 48 throw new \LengthException('$minLength must be greater than zero and less than $this->maxLength.'); 49 } 50 51 $this->minLength = $minLength; 52 } 53 54 /** 55 * @param int $maxLength 56 * @throws \LengthException 57 */ 58 public function setMaxLength(int $maxLength) 59 { 60 if ($maxLength <= $this->minLength) 61 { 62 throw new \LengthException('$maxLength must be greater than $this->minLength.'); 63 } 64 65 $this->maxLength = $maxLength; 66 } 67 68 /** 69 * @param int $maxNgrams 70 * @throws \LengthException 71 */ 72 public function setMaxNgrams(int $maxNgrams) 73 { 74 if ($maxNgrams <= 0) 75 { 76 throw new \LengthException('$maxNgrams must be greater than zero.'); 77 } 78 79 $this->maxNgrams = $maxNgrams; 80 } 81 82 /** 83 * Sets the tokenizer 84 * 85 * @param TokenizerInterface $tokenizer 86 */ 87 public function setTokenizer(TokenizerInterface $tokenizer) 88 { 89 $this->tokenizer = $tokenizer; 90 } 91 92 /** 93 * @param string $str 94 * @return array 95 */ 96 private function tokenize(string $str) 97 { 98 if (null === $this->tokenizer) 99 { 100 $this->tokenizer = new WhitespaceTokenizer(); 101 } 102 103 return $this->tokenizer->tokenize($str); 104 } 105 106 /** 107 * @param string $str 108 * @return array 109 */ 110 protected function getNgrams(string $str): array 111 { 112 $tokens = []; 113 114 foreach ($this->tokenize($str) as $word) 115 { 116 $l = \mb_strlen($word); 117 118 for ($i = $this->minLength; $i <= $this->maxLength; ++$i) 119 { 120 for ($j = 0; ($i + $j - 1) < $l; ++$j, ++$tmp) 121 { 122 $tmp = &$tokens[$i][\mb_substr($word, $j, $i)]; 123 } 124 } 125 } 126 127 foreach ($tokens as $i => $token) 128 { 129 $sum = \array_sum($token); 130 131 foreach ($token as $j => $value) 132 { 133 $tokens[$i][$j] = $value / $sum; 134 } 135 } 136 137 if (!\count($tokens)) 138 { 139 return []; 140 } 141 142 $tokens = \array_merge(...$tokens); 143 unset($tokens['_']); 144 145 \arsort($tokens, SORT_NUMERIC); 146 147 return \array_slice( 148 \array_keys($tokens), 149 0, 150 $this->maxNgrams 151 ); 152 } 153} 154