1<!DOCTYPE html> 2<html> 3<head> 4 <meta charset="utf-8"> 5 <meta name="robots" content="noindex"> 6 7 <title>File classes/Sentence.php | phpSentence</title> 8 9 <link rel="stylesheet" href="resources/style.css?e99947befd7bf673c6b43ff75e9e0f170c88a60e"> 10 11</head> 12 13<body> 14<div id="left"> 15 <div id="menu"> 16 <a href="index.html" title="Overview"><span>Overview</span></a> 17 18 19 <div id="groups"> 20 </div> 21 22 23 24 <div id="elements"> 25 <h3>Classes</h3> 26 <ul> 27 <li><a href="class-Sentence.html">Sentence</a></li> 28 <li><a href="class-SentenceTest.html">SentenceTest</a></li> 29 </ul> 30 31 32 33 34 35 <h3>Functions</h3> 36 <ul> 37 <li><a href="function-Sentence_autoloader.html">Sentence_autoloader</a></li> 38 </ul> 39 </div> 40 </div> 41</div> 42 43<div id="splitter"></div> 44 45<div id="right"> 46<div id="rightInner"> 47 <form id="search"> 48 <input type="hidden" name="cx" value=""> 49 <input type="hidden" name="ie" value="UTF-8"> 50 <input type="text" name="q" class="text" placeholder="Search"> 51 </form> 52 53 <div id="navigation"> 54 <ul> 55 <li> 56 <a href="index.html" title="Overview"><span>Overview</span></a> 57 </li> 58 <li> 59<span>Class</span> </li> 60 </ul> 61 <ul> 62 </ul> 63 <ul> 64 </ul> 65 </div> 66 67<pre><code><span id="1" class="l"><a href="#1"> 1: </a><span class="xlang"><?php</span> 68</span><span id="2" class="l"><a href="#2"> 2: </a> 69</span><span id="3" class="l"><a href="#3"> 3: </a><span class="php-comment">/** 70</span></span><span id="4" class="l"><a href="#4"> 4: </a><span class="php-comment"> * Segments sentences. 71</span></span><span id="5" class="l"><a href="#5"> 5: </a><span class="php-comment"> * Clipping may not be perfect. 72</span></span><span id="6" class="l"><a href="#6"> 6: </a><span class="php-comment"> * Sentence count should be VERY close to the truth. 73</span></span><span id="7" class="l"><a href="#7"> 7: </a><span class="php-comment"> * 74</span></span><span id="8" class="l"><a href="#8"> 8: </a><span class="php-comment"> * Multibyte safe (atleast for UTF-8), but rules based on germanic 75</span></span><span id="9" class="l"><a href="#9"> 9: </a><span class="php-comment"> * language stucture (English, Dutch, German). Should work for most 76</span></span><span id="10" class="l"><a href="#10"> 10: </a><span class="php-comment"> * latin-alphabet languages. 77</span></span><span id="11" class="l"><a href="#11"> 11: </a><span class="php-comment"> */</span> 78</span><span id="12" class="l"><a href="#12"> 12: </a><span class="php-keyword1">class</span> Sentence { 79</span><span id="13" class="l"><a href="#13"> 13: </a> <span class="php-comment">/** 80</span></span><span id="14" class="l"><a href="#14"> 14: </a><span class="php-comment"> * Specify this flag with the split method to trim whitespace. 81</span></span><span id="15" class="l"><a href="#15"> 15: </a><span class="php-comment"> */</span> 82</span><span id="16" class="l"><a href="#16"> 16: </a> <span class="php-keyword1">const</span> SPLIT_TRIM = <span class="php-num">0x1</span>; 83</span><span id="17" class="l"><a href="#17"> 17: </a> 84</span><span id="18" class="l"><a href="#18"> 18: </a> <span class="php-comment">/** 85</span></span><span id="19" class="l"><a href="#19"> 19: </a><span class="php-comment"> * List of characters used to terminate sentences. 86</span></span><span id="20" class="l"><a href="#20"> 20: </a><span class="php-comment"> * @var array 87</span></span><span id="21" class="l"><a href="#21"> 21: </a><span class="php-comment"> */</span> 88</span><span id="22" class="l"><a href="#22"> 22: </a> <span class="php-keyword1">private</span> <span class="php-var">$terminals</span> = <span class="php-keyword1">array</span>(<span class="php-quote">'.'</span>, <span class="php-quote">'!'</span>, <span class="php-quote">'?'</span>); 89</span><span id="23" class="l"><a href="#23"> 23: </a> 90</span><span id="24" class="l"><a href="#24"> 24: </a> <span class="php-comment">/** 91</span></span><span id="25" class="l"><a href="#25"> 25: </a><span class="php-comment"> * List of characters used for abbreviations. 92</span></span><span id="26" class="l"><a href="#26"> 26: </a><span class="php-comment"> * @var array 93</span></span><span id="27" class="l"><a href="#27"> 27: </a><span class="php-comment"> */</span> 94</span><span id="28" class="l"><a href="#28"> 28: </a> <span class="php-keyword1">private</span> <span class="php-var">$abbreviators</span> = <span class="php-keyword1">array</span>(<span class="php-quote">'.'</span>); 95</span><span id="29" class="l"><a href="#29"> 29: </a> 96</span><span id="30" class="l"><a href="#30"> 30: </a> <span class="php-comment">/** 97</span></span><span id="31" class="l"><a href="#31"> 31: </a><span class="php-comment"> * Multibyte safe version of standard trim() function. 98</span></span><span id="32" class="l"><a href="#32"> 32: </a><span class="php-comment"> * @param string $string 99</span></span><span id="33" class="l"><a href="#33"> 33: </a><span class="php-comment"> * @return string 100</span></span><span id="34" class="l"><a href="#34"> 34: </a><span class="php-comment"> */</span> 101</span><span id="35" class="l"><a href="#35"> 35: </a> <span class="php-keyword1">private</span> <span class="php-keyword1">static</span> <span class="php-keyword1">function</span> mbTrim(<span class="php-var">$string</span>) { 102</span><span id="36" class="l"><a href="#36"> 36: </a> <span class="php-keyword1">return</span> <span class="php-keyword2">mb_ereg_replace</span>(<span class="php-quote">'^\s*([\s\S]*?)\s*$'</span>, <span class="php-quote">'\1'</span>, <span class="php-var">$string</span>); 103</span><span id="37" class="l"><a href="#37"> 37: </a> } 104</span><span id="38" class="l"><a href="#38"> 38: </a> 105</span><span id="39" class="l"><a href="#39"> 39: </a> <span class="php-comment">/** 106</span></span><span id="40" class="l"><a href="#40"> 40: </a><span class="php-comment"> * A cross between mb_split and preg_split, adding the preg_split flags 107</span></span><span id="41" class="l"><a href="#41"> 41: </a><span class="php-comment"> * to mb_split. 108</span></span><span id="42" class="l"><a href="#42"> 42: </a><span class="php-comment"> * @param string $pattern 109</span></span><span id="43" class="l"><a href="#43"> 43: </a><span class="php-comment"> * @param string $string 110</span></span><span id="44" class="l"><a href="#44"> 44: </a><span class="php-comment"> * @param int $limit 111</span></span><span id="45" class="l"><a href="#45"> 45: </a><span class="php-comment"> * @param int $flags 112</span></span><span id="46" class="l"><a href="#46"> 46: </a><span class="php-comment"> * @return array 113</span></span><span id="47" class="l"><a href="#47"> 47: </a><span class="php-comment"> */</span> 114</span><span id="48" class="l"><a href="#48"> 48: </a> <span class="php-keyword1">private</span> <span class="php-keyword1">static</span> <span class="php-keyword1">function</span> <span class="php-keyword2">mbSplit</span>(<span class="php-var">$pattern</span>, <span class="php-var">$string</span>, <span class="php-var">$limit</span> = -<span class="php-num">1</span>, <span class="php-var">$flags</span> = <span class="php-num">0</span>) { 115</span><span id="49" class="l"><a href="#49"> 49: </a> <span class="php-var">$strlen</span> = <span class="php-keyword2">strlen</span>(<span class="php-var">$string</span>); <span class="php-comment">// bytes! </span> 116</span><span id="50" class="l"><a href="#50"> 50: </a> <span class="php-keyword2">mb_ereg_search_init</span>(<span class="php-var">$string</span>); 117</span><span id="51" class="l"><a href="#51"> 51: </a> 118</span><span id="52" class="l"><a href="#52"> 52: </a> <span class="php-var">$lengths</span> = <span class="php-keyword1">array</span>(); 119</span><span id="53" class="l"><a href="#53"> 53: </a> <span class="php-var">$position</span> = <span class="php-num">0</span>; 120</span><span id="54" class="l"><a href="#54"> 54: </a> <span class="php-keyword1">while</span> ((<span class="php-var">$array</span> = <span class="php-keyword2">mb_ereg_search_pos</span>(<span class="php-var">$pattern</span>, <span class="php-quote">''</span>)) !== <span class="php-keyword1">false</span>) { 121</span><span id="55" class="l"><a href="#55"> 55: </a> <span class="php-comment">// capture split</span> 122</span><span id="56" class="l"><a href="#56"> 56: </a> <span class="php-var">$lengths</span>[] = <span class="php-keyword1">array</span>(<span class="php-var">$array</span>[<span class="php-num">0</span>] - <span class="php-var">$position</span>, <span class="php-keyword1">false</span>, <span class="php-keyword1">null</span>); 123</span><span id="57" class="l"><a href="#57"> 57: </a> 124</span><span id="58" class="l"><a href="#58"> 58: </a> <span class="php-comment">// move position</span> 125</span><span id="59" class="l"><a href="#59"> 59: </a> <span class="php-var">$position</span> = <span class="php-var">$array</span>[<span class="php-num">0</span>] + <span class="php-var">$array</span>[<span class="php-num">1</span>]; 126</span><span id="60" class="l"><a href="#60"> 60: </a> 127</span><span id="61" class="l"><a href="#61"> 61: </a> <span class="php-comment">// capture delimiter</span> 128</span><span id="62" class="l"><a href="#62"> 62: </a> <span class="php-var">$regs</span> = <span class="php-keyword2">mb_ereg_search_getregs</span>(); 129</span><span id="63" class="l"><a href="#63"> 63: </a> <span class="php-var">$lengths</span>[] = <span class="php-keyword1">array</span>(<span class="php-var">$array</span>[<span class="php-num">1</span>], <span class="php-keyword1">true</span>, <span class="php-keyword1">isset</span>(<span class="php-var">$regs</span>[<span class="php-num">1</span>]) && <span class="php-var">$regs</span>[<span class="php-num">1</span>]); 130</span><span id="64" class="l"><a href="#64"> 64: </a> 131</span><span id="65" class="l"><a href="#65"> 65: </a> <span class="php-comment">// Continue on?</span> 132</span><span id="66" class="l"><a href="#66"> 66: </a> <span class="php-keyword1">if</span> (<span class="php-var">$position</span> >= <span class="php-var">$strlen</span>) { 133</span><span id="67" class="l"><a href="#67"> 67: </a> <span class="php-keyword1">break</span>; 134</span><span id="68" class="l"><a href="#68"> 68: </a> } 135</span><span id="69" class="l"><a href="#69"> 69: </a> } 136</span><span id="70" class="l"><a href="#70"> 70: </a> 137</span><span id="71" class="l"><a href="#71"> 71: </a> <span class="php-comment">// Add last bit, if not ending with split</span> 138</span><span id="72" class="l"><a href="#72"> 72: </a> <span class="php-var">$lengths</span>[] = <span class="php-keyword1">array</span>(<span class="php-var">$strlen</span> - <span class="php-var">$position</span>, <span class="php-keyword1">false</span>, <span class="php-keyword1">null</span>); 139</span><span id="73" class="l"><a href="#73"> 73: </a> 140</span><span id="74" class="l"><a href="#74"> 74: </a> <span class="php-comment">// Substrings</span> 141</span><span id="75" class="l"><a href="#75"> 75: </a> <span class="php-var">$parts</span> = <span class="php-keyword1">array</span>(); 142</span><span id="76" class="l"><a href="#76"> 76: </a> <span class="php-var">$position</span> = <span class="php-num">0</span>; 143</span><span id="77" class="l"><a href="#77"> 77: </a> <span class="php-var">$count</span> = <span class="php-num">1</span>; 144</span><span id="78" class="l"><a href="#78"> 78: </a> <span class="php-keyword1">foreach</span> (<span class="php-var">$lengths</span> <span class="php-keyword1">as</span> <span class="php-var">$length</span>) { 145</span><span id="79" class="l"><a href="#79"> 79: </a> <span class="php-var">$is_delimiter</span> = <span class="php-var">$length</span>[<span class="php-num">1</span>]; 146</span><span id="80" class="l"><a href="#80"> 80: </a> <span class="php-var">$is_captured</span> = <span class="php-var">$length</span>[<span class="php-num">2</span>]; 147</span><span id="81" class="l"><a href="#81"> 81: </a> 148</span><span id="82" class="l"><a href="#82"> 82: </a> <span class="php-keyword1">if</span> (<span class="php-var">$limit</span> > <span class="php-num">0</span> && !<span class="php-var">$is_delimiter</span> && (<span class="php-var">$length</span>[<span class="php-num">0</span>] || ~<span class="php-var">$flags</span> & PREG_SPLIT_NO_EMPTY) && ++<span class="php-var">$count</span> > <span class="php-var">$limit</span>) { 149</span><span id="83" class="l"><a href="#83"> 83: </a> <span class="php-keyword1">if</span> (<span class="php-var">$length</span>[<span class="php-num">0</span>] > <span class="php-num">0</span> || ~<span class="php-var">$flags</span> & PREG_SPLIT_NO_EMPTY) { 150</span><span id="84" class="l"><a href="#84"> 84: </a> <span class="php-var">$parts</span>[] = <span class="php-var">$flags</span> & PREG_SPLIT_OFFSET_CAPTURE 151</span><span id="85" class="l"><a href="#85"> 85: </a> ? <span class="php-keyword1">array</span>(<span class="php-keyword2">mb_strcut</span>(<span class="php-var">$string</span>, <span class="php-var">$position</span>), <span class="php-var">$position</span>) 152</span><span id="86" class="l"><a href="#86"> 86: </a> : <span class="php-keyword2">mb_strcut</span>(<span class="php-var">$string</span>, <span class="php-var">$position</span>); 153</span><span id="87" class="l"><a href="#87"> 87: </a> } 154</span><span id="88" class="l"><a href="#88"> 88: </a> <span class="php-keyword1">break</span>; 155</span><span id="89" class="l"><a href="#89"> 89: </a> } <span class="php-keyword1">elseif</span> ((!<span class="php-var">$is_delimiter</span> || (<span class="php-var">$flags</span> & PREG_SPLIT_DELIM_CAPTURE && <span class="php-var">$is_captured</span>)) 156</span><span id="90" class="l"><a href="#90"> 90: </a> && (<span class="php-var">$length</span>[<span class="php-num">0</span>] || ~<span class="php-var">$flags</span> & PREG_SPLIT_NO_EMPTY)) { 157</span><span id="91" class="l"><a href="#91"> 91: </a> <span class="php-var">$parts</span>[] = <span class="php-var">$flags</span> & PREG_SPLIT_OFFSET_CAPTURE 158</span><span id="92" class="l"><a href="#92"> 92: </a> ? <span class="php-keyword1">array</span>(<span class="php-keyword2">mb_strcut</span>(<span class="php-var">$string</span>, <span class="php-var">$position</span>, <span class="php-var">$length</span>[<span class="php-num">0</span>]), <span class="php-var">$position</span>) 159</span><span id="93" class="l"><a href="#93"> 93: </a> : <span class="php-keyword2">mb_strcut</span>(<span class="php-var">$string</span>, <span class="php-var">$position</span>, <span class="php-var">$length</span>[<span class="php-num">0</span>]); 160</span><span id="94" class="l"><a href="#94"> 94: </a> } 161</span><span id="95" class="l"><a href="#95"> 95: </a> 162</span><span id="96" class="l"><a href="#96"> 96: </a> <span class="php-var">$position</span> += <span class="php-var">$length</span>[<span class="php-num">0</span>]; 163</span><span id="97" class="l"><a href="#97"> 97: </a> } 164</span><span id="98" class="l"><a href="#98"> 98: </a> 165</span><span id="99" class="l"><a href="#99"> 99: </a> <span class="php-keyword1">return</span> <span class="php-var">$parts</span>; 166</span><span id="100" class="l"><a href="#100">100: </a> } 167</span><span id="101" class="l"><a href="#101">101: </a> 168</span><span id="102" class="l"><a href="#102">102: </a> <span class="php-comment">/** 169</span></span><span id="103" class="l"><a href="#103">103: </a><span class="php-comment"> * Breaks a piece of text into lines by linebreak. 170</span></span><span id="104" class="l"><a href="#104">104: </a><span class="php-comment"> * Eats up any linebreak characters as if one. 171</span></span><span id="105" class="l"><a href="#105">105: </a><span class="php-comment"> * 172</span></span><span id="106" class="l"><a href="#106">106: </a><span class="php-comment"> * Multibyte safe 173</span></span><span id="107" class="l"><a href="#107">107: </a><span class="php-comment"> * 174</span></span><span id="108" class="l"><a href="#108">108: </a><span class="php-comment"> * @param string $text 175</span></span><span id="109" class="l"><a href="#109">109: </a><span class="php-comment"> * @return array 176</span></span><span id="110" class="l"><a href="#110">110: </a><span class="php-comment"> */</span> 177</span><span id="111" class="l"><a href="#111">111: </a> <span class="php-keyword1">private</span> <span class="php-keyword1">static</span> <span class="php-keyword1">function</span> linebreakSplit(<span class="php-var">$text</span>) { 178</span><span id="112" class="l"><a href="#112">112: </a> <span class="php-var">$lines</span> = <span class="php-keyword1">array</span>(); 179</span><span id="113" class="l"><a href="#113">113: </a> <span class="php-var">$line</span> = <span class="php-quote">''</span>; 180</span><span id="114" class="l"><a href="#114">114: </a> 181</span><span id="115" class="l"><a href="#115">115: </a> <span class="php-keyword1">foreach</span> (self::<span class="php-keyword2">mbSplit</span>(<span class="php-quote">'([\r\n]+)'</span>, <span class="php-var">$text</span>, -<span class="php-num">1</span>, PREG_SPLIT_DELIM_CAPTURE) <span class="php-keyword1">as</span> <span class="php-var">$part</span>) { 182</span><span id="116" class="l"><a href="#116">116: </a> <span class="php-var">$line</span> .= <span class="php-var">$part</span>; 183</span><span id="117" class="l"><a href="#117">117: </a> <span class="php-keyword1">if</span> (self::mbTrim(<span class="php-var">$part</span>) === <span class="php-quote">''</span>) { 184</span><span id="118" class="l"><a href="#118">118: </a> <span class="php-var">$lines</span>[] = <span class="php-var">$line</span>; 185</span><span id="119" class="l"><a href="#119">119: </a> <span class="php-var">$line</span> = <span class="php-quote">''</span>; 186</span><span id="120" class="l"><a href="#120">120: </a> } 187</span><span id="121" class="l"><a href="#121">121: </a> } 188</span><span id="122" class="l"><a href="#122">122: </a> <span class="php-var">$lines</span>[] = <span class="php-var">$line</span>; 189</span><span id="123" class="l"><a href="#123">123: </a> 190</span><span id="124" class="l"><a href="#124">124: </a> <span class="php-keyword1">return</span> <span class="php-var">$lines</span>; 191</span><span id="125" class="l"><a href="#125">125: </a> } 192</span><span id="126" class="l"><a href="#126">126: </a> 193</span><span id="127" class="l"><a href="#127">127: </a> <span class="php-comment">/** 194</span></span><span id="128" class="l"><a href="#128">128: </a><span class="php-comment"> * Splits an array of lines by (consecutive sequences of) 195</span></span><span id="129" class="l"><a href="#129">129: </a><span class="php-comment"> * terminals, keeping terminals. 196</span></span><span id="130" class="l"><a href="#130">130: </a><span class="php-comment"> * 197</span></span><span id="131" class="l"><a href="#131">131: </a><span class="php-comment"> * Multibyte safe (atleast for UTF-8) 198</span></span><span id="132" class="l"><a href="#132">132: </a><span class="php-comment"> * 199</span></span><span id="133" class="l"><a href="#133">133: </a><span class="php-comment"> * For example: 200</span></span><span id="134" class="l"><a href="#134">134: </a><span class="php-comment"> * "There ... is. More!" 201</span></span><span id="135" class="l"><a href="#135">135: </a><span class="php-comment"> * ... becomes ... 202</span></span><span id="136" class="l"><a href="#136">136: </a><span class="php-comment"> * [ "There ", "...", " is", ".", " More", "!" ] 203</span></span><span id="137" class="l"><a href="#137">137: </a><span class="php-comment"> * 204</span></span><span id="138" class="l"><a href="#138">138: </a><span class="php-comment"> * @param array $lines 205</span></span><span id="139" class="l"><a href="#139">139: </a><span class="php-comment"> * @return array 206</span></span><span id="140" class="l"><a href="#140">140: </a><span class="php-comment"> */</span> 207</span><span id="141" class="l"><a href="#141">141: </a> <span class="php-keyword1">private</span> <span class="php-keyword1">function</span> punctuationSplit(<span class="php-var">$line</span>) { 208</span><span id="142" class="l"><a href="#142">142: </a> <span class="php-var">$parts</span> = <span class="php-keyword1">array</span>(); 209</span><span id="143" class="l"><a href="#143">143: </a> 210</span><span id="144" class="l"><a href="#144">144: </a> <span class="php-var">$chars</span> = <span class="php-keyword2">preg_split</span>(<span class="php-quote">'//u'</span>, <span class="php-var">$line</span>, -<span class="php-num">1</span>, PREG_SPLIT_NO_EMPTY); <span class="php-comment">// This is UTF8 multibyte safe!</span> 211</span><span id="145" class="l"><a href="#145">145: </a> <span class="php-var">$is_terminal</span> = <span class="php-keyword2">in_array</span>(<span class="php-var">$chars</span>[<span class="php-num">0</span>], <span class="php-var">$this</span>->terminals); 212</span><span id="146" class="l"><a href="#146">146: </a> 213</span><span id="147" class="l"><a href="#147">147: </a> <span class="php-var">$part</span> = <span class="php-quote">''</span>; 214</span><span id="148" class="l"><a href="#148">148: </a> <span class="php-keyword1">foreach</span> (<span class="php-var">$chars</span> <span class="php-keyword1">as</span> <span class="php-var">$index</span> => <span class="php-var">$char</span>) { 215</span><span id="149" class="l"><a href="#149">149: </a> <span class="php-keyword1">if</span> (<span class="php-keyword2">in_array</span>(<span class="php-var">$char</span>, <span class="php-var">$this</span>->terminals) !== <span class="php-var">$is_terminal</span>) { 216</span><span id="150" class="l"><a href="#150">150: </a> <span class="php-var">$parts</span>[] = <span class="php-var">$part</span>; 217</span><span id="151" class="l"><a href="#151">151: </a> <span class="php-var">$part</span> = <span class="php-quote">''</span>; 218</span><span id="152" class="l"><a href="#152">152: </a> <span class="php-var">$is_terminal</span> = !<span class="php-var">$is_terminal</span>; 219</span><span id="153" class="l"><a href="#153">153: </a> } 220</span><span id="154" class="l"><a href="#154">154: </a> <span class="php-var">$part</span> .= <span class="php-var">$char</span>; 221</span><span id="155" class="l"><a href="#155">155: </a> } 222</span><span id="156" class="l"><a href="#156">156: </a> 223</span><span id="157" class="l"><a href="#157">157: </a> <span class="php-keyword1">if</span> (!<span class="php-keyword1">empty</span>(<span class="php-var">$part</span>)) { 224</span><span id="158" class="l"><a href="#158">158: </a> <span class="php-var">$parts</span>[] = <span class="php-var">$part</span>; 225</span><span id="159" class="l"><a href="#159">159: </a> } 226</span><span id="160" class="l"><a href="#160">160: </a> 227</span><span id="161" class="l"><a href="#161">161: </a> <span class="php-keyword1">return</span> <span class="php-var">$parts</span>; 228</span><span id="162" class="l"><a href="#162">162: </a> } 229</span><span id="163" class="l"><a href="#163">163: </a> 230</span><span id="164" class="l"><a href="#164">164: </a> <span class="php-comment">/** 231</span></span><span id="165" class="l"><a href="#165">165: </a><span class="php-comment"> * Appends each terminal item after it's preceding 232</span></span><span id="166" class="l"><a href="#166">166: </a><span class="php-comment"> * non-terminals. 233</span></span><span id="167" class="l"><a href="#167">167: </a><span class="php-comment"> * 234</span></span><span id="168" class="l"><a href="#168">168: </a><span class="php-comment"> * Multibyte safe (atleast for UTF-8) 235</span></span><span id="169" class="l"><a href="#169">169: </a><span class="php-comment"> * 236</span></span><span id="170" class="l"><a href="#170">170: </a><span class="php-comment"> * For example: 237</span></span><span id="171" class="l"><a href="#171">171: </a><span class="php-comment"> * [ "There ", "...", " is", ".", " More", "!" ] 238</span></span><span id="172" class="l"><a href="#172">172: </a><span class="php-comment"> * ... becomes ... 239</span></span><span id="173" class="l"><a href="#173">173: </a><span class="php-comment"> * [ "There ... is.", "More!" ] 240</span></span><span id="174" class="l"><a href="#174">174: </a><span class="php-comment"> * 241</span></span><span id="175" class="l"><a href="#175">175: </a><span class="php-comment"> * @param array $punctuations 242</span></span><span id="176" class="l"><a href="#176">176: </a><span class="php-comment"> * @return array 243</span></span><span id="177" class="l"><a href="#177">177: </a><span class="php-comment"> */</span> 244</span><span id="178" class="l"><a href="#178">178: </a> <span class="php-keyword1">private</span> <span class="php-keyword1">function</span> punctuationMerge(<span class="php-var">$punctuations</span>) { 245</span><span id="179" class="l"><a href="#179">179: </a> <span class="php-var">$definite_terminals</span> = <span class="php-keyword2">array_diff</span>(<span class="php-var">$this</span>->terminals, <span class="php-var">$this</span>->abbreviators); 246</span><span id="180" class="l"><a href="#180">180: </a> 247</span><span id="181" class="l"><a href="#181">181: </a> <span class="php-var">$merges</span> = <span class="php-keyword1">array</span>(); 248</span><span id="182" class="l"><a href="#182">182: </a> <span class="php-var">$merge</span> = <span class="php-quote">''</span>; 249</span><span id="183" class="l"><a href="#183">183: </a> 250</span><span id="184" class="l"><a href="#184">184: </a> <span class="php-keyword1">foreach</span> (<span class="php-var">$punctuations</span> <span class="php-keyword1">as</span> <span class="php-var">$punctuation</span>) { 251</span><span id="185" class="l"><a href="#185">185: </a> <span class="php-keyword1">if</span> (<span class="php-var">$punctuation</span> !== <span class="php-quote">''</span>) { 252</span><span id="186" class="l"><a href="#186">186: </a> <span class="php-var">$merge</span>.= <span class="php-var">$punctuation</span>; 253</span><span id="187" class="l"><a href="#187">187: </a> <span class="php-keyword1">if</span> (<span class="php-keyword2">mb_strlen</span>(<span class="php-var">$punctuation</span>) === <span class="php-num">1</span> && <span class="php-keyword2">in_array</span>(<span class="php-var">$punctuation</span>, <span class="php-var">$this</span>->terminals)) { 254</span><span id="188" class="l"><a href="#188">188: </a> <span class="php-var">$merges</span>[] = <span class="php-var">$merge</span>; 255</span><span id="189" class="l"><a href="#189">189: </a> <span class="php-var">$merge</span> = <span class="php-quote">''</span>; 256</span><span id="190" class="l"><a href="#190">190: </a> } <span class="php-keyword1">else</span> { 257</span><span id="191" class="l"><a href="#191">191: </a> <span class="php-keyword1">foreach</span> (<span class="php-var">$definite_terminals</span> <span class="php-keyword1">as</span> <span class="php-var">$terminal</span>) { 258</span><span id="192" class="l"><a href="#192">192: </a> <span class="php-keyword1">if</span> (<span class="php-keyword2">mb_strpos</span>(<span class="php-var">$punctuation</span>, <span class="php-var">$terminal</span>) !== <span class="php-keyword1">false</span>) { 259</span><span id="193" class="l"><a href="#193">193: </a> <span class="php-var">$merges</span>[] = <span class="php-var">$merge</span>; 260</span><span id="194" class="l"><a href="#194">194: </a> <span class="php-var">$merge</span> = <span class="php-quote">''</span>; 261</span><span id="195" class="l"><a href="#195">195: </a> <span class="php-keyword1">break</span>; 262</span><span id="196" class="l"><a href="#196">196: </a> } 263</span><span id="197" class="l"><a href="#197">197: </a> } 264</span><span id="198" class="l"><a href="#198">198: </a> } 265</span><span id="199" class="l"><a href="#199">199: </a> } 266</span><span id="200" class="l"><a href="#200">200: </a> } 267</span><span id="201" class="l"><a href="#201">201: </a> <span class="php-keyword1">if</span> (!<span class="php-keyword1">empty</span>(<span class="php-var">$merge</span>)) { 268</span><span id="202" class="l"><a href="#202">202: </a> <span class="php-var">$merges</span>[] = <span class="php-var">$merge</span>; 269</span><span id="203" class="l"><a href="#203">203: </a> } 270</span><span id="204" class="l"><a href="#204">204: </a> 271</span><span id="205" class="l"><a href="#205">205: </a> <span class="php-keyword1">return</span> <span class="php-var">$merges</span>; 272</span><span id="206" class="l"><a href="#206">206: </a> } 273</span><span id="207" class="l"><a href="#207">207: </a> 274</span><span id="208" class="l"><a href="#208">208: </a> <span class="php-comment">/** 275</span></span><span id="209" class="l"><a href="#209">209: </a><span class="php-comment"> * Merges any one-word items with it's preceding items. 276</span></span><span id="210" class="l"><a href="#210">210: </a><span class="php-comment"> * 277</span></span><span id="211" class="l"><a href="#211">211: </a><span class="php-comment"> * Multibyte safe 278</span></span><span id="212" class="l"><a href="#212">212: </a><span class="php-comment"> * 279</span></span><span id="213" class="l"><a href="#213">213: </a><span class="php-comment"> * For example: 280</span></span><span id="214" class="l"><a href="#214">214: </a><span class="php-comment"> * [ "There ... is.", "More!" ] 281</span></span><span id="215" class="l"><a href="#215">215: </a><span class="php-comment"> * ... becomes ... 282</span></span><span id="216" class="l"><a href="#216">216: </a><span class="php-comment"> * [ "There ... is. More!" ] 283</span></span><span id="217" class="l"><a href="#217">217: </a><span class="php-comment"> * 284</span></span><span id="218" class="l"><a href="#218">218: </a><span class="php-comment"> * @param array $fragments 285</span></span><span id="219" class="l"><a href="#219">219: </a><span class="php-comment"> * @return array 286</span></span><span id="220" class="l"><a href="#220">220: </a><span class="php-comment"> */</span> 287</span><span id="221" class="l"><a href="#221">221: </a> <span class="php-keyword1">private</span> <span class="php-keyword1">function</span> abbreviationMerge(<span class="php-var">$fragments</span>) { 288</span><span id="222" class="l"><a href="#222">222: </a> <span class="php-var">$non_abbreviating_terminals</span> = <span class="php-keyword2">array_diff</span>(<span class="php-var">$this</span>->terminals, <span class="php-var">$this</span>->abbreviators); 289</span><span id="223" class="l"><a href="#223">223: </a> 290</span><span id="224" class="l"><a href="#224">224: </a> <span class="php-var">$abbreviations</span> = <span class="php-keyword1">array</span>(); 291</span><span id="225" class="l"><a href="#225">225: </a> 292</span><span id="226" class="l"><a href="#226">226: </a> <span class="php-var">$abbreviation</span> = <span class="php-quote">''</span>; 293</span><span id="227" class="l"><a href="#227">227: </a> 294</span><span id="228" class="l"><a href="#228">228: </a> <span class="php-var">$previous_word_count</span> = <span class="php-keyword1">null</span>; 295</span><span id="229" class="l"><a href="#229">229: </a> <span class="php-var">$previous_word_ending</span> = <span class="php-keyword1">null</span>; 296</span><span id="230" class="l"><a href="#230">230: </a> <span class="php-keyword1">foreach</span> (<span class="php-var">$fragments</span> <span class="php-keyword1">as</span> <span class="php-var">$fragment</span>) { 297</span><span id="231" class="l"><a href="#231">231: </a> <span class="php-var">$word_count</span> = <span class="php-keyword2">count</span>(<span class="php-keyword2">mb_split</span>(<span class="php-quote">'\s+'</span>, self::mbTrim(<span class="php-var">$fragment</span>))); 298</span><span id="232" class="l"><a href="#232">232: </a> <span class="php-var">$starts_with_space</span> = <span class="php-keyword2">mb_ereg_match</span>(<span class="php-quote">'^\s+'</span>, <span class="php-var">$fragment</span>); 299</span><span id="233" class="l"><a href="#233">233: </a> <span class="php-var">$after_non_abbreviating_terminal</span> = <span class="php-keyword2">in_array</span>(<span class="php-var">$previous_word_ending</span>, <span class="php-var">$non_abbreviating_terminals</span>); 300</span><span id="234" class="l"><a href="#234">234: </a> 301</span><span id="235" class="l"><a href="#235">235: </a> <span class="php-keyword1">if</span> (<span class="php-var">$after_non_abbreviating_terminal</span> || (<span class="php-var">$previous_word_count</span> !== <span class="php-keyword1">null</span> && (<span class="php-var">$previous_word_count</span> !== <span class="php-num">1</span> || <span class="php-var">$word_count</span> !== <span class="php-num">1</span>) && <span class="php-var">$starts_with_space</span>)) { 302</span><span id="236" class="l"><a href="#236">236: </a> <span class="php-var">$abbreviations</span>[] = <span class="php-var">$abbreviation</span>; 303</span><span id="237" class="l"><a href="#237">237: </a> <span class="php-var">$abbreviation</span> = <span class="php-quote">''</span>; 304</span><span id="238" class="l"><a href="#238">238: </a> } 305</span><span id="239" class="l"><a href="#239">239: </a> 306</span><span id="240" class="l"><a href="#240">240: </a> <span class="php-var">$abbreviation</span> .= <span class="php-var">$fragment</span>; 307</span><span id="241" class="l"><a href="#241">241: </a> <span class="php-var">$previous_word_count</span> = <span class="php-var">$word_count</span>; 308</span><span id="242" class="l"><a href="#242">242: </a> <span class="php-var">$previous_word_ending</span> = <span class="php-keyword2">mb_substr</span>(<span class="php-var">$fragment</span>, -<span class="php-num">1</span>); 309</span><span id="243" class="l"><a href="#243">243: </a> } 310</span><span id="244" class="l"><a href="#244">244: </a> <span class="php-keyword1">if</span> (<span class="php-var">$abbreviation</span> !== <span class="php-quote">''</span>) { 311</span><span id="245" class="l"><a href="#245">245: </a> <span class="php-var">$abbreviations</span>[] = <span class="php-var">$abbreviation</span>; 312</span><span id="246" class="l"><a href="#246">246: </a> } 313</span><span id="247" class="l"><a href="#247">247: </a> 314</span><span id="248" class="l"><a href="#248">248: </a> <span class="php-keyword1">return</span> <span class="php-var">$abbreviations</span>; 315</span><span id="249" class="l"><a href="#249">249: </a> } 316</span><span id="250" class="l"><a href="#250">250: </a> 317</span><span id="251" class="l"><a href="#251">251: </a> <span class="php-comment">/** 318</span></span><span id="252" class="l"><a href="#252">252: </a><span class="php-comment"> * Merges items into larger sentences. 319</span></span><span id="253" class="l"><a href="#253">253: </a><span class="php-comment"> * 320</span></span><span id="254" class="l"><a href="#254">254: </a><span class="php-comment"> * Multibyte safe 321</span></span><span id="255" class="l"><a href="#255">255: </a><span class="php-comment"> * 322</span></span><span id="256" class="l"><a href="#256">256: </a><span class="php-comment"> * @param array $shorts 323</span></span><span id="257" class="l"><a href="#257">257: </a><span class="php-comment"> * @return array 324</span></span><span id="258" class="l"><a href="#258">258: </a><span class="php-comment"> */</span> 325</span><span id="259" class="l"><a href="#259">259: </a> <span class="php-keyword1">private</span> <span class="php-keyword1">function</span> sentenceMerge(<span class="php-var">$shorts</span>) { 326</span><span id="260" class="l"><a href="#260">260: </a> <span class="php-var">$non_abbreviating_terminals</span> = <span class="php-keyword2">array_diff</span>(<span class="php-var">$this</span>->terminals, <span class="php-var">$this</span>->abbreviators); 327</span><span id="261" class="l"><a href="#261">261: </a> 328</span><span id="262" class="l"><a href="#262">262: </a> <span class="php-var">$sentences</span> = <span class="php-keyword1">array</span>(); 329</span><span id="263" class="l"><a href="#263">263: </a> 330</span><span id="264" class="l"><a href="#264">264: </a> <span class="php-var">$sentence</span> = <span class="php-quote">''</span>; 331</span><span id="265" class="l"><a href="#265">265: </a> <span class="php-var">$has_words</span> = <span class="php-keyword1">false</span>; 332</span><span id="266" class="l"><a href="#266">266: </a> <span class="php-var">$previous_word_ending</span> = <span class="php-keyword1">null</span>; 333</span><span id="267" class="l"><a href="#267">267: </a> <span class="php-keyword1">foreach</span> (<span class="php-var">$shorts</span> <span class="php-keyword1">as</span> <span class="php-var">$short</span>) { 334</span><span id="268" class="l"><a href="#268">268: </a> <span class="php-var">$word_count</span> = <span class="php-keyword2">count</span>(<span class="php-keyword2">mb_split</span>(<span class="php-quote">'\s+'</span>, self::mbTrim(<span class="php-var">$short</span>))); 335</span><span id="269" class="l"><a href="#269">269: </a> <span class="php-var">$after_non_abbreviating_terminal</span> = <span class="php-keyword2">in_array</span>(<span class="php-var">$previous_word_ending</span>, <span class="php-var">$non_abbreviating_terminals</span>); 336</span><span id="270" class="l"><a href="#270">270: </a> 337</span><span id="271" class="l"><a href="#271">271: </a> <span class="php-keyword1">if</span> (<span class="php-var">$after_non_abbreviating_terminal</span> || (<span class="php-var">$has_words</span> && <span class="php-var">$word_count</span> > <span class="php-num">1</span>)) { 338</span><span id="272" class="l"><a href="#272">272: </a> <span class="php-var">$sentences</span>[] = <span class="php-var">$sentence</span>; 339</span><span id="273" class="l"><a href="#273">273: </a> <span class="php-var">$sentence</span> = <span class="php-quote">''</span>; 340</span><span id="274" class="l"><a href="#274">274: </a> <span class="php-var">$has_words</span> = <span class="php-var">$word_count</span> > <span class="php-num">1</span>; 341</span><span id="275" class="l"><a href="#275">275: </a> } <span class="php-keyword1">else</span> { 342</span><span id="276" class="l"><a href="#276">276: </a> <span class="php-var">$has_words</span> = <span class="php-var">$has_words</span> || <span class="php-var">$word_count</span> > <span class="php-num">1</span>; 343</span><span id="277" class="l"><a href="#277">277: </a> } 344</span><span id="278" class="l"><a href="#278">278: </a> 345</span><span id="279" class="l"><a href="#279">279: </a> <span class="php-var">$sentence</span>.= <span class="php-var">$short</span>; 346</span><span id="280" class="l"><a href="#280">280: </a> <span class="php-var">$previous_word_ending</span> = <span class="php-keyword2">mb_substr</span>(<span class="php-var">$short</span>, -<span class="php-num">1</span>); 347</span><span id="281" class="l"><a href="#281">281: </a> } 348</span><span id="282" class="l"><a href="#282">282: </a> <span class="php-keyword1">if</span> (!<span class="php-keyword1">empty</span>(<span class="php-var">$sentence</span>)) { 349</span><span id="283" class="l"><a href="#283">283: </a> <span class="php-var">$sentences</span>[] = <span class="php-var">$sentence</span>; 350</span><span id="284" class="l"><a href="#284">284: </a> } 351</span><span id="285" class="l"><a href="#285">285: </a> 352</span><span id="286" class="l"><a href="#286">286: </a> <span class="php-keyword1">return</span> <span class="php-var">$sentences</span>; 353</span><span id="287" class="l"><a href="#287">287: </a> } 354</span><span id="288" class="l"><a href="#288">288: </a> 355</span><span id="289" class="l"><a href="#289">289: </a> <span class="php-comment">/** 356</span></span><span id="290" class="l"><a href="#290">290: </a><span class="php-comment"> * Return the sentences sentences detected in the provided text. 357</span></span><span id="291" class="l"><a href="#291">291: </a><span class="php-comment"> * Set the Sentence::SPLIT_TRIM flag to trim whitespace. 358</span></span><span id="292" class="l"><a href="#292">292: </a><span class="php-comment"> * @param string $text 359</span></span><span id="293" class="l"><a href="#293">293: </a><span class="php-comment"> * @param integer $flags 360</span></span><span id="294" class="l"><a href="#294">294: </a><span class="php-comment"> * @return array 361</span></span><span id="295" class="l"><a href="#295">295: </a><span class="php-comment"> */</span> 362</span><span id="296" class="l"><a href="#296">296: </a> <span class="php-keyword1">public</span> <span class="php-keyword1">function</span> <span class="php-keyword2">split</span>(<span class="php-var">$text</span>, <span class="php-var">$flags</span> = <span class="php-num">0</span>) { 363</span><span id="297" class="l"><a href="#297">297: </a> <span class="php-var">$sentences</span> = <span class="php-keyword1">array</span>(); 364</span><span id="298" class="l"><a href="#298">298: </a> 365</span><span id="299" class="l"><a href="#299">299: </a> <span class="php-comment">// Split</span> 366</span><span id="300" class="l"><a href="#300">300: </a> <span class="php-keyword1">foreach</span> (self::linebreakSplit(<span class="php-var">$text</span>) <span class="php-keyword1">as</span> <span class="php-var">$line</span>) { 367</span><span id="301" class="l"><a href="#301">301: </a> <span class="php-keyword1">if</span> (self::mbTrim(<span class="php-var">$line</span>) !== <span class="php-quote">''</span>) { 368</span><span id="302" class="l"><a href="#302">302: </a> <span class="php-var">$punctuations</span> = <span class="php-var">$this</span>->punctuationSplit(<span class="php-var">$line</span>); 369</span><span id="303" class="l"><a href="#303">303: </a> <span class="php-var">$merges</span> = <span class="php-var">$this</span>->punctuationMerge(<span class="php-var">$punctuations</span>); 370</span><span id="304" class="l"><a href="#304">304: </a> <span class="php-var">$shorts</span> = <span class="php-var">$this</span>->abbreviationMerge(<span class="php-var">$merges</span>); 371</span><span id="305" class="l"><a href="#305">305: </a> <span class="php-var">$sentences</span> = <span class="php-keyword2">array_merge</span>(<span class="php-var">$sentences</span>, <span class="php-var">$this</span>->sentenceMerge(<span class="php-var">$shorts</span>)); 372</span><span id="306" class="l"><a href="#306">306: </a> } 373</span><span id="307" class="l"><a href="#307">307: </a> } 374</span><span id="308" class="l"><a href="#308">308: </a> 375</span><span id="309" class="l"><a href="#309">309: </a> <span class="php-comment">// Post process</span> 376</span><span id="310" class="l"><a href="#310">310: </a> <span class="php-keyword1">if</span> (<span class="php-var">$flags</span> & self::SPLIT_TRIM) { 377</span><span id="311" class="l"><a href="#311">311: </a> <span class="php-keyword1">foreach</span> (<span class="php-var">$sentences</span> <span class="php-keyword1">as</span> &<span class="php-var">$sentence</span>) { 378</span><span id="312" class="l"><a href="#312">312: </a> <span class="php-var">$sentence</span> = self::mbTrim(<span class="php-var">$sentence</span>); 379</span><span id="313" class="l"><a href="#313">313: </a> } 380</span><span id="314" class="l"><a href="#314">314: </a> <span class="php-keyword1">unset</span>(<span class="php-var">$sentence</span>); 381</span><span id="315" class="l"><a href="#315">315: </a> } 382</span><span id="316" class="l"><a href="#316">316: </a> 383</span><span id="317" class="l"><a href="#317">317: </a> <span class="php-keyword1">return</span> <span class="php-var">$sentences</span>; 384</span><span id="318" class="l"><a href="#318">318: </a> } 385</span><span id="319" class="l"><a href="#319">319: </a> 386</span><span id="320" class="l"><a href="#320">320: </a> <span class="php-comment">/** 387</span></span><span id="321" class="l"><a href="#321">321: </a><span class="php-comment"> * Return the number of sentences detected in the provided text. 388</span></span><span id="322" class="l"><a href="#322">322: </a><span class="php-comment"> * @param string $text 389</span></span><span id="323" class="l"><a href="#323">323: </a><span class="php-comment"> * @return integer 390</span></span><span id="324" class="l"><a href="#324">324: </a><span class="php-comment"> */</span> 391</span><span id="325" class="l"><a href="#325">325: </a> <span class="php-keyword1">public</span> <span class="php-keyword1">function</span> <span class="php-keyword2">count</span>(<span class="php-var">$text</span>) { 392</span><span id="326" class="l"><a href="#326">326: </a> <span class="php-keyword1">return</span> <span class="php-keyword2">count</span>(<span class="php-var">$this</span>-><span class="php-keyword2">split</span>(<span class="php-var">$text</span>)); 393</span><span id="327" class="l"><a href="#327">327: </a> } 394</span><span id="328" class="l"><a href="#328">328: </a>}</span></code></pre> 395 396 <div id="footer"> 397 phpSentence API documentation generated by <a href="http://apigen.org">ApiGen</a> 398 </div> 399</div> 400</div> 401<script src="resources/combined.js?dc3592a696e654c132a2cb2ca318def0ec6c3f17"></script> 402<script src="elementlist.js?94082770cba9dfa8d9d0c03634ee64ddac29c138"></script> 403</body> 404</html> 405