1<!DOCTYPE html>
2<html>
3<head>
4	<meta charset="utf-8">
5	<meta name="robots" content="noindex">
6
7	<title>File classes/Sentence.php | phpSentence</title>
8
9	<link rel="stylesheet" href="resources/style.css?e99947befd7bf673c6b43ff75e9e0f170c88a60e">
10
11</head>
12
13<body>
14<div id="left">
15	<div id="menu">
16		<a href="index.html" title="Overview"><span>Overview</span></a>
17
18
19		<div id="groups">
20		</div>
21
22
23
24		<div id="elements">
25			<h3>Classes</h3>
26			<ul>
27				<li><a href="class-Sentence.html">Sentence</a></li>
28				<li><a href="class-SentenceTest.html">SentenceTest</a></li>
29			</ul>
30
31
32
33
34
35			<h3>Functions</h3>
36			<ul>
37				<li><a href="function-Sentence_autoloader.html">Sentence_autoloader</a></li>
38			</ul>
39		</div>
40	</div>
41</div>
42
43<div id="splitter"></div>
44
45<div id="right">
46<div id="rightInner">
47	<form id="search">
48		<input type="hidden" name="cx" value="">
49		<input type="hidden" name="ie" value="UTF-8">
50		<input type="text" name="q" class="text" placeholder="Search">
51	</form>
52
53	<div id="navigation">
54		<ul>
55			<li>
56				<a href="index.html" title="Overview"><span>Overview</span></a>
57			</li>
58			<li>
59<span>Class</span>			</li>
60		</ul>
61		<ul>
62		</ul>
63		<ul>
64		</ul>
65	</div>
66
67<pre><code><span id="1" class="l"><a href="#1">  1: </a><span class="xlang">&lt;?php</span>
68</span><span id="2" class="l"><a href="#2">  2: </a>
69</span><span id="3" class="l"><a href="#3">  3: </a><span class="php-comment">/**
70</span></span><span id="4" class="l"><a href="#4">  4: </a><span class="php-comment"> * Segments sentences.
71</span></span><span id="5" class="l"><a href="#5">  5: </a><span class="php-comment"> * Clipping may not be perfect.
72</span></span><span id="6" class="l"><a href="#6">  6: </a><span class="php-comment"> * Sentence count should be VERY close to the truth.
73</span></span><span id="7" class="l"><a href="#7">  7: </a><span class="php-comment"> *
74</span></span><span id="8" class="l"><a href="#8">  8: </a><span class="php-comment"> * Multibyte safe (atleast for UTF-8), but rules based on germanic
75</span></span><span id="9" class="l"><a href="#9">  9: </a><span class="php-comment"> * language stucture (English, Dutch, German). Should work for most
76</span></span><span id="10" class="l"><a href="#10"> 10: </a><span class="php-comment"> * latin-alphabet languages.
77</span></span><span id="11" class="l"><a href="#11"> 11: </a><span class="php-comment"> */</span>
78</span><span id="12" class="l"><a href="#12"> 12: </a><span class="php-keyword1">class</span> Sentence {
79</span><span id="13" class="l"><a href="#13"> 13: </a>    <span class="php-comment">/**
80</span></span><span id="14" class="l"><a href="#14"> 14: </a><span class="php-comment">     * Specify this flag with the split method to trim whitespace.
81</span></span><span id="15" class="l"><a href="#15"> 15: </a><span class="php-comment">     */</span>
82</span><span id="16" class="l"><a href="#16"> 16: </a>    <span class="php-keyword1">const</span> SPLIT_TRIM        = <span class="php-num">0x1</span>;
83</span><span id="17" class="l"><a href="#17"> 17: </a>
84</span><span id="18" class="l"><a href="#18"> 18: </a>    <span class="php-comment">/**
85</span></span><span id="19" class="l"><a href="#19"> 19: </a><span class="php-comment">     * List of characters used to terminate sentences.
86</span></span><span id="20" class="l"><a href="#20"> 20: </a><span class="php-comment">     * @var array
87</span></span><span id="21" class="l"><a href="#21"> 21: </a><span class="php-comment">     */</span>
88</span><span id="22" class="l"><a href="#22"> 22: </a>    <span class="php-keyword1">private</span> <span class="php-var">$terminals</span>      = <span class="php-keyword1">array</span>(<span class="php-quote">'.'</span>, <span class="php-quote">'!'</span>, <span class="php-quote">'?'</span>);
89</span><span id="23" class="l"><a href="#23"> 23: </a>
90</span><span id="24" class="l"><a href="#24"> 24: </a>    <span class="php-comment">/**
91</span></span><span id="25" class="l"><a href="#25"> 25: </a><span class="php-comment">     * List of characters used for abbreviations.
92</span></span><span id="26" class="l"><a href="#26"> 26: </a><span class="php-comment">     * @var array
93</span></span><span id="27" class="l"><a href="#27"> 27: </a><span class="php-comment">     */</span>
94</span><span id="28" class="l"><a href="#28"> 28: </a>    <span class="php-keyword1">private</span> <span class="php-var">$abbreviators</span>   = <span class="php-keyword1">array</span>(<span class="php-quote">'.'</span>);
95</span><span id="29" class="l"><a href="#29"> 29: </a>
96</span><span id="30" class="l"><a href="#30"> 30: </a>    <span class="php-comment">/**
97</span></span><span id="31" class="l"><a href="#31"> 31: </a><span class="php-comment">     * Multibyte safe version of standard trim() function.
98</span></span><span id="32" class="l"><a href="#32"> 32: </a><span class="php-comment">     * @param string $string
99</span></span><span id="33" class="l"><a href="#33"> 33: </a><span class="php-comment">     * @return string
100</span></span><span id="34" class="l"><a href="#34"> 34: </a><span class="php-comment">     */</span>
101</span><span id="35" class="l"><a href="#35"> 35: </a>    <span class="php-keyword1">private</span> <span class="php-keyword1">static</span> <span class="php-keyword1">function</span> mbTrim(<span class="php-var">$string</span>) {
102</span><span id="36" class="l"><a href="#36"> 36: </a>        <span class="php-keyword1">return</span> <span class="php-keyword2">mb_ereg_replace</span>(<span class="php-quote">'^\s*([\s\S]*?)\s*$'</span>, <span class="php-quote">'\1'</span>, <span class="php-var">$string</span>);
103</span><span id="37" class="l"><a href="#37"> 37: </a>    }
104</span><span id="38" class="l"><a href="#38"> 38: </a>
105</span><span id="39" class="l"><a href="#39"> 39: </a>    <span class="php-comment">/**
106</span></span><span id="40" class="l"><a href="#40"> 40: </a><span class="php-comment">     * A cross between mb_split and preg_split, adding the preg_split flags
107</span></span><span id="41" class="l"><a href="#41"> 41: </a><span class="php-comment">     * to mb_split.
108</span></span><span id="42" class="l"><a href="#42"> 42: </a><span class="php-comment">     * @param string $pattern
109</span></span><span id="43" class="l"><a href="#43"> 43: </a><span class="php-comment">     * @param string $string
110</span></span><span id="44" class="l"><a href="#44"> 44: </a><span class="php-comment">     * @param int $limit
111</span></span><span id="45" class="l"><a href="#45"> 45: </a><span class="php-comment">     * @param int $flags
112</span></span><span id="46" class="l"><a href="#46"> 46: </a><span class="php-comment">     * @return array
113</span></span><span id="47" class="l"><a href="#47"> 47: </a><span class="php-comment">     */</span>
114</span><span id="48" class="l"><a href="#48"> 48: </a>    <span class="php-keyword1">private</span> <span class="php-keyword1">static</span> <span class="php-keyword1">function</span> <span class="php-keyword2">mbSplit</span>(<span class="php-var">$pattern</span>, <span class="php-var">$string</span>, <span class="php-var">$limit</span> = -<span class="php-num">1</span>, <span class="php-var">$flags</span> = <span class="php-num">0</span>) {
115</span><span id="49" class="l"><a href="#49"> 49: </a>        <span class="php-var">$strlen</span> = <span class="php-keyword2">strlen</span>(<span class="php-var">$string</span>);      <span class="php-comment">// bytes!   </span>
116</span><span id="50" class="l"><a href="#50"> 50: </a>        <span class="php-keyword2">mb_ereg_search_init</span>(<span class="php-var">$string</span>);
117</span><span id="51" class="l"><a href="#51"> 51: </a>
118</span><span id="52" class="l"><a href="#52"> 52: </a>        <span class="php-var">$lengths</span> = <span class="php-keyword1">array</span>();
119</span><span id="53" class="l"><a href="#53"> 53: </a>        <span class="php-var">$position</span> = <span class="php-num">0</span>;
120</span><span id="54" class="l"><a href="#54"> 54: </a>        <span class="php-keyword1">while</span> ((<span class="php-var">$array</span> = <span class="php-keyword2">mb_ereg_search_pos</span>(<span class="php-var">$pattern</span>, <span class="php-quote">''</span>)) !== <span class="php-keyword1">false</span>) {
121</span><span id="55" class="l"><a href="#55"> 55: </a>            <span class="php-comment">// capture split</span>
122</span><span id="56" class="l"><a href="#56"> 56: </a>            <span class="php-var">$lengths</span>[] = <span class="php-keyword1">array</span>(<span class="php-var">$array</span>[<span class="php-num">0</span>] - <span class="php-var">$position</span>, <span class="php-keyword1">false</span>, <span class="php-keyword1">null</span>);
123</span><span id="57" class="l"><a href="#57"> 57: </a>
124</span><span id="58" class="l"><a href="#58"> 58: </a>            <span class="php-comment">// move position</span>
125</span><span id="59" class="l"><a href="#59"> 59: </a>            <span class="php-var">$position</span> = <span class="php-var">$array</span>[<span class="php-num">0</span>] + <span class="php-var">$array</span>[<span class="php-num">1</span>];
126</span><span id="60" class="l"><a href="#60"> 60: </a>
127</span><span id="61" class="l"><a href="#61"> 61: </a>            <span class="php-comment">// capture delimiter</span>
128</span><span id="62" class="l"><a href="#62"> 62: </a>            <span class="php-var">$regs</span> = <span class="php-keyword2">mb_ereg_search_getregs</span>();
129</span><span id="63" class="l"><a href="#63"> 63: </a>            <span class="php-var">$lengths</span>[] = <span class="php-keyword1">array</span>(<span class="php-var">$array</span>[<span class="php-num">1</span>], <span class="php-keyword1">true</span>, <span class="php-keyword1">isset</span>(<span class="php-var">$regs</span>[<span class="php-num">1</span>]) &amp;&amp; <span class="php-var">$regs</span>[<span class="php-num">1</span>]);
130</span><span id="64" class="l"><a href="#64"> 64: </a>
131</span><span id="65" class="l"><a href="#65"> 65: </a>            <span class="php-comment">// Continue on?</span>
132</span><span id="66" class="l"><a href="#66"> 66: </a>            <span class="php-keyword1">if</span> (<span class="php-var">$position</span> &gt;= <span class="php-var">$strlen</span>) {
133</span><span id="67" class="l"><a href="#67"> 67: </a>                <span class="php-keyword1">break</span>;
134</span><span id="68" class="l"><a href="#68"> 68: </a>            }
135</span><span id="69" class="l"><a href="#69"> 69: </a>        }
136</span><span id="70" class="l"><a href="#70"> 70: </a>
137</span><span id="71" class="l"><a href="#71"> 71: </a>        <span class="php-comment">// Add last bit, if not ending with split</span>
138</span><span id="72" class="l"><a href="#72"> 72: </a>        <span class="php-var">$lengths</span>[] = <span class="php-keyword1">array</span>(<span class="php-var">$strlen</span> - <span class="php-var">$position</span>, <span class="php-keyword1">false</span>, <span class="php-keyword1">null</span>);
139</span><span id="73" class="l"><a href="#73"> 73: </a>
140</span><span id="74" class="l"><a href="#74"> 74: </a>        <span class="php-comment">// Substrings</span>
141</span><span id="75" class="l"><a href="#75"> 75: </a>        <span class="php-var">$parts</span> = <span class="php-keyword1">array</span>();
142</span><span id="76" class="l"><a href="#76"> 76: </a>        <span class="php-var">$position</span> = <span class="php-num">0</span>;
143</span><span id="77" class="l"><a href="#77"> 77: </a>        <span class="php-var">$count</span> = <span class="php-num">1</span>;
144</span><span id="78" class="l"><a href="#78"> 78: </a>        <span class="php-keyword1">foreach</span> (<span class="php-var">$lengths</span> <span class="php-keyword1">as</span> <span class="php-var">$length</span>) {
145</span><span id="79" class="l"><a href="#79"> 79: </a>            <span class="php-var">$is_delimiter</span>   = <span class="php-var">$length</span>[<span class="php-num">1</span>];
146</span><span id="80" class="l"><a href="#80"> 80: </a>            <span class="php-var">$is_captured</span>    = <span class="php-var">$length</span>[<span class="php-num">2</span>];
147</span><span id="81" class="l"><a href="#81"> 81: </a>
148</span><span id="82" class="l"><a href="#82"> 82: </a>            <span class="php-keyword1">if</span> (<span class="php-var">$limit</span> &gt; <span class="php-num">0</span> &amp;&amp; !<span class="php-var">$is_delimiter</span> &amp;&amp; (<span class="php-var">$length</span>[<span class="php-num">0</span>] || ~<span class="php-var">$flags</span> &amp; PREG_SPLIT_NO_EMPTY) &amp;&amp; ++<span class="php-var">$count</span> &gt; <span class="php-var">$limit</span>) {
149</span><span id="83" class="l"><a href="#83"> 83: </a>                <span class="php-keyword1">if</span> (<span class="php-var">$length</span>[<span class="php-num">0</span>] &gt; <span class="php-num">0</span> || ~<span class="php-var">$flags</span> &amp; PREG_SPLIT_NO_EMPTY) {
150</span><span id="84" class="l"><a href="#84"> 84: </a>                    <span class="php-var">$parts</span>[]    = <span class="php-var">$flags</span> &amp; PREG_SPLIT_OFFSET_CAPTURE
151</span><span id="85" class="l"><a href="#85"> 85: </a>                                ? <span class="php-keyword1">array</span>(<span class="php-keyword2">mb_strcut</span>(<span class="php-var">$string</span>, <span class="php-var">$position</span>), <span class="php-var">$position</span>)
152</span><span id="86" class="l"><a href="#86"> 86: </a>                                : <span class="php-keyword2">mb_strcut</span>(<span class="php-var">$string</span>, <span class="php-var">$position</span>);
153</span><span id="87" class="l"><a href="#87"> 87: </a>                }
154</span><span id="88" class="l"><a href="#88"> 88: </a>                <span class="php-keyword1">break</span>;
155</span><span id="89" class="l"><a href="#89"> 89: </a>            } <span class="php-keyword1">elseif</span> ((!<span class="php-var">$is_delimiter</span> || (<span class="php-var">$flags</span> &amp; PREG_SPLIT_DELIM_CAPTURE &amp;&amp; <span class="php-var">$is_captured</span>))
156</span><span id="90" class="l"><a href="#90"> 90: </a>                   &amp;&amp; (<span class="php-var">$length</span>[<span class="php-num">0</span>] || ~<span class="php-var">$flags</span> &amp; PREG_SPLIT_NO_EMPTY)) {
157</span><span id="91" class="l"><a href="#91"> 91: </a>                <span class="php-var">$parts</span>[]    = <span class="php-var">$flags</span> &amp; PREG_SPLIT_OFFSET_CAPTURE
158</span><span id="92" class="l"><a href="#92"> 92: </a>                            ? <span class="php-keyword1">array</span>(<span class="php-keyword2">mb_strcut</span>(<span class="php-var">$string</span>, <span class="php-var">$position</span>, <span class="php-var">$length</span>[<span class="php-num">0</span>]), <span class="php-var">$position</span>)
159</span><span id="93" class="l"><a href="#93"> 93: </a>                            : <span class="php-keyword2">mb_strcut</span>(<span class="php-var">$string</span>, <span class="php-var">$position</span>, <span class="php-var">$length</span>[<span class="php-num">0</span>]);
160</span><span id="94" class="l"><a href="#94"> 94: </a>            }
161</span><span id="95" class="l"><a href="#95"> 95: </a>
162</span><span id="96" class="l"><a href="#96"> 96: </a>            <span class="php-var">$position</span> += <span class="php-var">$length</span>[<span class="php-num">0</span>];
163</span><span id="97" class="l"><a href="#97"> 97: </a>        }
164</span><span id="98" class="l"><a href="#98"> 98: </a>
165</span><span id="99" class="l"><a href="#99"> 99: </a>        <span class="php-keyword1">return</span> <span class="php-var">$parts</span>;
166</span><span id="100" class="l"><a href="#100">100: </a>    }
167</span><span id="101" class="l"><a href="#101">101: </a>
168</span><span id="102" class="l"><a href="#102">102: </a>    <span class="php-comment">/**
169</span></span><span id="103" class="l"><a href="#103">103: </a><span class="php-comment">     * Breaks a piece of text into lines by linebreak.
170</span></span><span id="104" class="l"><a href="#104">104: </a><span class="php-comment">     * Eats up any linebreak characters as if one.
171</span></span><span id="105" class="l"><a href="#105">105: </a><span class="php-comment">     *
172</span></span><span id="106" class="l"><a href="#106">106: </a><span class="php-comment">     * Multibyte safe
173</span></span><span id="107" class="l"><a href="#107">107: </a><span class="php-comment">     *
174</span></span><span id="108" class="l"><a href="#108">108: </a><span class="php-comment">     * @param string $text
175</span></span><span id="109" class="l"><a href="#109">109: </a><span class="php-comment">     * @return array
176</span></span><span id="110" class="l"><a href="#110">110: </a><span class="php-comment">     */</span>
177</span><span id="111" class="l"><a href="#111">111: </a>    <span class="php-keyword1">private</span> <span class="php-keyword1">static</span> <span class="php-keyword1">function</span> linebreakSplit(<span class="php-var">$text</span>) {
178</span><span id="112" class="l"><a href="#112">112: </a>        <span class="php-var">$lines</span> = <span class="php-keyword1">array</span>();
179</span><span id="113" class="l"><a href="#113">113: </a>        <span class="php-var">$line</span> = <span class="php-quote">''</span>;
180</span><span id="114" class="l"><a href="#114">114: </a>
181</span><span id="115" class="l"><a href="#115">115: </a>        <span class="php-keyword1">foreach</span> (self::<span class="php-keyword2">mbSplit</span>(<span class="php-quote">'([\r\n]+)'</span>, <span class="php-var">$text</span>, -<span class="php-num">1</span>, PREG_SPLIT_DELIM_CAPTURE) <span class="php-keyword1">as</span> <span class="php-var">$part</span>) {
182</span><span id="116" class="l"><a href="#116">116: </a>            <span class="php-var">$line</span> .= <span class="php-var">$part</span>;
183</span><span id="117" class="l"><a href="#117">117: </a>            <span class="php-keyword1">if</span> (self::mbTrim(<span class="php-var">$part</span>) === <span class="php-quote">''</span>) {
184</span><span id="118" class="l"><a href="#118">118: </a>                <span class="php-var">$lines</span>[] = <span class="php-var">$line</span>;
185</span><span id="119" class="l"><a href="#119">119: </a>                <span class="php-var">$line</span> = <span class="php-quote">''</span>;
186</span><span id="120" class="l"><a href="#120">120: </a>            }
187</span><span id="121" class="l"><a href="#121">121: </a>        }
188</span><span id="122" class="l"><a href="#122">122: </a>        <span class="php-var">$lines</span>[] = <span class="php-var">$line</span>;
189</span><span id="123" class="l"><a href="#123">123: </a>
190</span><span id="124" class="l"><a href="#124">124: </a>        <span class="php-keyword1">return</span> <span class="php-var">$lines</span>;
191</span><span id="125" class="l"><a href="#125">125: </a>    }
192</span><span id="126" class="l"><a href="#126">126: </a>
193</span><span id="127" class="l"><a href="#127">127: </a>    <span class="php-comment">/**
194</span></span><span id="128" class="l"><a href="#128">128: </a><span class="php-comment">     * Splits an array of lines by (consecutive sequences of)
195</span></span><span id="129" class="l"><a href="#129">129: </a><span class="php-comment">     * terminals, keeping terminals.
196</span></span><span id="130" class="l"><a href="#130">130: </a><span class="php-comment">     *
197</span></span><span id="131" class="l"><a href="#131">131: </a><span class="php-comment">     * Multibyte safe (atleast for UTF-8)
198</span></span><span id="132" class="l"><a href="#132">132: </a><span class="php-comment">     *
199</span></span><span id="133" class="l"><a href="#133">133: </a><span class="php-comment">     * For example:
200</span></span><span id="134" class="l"><a href="#134">134: </a><span class="php-comment">     *  &quot;There ... is. More!&quot;
201</span></span><span id="135" class="l"><a href="#135">135: </a><span class="php-comment">     *      ... becomes ...
202</span></span><span id="136" class="l"><a href="#136">136: </a><span class="php-comment">     *  [ &quot;There &quot;, &quot;...&quot;, &quot; is&quot;, &quot;.&quot;, &quot; More&quot;, &quot;!&quot; ]
203</span></span><span id="137" class="l"><a href="#137">137: </a><span class="php-comment">     *
204</span></span><span id="138" class="l"><a href="#138">138: </a><span class="php-comment">     * @param array $lines
205</span></span><span id="139" class="l"><a href="#139">139: </a><span class="php-comment">     * @return array
206</span></span><span id="140" class="l"><a href="#140">140: </a><span class="php-comment">     */</span>
207</span><span id="141" class="l"><a href="#141">141: </a>    <span class="php-keyword1">private</span> <span class="php-keyword1">function</span> punctuationSplit(<span class="php-var">$line</span>) {
208</span><span id="142" class="l"><a href="#142">142: </a>        <span class="php-var">$parts</span> = <span class="php-keyword1">array</span>();
209</span><span id="143" class="l"><a href="#143">143: </a>
210</span><span id="144" class="l"><a href="#144">144: </a>        <span class="php-var">$chars</span> = <span class="php-keyword2">preg_split</span>(<span class="php-quote">'//u'</span>, <span class="php-var">$line</span>, -<span class="php-num">1</span>, PREG_SPLIT_NO_EMPTY); <span class="php-comment">// This is UTF8 multibyte safe!</span>
211</span><span id="145" class="l"><a href="#145">145: </a>        <span class="php-var">$is_terminal</span> = <span class="php-keyword2">in_array</span>(<span class="php-var">$chars</span>[<span class="php-num">0</span>], <span class="php-var">$this</span>-&gt;terminals);
212</span><span id="146" class="l"><a href="#146">146: </a>
213</span><span id="147" class="l"><a href="#147">147: </a>        <span class="php-var">$part</span> = <span class="php-quote">''</span>;
214</span><span id="148" class="l"><a href="#148">148: </a>        <span class="php-keyword1">foreach</span> (<span class="php-var">$chars</span> <span class="php-keyword1">as</span> <span class="php-var">$index</span> =&gt; <span class="php-var">$char</span>) {
215</span><span id="149" class="l"><a href="#149">149: </a>            <span class="php-keyword1">if</span> (<span class="php-keyword2">in_array</span>(<span class="php-var">$char</span>, <span class="php-var">$this</span>-&gt;terminals) !== <span class="php-var">$is_terminal</span>) {
216</span><span id="150" class="l"><a href="#150">150: </a>                <span class="php-var">$parts</span>[] = <span class="php-var">$part</span>;
217</span><span id="151" class="l"><a href="#151">151: </a>                <span class="php-var">$part</span> = <span class="php-quote">''</span>;
218</span><span id="152" class="l"><a href="#152">152: </a>                <span class="php-var">$is_terminal</span> = !<span class="php-var">$is_terminal</span>;
219</span><span id="153" class="l"><a href="#153">153: </a>            }
220</span><span id="154" class="l"><a href="#154">154: </a>            <span class="php-var">$part</span> .= <span class="php-var">$char</span>;
221</span><span id="155" class="l"><a href="#155">155: </a>        }
222</span><span id="156" class="l"><a href="#156">156: </a>
223</span><span id="157" class="l"><a href="#157">157: </a>        <span class="php-keyword1">if</span> (!<span class="php-keyword1">empty</span>(<span class="php-var">$part</span>)) {
224</span><span id="158" class="l"><a href="#158">158: </a>            <span class="php-var">$parts</span>[] = <span class="php-var">$part</span>;
225</span><span id="159" class="l"><a href="#159">159: </a>        }
226</span><span id="160" class="l"><a href="#160">160: </a>
227</span><span id="161" class="l"><a href="#161">161: </a>        <span class="php-keyword1">return</span> <span class="php-var">$parts</span>;
228</span><span id="162" class="l"><a href="#162">162: </a>    }
229</span><span id="163" class="l"><a href="#163">163: </a>
230</span><span id="164" class="l"><a href="#164">164: </a>    <span class="php-comment">/**
231</span></span><span id="165" class="l"><a href="#165">165: </a><span class="php-comment">     * Appends each terminal item after it's preceding
232</span></span><span id="166" class="l"><a href="#166">166: </a><span class="php-comment">     * non-terminals.
233</span></span><span id="167" class="l"><a href="#167">167: </a><span class="php-comment">     *
234</span></span><span id="168" class="l"><a href="#168">168: </a><span class="php-comment">     * Multibyte safe (atleast for UTF-8)
235</span></span><span id="169" class="l"><a href="#169">169: </a><span class="php-comment">     *
236</span></span><span id="170" class="l"><a href="#170">170: </a><span class="php-comment">     * For example:
237</span></span><span id="171" class="l"><a href="#171">171: </a><span class="php-comment">     *  [ &quot;There &quot;, &quot;...&quot;, &quot; is&quot;, &quot;.&quot;, &quot; More&quot;, &quot;!&quot; ]
238</span></span><span id="172" class="l"><a href="#172">172: </a><span class="php-comment">     *      ... becomes ...
239</span></span><span id="173" class="l"><a href="#173">173: </a><span class="php-comment">     *  [ &quot;There ... is.&quot;, &quot;More!&quot; ]
240</span></span><span id="174" class="l"><a href="#174">174: </a><span class="php-comment">     *
241</span></span><span id="175" class="l"><a href="#175">175: </a><span class="php-comment">     * @param array $punctuations
242</span></span><span id="176" class="l"><a href="#176">176: </a><span class="php-comment">     * @return array
243</span></span><span id="177" class="l"><a href="#177">177: </a><span class="php-comment">     */</span>
244</span><span id="178" class="l"><a href="#178">178: </a>    <span class="php-keyword1">private</span> <span class="php-keyword1">function</span> punctuationMerge(<span class="php-var">$punctuations</span>) {
245</span><span id="179" class="l"><a href="#179">179: </a>        <span class="php-var">$definite_terminals</span> = <span class="php-keyword2">array_diff</span>(<span class="php-var">$this</span>-&gt;terminals, <span class="php-var">$this</span>-&gt;abbreviators);
246</span><span id="180" class="l"><a href="#180">180: </a>
247</span><span id="181" class="l"><a href="#181">181: </a>        <span class="php-var">$merges</span> = <span class="php-keyword1">array</span>();
248</span><span id="182" class="l"><a href="#182">182: </a>        <span class="php-var">$merge</span> = <span class="php-quote">''</span>;
249</span><span id="183" class="l"><a href="#183">183: </a>
250</span><span id="184" class="l"><a href="#184">184: </a>        <span class="php-keyword1">foreach</span> (<span class="php-var">$punctuations</span> <span class="php-keyword1">as</span> <span class="php-var">$punctuation</span>) {
251</span><span id="185" class="l"><a href="#185">185: </a>            <span class="php-keyword1">if</span> (<span class="php-var">$punctuation</span> !== <span class="php-quote">''</span>) {
252</span><span id="186" class="l"><a href="#186">186: </a>                <span class="php-var">$merge</span>.= <span class="php-var">$punctuation</span>;
253</span><span id="187" class="l"><a href="#187">187: </a>                <span class="php-keyword1">if</span> (<span class="php-keyword2">mb_strlen</span>(<span class="php-var">$punctuation</span>) === <span class="php-num">1</span> &amp;&amp; <span class="php-keyword2">in_array</span>(<span class="php-var">$punctuation</span>, <span class="php-var">$this</span>-&gt;terminals)) {
254</span><span id="188" class="l"><a href="#188">188: </a>                    <span class="php-var">$merges</span>[] = <span class="php-var">$merge</span>;
255</span><span id="189" class="l"><a href="#189">189: </a>                    <span class="php-var">$merge</span> = <span class="php-quote">''</span>;
256</span><span id="190" class="l"><a href="#190">190: </a>                } <span class="php-keyword1">else</span> {
257</span><span id="191" class="l"><a href="#191">191: </a>                    <span class="php-keyword1">foreach</span> (<span class="php-var">$definite_terminals</span> <span class="php-keyword1">as</span> <span class="php-var">$terminal</span>) {
258</span><span id="192" class="l"><a href="#192">192: </a>                        <span class="php-keyword1">if</span> (<span class="php-keyword2">mb_strpos</span>(<span class="php-var">$punctuation</span>, <span class="php-var">$terminal</span>) !== <span class="php-keyword1">false</span>) {
259</span><span id="193" class="l"><a href="#193">193: </a>                            <span class="php-var">$merges</span>[] = <span class="php-var">$merge</span>;
260</span><span id="194" class="l"><a href="#194">194: </a>                            <span class="php-var">$merge</span> = <span class="php-quote">''</span>;
261</span><span id="195" class="l"><a href="#195">195: </a>                            <span class="php-keyword1">break</span>;
262</span><span id="196" class="l"><a href="#196">196: </a>                        }
263</span><span id="197" class="l"><a href="#197">197: </a>                    }
264</span><span id="198" class="l"><a href="#198">198: </a>                }
265</span><span id="199" class="l"><a href="#199">199: </a>            }
266</span><span id="200" class="l"><a href="#200">200: </a>        }
267</span><span id="201" class="l"><a href="#201">201: </a>        <span class="php-keyword1">if</span> (!<span class="php-keyword1">empty</span>(<span class="php-var">$merge</span>)) {
268</span><span id="202" class="l"><a href="#202">202: </a>            <span class="php-var">$merges</span>[] = <span class="php-var">$merge</span>;
269</span><span id="203" class="l"><a href="#203">203: </a>        }
270</span><span id="204" class="l"><a href="#204">204: </a>
271</span><span id="205" class="l"><a href="#205">205: </a>        <span class="php-keyword1">return</span> <span class="php-var">$merges</span>;
272</span><span id="206" class="l"><a href="#206">206: </a>    }
273</span><span id="207" class="l"><a href="#207">207: </a>
274</span><span id="208" class="l"><a href="#208">208: </a>    <span class="php-comment">/**
275</span></span><span id="209" class="l"><a href="#209">209: </a><span class="php-comment">     * Merges any one-word items with it's preceding items.
276</span></span><span id="210" class="l"><a href="#210">210: </a><span class="php-comment">     *
277</span></span><span id="211" class="l"><a href="#211">211: </a><span class="php-comment">     * Multibyte safe
278</span></span><span id="212" class="l"><a href="#212">212: </a><span class="php-comment">     *
279</span></span><span id="213" class="l"><a href="#213">213: </a><span class="php-comment">     * For example:
280</span></span><span id="214" class="l"><a href="#214">214: </a><span class="php-comment">     *  [ &quot;There ... is.&quot;, &quot;More!&quot; ]
281</span></span><span id="215" class="l"><a href="#215">215: </a><span class="php-comment">     *      ... becomes ...
282</span></span><span id="216" class="l"><a href="#216">216: </a><span class="php-comment">     *  [ &quot;There ... is. More!&quot; ]
283</span></span><span id="217" class="l"><a href="#217">217: </a><span class="php-comment">     *
284</span></span><span id="218" class="l"><a href="#218">218: </a><span class="php-comment">     * @param array $fragments
285</span></span><span id="219" class="l"><a href="#219">219: </a><span class="php-comment">     * @return array
286</span></span><span id="220" class="l"><a href="#220">220: </a><span class="php-comment">     */</span>
287</span><span id="221" class="l"><a href="#221">221: </a>    <span class="php-keyword1">private</span> <span class="php-keyword1">function</span> abbreviationMerge(<span class="php-var">$fragments</span>) {
288</span><span id="222" class="l"><a href="#222">222: </a>        <span class="php-var">$non_abbreviating_terminals</span> = <span class="php-keyword2">array_diff</span>(<span class="php-var">$this</span>-&gt;terminals, <span class="php-var">$this</span>-&gt;abbreviators);
289</span><span id="223" class="l"><a href="#223">223: </a>
290</span><span id="224" class="l"><a href="#224">224: </a>        <span class="php-var">$abbreviations</span> = <span class="php-keyword1">array</span>();
291</span><span id="225" class="l"><a href="#225">225: </a>
292</span><span id="226" class="l"><a href="#226">226: </a>        <span class="php-var">$abbreviation</span> = <span class="php-quote">''</span>;
293</span><span id="227" class="l"><a href="#227">227: </a>
294</span><span id="228" class="l"><a href="#228">228: </a>        <span class="php-var">$previous_word_count</span> = <span class="php-keyword1">null</span>;
295</span><span id="229" class="l"><a href="#229">229: </a>        <span class="php-var">$previous_word_ending</span> = <span class="php-keyword1">null</span>;
296</span><span id="230" class="l"><a href="#230">230: </a>        <span class="php-keyword1">foreach</span> (<span class="php-var">$fragments</span> <span class="php-keyword1">as</span> <span class="php-var">$fragment</span>) {
297</span><span id="231" class="l"><a href="#231">231: </a>            <span class="php-var">$word_count</span> = <span class="php-keyword2">count</span>(<span class="php-keyword2">mb_split</span>(<span class="php-quote">'\s+'</span>, self::mbTrim(<span class="php-var">$fragment</span>)));
298</span><span id="232" class="l"><a href="#232">232: </a>            <span class="php-var">$starts_with_space</span> = <span class="php-keyword2">mb_ereg_match</span>(<span class="php-quote">'^\s+'</span>, <span class="php-var">$fragment</span>);
299</span><span id="233" class="l"><a href="#233">233: </a>            <span class="php-var">$after_non_abbreviating_terminal</span> = <span class="php-keyword2">in_array</span>(<span class="php-var">$previous_word_ending</span>, <span class="php-var">$non_abbreviating_terminals</span>);
300</span><span id="234" class="l"><a href="#234">234: </a>
301</span><span id="235" class="l"><a href="#235">235: </a>            <span class="php-keyword1">if</span> (<span class="php-var">$after_non_abbreviating_terminal</span> || (<span class="php-var">$previous_word_count</span> !== <span class="php-keyword1">null</span> &amp;&amp; (<span class="php-var">$previous_word_count</span> !== <span class="php-num">1</span> || <span class="php-var">$word_count</span> !== <span class="php-num">1</span>) &amp;&amp; <span class="php-var">$starts_with_space</span>)) {
302</span><span id="236" class="l"><a href="#236">236: </a>                <span class="php-var">$abbreviations</span>[] = <span class="php-var">$abbreviation</span>;
303</span><span id="237" class="l"><a href="#237">237: </a>                <span class="php-var">$abbreviation</span> = <span class="php-quote">''</span>;
304</span><span id="238" class="l"><a href="#238">238: </a>            }
305</span><span id="239" class="l"><a href="#239">239: </a>
306</span><span id="240" class="l"><a href="#240">240: </a>            <span class="php-var">$abbreviation</span>           .= <span class="php-var">$fragment</span>;
307</span><span id="241" class="l"><a href="#241">241: </a>            <span class="php-var">$previous_word_count</span>    = <span class="php-var">$word_count</span>;
308</span><span id="242" class="l"><a href="#242">242: </a>            <span class="php-var">$previous_word_ending</span>   = <span class="php-keyword2">mb_substr</span>(<span class="php-var">$fragment</span>, -<span class="php-num">1</span>);
309</span><span id="243" class="l"><a href="#243">243: </a>        }
310</span><span id="244" class="l"><a href="#244">244: </a>        <span class="php-keyword1">if</span> (<span class="php-var">$abbreviation</span> !== <span class="php-quote">''</span>) {
311</span><span id="245" class="l"><a href="#245">245: </a>            <span class="php-var">$abbreviations</span>[] = <span class="php-var">$abbreviation</span>;
312</span><span id="246" class="l"><a href="#246">246: </a>        }
313</span><span id="247" class="l"><a href="#247">247: </a>
314</span><span id="248" class="l"><a href="#248">248: </a>        <span class="php-keyword1">return</span> <span class="php-var">$abbreviations</span>;
315</span><span id="249" class="l"><a href="#249">249: </a>    }
316</span><span id="250" class="l"><a href="#250">250: </a>
317</span><span id="251" class="l"><a href="#251">251: </a>    <span class="php-comment">/**
318</span></span><span id="252" class="l"><a href="#252">252: </a><span class="php-comment">     * Merges items into larger sentences.
319</span></span><span id="253" class="l"><a href="#253">253: </a><span class="php-comment">     *
320</span></span><span id="254" class="l"><a href="#254">254: </a><span class="php-comment">     * Multibyte safe
321</span></span><span id="255" class="l"><a href="#255">255: </a><span class="php-comment">     *
322</span></span><span id="256" class="l"><a href="#256">256: </a><span class="php-comment">     * @param array $shorts
323</span></span><span id="257" class="l"><a href="#257">257: </a><span class="php-comment">     * @return array
324</span></span><span id="258" class="l"><a href="#258">258: </a><span class="php-comment">     */</span>
325</span><span id="259" class="l"><a href="#259">259: </a>    <span class="php-keyword1">private</span> <span class="php-keyword1">function</span> sentenceMerge(<span class="php-var">$shorts</span>) {
326</span><span id="260" class="l"><a href="#260">260: </a>        <span class="php-var">$non_abbreviating_terminals</span> = <span class="php-keyword2">array_diff</span>(<span class="php-var">$this</span>-&gt;terminals, <span class="php-var">$this</span>-&gt;abbreviators);
327</span><span id="261" class="l"><a href="#261">261: </a>
328</span><span id="262" class="l"><a href="#262">262: </a>        <span class="php-var">$sentences</span> = <span class="php-keyword1">array</span>();
329</span><span id="263" class="l"><a href="#263">263: </a>
330</span><span id="264" class="l"><a href="#264">264: </a>        <span class="php-var">$sentence</span> = <span class="php-quote">''</span>;
331</span><span id="265" class="l"><a href="#265">265: </a>        <span class="php-var">$has_words</span> = <span class="php-keyword1">false</span>;
332</span><span id="266" class="l"><a href="#266">266: </a>        <span class="php-var">$previous_word_ending</span> = <span class="php-keyword1">null</span>;
333</span><span id="267" class="l"><a href="#267">267: </a>        <span class="php-keyword1">foreach</span> (<span class="php-var">$shorts</span> <span class="php-keyword1">as</span> <span class="php-var">$short</span>) {
334</span><span id="268" class="l"><a href="#268">268: </a>            <span class="php-var">$word_count</span> = <span class="php-keyword2">count</span>(<span class="php-keyword2">mb_split</span>(<span class="php-quote">'\s+'</span>, self::mbTrim(<span class="php-var">$short</span>)));
335</span><span id="269" class="l"><a href="#269">269: </a>            <span class="php-var">$after_non_abbreviating_terminal</span> = <span class="php-keyword2">in_array</span>(<span class="php-var">$previous_word_ending</span>, <span class="php-var">$non_abbreviating_terminals</span>);
336</span><span id="270" class="l"><a href="#270">270: </a>
337</span><span id="271" class="l"><a href="#271">271: </a>            <span class="php-keyword1">if</span> (<span class="php-var">$after_non_abbreviating_terminal</span> || (<span class="php-var">$has_words</span> &amp;&amp; <span class="php-var">$word_count</span> &gt; <span class="php-num">1</span>)) {
338</span><span id="272" class="l"><a href="#272">272: </a>                <span class="php-var">$sentences</span>[] = <span class="php-var">$sentence</span>;
339</span><span id="273" class="l"><a href="#273">273: </a>                <span class="php-var">$sentence</span> = <span class="php-quote">''</span>;
340</span><span id="274" class="l"><a href="#274">274: </a>                <span class="php-var">$has_words</span> = <span class="php-var">$word_count</span> &gt; <span class="php-num">1</span>;
341</span><span id="275" class="l"><a href="#275">275: </a>            } <span class="php-keyword1">else</span> {
342</span><span id="276" class="l"><a href="#276">276: </a>                <span class="php-var">$has_words</span> = <span class="php-var">$has_words</span> || <span class="php-var">$word_count</span> &gt; <span class="php-num">1</span>;
343</span><span id="277" class="l"><a href="#277">277: </a>            }
344</span><span id="278" class="l"><a href="#278">278: </a>
345</span><span id="279" class="l"><a href="#279">279: </a>            <span class="php-var">$sentence</span>.= <span class="php-var">$short</span>;
346</span><span id="280" class="l"><a href="#280">280: </a>            <span class="php-var">$previous_word_ending</span> = <span class="php-keyword2">mb_substr</span>(<span class="php-var">$short</span>, -<span class="php-num">1</span>);
347</span><span id="281" class="l"><a href="#281">281: </a>        }
348</span><span id="282" class="l"><a href="#282">282: </a>        <span class="php-keyword1">if</span> (!<span class="php-keyword1">empty</span>(<span class="php-var">$sentence</span>)) {
349</span><span id="283" class="l"><a href="#283">283: </a>            <span class="php-var">$sentences</span>[] = <span class="php-var">$sentence</span>;
350</span><span id="284" class="l"><a href="#284">284: </a>        }
351</span><span id="285" class="l"><a href="#285">285: </a>
352</span><span id="286" class="l"><a href="#286">286: </a>        <span class="php-keyword1">return</span> <span class="php-var">$sentences</span>;
353</span><span id="287" class="l"><a href="#287">287: </a>    }
354</span><span id="288" class="l"><a href="#288">288: </a>
355</span><span id="289" class="l"><a href="#289">289: </a>    <span class="php-comment">/**
356</span></span><span id="290" class="l"><a href="#290">290: </a><span class="php-comment">     * Return the sentences sentences detected in the provided text.
357</span></span><span id="291" class="l"><a href="#291">291: </a><span class="php-comment">     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
358</span></span><span id="292" class="l"><a href="#292">292: </a><span class="php-comment">     * @param string $text
359</span></span><span id="293" class="l"><a href="#293">293: </a><span class="php-comment">     * @param integer $flags
360</span></span><span id="294" class="l"><a href="#294">294: </a><span class="php-comment">     * @return array
361</span></span><span id="295" class="l"><a href="#295">295: </a><span class="php-comment">     */</span>
362</span><span id="296" class="l"><a href="#296">296: </a>    <span class="php-keyword1">public</span> <span class="php-keyword1">function</span> <span class="php-keyword2">split</span>(<span class="php-var">$text</span>, <span class="php-var">$flags</span> = <span class="php-num">0</span>) {
363</span><span id="297" class="l"><a href="#297">297: </a>        <span class="php-var">$sentences</span> = <span class="php-keyword1">array</span>();
364</span><span id="298" class="l"><a href="#298">298: </a>
365</span><span id="299" class="l"><a href="#299">299: </a>        <span class="php-comment">// Split</span>
366</span><span id="300" class="l"><a href="#300">300: </a>        <span class="php-keyword1">foreach</span> (self::linebreakSplit(<span class="php-var">$text</span>) <span class="php-keyword1">as</span> <span class="php-var">$line</span>) {
367</span><span id="301" class="l"><a href="#301">301: </a>            <span class="php-keyword1">if</span> (self::mbTrim(<span class="php-var">$line</span>) !== <span class="php-quote">''</span>) {
368</span><span id="302" class="l"><a href="#302">302: </a>                <span class="php-var">$punctuations</span>   = <span class="php-var">$this</span>-&gt;punctuationSplit(<span class="php-var">$line</span>);
369</span><span id="303" class="l"><a href="#303">303: </a>                <span class="php-var">$merges</span>         = <span class="php-var">$this</span>-&gt;punctuationMerge(<span class="php-var">$punctuations</span>);
370</span><span id="304" class="l"><a href="#304">304: </a>                <span class="php-var">$shorts</span>         = <span class="php-var">$this</span>-&gt;abbreviationMerge(<span class="php-var">$merges</span>);
371</span><span id="305" class="l"><a href="#305">305: </a>                <span class="php-var">$sentences</span>      = <span class="php-keyword2">array_merge</span>(<span class="php-var">$sentences</span>, <span class="php-var">$this</span>-&gt;sentenceMerge(<span class="php-var">$shorts</span>));
372</span><span id="306" class="l"><a href="#306">306: </a>            }
373</span><span id="307" class="l"><a href="#307">307: </a>        }
374</span><span id="308" class="l"><a href="#308">308: </a>
375</span><span id="309" class="l"><a href="#309">309: </a>        <span class="php-comment">// Post process</span>
376</span><span id="310" class="l"><a href="#310">310: </a>        <span class="php-keyword1">if</span> (<span class="php-var">$flags</span> &amp; self::SPLIT_TRIM) {
377</span><span id="311" class="l"><a href="#311">311: </a>            <span class="php-keyword1">foreach</span> (<span class="php-var">$sentences</span> <span class="php-keyword1">as</span> &amp;<span class="php-var">$sentence</span>) {
378</span><span id="312" class="l"><a href="#312">312: </a>                <span class="php-var">$sentence</span> = self::mbTrim(<span class="php-var">$sentence</span>);
379</span><span id="313" class="l"><a href="#313">313: </a>            }
380</span><span id="314" class="l"><a href="#314">314: </a>            <span class="php-keyword1">unset</span>(<span class="php-var">$sentence</span>);
381</span><span id="315" class="l"><a href="#315">315: </a>        }
382</span><span id="316" class="l"><a href="#316">316: </a>
383</span><span id="317" class="l"><a href="#317">317: </a>        <span class="php-keyword1">return</span> <span class="php-var">$sentences</span>;
384</span><span id="318" class="l"><a href="#318">318: </a>    }
385</span><span id="319" class="l"><a href="#319">319: </a>
386</span><span id="320" class="l"><a href="#320">320: </a>    <span class="php-comment">/**
387</span></span><span id="321" class="l"><a href="#321">321: </a><span class="php-comment">     * Return the number of sentences detected in the provided text.
388</span></span><span id="322" class="l"><a href="#322">322: </a><span class="php-comment">     * @param string $text
389</span></span><span id="323" class="l"><a href="#323">323: </a><span class="php-comment">     * @return integer
390</span></span><span id="324" class="l"><a href="#324">324: </a><span class="php-comment">     */</span>
391</span><span id="325" class="l"><a href="#325">325: </a>    <span class="php-keyword1">public</span> <span class="php-keyword1">function</span> <span class="php-keyword2">count</span>(<span class="php-var">$text</span>) {
392</span><span id="326" class="l"><a href="#326">326: </a>        <span class="php-keyword1">return</span> <span class="php-keyword2">count</span>(<span class="php-var">$this</span>-&gt;<span class="php-keyword2">split</span>(<span class="php-var">$text</span>));
393</span><span id="327" class="l"><a href="#327">327: </a>    }
394</span><span id="328" class="l"><a href="#328">328: </a>}</span></code></pre>
395
396	<div id="footer">
397		phpSentence API documentation generated by <a href="http://apigen.org">ApiGen</a>
398	</div>
399</div>
400</div>
401<script src="resources/combined.js?dc3592a696e654c132a2cb2ca318def0ec6c3f17"></script>
402<script src="elementlist.js?94082770cba9dfa8d9d0c03634ee64ddac29c138"></script>
403</body>
404</html>
405