1<?php 2 3/** 4 * This file is part of PHP K-Means 5 * 6 * Copyright (c) 2014 Benjamin Delespierre 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a copy 9 * of this software and associated documentation files (the "Software"), to deal 10 * in the Software without restriction, including without limitation the rights 11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 * copies of the Software, and to permit persons to whom the Software is furnished 13 * to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice shall be included in all 16 * copies or substantial portions of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 * THE SOFTWARE. 25 */ 26 27namespace KMeans; 28 29class Space extends \SplObjectStorage 30{ 31 protected $dimention; 32 33 protected static $rng = 'mt_rand'; 34 35 public function __construct($dimention) 36 { 37 if ($dimention < 1) { 38 throw new \LogicException("a space dimention cannot be null or negative"); 39 } 40 41 $this->dimention = $dimention; 42 } 43 44 public static function setRng(callable $fn): void 45 { 46 static::$rng = $fn; 47 } 48 49 public function toArray(): array 50 { 51 $points = []; 52 foreach ($this as $point) { 53 $points[] = $point->toArray(); 54 } 55 56 return ['points' => $points]; 57 } 58 59 public function newPoint(array $coordinates): Point 60 { 61 if (count($coordinates) != $this->dimention) { 62 throw new \LogicException("(" . implode(',', $coordinates) . ") is not a point of this space"); 63 } 64 65 return new Point($this, $coordinates); 66 } 67 68 public function addPoint(array $coordinates, $data = null): Point 69 { 70 $this->attach($point = $this->newPoint($coordinates), $data); 71 72 return $point; 73 } 74 75 public function attach($point, $data = null): void 76 { 77 if (!$point instanceof Point) { 78 throw new \InvalidArgumentException("can only attach points to spaces"); 79 } 80 81 parent::attach($point, $data); 82 } 83 84 public function getDimention(): int 85 { 86 return $this->dimention; 87 } 88 89 public function getBoundaries(): array 90 { 91 if (!count($this)) { 92 return []; 93 } 94 95 $min = $this->newPoint(array_fill(0, $this->dimention, null)); 96 $max = $this->newPoint(array_fill(0, $this->dimention, null)); 97 98 foreach ($this as $point) { 99 for ($n = 0; $n < $this->dimention; $n++) { 100 if ($min[$n] === null || $min[$n] > $point[$n]) { 101 $min[$n] = $point[$n]; 102 } 103 104 if ($max[$n] === null || $max[$n] < $point[$n]) { 105 $max[$n] = $point[$n]; 106 } 107 } 108 } 109 110 return [$min, $max]; 111 } 112 113 public function getRandomPoint(Point $min, Point $max): Point 114 { 115 $point = $this->newPoint(array_fill(0, $this->dimention, null)); 116 $rng = static::$rng; 117 118 for ($n = 0; $n < $this->dimention; $n++) { 119 $point[$n] = $rng($min[$n], $max[$n]); 120 } 121 122 return $point; 123 } 124 125 public function solve(int $nbClusters, callable $iterationCallback = null, $initMethod = Cluster::INIT_RANDOM): array 126 { 127 // initialize K clusters 128 $clusters = $this->initializeClusters($nbClusters, $initMethod); 129 130 // there's only one cluster, clusterization has no meaning 131 if (count($clusters) == 1) { 132 return $clusters; 133 } 134 135 // until convergence is reached 136 do { 137 if ($iterationCallback) { 138 $iterationCallback($this, $clusters); 139 } 140 } while ($this->iterate($clusters)); 141 142 // clustering is done. 143 return $clusters; 144 } 145 146 protected function initializeClusters(int $nbClusters, int $initMethod): array 147 { 148 if ($nbClusters <= 0) { 149 throw new \InvalidArgumentException("invalid clusters number"); 150 } 151 152 switch ($initMethod) { 153 case Cluster::INIT_RANDOM: 154 $clusters = $this->initializeRandomClusters($nbClusters); 155 156 break; 157 158 case Cluster::INIT_KMEANS_PLUS_PLUS: 159 $clusters = $this->initializeKmeansPlusPlusClusters($nbClusters); 160 161 break; 162 163 default: 164 return []; 165 } 166 167 // assign all points to the first cluster 168 $clusters[0]->attachAll($this); 169 170 return $clusters; 171 } 172 173 protected function initializeKmeansPlusPlusClusters(int $nbClusters): array 174 { 175 $clusters = []; 176 $clusters[] = new Cluster($this, $this->current()->getCoordinates()); 177 178 for ($i = 1; $i < $nbClusters; ++$i) { 179 $sum = 0; 180 $distances = []; 181 foreach ($this as $point) { 182 $distance = $point->getDistanceWith($point->getClosest($clusters), false); 183 $distances[] = $distance; 184 $sum += $distance; 185 } 186 187 $probabilities = []; 188 foreach ($distances as $distance) { 189 $probabilities[] = $distance / $sum; 190 } 191 192 $cumulativeProbabilities = array_reduce($probabilities, function ($c, $i) { 193 $c[] = end($c) + $i; 194 return $c; 195 }, []); 196 197 $rng = static::$rng; 198 $rand = $rng() / mt_getrandmax(); 199 foreach ($cumulativeProbabilities as $j => $cumulativeProbability) { 200 if ($rand < $cumulativeProbability) { 201 foreach ($this as $key => $value) { 202 if ($j == $key) { 203 $clusters[] = new Cluster($this, $value->getCoordinates()); 204 break; 205 } 206 } 207 break; 208 } 209 } 210 } 211 212 return $clusters; 213 } 214 215 protected function initializeRandomClusters(int $nbClusters): array 216 { 217 $clusters = []; 218 219 // get the space boundaries to avoid placing clusters centroid too far from points 220 list($min, $max) = $this->getBoundaries(); 221 222 // initialize N clusters with a random point within space boundaries 223 for ($n = 0; $n < $nbClusters; $n++) { 224 $clusters[] = new Cluster($this, $this->getRandomPoint($min, $max)->getCoordinates()); 225 } 226 return $clusters; 227 } 228 229 protected function iterate(array $clusters): bool 230 { 231 $continue = false; 232 233 // migration storages 234 $attach = new \SplObjectStorage(); 235 $detach = new \SplObjectStorage(); 236 237 // calculate proximity amongst points and clusters 238 foreach ($clusters as $cluster) { 239 foreach ($cluster as $point) { 240 // find the closest cluster 241 $closest = $point->getClosest($clusters); 242 243 // move the point from its old cluster to its closest 244 if ($closest !== $cluster) { 245 if (! isset($attach[$closest])) { 246 $attach[$closest] = new \SplObjectStorage(); 247 } 248 249 if (! isset($detach[$cluster])) { 250 $detach[$cluster] = new \SplObjectStorage(); 251 } 252 253 $attach[$closest]->attach($point); 254 $detach[$cluster]->attach($point); 255 256 $continue = true; 257 } 258 } 259 } 260 261 // perform points migrations 262 foreach ($attach as $cluster) { 263 $cluster->attachAll($attach[$cluster]); 264 } 265 266 foreach ($detach as $cluster) { 267 $cluster->detachAll($detach[$cluster]); 268 } 269 270 // update all cluster's centroids 271 foreach ($clusters as $cluster) { 272 $cluster->updateCentroid(); 273 } 274 275 return $continue; 276 } 277} 278