1a3092f6cSAndreas Gohr<?php 2a3092f6cSAndreas Gohr 3a3092f6cSAndreas Gohrnamespace dokuwiki\plugin\cachestats; 4a3092f6cSAndreas Gohr 5a3092f6cSAndreas Gohruse InvalidArgumentException; 6a3092f6cSAndreas Gohruse RecursiveDirectoryIterator; 7a3092f6cSAndreas Gohruse RecursiveIteratorIterator; 8a3092f6cSAndreas Gohruse SplFileInfo; 9a3092f6cSAndreas Gohr 10a3092f6cSAndreas Gohr/** 11a3092f6cSAndreas Gohr * Class FileStatistics 12a3092f6cSAndreas Gohr * 13a3092f6cSAndreas Gohr * Recursively scans a directory and collects: 14a3092f6cSAndreas Gohr * - number of files per file extension 15a3092f6cSAndreas Gohr * - duplicate files (based on MD5 checksum) per file extension 16a3092f6cSAndreas Gohr * - size of files summed up per extension 17a3092f6cSAndreas Gohr * - number of files per extension grouped by last modified date 18a3092f6cSAndreas Gohr */ 19a3092f6cSAndreas Gohrclass FileStatistics 20a3092f6cSAndreas Gohr{ 21a3092f6cSAndreas Gohr private string $path; 22a3092f6cSAndreas Gohr 23*537711ebSAndreas Gohr private const BUCKETS = ['<1d', '<1w', '<1m', '<3m', '<6m', '<1y', '>1y']; 24c25debc6SAndreas Gohr 25*537711ebSAndreas Gohr private array $result = []; 26a3092f6cSAndreas Gohr 27a3092f6cSAndreas Gohr private array $hashMap = []; // md5 => [ext, count] 28a3092f6cSAndreas Gohr 29a3092f6cSAndreas Gohr public function __construct(string $path) 30a3092f6cSAndreas Gohr { 31a3092f6cSAndreas Gohr if (!is_dir($path)) { 32a3092f6cSAndreas Gohr throw new InvalidArgumentException("Path '$path' is not a valid directory."); 33a3092f6cSAndreas Gohr } 34a3092f6cSAndreas Gohr 35a3092f6cSAndreas Gohr $this->path = rtrim($path, DIRECTORY_SEPARATOR); 36a3092f6cSAndreas Gohr } 37a3092f6cSAndreas Gohr 38a3092f6cSAndreas Gohr public function collect(): array 39a3092f6cSAndreas Gohr { 40a3092f6cSAndreas Gohr $iterator = new RecursiveIteratorIterator( 41a3092f6cSAndreas Gohr new RecursiveDirectoryIterator($this->path, RecursiveDirectoryIterator::SKIP_DOTS) 42a3092f6cSAndreas Gohr ); 43a3092f6cSAndreas Gohr 44a3092f6cSAndreas Gohr $now = time(); 45a3092f6cSAndreas Gohr 46a3092f6cSAndreas Gohr foreach ($iterator as $fileInfo) { 47a3092f6cSAndreas Gohr /** @var SplFileInfo $fileInfo */ 48a3092f6cSAndreas Gohr if (!$fileInfo->isFile()) { 49a3092f6cSAndreas Gohr continue; 50a3092f6cSAndreas Gohr } 51a3092f6cSAndreas Gohr 52a3092f6cSAndreas Gohr $ext = strtolower($fileInfo->getExtension()) ?: 'no_extension'; 53a3092f6cSAndreas Gohr $path = $fileInfo->getPathname(); 54a3092f6cSAndreas Gohr $size = $fileInfo->getSize(); 55a3092f6cSAndreas Gohr $mtime = $fileInfo->getMTime(); 56a3092f6cSAndreas Gohr 57*537711ebSAndreas Gohr $this->initExtension($ext); 58a3092f6cSAndreas Gohr 59*537711ebSAndreas Gohr $this->result[$ext]['count']++; 60*537711ebSAndreas Gohr $this->result[$ext]['size'] += $size; 61a3092f6cSAndreas Gohr 62a3092f6cSAndreas Gohr // group by modified time 63a3092f6cSAndreas Gohr $group = $this->getModifiedGroup($now - $mtime); 64*537711ebSAndreas Gohr $this->result[$ext][$group]++; 65a3092f6cSAndreas Gohr 66a3092f6cSAndreas Gohr // handle duplicates by checksum 67a3092f6cSAndreas Gohr $md5 = md5_file($path); 68a3092f6cSAndreas Gohr if (isset($this->hashMap[$md5])) { 69a3092f6cSAndreas Gohr $this->hashMap[$md5]['count']++; 70a3092f6cSAndreas Gohr } else { 71a3092f6cSAndreas Gohr $this->hashMap[$md5] = ['ext' => $ext, 'count' => 1]; 72a3092f6cSAndreas Gohr } 73a3092f6cSAndreas Gohr } 74a3092f6cSAndreas Gohr 75a3092f6cSAndreas Gohr // summarize duplicates 76a3092f6cSAndreas Gohr foreach ($this->hashMap as $hash => $info) { 77a3092f6cSAndreas Gohr if ($info['count'] > 1) { 78*537711ebSAndreas Gohr $ext = $info['ext']; 79*537711ebSAndreas Gohr $this->initExtension($ext); 80*537711ebSAndreas Gohr $this->result[$ext]['dups'] += $info['count'] - 1; 81a3092f6cSAndreas Gohr } 82a3092f6cSAndreas Gohr } 83a3092f6cSAndreas Gohr 84*537711ebSAndreas Gohr return $this->result; 85a3092f6cSAndreas Gohr } 86a3092f6cSAndreas Gohr 87a3092f6cSAndreas Gohr private function getModifiedGroup(int $ageSeconds): string 88a3092f6cSAndreas Gohr { 89a3092f6cSAndreas Gohr $day = 86400; 90a3092f6cSAndreas Gohr return match (true) { 91a3092f6cSAndreas Gohr $ageSeconds < $day => '<1d', 92a3092f6cSAndreas Gohr $ageSeconds < 7 * $day => '<1w', 93a3092f6cSAndreas Gohr $ageSeconds < 30 * $day => '<1m', 94a3092f6cSAndreas Gohr $ageSeconds < 90 * $day => '<3m', 95a3092f6cSAndreas Gohr $ageSeconds < 180 * $day => '<6m', 96a3092f6cSAndreas Gohr $ageSeconds < 365 * $day => '<1y', 97a3092f6cSAndreas Gohr default => '>1y', 98a3092f6cSAndreas Gohr }; 99a3092f6cSAndreas Gohr } 100c25debc6SAndreas Gohr 101*537711ebSAndreas Gohr private function initExtension(string $ext): void 102c25debc6SAndreas Gohr { 103*537711ebSAndreas Gohr if (isset($this->result[$ext])) { 104*537711ebSAndreas Gohr return; 105*537711ebSAndreas Gohr } 106c25debc6SAndreas Gohr 107*537711ebSAndreas Gohr $this->result[$ext] = [ 108*537711ebSAndreas Gohr 'count' => 0, 109*537711ebSAndreas Gohr 'size' => 0, 110*537711ebSAndreas Gohr 'dups' => 0, 111c25debc6SAndreas Gohr ]; 112*537711ebSAndreas Gohr foreach (self::BUCKETS as $bucket) { 113*537711ebSAndreas Gohr $this->result[$ext][$bucket] = 0; 114c25debc6SAndreas Gohr } 115a3092f6cSAndreas Gohr } 116c25debc6SAndreas Gohr} 117