1*a3092f6cSAndreas Gohr<?php 2*a3092f6cSAndreas Gohr 3*a3092f6cSAndreas Gohrnamespace dokuwiki\plugin\cachestats; 4*a3092f6cSAndreas Gohr 5*a3092f6cSAndreas Gohruse InvalidArgumentException; 6*a3092f6cSAndreas Gohruse RecursiveDirectoryIterator; 7*a3092f6cSAndreas Gohruse RecursiveIteratorIterator; 8*a3092f6cSAndreas Gohruse SplFileInfo; 9*a3092f6cSAndreas Gohr 10*a3092f6cSAndreas Gohr/** 11*a3092f6cSAndreas Gohr * Class FileStatistics 12*a3092f6cSAndreas Gohr * 13*a3092f6cSAndreas Gohr * Recursively scans a directory and collects: 14*a3092f6cSAndreas Gohr * - number of files per file extension 15*a3092f6cSAndreas Gohr * - duplicate files (based on MD5 checksum) per file extension 16*a3092f6cSAndreas Gohr * - size of files summed up per extension 17*a3092f6cSAndreas Gohr * - number of files per extension grouped by last modified date 18*a3092f6cSAndreas Gohr * - total number of files 19*a3092f6cSAndreas Gohr * - total size of all files 20*a3092f6cSAndreas Gohr */ 21*a3092f6cSAndreas Gohrclass FileStatistics 22*a3092f6cSAndreas Gohr{ 23*a3092f6cSAndreas Gohr private string $path; 24*a3092f6cSAndreas Gohr 25*a3092f6cSAndreas Gohr private array $stats = [ 26*a3092f6cSAndreas Gohr 'extensions' => [], 27*a3092f6cSAndreas Gohr 'duplicates' => [], 28*a3092f6cSAndreas Gohr 'sizes' => [], 29*a3092f6cSAndreas Gohr 'modified_groups' => [], 30*a3092f6cSAndreas Gohr 'total_files' => 0, 31*a3092f6cSAndreas Gohr 'total_size' => 0, 32*a3092f6cSAndreas Gohr ]; 33*a3092f6cSAndreas Gohr 34*a3092f6cSAndreas Gohr private array $hashMap = []; // md5 => [ext, count] 35*a3092f6cSAndreas Gohr 36*a3092f6cSAndreas Gohr public function __construct(string $path) 37*a3092f6cSAndreas Gohr { 38*a3092f6cSAndreas Gohr if (!is_dir($path)) { 39*a3092f6cSAndreas Gohr throw new InvalidArgumentException("Path '$path' is not a valid directory."); 40*a3092f6cSAndreas Gohr } 41*a3092f6cSAndreas Gohr 42*a3092f6cSAndreas Gohr $this->path = rtrim($path, DIRECTORY_SEPARATOR); 43*a3092f6cSAndreas Gohr } 44*a3092f6cSAndreas Gohr 45*a3092f6cSAndreas Gohr public function collect(): array 46*a3092f6cSAndreas Gohr { 47*a3092f6cSAndreas Gohr $iterator = new RecursiveIteratorIterator( 48*a3092f6cSAndreas Gohr new RecursiveDirectoryIterator($this->path, RecursiveDirectoryIterator::SKIP_DOTS) 49*a3092f6cSAndreas Gohr ); 50*a3092f6cSAndreas Gohr 51*a3092f6cSAndreas Gohr $now = time(); 52*a3092f6cSAndreas Gohr 53*a3092f6cSAndreas Gohr foreach ($iterator as $fileInfo) { 54*a3092f6cSAndreas Gohr /** @var SplFileInfo $fileInfo */ 55*a3092f6cSAndreas Gohr if (!$fileInfo->isFile()) { 56*a3092f6cSAndreas Gohr continue; 57*a3092f6cSAndreas Gohr } 58*a3092f6cSAndreas Gohr 59*a3092f6cSAndreas Gohr $this->stats['total_files']++; 60*a3092f6cSAndreas Gohr $ext = strtolower($fileInfo->getExtension()) ?: 'no_extension'; 61*a3092f6cSAndreas Gohr $path = $fileInfo->getPathname(); 62*a3092f6cSAndreas Gohr $size = $fileInfo->getSize(); 63*a3092f6cSAndreas Gohr $mtime = $fileInfo->getMTime(); 64*a3092f6cSAndreas Gohr 65*a3092f6cSAndreas Gohr // size aggregated per extension 66*a3092f6cSAndreas Gohr $this->stats['sizes'][$ext] = ($this->stats['sizes'][$ext] ?? 0) + $size; 67*a3092f6cSAndreas Gohr $this->stats['total_size'] += $size; 68*a3092f6cSAndreas Gohr 69*a3092f6cSAndreas Gohr // count per extension 70*a3092f6cSAndreas Gohr $this->stats['extensions'][$ext] = ($this->stats['extensions'][$ext] ?? 0) + 1; 71*a3092f6cSAndreas Gohr 72*a3092f6cSAndreas Gohr // group by modified time 73*a3092f6cSAndreas Gohr $group = $this->getModifiedGroup($now - $mtime); 74*a3092f6cSAndreas Gohr $this->stats['modified_groups'][$ext][$group] = 75*a3092f6cSAndreas Gohr ($this->stats['modified_groups'][$ext][$group] ?? 0) + 1; 76*a3092f6cSAndreas Gohr 77*a3092f6cSAndreas Gohr // handle duplicates by checksum 78*a3092f6cSAndreas Gohr $md5 = md5_file($path); 79*a3092f6cSAndreas Gohr if (isset($this->hashMap[$md5])) { 80*a3092f6cSAndreas Gohr $this->hashMap[$md5]['count']++; 81*a3092f6cSAndreas Gohr } else { 82*a3092f6cSAndreas Gohr $this->hashMap[$md5] = ['ext' => $ext, 'count' => 1]; 83*a3092f6cSAndreas Gohr } 84*a3092f6cSAndreas Gohr } 85*a3092f6cSAndreas Gohr 86*a3092f6cSAndreas Gohr // summarize duplicates 87*a3092f6cSAndreas Gohr foreach ($this->hashMap as $hash => $info) { 88*a3092f6cSAndreas Gohr if ($info['count'] > 1) { 89*a3092f6cSAndreas Gohr $this->stats['duplicates'][$info['ext']] = 90*a3092f6cSAndreas Gohr ($this->stats['duplicates'][$info['ext']] ?? 0) + ($info['count'] - 1); 91*a3092f6cSAndreas Gohr } 92*a3092f6cSAndreas Gohr } 93*a3092f6cSAndreas Gohr 94*a3092f6cSAndreas Gohr return $this->stats; 95*a3092f6cSAndreas Gohr } 96*a3092f6cSAndreas Gohr 97*a3092f6cSAndreas Gohr private function getModifiedGroup(int $ageSeconds): string 98*a3092f6cSAndreas Gohr { 99*a3092f6cSAndreas Gohr $day = 86400; 100*a3092f6cSAndreas Gohr return match (true) { 101*a3092f6cSAndreas Gohr $ageSeconds < $day => '<1d', 102*a3092f6cSAndreas Gohr $ageSeconds < 7 * $day => '<1w', 103*a3092f6cSAndreas Gohr $ageSeconds < 30 * $day => '<1m', 104*a3092f6cSAndreas Gohr $ageSeconds < 90 * $day => '<3m', 105*a3092f6cSAndreas Gohr $ageSeconds < 180 * $day => '<6m', 106*a3092f6cSAndreas Gohr $ageSeconds < 365 * $day => '<1y', 107*a3092f6cSAndreas Gohr default => '>1y', 108*a3092f6cSAndreas Gohr }; 109*a3092f6cSAndreas Gohr } 110*a3092f6cSAndreas Gohr} 111*a3092f6cSAndreas Gohr 112