xref: /plugin/cachestats/FileStatistics.php (revision c180bf5ffae7ba7408a7d74178df97480b848636)
1<?php
2
3namespace dokuwiki\plugin\cachestats;
4
5use InvalidArgumentException;
6use RecursiveDirectoryIterator;
7use RecursiveIteratorIterator;
8use SplFileInfo;
9
10/**
11 * Recursively scans a directory and builds cache statistics keyed by extension.
12 * Data includes counts, total size, duplicate counts, and age buckets.
13 */
14class FileStatistics
15{
16    private string $path;
17
18    private const BUCKETS = ['<1d', '<1w', '<1m', '<3m', '<6m', '<1y', '>1y'];
19
20    private array $result = [];
21
22    private array $hashMap = []; // md5 => [ext, count]
23
24    /**
25     * @param string $path Absolute path to the cache directory
26     */
27    public function __construct(string $path)
28    {
29        if (!is_dir($path)) {
30            throw new InvalidArgumentException("Path '$path' is not a valid directory.");
31        }
32
33        $this->path = rtrim($path, DIRECTORY_SEPARATOR);
34    }
35
36    /**
37     * Walk the directory tree and return statistics keyed by extension.
38     *
39     * @return array<string, array>
40     */
41    public function collect(): array
42    {
43        $iterator = new RecursiveIteratorIterator(
44            new RecursiveDirectoryIterator($this->path, RecursiveDirectoryIterator::SKIP_DOTS)
45        );
46
47        $now = time();
48
49        foreach ($iterator as $fileInfo) {
50            /** @var SplFileInfo $fileInfo */
51            if (!$fileInfo->isFile()) {
52                continue;
53            }
54
55            $ext = strtolower($fileInfo->getExtension()) ?: 'no_extension';
56            $path = $fileInfo->getPathname();
57            $size = $fileInfo->getSize();
58            $mtime = $fileInfo->getMTime();
59
60            $this->initExtension($ext);
61
62            $this->result[$ext]['count']++;
63            $this->result[$ext]['size'] += $size;
64
65            // group by modified time
66            $group = $this->getModifiedGroup($now - $mtime);
67            $this->result[$ext][$group]++;
68
69            // handle duplicates by checksum
70            $md5 = md5_file($path);
71            if (isset($this->hashMap[$md5])) {
72                $this->hashMap[$md5]['count']++;
73            } else {
74                $this->hashMap[$md5] = ['ext' => $ext, 'count' => 1];
75            }
76        }
77
78        // summarize duplicates
79        foreach ($this->hashMap as $hash => $info) {
80            if ($info['count'] > 1) {
81                $ext = $info['ext'];
82                $this->initExtension($ext);
83                $this->result[$ext]['dups'] += $info['count'] - 1;
84            }
85        }
86
87        return $this->result;
88    }
89
90    /**
91     * Map file age to a human-friendly bucket label.
92     *
93     * @param int $ageSeconds Age in seconds since last modification
94     */
95    private function getModifiedGroup(int $ageSeconds): string
96    {
97        $day = 86400;
98        return match (true) {
99            $ageSeconds < $day => '<1d',
100            $ageSeconds < 7 * $day => '<1w',
101            $ageSeconds < 30 * $day => '<1m',
102            $ageSeconds < 90 * $day => '<3m',
103            $ageSeconds < 180 * $day => '<6m',
104            $ageSeconds < 365 * $day => '<1y',
105            default => '>1y',
106        };
107    }
108
109    /**
110     * Ensure an extension has all expected keys initialized.
111     *
112     * @param string $ext Lowercased file extension (or 'no_extension')
113     */
114    private function initExtension(string $ext): void
115    {
116        if (isset($this->result[$ext])) {
117            return;
118        }
119
120        $this->result[$ext] = [
121            'count' => 0,
122            'size' => 0,
123            'dups' => 0,
124        ];
125        foreach (self::BUCKETS as $bucket) {
126            $this->result[$ext][$bucket] = 0;
127        }
128    }
129}
130