xref: /plugin/cachestats/FileStatistics.php (revision c180bf5ffae7ba7408a7d74178df97480b848636)
1a3092f6cSAndreas Gohr<?php
2a3092f6cSAndreas Gohr
3a3092f6cSAndreas Gohrnamespace dokuwiki\plugin\cachestats;
4a3092f6cSAndreas Gohr
5a3092f6cSAndreas Gohruse InvalidArgumentException;
6a3092f6cSAndreas Gohruse RecursiveDirectoryIterator;
7a3092f6cSAndreas Gohruse RecursiveIteratorIterator;
8a3092f6cSAndreas Gohruse SplFileInfo;
9a3092f6cSAndreas Gohr
10a3092f6cSAndreas Gohr/**
11*c180bf5fSAndreas Gohr * Recursively scans a directory and builds cache statistics keyed by extension.
12*c180bf5fSAndreas Gohr * Data includes counts, total size, duplicate counts, and age buckets.
13a3092f6cSAndreas Gohr */
14a3092f6cSAndreas Gohrclass FileStatistics
15a3092f6cSAndreas Gohr{
16a3092f6cSAndreas Gohr    private string $path;
17a3092f6cSAndreas Gohr
18537711ebSAndreas Gohr    private const BUCKETS = ['<1d', '<1w', '<1m', '<3m', '<6m', '<1y', '>1y'];
19c25debc6SAndreas Gohr
20537711ebSAndreas Gohr    private array $result = [];
21a3092f6cSAndreas Gohr
22a3092f6cSAndreas Gohr    private array $hashMap = []; // md5 => [ext, count]
23a3092f6cSAndreas Gohr
24*c180bf5fSAndreas Gohr    /**
25*c180bf5fSAndreas Gohr     * @param string $path Absolute path to the cache directory
26*c180bf5fSAndreas Gohr     */
27a3092f6cSAndreas Gohr    public function __construct(string $path)
28a3092f6cSAndreas Gohr    {
29a3092f6cSAndreas Gohr        if (!is_dir($path)) {
30a3092f6cSAndreas Gohr            throw new InvalidArgumentException("Path '$path' is not a valid directory.");
31a3092f6cSAndreas Gohr        }
32a3092f6cSAndreas Gohr
33a3092f6cSAndreas Gohr        $this->path = rtrim($path, DIRECTORY_SEPARATOR);
34a3092f6cSAndreas Gohr    }
35a3092f6cSAndreas Gohr
36*c180bf5fSAndreas Gohr    /**
37*c180bf5fSAndreas Gohr     * Walk the directory tree and return statistics keyed by extension.
38*c180bf5fSAndreas Gohr     *
39*c180bf5fSAndreas Gohr     * @return array<string, array>
40*c180bf5fSAndreas Gohr     */
41a3092f6cSAndreas Gohr    public function collect(): array
42a3092f6cSAndreas Gohr    {
43a3092f6cSAndreas Gohr        $iterator = new RecursiveIteratorIterator(
44a3092f6cSAndreas Gohr            new RecursiveDirectoryIterator($this->path, RecursiveDirectoryIterator::SKIP_DOTS)
45a3092f6cSAndreas Gohr        );
46a3092f6cSAndreas Gohr
47a3092f6cSAndreas Gohr        $now = time();
48a3092f6cSAndreas Gohr
49a3092f6cSAndreas Gohr        foreach ($iterator as $fileInfo) {
50a3092f6cSAndreas Gohr            /** @var SplFileInfo $fileInfo */
51a3092f6cSAndreas Gohr            if (!$fileInfo->isFile()) {
52a3092f6cSAndreas Gohr                continue;
53a3092f6cSAndreas Gohr            }
54a3092f6cSAndreas Gohr
55a3092f6cSAndreas Gohr            $ext = strtolower($fileInfo->getExtension()) ?: 'no_extension';
56a3092f6cSAndreas Gohr            $path = $fileInfo->getPathname();
57a3092f6cSAndreas Gohr            $size = $fileInfo->getSize();
58a3092f6cSAndreas Gohr            $mtime = $fileInfo->getMTime();
59a3092f6cSAndreas Gohr
60537711ebSAndreas Gohr            $this->initExtension($ext);
61a3092f6cSAndreas Gohr
62537711ebSAndreas Gohr            $this->result[$ext]['count']++;
63537711ebSAndreas Gohr            $this->result[$ext]['size'] += $size;
64a3092f6cSAndreas Gohr
65a3092f6cSAndreas Gohr            // group by modified time
66a3092f6cSAndreas Gohr            $group = $this->getModifiedGroup($now - $mtime);
67537711ebSAndreas Gohr            $this->result[$ext][$group]++;
68a3092f6cSAndreas Gohr
69a3092f6cSAndreas Gohr            // handle duplicates by checksum
70a3092f6cSAndreas Gohr            $md5 = md5_file($path);
71a3092f6cSAndreas Gohr            if (isset($this->hashMap[$md5])) {
72a3092f6cSAndreas Gohr                $this->hashMap[$md5]['count']++;
73a3092f6cSAndreas Gohr            } else {
74a3092f6cSAndreas Gohr                $this->hashMap[$md5] = ['ext' => $ext, 'count' => 1];
75a3092f6cSAndreas Gohr            }
76a3092f6cSAndreas Gohr        }
77a3092f6cSAndreas Gohr
78a3092f6cSAndreas Gohr        // summarize duplicates
79a3092f6cSAndreas Gohr        foreach ($this->hashMap as $hash => $info) {
80a3092f6cSAndreas Gohr            if ($info['count'] > 1) {
81537711ebSAndreas Gohr                $ext = $info['ext'];
82537711ebSAndreas Gohr                $this->initExtension($ext);
83537711ebSAndreas Gohr                $this->result[$ext]['dups'] += $info['count'] - 1;
84a3092f6cSAndreas Gohr            }
85a3092f6cSAndreas Gohr        }
86a3092f6cSAndreas Gohr
87537711ebSAndreas Gohr        return $this->result;
88a3092f6cSAndreas Gohr    }
89a3092f6cSAndreas Gohr
90*c180bf5fSAndreas Gohr    /**
91*c180bf5fSAndreas Gohr     * Map file age to a human-friendly bucket label.
92*c180bf5fSAndreas Gohr     *
93*c180bf5fSAndreas Gohr     * @param int $ageSeconds Age in seconds since last modification
94*c180bf5fSAndreas Gohr     */
95a3092f6cSAndreas Gohr    private function getModifiedGroup(int $ageSeconds): string
96a3092f6cSAndreas Gohr    {
97a3092f6cSAndreas Gohr        $day = 86400;
98a3092f6cSAndreas Gohr        return match (true) {
99a3092f6cSAndreas Gohr            $ageSeconds < $day => '<1d',
100a3092f6cSAndreas Gohr            $ageSeconds < 7 * $day => '<1w',
101a3092f6cSAndreas Gohr            $ageSeconds < 30 * $day => '<1m',
102a3092f6cSAndreas Gohr            $ageSeconds < 90 * $day => '<3m',
103a3092f6cSAndreas Gohr            $ageSeconds < 180 * $day => '<6m',
104a3092f6cSAndreas Gohr            $ageSeconds < 365 * $day => '<1y',
105a3092f6cSAndreas Gohr            default => '>1y',
106a3092f6cSAndreas Gohr        };
107a3092f6cSAndreas Gohr    }
108c25debc6SAndreas Gohr
109*c180bf5fSAndreas Gohr    /**
110*c180bf5fSAndreas Gohr     * Ensure an extension has all expected keys initialized.
111*c180bf5fSAndreas Gohr     *
112*c180bf5fSAndreas Gohr     * @param string $ext Lowercased file extension (or 'no_extension')
113*c180bf5fSAndreas Gohr     */
114537711ebSAndreas Gohr    private function initExtension(string $ext): void
115c25debc6SAndreas Gohr    {
116537711ebSAndreas Gohr        if (isset($this->result[$ext])) {
117537711ebSAndreas Gohr            return;
118537711ebSAndreas Gohr        }
119c25debc6SAndreas Gohr
120537711ebSAndreas Gohr        $this->result[$ext] = [
121537711ebSAndreas Gohr            'count' => 0,
122537711ebSAndreas Gohr            'size' => 0,
123537711ebSAndreas Gohr            'dups' => 0,
124c25debc6SAndreas Gohr        ];
125537711ebSAndreas Gohr        foreach (self::BUCKETS as $bucket) {
126537711ebSAndreas Gohr            $this->result[$ext][$bucket] = 0;
127c25debc6SAndreas Gohr        }
128a3092f6cSAndreas Gohr    }
129c25debc6SAndreas Gohr}
130