xref: /plugin/cachestats/FileStatistics.php (revision c25debc6bd808a17b10cab0771c0153188c1e6cf)
1<?php
2
3namespace dokuwiki\plugin\cachestats;
4
5use InvalidArgumentException;
6use RecursiveDirectoryIterator;
7use RecursiveIteratorIterator;
8use SplFileInfo;
9
10/**
11 * Class FileStatistics
12 *
13 * Recursively scans a directory and collects:
14 *  - number of files per file extension
15 *  - duplicate files (based on MD5 checksum) per file extension
16 *  - size of files summed up per extension
17 *  - number of files per extension grouped by last modified date
18 *  - total number of files
19 *  - total size of all files
20 */
21class FileStatistics
22{
23    private string $path;
24
25    /** @var string[] */
26    private array $buckets = ['<1d', '<1w', '<1m', '<3m', '<6m', '<1y', '>1y'];
27
28    private array $stats = [
29        'extensions' => [],
30        'duplicates' => [],
31        'sizes' => [],
32        'modified_groups' => [],
33        'total_files' => 0,
34        'total_size' => 0,
35    ];
36
37    private array $hashMap = []; // md5 => [ext, count]
38
39    public function __construct(string $path)
40    {
41        if (!is_dir($path)) {
42            throw new InvalidArgumentException("Path '$path' is not a valid directory.");
43        }
44
45        $this->path = rtrim($path, DIRECTORY_SEPARATOR);
46    }
47
48    public function collect(): array
49    {
50        $iterator = new RecursiveIteratorIterator(
51            new RecursiveDirectoryIterator($this->path, RecursiveDirectoryIterator::SKIP_DOTS)
52        );
53
54        $now = time();
55
56        foreach ($iterator as $fileInfo) {
57            /** @var SplFileInfo $fileInfo */
58            if (!$fileInfo->isFile()) {
59                continue;
60            }
61
62            $this->stats['total_files']++;
63            $ext = strtolower($fileInfo->getExtension()) ?: 'no_extension';
64            $path = $fileInfo->getPathname();
65            $size = $fileInfo->getSize();
66            $mtime = $fileInfo->getMTime();
67
68            // size aggregated per extension
69            $this->stats['sizes'][$ext] = ($this->stats['sizes'][$ext] ?? 0) + $size;
70            $this->stats['total_size'] += $size;
71
72            // count per extension
73            $this->stats['extensions'][$ext] = ($this->stats['extensions'][$ext] ?? 0) + 1;
74
75            // group by modified time
76            $group = $this->getModifiedGroup($now - $mtime);
77            $this->stats['modified_groups'][$ext][$group] =
78                ($this->stats['modified_groups'][$ext][$group] ?? 0) + 1;
79
80            // handle duplicates by checksum
81            $md5 = md5_file($path);
82            if (isset($this->hashMap[$md5])) {
83                $this->hashMap[$md5]['count']++;
84            } else {
85                $this->hashMap[$md5] = ['ext' => $ext, 'count' => 1];
86            }
87        }
88
89        // summarize duplicates
90        foreach ($this->hashMap as $hash => $info) {
91            if ($info['count'] > 1) {
92                $this->stats['duplicates'][$info['ext']] =
93                    ($this->stats['duplicates'][$info['ext']] ?? 0) + ($info['count'] - 1);
94            }
95        }
96
97        return $this->buildResult();
98    }
99
100    private function getModifiedGroup(int $ageSeconds): string
101    {
102        $day = 86400;
103        return match (true) {
104            $ageSeconds < $day => '<1d',
105            $ageSeconds < 7 * $day => '<1w',
106            $ageSeconds < 30 * $day => '<1m',
107            $ageSeconds < 90 * $day => '<3m',
108            $ageSeconds < 180 * $day => '<6m',
109            $ageSeconds < 365 * $day => '<1y',
110            default => '>1y',
111        };
112    }
113
114    /**
115     * Combine collected sub statistics into a single result array keyed by extension
116     */
117    private function buildResult(): array
118    {
119        $keys = array_unique(
120            array_merge(
121                array_keys($this->stats['extensions']),
122                array_keys($this->stats['sizes']),
123                array_keys($this->stats['duplicates']),
124                array_keys($this->stats['modified_groups'])
125            )
126        );
127
128        $result = [];
129        foreach ($keys as $key) {
130            $result[$key] = [
131                'count' => $this->stats['extensions'][$key] ?? 0,
132                'size' => $this->stats['sizes'][$key] ?? 0,
133                'dups' => $this->stats['duplicates'][$key] ?? 0,
134            ];
135            foreach ($this->buckets as $bucket) {
136                $result[$key][$bucket] = $this->stats['modified_groups'][$key][$bucket] ?? 0;
137            }
138        }
139
140        return $result;
141    }
142}
143