xref: /plugin/cachestats/FileStatistics.php (revision 537711eb520f98f3ac06cff3d3b9e73326a97f47)
1<?php
2
3namespace dokuwiki\plugin\cachestats;
4
5use InvalidArgumentException;
6use RecursiveDirectoryIterator;
7use RecursiveIteratorIterator;
8use SplFileInfo;
9
10/**
11 * Class FileStatistics
12 *
13 * Recursively scans a directory and collects:
14 *  - number of files per file extension
15 *  - duplicate files (based on MD5 checksum) per file extension
16 *  - size of files summed up per extension
17 *  - number of files per extension grouped by last modified date
18 */
19class FileStatistics
20{
21    private string $path;
22
23    private const BUCKETS = ['<1d', '<1w', '<1m', '<3m', '<6m', '<1y', '>1y'];
24
25    private array $result = [];
26
27    private array $hashMap = []; // md5 => [ext, count]
28
29    public function __construct(string $path)
30    {
31        if (!is_dir($path)) {
32            throw new InvalidArgumentException("Path '$path' is not a valid directory.");
33        }
34
35        $this->path = rtrim($path, DIRECTORY_SEPARATOR);
36    }
37
38    public function collect(): array
39    {
40        $iterator = new RecursiveIteratorIterator(
41            new RecursiveDirectoryIterator($this->path, RecursiveDirectoryIterator::SKIP_DOTS)
42        );
43
44        $now = time();
45
46        foreach ($iterator as $fileInfo) {
47            /** @var SplFileInfo $fileInfo */
48            if (!$fileInfo->isFile()) {
49                continue;
50            }
51
52            $ext = strtolower($fileInfo->getExtension()) ?: 'no_extension';
53            $path = $fileInfo->getPathname();
54            $size = $fileInfo->getSize();
55            $mtime = $fileInfo->getMTime();
56
57            $this->initExtension($ext);
58
59            $this->result[$ext]['count']++;
60            $this->result[$ext]['size'] += $size;
61
62            // group by modified time
63            $group = $this->getModifiedGroup($now - $mtime);
64            $this->result[$ext][$group]++;
65
66            // handle duplicates by checksum
67            $md5 = md5_file($path);
68            if (isset($this->hashMap[$md5])) {
69                $this->hashMap[$md5]['count']++;
70            } else {
71                $this->hashMap[$md5] = ['ext' => $ext, 'count' => 1];
72            }
73        }
74
75        // summarize duplicates
76        foreach ($this->hashMap as $hash => $info) {
77            if ($info['count'] > 1) {
78                $ext = $info['ext'];
79                $this->initExtension($ext);
80                $this->result[$ext]['dups'] += $info['count'] - 1;
81            }
82        }
83
84        return $this->result;
85    }
86
87    private function getModifiedGroup(int $ageSeconds): string
88    {
89        $day = 86400;
90        return match (true) {
91            $ageSeconds < $day => '<1d',
92            $ageSeconds < 7 * $day => '<1w',
93            $ageSeconds < 30 * $day => '<1m',
94            $ageSeconds < 90 * $day => '<3m',
95            $ageSeconds < 180 * $day => '<6m',
96            $ageSeconds < 365 * $day => '<1y',
97            default => '>1y',
98        };
99    }
100
101    private function initExtension(string $ext): void
102    {
103        if (isset($this->result[$ext])) {
104            return;
105        }
106
107        $this->result[$ext] = [
108            'count' => 0,
109            'size' => 0,
110            'dups' => 0,
111        ];
112        foreach (self::BUCKETS as $bucket) {
113            $this->result[$ext][$bucket] = 0;
114        }
115    }
116}
117