xref: /plugin/cachestats/FileStatistics.php (revision a3092f6c78abc16c6ef67e3c51fd72844d492d6b)
1<?php
2
3namespace dokuwiki\plugin\cachestats;
4
5use InvalidArgumentException;
6use RecursiveDirectoryIterator;
7use RecursiveIteratorIterator;
8use SplFileInfo;
9
10/**
11 * Class FileStatistics
12 *
13 * Recursively scans a directory and collects:
14 *  - number of files per file extension
15 *  - duplicate files (based on MD5 checksum) per file extension
16 *  - size of files summed up per extension
17 *  - number of files per extension grouped by last modified date
18 *  - total number of files
19 *  - total size of all files
20 */
21class FileStatistics
22{
23    private string $path;
24
25    private array $stats = [
26        'extensions' => [],
27        'duplicates' => [],
28        'sizes' => [],
29        'modified_groups' => [],
30        'total_files' => 0,
31        'total_size' => 0,
32    ];
33
34    private array $hashMap = []; // md5 => [ext, count]
35
36    public function __construct(string $path)
37    {
38        if (!is_dir($path)) {
39            throw new InvalidArgumentException("Path '$path' is not a valid directory.");
40        }
41
42        $this->path = rtrim($path, DIRECTORY_SEPARATOR);
43    }
44
45    public function collect(): array
46    {
47        $iterator = new RecursiveIteratorIterator(
48            new RecursiveDirectoryIterator($this->path, RecursiveDirectoryIterator::SKIP_DOTS)
49        );
50
51        $now = time();
52
53        foreach ($iterator as $fileInfo) {
54            /** @var SplFileInfo $fileInfo */
55            if (!$fileInfo->isFile()) {
56                continue;
57            }
58
59            $this->stats['total_files']++;
60            $ext = strtolower($fileInfo->getExtension()) ?: 'no_extension';
61            $path = $fileInfo->getPathname();
62            $size = $fileInfo->getSize();
63            $mtime = $fileInfo->getMTime();
64
65            // size aggregated per extension
66            $this->stats['sizes'][$ext] = ($this->stats['sizes'][$ext] ?? 0) + $size;
67            $this->stats['total_size'] += $size;
68
69            // count per extension
70            $this->stats['extensions'][$ext] = ($this->stats['extensions'][$ext] ?? 0) + 1;
71
72            // group by modified time
73            $group = $this->getModifiedGroup($now - $mtime);
74            $this->stats['modified_groups'][$ext][$group] =
75                ($this->stats['modified_groups'][$ext][$group] ?? 0) + 1;
76
77            // handle duplicates by checksum
78            $md5 = md5_file($path);
79            if (isset($this->hashMap[$md5])) {
80                $this->hashMap[$md5]['count']++;
81            } else {
82                $this->hashMap[$md5] = ['ext' => $ext, 'count' => 1];
83            }
84        }
85
86        // summarize duplicates
87        foreach ($this->hashMap as $hash => $info) {
88            if ($info['count'] > 1) {
89                $this->stats['duplicates'][$info['ext']] =
90                    ($this->stats['duplicates'][$info['ext']] ?? 0) + ($info['count'] - 1);
91            }
92        }
93
94        return $this->stats;
95    }
96
97    private function getModifiedGroup(int $ageSeconds): string
98    {
99        $day = 86400;
100        return match (true) {
101            $ageSeconds < $day => '<1d',
102            $ageSeconds < 7 * $day => '<1w',
103            $ageSeconds < 30 * $day => '<1m',
104            $ageSeconds < 90 * $day => '<3m',
105            $ageSeconds < 180 * $day => '<6m',
106            $ageSeconds < 365 * $day => '<1y',
107            default => '>1y',
108        };
109    }
110}
111
112