xref: /plugin/cachestats/FileStatistics.php (revision c25debc6bd808a17b10cab0771c0153188c1e6cf)
1a3092f6cSAndreas Gohr<?php
2a3092f6cSAndreas Gohr
3a3092f6cSAndreas Gohrnamespace dokuwiki\plugin\cachestats;
4a3092f6cSAndreas Gohr
5a3092f6cSAndreas Gohruse InvalidArgumentException;
6a3092f6cSAndreas Gohruse RecursiveDirectoryIterator;
7a3092f6cSAndreas Gohruse RecursiveIteratorIterator;
8a3092f6cSAndreas Gohruse SplFileInfo;
9a3092f6cSAndreas Gohr
10a3092f6cSAndreas Gohr/**
11a3092f6cSAndreas Gohr * Class FileStatistics
12a3092f6cSAndreas Gohr *
13a3092f6cSAndreas Gohr * Recursively scans a directory and collects:
14a3092f6cSAndreas Gohr *  - number of files per file extension
15a3092f6cSAndreas Gohr *  - duplicate files (based on MD5 checksum) per file extension
16a3092f6cSAndreas Gohr *  - size of files summed up per extension
17a3092f6cSAndreas Gohr *  - number of files per extension grouped by last modified date
18a3092f6cSAndreas Gohr *  - total number of files
19a3092f6cSAndreas Gohr *  - total size of all files
20a3092f6cSAndreas Gohr */
21a3092f6cSAndreas Gohrclass FileStatistics
22a3092f6cSAndreas Gohr{
23a3092f6cSAndreas Gohr    private string $path;
24a3092f6cSAndreas Gohr
25*c25debc6SAndreas Gohr    /** @var string[] */
26*c25debc6SAndreas Gohr    private array $buckets = ['<1d', '<1w', '<1m', '<3m', '<6m', '<1y', '>1y'];
27*c25debc6SAndreas Gohr
28a3092f6cSAndreas Gohr    private array $stats = [
29a3092f6cSAndreas Gohr        'extensions' => [],
30a3092f6cSAndreas Gohr        'duplicates' => [],
31a3092f6cSAndreas Gohr        'sizes' => [],
32a3092f6cSAndreas Gohr        'modified_groups' => [],
33a3092f6cSAndreas Gohr        'total_files' => 0,
34a3092f6cSAndreas Gohr        'total_size' => 0,
35a3092f6cSAndreas Gohr    ];
36a3092f6cSAndreas Gohr
37a3092f6cSAndreas Gohr    private array $hashMap = []; // md5 => [ext, count]
38a3092f6cSAndreas Gohr
39a3092f6cSAndreas Gohr    public function __construct(string $path)
40a3092f6cSAndreas Gohr    {
41a3092f6cSAndreas Gohr        if (!is_dir($path)) {
42a3092f6cSAndreas Gohr            throw new InvalidArgumentException("Path '$path' is not a valid directory.");
43a3092f6cSAndreas Gohr        }
44a3092f6cSAndreas Gohr
45a3092f6cSAndreas Gohr        $this->path = rtrim($path, DIRECTORY_SEPARATOR);
46a3092f6cSAndreas Gohr    }
47a3092f6cSAndreas Gohr
48a3092f6cSAndreas Gohr    public function collect(): array
49a3092f6cSAndreas Gohr    {
50a3092f6cSAndreas Gohr        $iterator = new RecursiveIteratorIterator(
51a3092f6cSAndreas Gohr            new RecursiveDirectoryIterator($this->path, RecursiveDirectoryIterator::SKIP_DOTS)
52a3092f6cSAndreas Gohr        );
53a3092f6cSAndreas Gohr
54a3092f6cSAndreas Gohr        $now = time();
55a3092f6cSAndreas Gohr
56a3092f6cSAndreas Gohr        foreach ($iterator as $fileInfo) {
57a3092f6cSAndreas Gohr            /** @var SplFileInfo $fileInfo */
58a3092f6cSAndreas Gohr            if (!$fileInfo->isFile()) {
59a3092f6cSAndreas Gohr                continue;
60a3092f6cSAndreas Gohr            }
61a3092f6cSAndreas Gohr
62a3092f6cSAndreas Gohr            $this->stats['total_files']++;
63a3092f6cSAndreas Gohr            $ext = strtolower($fileInfo->getExtension()) ?: 'no_extension';
64a3092f6cSAndreas Gohr            $path = $fileInfo->getPathname();
65a3092f6cSAndreas Gohr            $size = $fileInfo->getSize();
66a3092f6cSAndreas Gohr            $mtime = $fileInfo->getMTime();
67a3092f6cSAndreas Gohr
68a3092f6cSAndreas Gohr            // size aggregated per extension
69a3092f6cSAndreas Gohr            $this->stats['sizes'][$ext] = ($this->stats['sizes'][$ext] ?? 0) + $size;
70a3092f6cSAndreas Gohr            $this->stats['total_size'] += $size;
71a3092f6cSAndreas Gohr
72a3092f6cSAndreas Gohr            // count per extension
73a3092f6cSAndreas Gohr            $this->stats['extensions'][$ext] = ($this->stats['extensions'][$ext] ?? 0) + 1;
74a3092f6cSAndreas Gohr
75a3092f6cSAndreas Gohr            // group by modified time
76a3092f6cSAndreas Gohr            $group = $this->getModifiedGroup($now - $mtime);
77a3092f6cSAndreas Gohr            $this->stats['modified_groups'][$ext][$group] =
78a3092f6cSAndreas Gohr                ($this->stats['modified_groups'][$ext][$group] ?? 0) + 1;
79a3092f6cSAndreas Gohr
80a3092f6cSAndreas Gohr            // handle duplicates by checksum
81a3092f6cSAndreas Gohr            $md5 = md5_file($path);
82a3092f6cSAndreas Gohr            if (isset($this->hashMap[$md5])) {
83a3092f6cSAndreas Gohr                $this->hashMap[$md5]['count']++;
84a3092f6cSAndreas Gohr            } else {
85a3092f6cSAndreas Gohr                $this->hashMap[$md5] = ['ext' => $ext, 'count' => 1];
86a3092f6cSAndreas Gohr            }
87a3092f6cSAndreas Gohr        }
88a3092f6cSAndreas Gohr
89a3092f6cSAndreas Gohr        // summarize duplicates
90a3092f6cSAndreas Gohr        foreach ($this->hashMap as $hash => $info) {
91a3092f6cSAndreas Gohr            if ($info['count'] > 1) {
92a3092f6cSAndreas Gohr                $this->stats['duplicates'][$info['ext']] =
93a3092f6cSAndreas Gohr                    ($this->stats['duplicates'][$info['ext']] ?? 0) + ($info['count'] - 1);
94a3092f6cSAndreas Gohr            }
95a3092f6cSAndreas Gohr        }
96a3092f6cSAndreas Gohr
97*c25debc6SAndreas Gohr        return $this->buildResult();
98a3092f6cSAndreas Gohr    }
99a3092f6cSAndreas Gohr
100a3092f6cSAndreas Gohr    private function getModifiedGroup(int $ageSeconds): string
101a3092f6cSAndreas Gohr    {
102a3092f6cSAndreas Gohr        $day = 86400;
103a3092f6cSAndreas Gohr        return match (true) {
104a3092f6cSAndreas Gohr            $ageSeconds < $day => '<1d',
105a3092f6cSAndreas Gohr            $ageSeconds < 7 * $day => '<1w',
106a3092f6cSAndreas Gohr            $ageSeconds < 30 * $day => '<1m',
107a3092f6cSAndreas Gohr            $ageSeconds < 90 * $day => '<3m',
108a3092f6cSAndreas Gohr            $ageSeconds < 180 * $day => '<6m',
109a3092f6cSAndreas Gohr            $ageSeconds < 365 * $day => '<1y',
110a3092f6cSAndreas Gohr            default => '>1y',
111a3092f6cSAndreas Gohr        };
112a3092f6cSAndreas Gohr    }
113*c25debc6SAndreas Gohr
114*c25debc6SAndreas Gohr    /**
115*c25debc6SAndreas Gohr     * Combine collected sub statistics into a single result array keyed by extension
116*c25debc6SAndreas Gohr     */
117*c25debc6SAndreas Gohr    private function buildResult(): array
118*c25debc6SAndreas Gohr    {
119*c25debc6SAndreas Gohr        $keys = array_unique(
120*c25debc6SAndreas Gohr            array_merge(
121*c25debc6SAndreas Gohr                array_keys($this->stats['extensions']),
122*c25debc6SAndreas Gohr                array_keys($this->stats['sizes']),
123*c25debc6SAndreas Gohr                array_keys($this->stats['duplicates']),
124*c25debc6SAndreas Gohr                array_keys($this->stats['modified_groups'])
125*c25debc6SAndreas Gohr            )
126*c25debc6SAndreas Gohr        );
127*c25debc6SAndreas Gohr
128*c25debc6SAndreas Gohr        $result = [];
129*c25debc6SAndreas Gohr        foreach ($keys as $key) {
130*c25debc6SAndreas Gohr            $result[$key] = [
131*c25debc6SAndreas Gohr                'count' => $this->stats['extensions'][$key] ?? 0,
132*c25debc6SAndreas Gohr                'size' => $this->stats['sizes'][$key] ?? 0,
133*c25debc6SAndreas Gohr                'dups' => $this->stats['duplicates'][$key] ?? 0,
134*c25debc6SAndreas Gohr            ];
135*c25debc6SAndreas Gohr            foreach ($this->buckets as $bucket) {
136*c25debc6SAndreas Gohr                $result[$key][$bucket] = $this->stats['modified_groups'][$key][$bucket] ?? 0;
137*c25debc6SAndreas Gohr            }
138a3092f6cSAndreas Gohr        }
139a3092f6cSAndreas Gohr
140*c25debc6SAndreas Gohr        return $result;
141*c25debc6SAndreas Gohr    }
142*c25debc6SAndreas Gohr}
143