xref: /plugin/cachestats/FileStatistics.php (revision 537711eb520f98f3ac06cff3d3b9e73326a97f47)
1a3092f6cSAndreas Gohr<?php
2a3092f6cSAndreas Gohr
3a3092f6cSAndreas Gohrnamespace dokuwiki\plugin\cachestats;
4a3092f6cSAndreas Gohr
5a3092f6cSAndreas Gohruse InvalidArgumentException;
6a3092f6cSAndreas Gohruse RecursiveDirectoryIterator;
7a3092f6cSAndreas Gohruse RecursiveIteratorIterator;
8a3092f6cSAndreas Gohruse SplFileInfo;
9a3092f6cSAndreas Gohr
10a3092f6cSAndreas Gohr/**
11a3092f6cSAndreas Gohr * Class FileStatistics
12a3092f6cSAndreas Gohr *
13a3092f6cSAndreas Gohr * Recursively scans a directory and collects:
14a3092f6cSAndreas Gohr *  - number of files per file extension
15a3092f6cSAndreas Gohr *  - duplicate files (based on MD5 checksum) per file extension
16a3092f6cSAndreas Gohr *  - size of files summed up per extension
17a3092f6cSAndreas Gohr *  - number of files per extension grouped by last modified date
18a3092f6cSAndreas Gohr */
19a3092f6cSAndreas Gohrclass FileStatistics
20a3092f6cSAndreas Gohr{
21a3092f6cSAndreas Gohr    private string $path;
22a3092f6cSAndreas Gohr
23*537711ebSAndreas Gohr    private const BUCKETS = ['<1d', '<1w', '<1m', '<3m', '<6m', '<1y', '>1y'];
24c25debc6SAndreas Gohr
25*537711ebSAndreas Gohr    private array $result = [];
26a3092f6cSAndreas Gohr
27a3092f6cSAndreas Gohr    private array $hashMap = []; // md5 => [ext, count]
28a3092f6cSAndreas Gohr
29a3092f6cSAndreas Gohr    public function __construct(string $path)
30a3092f6cSAndreas Gohr    {
31a3092f6cSAndreas Gohr        if (!is_dir($path)) {
32a3092f6cSAndreas Gohr            throw new InvalidArgumentException("Path '$path' is not a valid directory.");
33a3092f6cSAndreas Gohr        }
34a3092f6cSAndreas Gohr
35a3092f6cSAndreas Gohr        $this->path = rtrim($path, DIRECTORY_SEPARATOR);
36a3092f6cSAndreas Gohr    }
37a3092f6cSAndreas Gohr
38a3092f6cSAndreas Gohr    public function collect(): array
39a3092f6cSAndreas Gohr    {
40a3092f6cSAndreas Gohr        $iterator = new RecursiveIteratorIterator(
41a3092f6cSAndreas Gohr            new RecursiveDirectoryIterator($this->path, RecursiveDirectoryIterator::SKIP_DOTS)
42a3092f6cSAndreas Gohr        );
43a3092f6cSAndreas Gohr
44a3092f6cSAndreas Gohr        $now = time();
45a3092f6cSAndreas Gohr
46a3092f6cSAndreas Gohr        foreach ($iterator as $fileInfo) {
47a3092f6cSAndreas Gohr            /** @var SplFileInfo $fileInfo */
48a3092f6cSAndreas Gohr            if (!$fileInfo->isFile()) {
49a3092f6cSAndreas Gohr                continue;
50a3092f6cSAndreas Gohr            }
51a3092f6cSAndreas Gohr
52a3092f6cSAndreas Gohr            $ext = strtolower($fileInfo->getExtension()) ?: 'no_extension';
53a3092f6cSAndreas Gohr            $path = $fileInfo->getPathname();
54a3092f6cSAndreas Gohr            $size = $fileInfo->getSize();
55a3092f6cSAndreas Gohr            $mtime = $fileInfo->getMTime();
56a3092f6cSAndreas Gohr
57*537711ebSAndreas Gohr            $this->initExtension($ext);
58a3092f6cSAndreas Gohr
59*537711ebSAndreas Gohr            $this->result[$ext]['count']++;
60*537711ebSAndreas Gohr            $this->result[$ext]['size'] += $size;
61a3092f6cSAndreas Gohr
62a3092f6cSAndreas Gohr            // group by modified time
63a3092f6cSAndreas Gohr            $group = $this->getModifiedGroup($now - $mtime);
64*537711ebSAndreas Gohr            $this->result[$ext][$group]++;
65a3092f6cSAndreas Gohr
66a3092f6cSAndreas Gohr            // handle duplicates by checksum
67a3092f6cSAndreas Gohr            $md5 = md5_file($path);
68a3092f6cSAndreas Gohr            if (isset($this->hashMap[$md5])) {
69a3092f6cSAndreas Gohr                $this->hashMap[$md5]['count']++;
70a3092f6cSAndreas Gohr            } else {
71a3092f6cSAndreas Gohr                $this->hashMap[$md5] = ['ext' => $ext, 'count' => 1];
72a3092f6cSAndreas Gohr            }
73a3092f6cSAndreas Gohr        }
74a3092f6cSAndreas Gohr
75a3092f6cSAndreas Gohr        // summarize duplicates
76a3092f6cSAndreas Gohr        foreach ($this->hashMap as $hash => $info) {
77a3092f6cSAndreas Gohr            if ($info['count'] > 1) {
78*537711ebSAndreas Gohr                $ext = $info['ext'];
79*537711ebSAndreas Gohr                $this->initExtension($ext);
80*537711ebSAndreas Gohr                $this->result[$ext]['dups'] += $info['count'] - 1;
81a3092f6cSAndreas Gohr            }
82a3092f6cSAndreas Gohr        }
83a3092f6cSAndreas Gohr
84*537711ebSAndreas Gohr        return $this->result;
85a3092f6cSAndreas Gohr    }
86a3092f6cSAndreas Gohr
87a3092f6cSAndreas Gohr    private function getModifiedGroup(int $ageSeconds): string
88a3092f6cSAndreas Gohr    {
89a3092f6cSAndreas Gohr        $day = 86400;
90a3092f6cSAndreas Gohr        return match (true) {
91a3092f6cSAndreas Gohr            $ageSeconds < $day => '<1d',
92a3092f6cSAndreas Gohr            $ageSeconds < 7 * $day => '<1w',
93a3092f6cSAndreas Gohr            $ageSeconds < 30 * $day => '<1m',
94a3092f6cSAndreas Gohr            $ageSeconds < 90 * $day => '<3m',
95a3092f6cSAndreas Gohr            $ageSeconds < 180 * $day => '<6m',
96a3092f6cSAndreas Gohr            $ageSeconds < 365 * $day => '<1y',
97a3092f6cSAndreas Gohr            default => '>1y',
98a3092f6cSAndreas Gohr        };
99a3092f6cSAndreas Gohr    }
100c25debc6SAndreas Gohr
101*537711ebSAndreas Gohr    private function initExtension(string $ext): void
102c25debc6SAndreas Gohr    {
103*537711ebSAndreas Gohr        if (isset($this->result[$ext])) {
104*537711ebSAndreas Gohr            return;
105*537711ebSAndreas Gohr        }
106c25debc6SAndreas Gohr
107*537711ebSAndreas Gohr        $this->result[$ext] = [
108*537711ebSAndreas Gohr            'count' => 0,
109*537711ebSAndreas Gohr            'size' => 0,
110*537711ebSAndreas Gohr            'dups' => 0,
111c25debc6SAndreas Gohr        ];
112*537711ebSAndreas Gohr        foreach (self::BUCKETS as $bucket) {
113*537711ebSAndreas Gohr            $this->result[$ext][$bucket] = 0;
114c25debc6SAndreas Gohr        }
115a3092f6cSAndreas Gohr    }
116c25debc6SAndreas Gohr}
117