xref: /plugin/cachestats/FileStatistics.php (revision a3092f6c78abc16c6ef67e3c51fd72844d492d6b)
1*a3092f6cSAndreas Gohr<?php
2*a3092f6cSAndreas Gohr
3*a3092f6cSAndreas Gohrnamespace dokuwiki\plugin\cachestats;
4*a3092f6cSAndreas Gohr
5*a3092f6cSAndreas Gohruse InvalidArgumentException;
6*a3092f6cSAndreas Gohruse RecursiveDirectoryIterator;
7*a3092f6cSAndreas Gohruse RecursiveIteratorIterator;
8*a3092f6cSAndreas Gohruse SplFileInfo;
9*a3092f6cSAndreas Gohr
10*a3092f6cSAndreas Gohr/**
11*a3092f6cSAndreas Gohr * Class FileStatistics
12*a3092f6cSAndreas Gohr *
13*a3092f6cSAndreas Gohr * Recursively scans a directory and collects:
14*a3092f6cSAndreas Gohr *  - number of files per file extension
15*a3092f6cSAndreas Gohr *  - duplicate files (based on MD5 checksum) per file extension
16*a3092f6cSAndreas Gohr *  - size of files summed up per extension
17*a3092f6cSAndreas Gohr *  - number of files per extension grouped by last modified date
18*a3092f6cSAndreas Gohr *  - total number of files
19*a3092f6cSAndreas Gohr *  - total size of all files
20*a3092f6cSAndreas Gohr */
21*a3092f6cSAndreas Gohrclass FileStatistics
22*a3092f6cSAndreas Gohr{
23*a3092f6cSAndreas Gohr    private string $path;
24*a3092f6cSAndreas Gohr
25*a3092f6cSAndreas Gohr    private array $stats = [
26*a3092f6cSAndreas Gohr        'extensions' => [],
27*a3092f6cSAndreas Gohr        'duplicates' => [],
28*a3092f6cSAndreas Gohr        'sizes' => [],
29*a3092f6cSAndreas Gohr        'modified_groups' => [],
30*a3092f6cSAndreas Gohr        'total_files' => 0,
31*a3092f6cSAndreas Gohr        'total_size' => 0,
32*a3092f6cSAndreas Gohr    ];
33*a3092f6cSAndreas Gohr
34*a3092f6cSAndreas Gohr    private array $hashMap = []; // md5 => [ext, count]
35*a3092f6cSAndreas Gohr
36*a3092f6cSAndreas Gohr    public function __construct(string $path)
37*a3092f6cSAndreas Gohr    {
38*a3092f6cSAndreas Gohr        if (!is_dir($path)) {
39*a3092f6cSAndreas Gohr            throw new InvalidArgumentException("Path '$path' is not a valid directory.");
40*a3092f6cSAndreas Gohr        }
41*a3092f6cSAndreas Gohr
42*a3092f6cSAndreas Gohr        $this->path = rtrim($path, DIRECTORY_SEPARATOR);
43*a3092f6cSAndreas Gohr    }
44*a3092f6cSAndreas Gohr
45*a3092f6cSAndreas Gohr    public function collect(): array
46*a3092f6cSAndreas Gohr    {
47*a3092f6cSAndreas Gohr        $iterator = new RecursiveIteratorIterator(
48*a3092f6cSAndreas Gohr            new RecursiveDirectoryIterator($this->path, RecursiveDirectoryIterator::SKIP_DOTS)
49*a3092f6cSAndreas Gohr        );
50*a3092f6cSAndreas Gohr
51*a3092f6cSAndreas Gohr        $now = time();
52*a3092f6cSAndreas Gohr
53*a3092f6cSAndreas Gohr        foreach ($iterator as $fileInfo) {
54*a3092f6cSAndreas Gohr            /** @var SplFileInfo $fileInfo */
55*a3092f6cSAndreas Gohr            if (!$fileInfo->isFile()) {
56*a3092f6cSAndreas Gohr                continue;
57*a3092f6cSAndreas Gohr            }
58*a3092f6cSAndreas Gohr
59*a3092f6cSAndreas Gohr            $this->stats['total_files']++;
60*a3092f6cSAndreas Gohr            $ext = strtolower($fileInfo->getExtension()) ?: 'no_extension';
61*a3092f6cSAndreas Gohr            $path = $fileInfo->getPathname();
62*a3092f6cSAndreas Gohr            $size = $fileInfo->getSize();
63*a3092f6cSAndreas Gohr            $mtime = $fileInfo->getMTime();
64*a3092f6cSAndreas Gohr
65*a3092f6cSAndreas Gohr            // size aggregated per extension
66*a3092f6cSAndreas Gohr            $this->stats['sizes'][$ext] = ($this->stats['sizes'][$ext] ?? 0) + $size;
67*a3092f6cSAndreas Gohr            $this->stats['total_size'] += $size;
68*a3092f6cSAndreas Gohr
69*a3092f6cSAndreas Gohr            // count per extension
70*a3092f6cSAndreas Gohr            $this->stats['extensions'][$ext] = ($this->stats['extensions'][$ext] ?? 0) + 1;
71*a3092f6cSAndreas Gohr
72*a3092f6cSAndreas Gohr            // group by modified time
73*a3092f6cSAndreas Gohr            $group = $this->getModifiedGroup($now - $mtime);
74*a3092f6cSAndreas Gohr            $this->stats['modified_groups'][$ext][$group] =
75*a3092f6cSAndreas Gohr                ($this->stats['modified_groups'][$ext][$group] ?? 0) + 1;
76*a3092f6cSAndreas Gohr
77*a3092f6cSAndreas Gohr            // handle duplicates by checksum
78*a3092f6cSAndreas Gohr            $md5 = md5_file($path);
79*a3092f6cSAndreas Gohr            if (isset($this->hashMap[$md5])) {
80*a3092f6cSAndreas Gohr                $this->hashMap[$md5]['count']++;
81*a3092f6cSAndreas Gohr            } else {
82*a3092f6cSAndreas Gohr                $this->hashMap[$md5] = ['ext' => $ext, 'count' => 1];
83*a3092f6cSAndreas Gohr            }
84*a3092f6cSAndreas Gohr        }
85*a3092f6cSAndreas Gohr
86*a3092f6cSAndreas Gohr        // summarize duplicates
87*a3092f6cSAndreas Gohr        foreach ($this->hashMap as $hash => $info) {
88*a3092f6cSAndreas Gohr            if ($info['count'] > 1) {
89*a3092f6cSAndreas Gohr                $this->stats['duplicates'][$info['ext']] =
90*a3092f6cSAndreas Gohr                    ($this->stats['duplicates'][$info['ext']] ?? 0) + ($info['count'] - 1);
91*a3092f6cSAndreas Gohr            }
92*a3092f6cSAndreas Gohr        }
93*a3092f6cSAndreas Gohr
94*a3092f6cSAndreas Gohr        return $this->stats;
95*a3092f6cSAndreas Gohr    }
96*a3092f6cSAndreas Gohr
97*a3092f6cSAndreas Gohr    private function getModifiedGroup(int $ageSeconds): string
98*a3092f6cSAndreas Gohr    {
99*a3092f6cSAndreas Gohr        $day = 86400;
100*a3092f6cSAndreas Gohr        return match (true) {
101*a3092f6cSAndreas Gohr            $ageSeconds < $day => '<1d',
102*a3092f6cSAndreas Gohr            $ageSeconds < 7 * $day => '<1w',
103*a3092f6cSAndreas Gohr            $ageSeconds < 30 * $day => '<1m',
104*a3092f6cSAndreas Gohr            $ageSeconds < 90 * $day => '<3m',
105*a3092f6cSAndreas Gohr            $ageSeconds < 180 * $day => '<6m',
106*a3092f6cSAndreas Gohr            $ageSeconds < 365 * $day => '<1y',
107*a3092f6cSAndreas Gohr            default => '>1y',
108*a3092f6cSAndreas Gohr        };
109*a3092f6cSAndreas Gohr    }
110*a3092f6cSAndreas Gohr}
111*a3092f6cSAndreas Gohr
112