xref: /plugin/mizarverifiabledocs/script/miz_parser.py (revision 1a91113b0094ce6ce10091527b530187a60781d5)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0-or-later
3# SPDX-FileCopyrightText: 2024-2025 Yamada, M.
4# --------------------------------------------------------------
5#  miz_parser.py ――  Mizar 依存グラフ可視化
6# --------------------------------------------------------------
7#  theorem             : yellow   (Th…)
8#  auto-theorem        : lavender (Th_auto…)
9#  definition / auto   : teal     (Def… / Def_auto…)
10#  lemma               : coral    (Lm…)
11#  external article    : grey     (environ 節に列挙された語)
12# --------------------------------------------------------------
13from __future__ import annotations
14import re
15import sys
16from pathlib import Path
17from typing import Iterable
18
19import networkx as nx
20# from pyvis.network import Network
21
22
23# ─── 色定義 ──────────────────────────────────────────
24COL_TH   = "#ffc107"
25COL_AUTO = "#b39ddb"
26COL_DEF  = "#17a2b8"
27COL_LM   = "#ff7f50"
28COL_EXT  = "#9e9e9e"
29
30# ─── 基本正規表現 ──────────────────────────────────
31COMMENT_RE   = re.compile(r"::.*?$", re.M)
32THEOREM_RE   = re.compile(r"\btheorem\b(?:\s+([A-Za-z0-9_]+)\s*:)?", re.I)
33DEF_HDR_RE   = re.compile(r"\bdefinition\b(?:\s+([A-Za-z0-9_]+)\s*:)?", re.I)
34
35# 行頭にある Def/Lm ラベル (  :Def1:  や  Def1:  )
36INLINE_HEAD_RE = re.compile(r"^[ \t]*:?\s*((?:Def|Lm)\d+)\s*:", re.I | re.M)
37
38# 行中に埋め込まれた :Def1: / :Lm3: など
39INLINE_INLINE_RE = re.compile(r":\s*((?:Def|Lm)\d+)\s*:", re.I)
40
41BY_RE   = re.compile(r"\bby\s+([^;]+);",   re.I | re.S)
42FROM_RE = re.compile(r"\bfrom\s+([^;]+);", re.I | re.S)
43
44# 〈記事:キーワード+番号〉も 1 トークン扱い
45LABEL_RE = re.compile(
46    r"[A-Z][A-Za-z0-9_]*:[A-Za-z0-9_]+\s*\d+"   #  TARSKI:def 1 など
47    r"|[A-Z][A-Za-z0-9_]*"                      #  単体記事名・内部ラベル
48)
49
50FOLD_RE     = re.compile(r"^[A-Z](\d+)?$")          # A, A1 …(吸収対象)
51INTERNAL_RE = re.compile(r"^(?:Th\d+|Th_auto\d+|Def\d+|Def_auto\d+|Lm\d+)$")
52EXT_ART_RE  = re.compile(r"^[A-Z][A-Z0-9_]*$")
53
54
55# ─── environ から外部アーティクル抽出 ────────────────
56def externals(txt: str) -> set[str]:
57    m = re.search(r"\benviron\b(.*?)(?:\bbegin\b)", txt, re.I | re.S)
58    return set() if not m else set(re.findall(r"[A-Z][A-Z0-9_]*", m[1]))
59
60
61# ─── 1 つの .miz を依存グラフに変換 ───────────────────
62def parse_mizar(path: Path) -> nx.DiGraph:
63    src = COMMENT_RE.sub("", path.read_text(encoding="utf-8", errors="ignore"))
64    ext_art = externals(src)
65
66    #―― theorem:明示番号を把握して auto 番衝突を防ぐ ―――――――――――
67    explicit_nums, th_matches = set(), []
68    for m in THEOREM_RE.finditer(src):
69        th_matches.append(m)
70        if m.group(1) and m.group(1).startswith("Th") and m.group(1)[2:].isdigit():
71            explicit_nums.add(int(m.group(1)[2:]))
72
73    entries, inline_taken = [], set()
74    th_auto_i = def_auto_i = 1
75
76    #── theorem ブロック登録 ──────────────────────────
77    for m in th_matches:
78        if m.group(1):                              # 明示ラベル
79            lbl, kind = m.group(1), "th"
80        else:                                       # auto 採番
81            while th_auto_i in explicit_nums:
82                th_auto_i += 1
83            lbl, kind = f"Th_auto{th_auto_i}", "th_auto"
84            th_auto_i += 1
85        entries.append(dict(start=m.start(), end=m.end(), label=lbl, kind=kind))
86
87    #── definition ヘッダ処理 ─────────────────────────
88    for m in DEF_HDR_RE.finditer(src):
89        hdr_s, hdr_e = m.span()
90        # ブロック終端=次 the/def 手前
91        nxt_th  = THEOREM_RE.search(src, hdr_e)
92        nxt_def = DEF_HDR_RE.search(src, hdr_e + 1)
93        blk_end = min(nxt_th.start()  if nxt_th  else len(src),
94                      nxt_def.start() if nxt_def else len(src))
95
96        # --- 行頭ラベル ---
97        for im in INLINE_HEAD_RE.finditer(src[hdr_e:blk_end]):
98            abs_pos = hdr_e + im.start()
99            lbl     = im.group(1)
100            inline_taken.add(abs_pos)
101            kind = "lemma" if lbl.lower().startswith("lm") else "def"
102            entries.append(dict(start=abs_pos, end=abs_pos, label=lbl, kind=kind))
103
104        # --- 行中ラベル ---
105        for jm in INLINE_INLINE_RE.finditer(src[hdr_e:blk_end]):
106            abs_pos = hdr_e + jm.start()
107            if abs_pos in inline_taken:             # 行頭と重複しないように
108                continue
109            lbl  = jm.group(1)
110            inline_taken.add(abs_pos)
111            kind = "lemma" if lbl.lower().startswith("lm") else "def"
112            entries.append(dict(start=abs_pos, end=abs_pos, label=lbl, kind=kind))
113
114        # ヘッダに固有ラベルが無ければ auto 付与
115        if not any(hdr_s <= e["start"] < hdr_e for e in entries):
116            lbl = f"Def_auto{def_auto_i}"
117            def_auto_i += 1
118            entries.append(dict(start=hdr_s, end=hdr_e, label=lbl, kind="def"))
119
120    #── ファイル行頭の Def/Lm (未登録) ─────────────────
121    for m in INLINE_HEAD_RE.finditer(src):
122        if m.start() in inline_taken:
123            continue
124        lbl  = m.group(1)
125        kind = "lemma" if lbl.lower().startswith("lm") else "def"
126        entries.append(dict(start=m.start(), end=m.end(), label=lbl, kind=kind))
127        inline_taken.add(m.start())
128
129    # 行中 :DefN: / :LmN: (未登録)
130    for m in INLINE_INLINE_RE.finditer(src):
131        if m.start() in inline_taken:
132            continue
133        lbl  = m.group(1)
134        kind = "lemma" if lbl.lower().startswith("lm") else "def"
135        entries.append(dict(start=m.start(), end=m.end(), label=lbl, kind=kind))
136
137    #―― 時系列順 & ラベル集合 ―――――――――――――――――――――――
138    entries.sort(key=lambda d: d["start"])
139    internal_labels = {e["label"] for e in entries}
140
141    #―― グラフ (ノード) ―――――――――――――――――――――――――――
142    G = nx.DiGraph()
143    color_of = {"th": COL_TH, "th_auto": COL_AUTO, "def": COL_DEF, "lemma": COL_LM}
144    for e in entries:
145        G.add_node(e["label"], color=color_of[e["kind"]], label=e["label"])
146
147    #―― 参照エッジ解析 ―――――――――――――――――――――――――
148    for idx, blk in enumerate(entries):
149        parent = blk["label"]
150        body = src[
151            blk["end"]:
152            entries[idx + 1]["start"] if idx + 1 < len(entries) else len(src)
153        ]
154
155        refs: set[str] = set()
156        for seg in BY_RE.findall(body) + FROM_RE.findall(body):
157            for tok in LABEL_RE.findall(seg):
158                tok = tok.rstrip()
159                if FOLD_RE.fullmatch(tok):
160                    continue
161
162                # ------------ 内部ラベル ------------
163                if tok in internal_labels and INTERNAL_RE.fullmatch(tok):
164                    refs.add(tok)
165                    continue
166
167                # ------------ 外部参照 ------------
168                art = tok.split(":")[0]              # “XBOOLE_0:7” → “XBOOLE_0”
169                if art in ext_art:
170                    refs.add(art)                    # 番号無視で記事名だけ追加
171                    continue
172
173        refs.discard(parent)
174        for r in refs:
175            if r not in G:
176                col = COL_EXT if r in ext_art else COL_TH
177                G.add_node(r, color=col, label=r)
178            G.add_edge(r, parent)
179
180    return G
181
182
183# ─── 可視化 (HTML) ─────────────────────────────────
184def visualize(G: nx.DiGraph, outfile: Path) -> None:
185    net = Network(height="800px", width="100%", directed=True, notebook=False)
186    net.from_nx(G)
187    net.barnes_hut(gravity=-8000, central_gravity=0.5)
188
189    for n in net.nodes:
190        n.update({
191            "value": max(nx.degree(G, n["id"]), 1),
192            "shape": "circle",
193            "font": {"vadjust": 0},
194        })
195
196    net.show(str(outfile))
197    print(f"★ 生成完了: {outfile}")
198
199
200# ─── CLI ────────────────────────────────────────────
201def main(files: Iterable[str]) -> None:
202    paths = [Path(p) for p in files]
203    if not paths:
204        print("usage: python miz_parser.py <file1.miz> [file2.miz ...]")
205        sys.exit(1)
206
207    for p in paths:
208        if not p.exists():
209            print(f"※ file not found: {p}")
210            continue
211        graph = parse_mizar(p)
212        # visualize(graph, p.with_stem(p.stem + "_graph").with_suffix(".html"))
213
214
215if __name__ == "__main__":
216    main(sys.argv[1:])
217