QuantEngineByItz/tools/build_yaml_code_coverage_v1.py

"""build_yaml_code_coverage_v1.py — YAML_TO_CODE_COVERAGE_V1

spec/13_formula_registry.yaml 의 active=true formula_id를 authoritative denominator로 삼고
tools/*.py / *.gs 구현 여부를 매핑해 yaml-to-code 커버리지 보고서를 산출한다.

산출물: Temp/yaml_code_coverage_v1.json
  - yaml_formula_count: spec에 등록된 공식 수
  - implemented_count: 코드에서 확인된 공식 수
  - golden_test_count: tests/*.yaml / spec/formula_golden_cases_v2.yaml에 테스트가 있는 공식 수
  - unimplemented_rules: 코드 미구현 공식 목록
  - orphan_code_rules: 코드에는 있으나 spec에 없는 식별자 (샘플)
  - coverage_ratio: implemented / total
  - golden_coverage_ratio: golden_test / total
"""
from __future__ import annotations

import argparse
import json
import re
from pathlib import Path
from typing import Any

import yaml

ROOT = Path(__file__).resolve().parents[1]
DEFAULT_OUT = ROOT / "Temp" / "yaml_code_coverage_v1.json"
SPEC_DIR = ROOT / "spec"
TOOLS_DIR = ROOT / "tools"

FORMULA_YAML_FILES = [SPEC_DIR / "13_formula_registry.yaml"]
GOLDEN_YAML_FILES = [
    SPEC_DIR / "formula_golden_cases_v2.yaml",
    SPEC_DIR / "formula_golden_cases_v3.yaml",
    SPEC_DIR / "formula_golden_cases_v4.yaml",
    ROOT / "tests" / "strategy_tests.yaml",
]
GS_FILES = list(ROOT.glob("*.gs"))
PY_FILES = list(TOOLS_DIR.glob("*.py"))
ALL_CODE_FILES = GS_FILES + PY_FILES


def _load_yaml(path: Path) -> Any:
    if not path.exists():
        return {}
    try:
        return yaml.safe_load(path.read_text(encoding="utf-8")) or {}
    except Exception:
        return {}


def _extract_formula_ids(registry: Any) -> list[str]:
    fr = (registry.get("formula_registry") or {}) if isinstance(registry, dict) else {}
    return list((fr.get("formulas") or {}).keys())


def _read_code_text() -> str:
    parts = []
    for f in ALL_CODE_FILES:
        try:
            parts.append(f.read_text(encoding="utf-8"))
        except Exception:
            pass
    return "\n".join(parts)


def _read_golden_text() -> str:
    parts = []
    for f in GOLDEN_YAML_FILES:
        if f.exists():
            try:
                parts.append(f.read_text(encoding="utf-8"))
            except Exception:
                pass
    return "\n".join(parts)


def _find_orphan_formula_ids(code_text: str, spec_ids: set[str], max_sample: int = 20) -> list[str]:
    """코드에 정의된 FORMULA_ID 패턴 중 spec에 없는 것 (샘플)."""
    candidates = set(re.findall(r"\bFORMULA_ID\s*=\s*[\"']([A-Z0-9_]+)[\"']", code_text))
    # also pick up python_tool formula_id strings
    candidates |= set(re.findall(r'"formula_id"\s*:\s*"([A-Z0-9_]+)"', code_text))
    orphans = sorted(candidates - spec_ids)
    return orphans[:max_sample]


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--out", default=str(DEFAULT_OUT))
    args = ap.parse_args()
    out_path = Path(args.out) if Path(args.out).is_absolute() else ROOT / args.out

    # 1) spec 공식 수집
    all_spec_ids: list[str] = []
    spec_sources: dict[str, str] = {}
    for yf in FORMULA_YAML_FILES:
        reg = _load_yaml(yf)
        ids = _extract_formula_ids(reg)
        for fid in ids:
            # also read python_tool field for implementation source
            formula_def = (reg.get("formula_registry") or {}).get("formulas", {}).get(fid, {})
            if not bool(formula_def.get("active", True)):
                continue
            py_tool = str(formula_def.get("python_tool") or "")
            gas_impl = str(formula_def.get("gas_function") or formula_def.get("gas_name") or "")
            source = py_tool or gas_impl or "unknown"
            all_spec_ids.append(fid)
            spec_sources[fid] = source

    all_spec_ids = list(dict.fromkeys(all_spec_ids))  # dedup preserving order
    spec_id_set = set(all_spec_ids)

    # 2) 코드에서 구현 확인
    code_text = _read_code_text()
    golden_text = _read_golden_text()
    runtime_gas_text = "\n".join(
        p.read_text(encoding="utf-8", errors="ignore")
        for p in sorted(ROOT.glob("gas_*.gs"))
        if p.exists()
    )

    rows: list[dict[str, Any]] = []
    for fid in all_spec_ids:
        in_code = bool(re.search(re.escape(fid), code_text))
        in_golden = bool(re.search(re.escape(fid), golden_text))
        declared_source = spec_sources.get(fid, "")
        # Check if declared python_tool file actually exists
        source_exists: bool | str = "N/A"
        if declared_source and declared_source.startswith("tools/"):
            source_path = ROOT / declared_source
            source_exists = source_path.exists()
        rows.append({
            "formula_id": fid,
            "in_code": in_code,
            "in_golden_test": in_golden,
            "declared_source": declared_source,
            "source_file_exists": source_exists,
        })

    implemented = [r for r in rows if r["in_code"]]
    unimplemented = [r for r in rows if not r["in_code"]]
    golden_covered = [r for r in rows if r["in_golden_test"]]
    missing_source_file = [r for r in rows if r["source_file_exists"] is False]

    orphan_ids = _find_orphan_formula_ids(runtime_gas_text, spec_id_set)

    result = {
        "formula_id": "YAML_TO_CODE_COVERAGE_V1",
        "yaml_formula_count": len(all_spec_ids),
        "implemented_count": len(implemented),
        "unimplemented_count": len(unimplemented),
        "golden_test_count": len(golden_covered),
        "missing_source_file_count": len(missing_source_file),
        "orphan_code_formula_count": len(orphan_ids),
        "coverage_ratio": round(len(implemented) / len(all_spec_ids), 4) if all_spec_ids else 0.0,
        "golden_coverage_ratio": round(len(golden_covered) / len(all_spec_ids), 4) if all_spec_ids else 0.0,
        "gate": "PASS" if not unimplemented else "WARN",
        "unimplemented_rules": [r["formula_id"] for r in unimplemented],
        "missing_source_file_rules": [r["formula_id"] for r in missing_source_file],
        "orphan_code_formulas": orphan_ids,
        "golden_uncovered_rules": [r["formula_id"] for r in rows if not r["in_golden_test"]],
        "rows": rows,
    }

    out_path.write_text(json.dumps(result, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
    print(
        f"[YAML_TO_CODE_COVERAGE_V1] total={len(all_spec_ids)} "
        f"implemented={len(implemented)} ({result['coverage_ratio']*100:.1f}%) "
        f"golden={len(golden_covered)} ({result['golden_coverage_ratio']*100:.1f}%) "
        f"unimplemented={len(unimplemented)} orphan={len(orphan_ids)} -> {out_path}"
    )
    return 0


if __name__ == "__main__":
    raise SystemExit(main())