QuantEngineByItz/tools/build_llm_narrative_template_lock_v1.py

"""LLM_NARRATIVE_TEMPLATE_LOCK_V1 — LLM 서술 어휘 잠금 도구.

operational_report.json 각 section.markdown에서 두 종류의 위반을 스캔한다.

(1) INVALID_NARRATIVE — 금지 어휘 블랙리스트:
    한국어: 같다, 약간, 괜찮다, 이번엔, 곧, 조만간, 강한 모멘텀
    영어: "seems like", "might be", "probably", "soon", "strong momentum", "pretty good"

(2) INVALID_SOFTENING — verdict 완화 패턴 (P3 확장):
    BLOCK/SELL/CRITICAL verdict 근방에서 아래 완화 어휘가 동시 등장하면 차단.
    완화 어휘: "그래도", "유연하게", "장기 관점", "재진입 고려", "고려 가능",
               "상황에 따라", "아직 괜찮", "지켜볼 만"

    감지 조건: 동일 섹션 내에 verdict_keyword + softening_keyword 동시 존재.

허용:
  공식 ID (FORMULA_ID_V1 형식), 산출 라벨, 산출 숫자만.

게이트 CHECK_71 + CHECK_72(SOFTENING): 총 위반 0건.
"""
from __future__ import annotations

import argparse
import json
import re
from pathlib import Path
from typing import Any

ROOT = Path(__file__).resolve().parents[1]
DEFAULT_REPORT = ROOT / "Temp" / "operational_report.json"
DEFAULT_OUT = ROOT / "Temp" / "llm_narrative_template_lock_v1.json"

# 금지 어휘 패턴 (정규식)
_FORBIDDEN_PATTERNS: list[tuple[str, str]] = [
    # (pattern, label)
    (r"(?<![A-Z_])같다(?![A-Z_])",             "AMBIGUOUS_PREDICATE:같다"),
    (r"(?<![A-Z_])약간(?![A-Z_])",              "VAGUE_QUALIFIER:약간"),
    (r"(?<![A-Z_])괜찮다(?![A-Z_])",            "VAGUE_QUALIFIER:괜찮다"),
    (r"이번엔",                                  "INFORMAL_TEMPORAL:이번엔"),
    (r"(?<![A-Z_])곧(?![A-Z_])",                "VAGUE_TEMPORAL:곧"),
    (r"조만간",                                  "VAGUE_TEMPORAL:조만간"),
    (r"강한\s*모멘텀",                           "VAGUE_SIGNAL:강한모멘텀"),
    (r"(?i)\bseems?\s+like\b",                   "VAGUE_ENGLISH:seems_like"),
    (r"(?i)\bmight\s+be\b",                      "VAGUE_ENGLISH:might_be"),
    (r"(?i)\bprobably\b",                        "VAGUE_ENGLISH:probably"),
    (r"(?i)\bsoon\b",                            "VAGUE_ENGLISH:soon"),
    (r"(?i)\bstrong\s+momentum\b",               "VAGUE_ENGLISH:strong_momentum"),
    (r"(?i)\bpretty\s+good\b",                   "VAGUE_ENGLISH:pretty_good"),
    (r"(?i)\blooks?\s+good\b",                   "VAGUE_ENGLISH:looks_good"),
]

_COMPILED = [(re.compile(p), label) for p, label in _FORBIDDEN_PATTERNS]

# ── INVALID_SOFTENING 감지 (P3 확장) ────────────────────────────────────────
# verdict 키워드: 이 중 하나라도 섹션에 있으면 완화어휘 스캔 트리거
_VERDICT_KEYWORDS = [
    "강제 차단", "BLOCK", "CRITICAL", "매도", "손절", "SELL", "BREACH",
    "신규 매수.*금지", "매수.*차단", "BLOCKED",
]
_VERDICT_RE = re.compile("|".join(_VERDICT_KEYWORDS))

# 완화 어휘: verdict와 함께 나타나면 INVALID_SOFTENING
_SOFTENING_PATTERNS: list[tuple[str, str]] = [
    (r"그래도\s*(?:고려|참고|볼\s*만|매수)", "SOFTENING:그래도_고려"),
    (r"유연하게",                              "SOFTENING:유연하게"),
    (r"장기\s*관점\s*(?:재진입|매수|고려)",   "SOFTENING:장기관점_재진입"),
    (r"재진입\s*(?:고려|기회)",               "SOFTENING:재진입_고려"),
    (r"고려\s*가능",                          "SOFTENING:고려_가능"),
    (r"상황에\s*따라\s*(?:유연|조정|판단)",   "SOFTENING:상황에따라"),
    (r"아직\s*괜찮",                          "SOFTENING:아직_괜찮"),
    (r"지켜볼\s*만",                          "SOFTENING:지켜볼만"),
    (r"(?i)still\s+consider",                 "SOFTENING:still_consider"),
    (r"(?i)flexible(?:ly)?",                  "SOFTENING:flexible"),
]
_SOFTENING_RE_LIST = [(re.compile(p), label) for p, label in _SOFTENING_PATTERNS]


def _scan_softening(text: str) -> list[dict[str, Any]]:
    """BLOCK/SELL verdict 근방에서 완화 어휘 동시 출현 감지."""
    # verdict 키워드가 없으면 검사 생략
    if not _VERDICT_RE.search(text):
        return []
    hits = []
    for pattern, label in _SOFTENING_RE_LIST:
        for m in pattern.finditer(text):
            start = max(0, m.start() - 60)
            end   = min(len(text), m.end() + 60)
            context = text[start:end].replace("\n", " ").strip()
            hits.append({
                "pattern_label": label,
                "matched_text": m.group(0),
                "context": context,
                "position": m.start(),
                "violation_type": "INVALID_SOFTENING",
            })
    return hits


def _load(path: Path) -> dict[str, Any]:
    if not path.exists():
        return {}
    try:
        d = json.loads(path.read_text(encoding="utf-8"))
        return d if isinstance(d, dict) else {}
    except Exception:
        return {}


def _scan_text(text: str) -> list[dict[str, Any]]:
    """텍스트에서 금지 어휘 탐색."""
    hits = []
    for pattern, label in _COMPILED:
        for m in pattern.finditer(text):
            # 컨텍스트 추출 (±30자)
            start = max(0, m.start() - 30)
            end = min(len(text), m.end() + 30)
            context = text[start:end].replace("\n", " ").strip()
            hits.append({
                "pattern_label": label,
                "matched_text": m.group(0),
                "context": context,
                "position": m.start(),
            })
    return hits


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--report", default=str(DEFAULT_REPORT))
    ap.add_argument("--out", default=str(DEFAULT_OUT))
    args = ap.parse_args()

    report_path = Path(args.report) if Path(args.report).is_absolute() else ROOT / args.report
    out_path = Path(args.out) if Path(args.out).is_absolute() else ROOT / args.out

    report = _load(report_path)
    sections = report.get("sections") if isinstance(report.get("sections"), list) else []

    total_violations = 0
    section_results: list[dict[str, Any]] = []

    for section in sections:
        if not isinstance(section, dict):
            continue
        name = str(section.get("name") or "")
        markdown = str(section.get("markdown") or "")
        if not markdown:
            continue

        hits          = _scan_text(markdown)
        softening_hits = _scan_softening(markdown)
        all_hits = hits + softening_hits

        sec_status = "OK"
        if softening_hits:
            sec_status = "INVALID_SOFTENING"
        elif hits:
            sec_status = "INVALID_NARRATIVE"

        section_results.append({
            "section_name": name,
            "violation_count": len(all_hits),
            "narrative_violations": len(hits),
            "softening_violations": len(softening_hits),
            "violations": all_hits,
            "status": sec_status,
        })
        total_violations += len(all_hits)

    total_softening = sum(s["softening_violations"] for s in section_results)
    total_narrative = sum(s["narrative_violations"] for s in section_results)
    gate = "PASS" if total_violations == 0 else "FAIL"

    # 요약
    failed_sections = [s for s in section_results if s["status"] != "OK"]

    result = {
        "formula_id": "LLM_NARRATIVE_TEMPLATE_LOCK_V1",
        "gate": gate,
        "total_violations": total_violations,
        "narrative_violations": total_narrative,
        "softening_violations": total_softening,
        "sections_checked": len(section_results),
        "sections_failed": len(failed_sections),
        "forbidden_pattern_count": len(_FORBIDDEN_PATTERNS),
        "softening_pattern_count": len(_SOFTENING_PATTERNS),
        "forbidden_patterns": [label for _, label in _FORBIDDEN_PATTERNS],
        "softening_patterns": [label for _, label in _SOFTENING_PATTERNS],
        "section_results": section_results,
    }
    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8")
    print(
        f"LLM_NARRATIVE_TEMPLATE_LOCK_V1 gate={gate} "
        f"total_violations={total_violations} "
        f"(narrative={total_narrative} softening={total_softening}) "
        f"sections_checked={len(section_results)} sections_failed={len(failed_sections)}"
    )
    return 0


if __name__ == "__main__":
    raise SystemExit(main())