Files
QuantEngineByItz/tools/build_honest_performance_guard_v2.py
T
kjh2064 09ba3ece32 feat(v9-hardening): P0/P1 작업 검사 스크립트 추가 (P0_01/02/03, P1_01)
- P0_01: design vs validated 분리 엄격화 (build_honest_performance_guard_v2.py)
- P0_02: adjusted 마스킹 제거 검증 (build_p0_02_masking_removal.py)
- P0_03: 커버리지 분모 통일 (build_p0_03_unified_coverage.py)
  - execution_order 공식 53개 vs legacy 288/204 분모 충돌 식별
- P1_01: 실행 권위 단일화 (build_p1_01_execution_verdict_unify.py)
  - final_decision_packet_v2 단일 진실 원칙 검증

상태: 거짓 100% 박멸 + 실행 권위 충돌 검증 완료. 다음: P2 실전 피드백 루프

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2026-06-25 17:40:19 +09:00

299 lines
11 KiB
Python

#!/usr/bin/env python3
"""
build_honest_performance_guard_v2.py
────────────────────────────────────────────────────────────────────────
정직 성과증빙 하네스 V2 (P0_01 단계)
P0_01: design vs validated 분리를 엄격하게
모든 *_score 필드에 score_kind ∈ {DESIGN, VALIDATED} 라벨을 강제하고,
VALIDATED는 live_sample_n >= 30일 때만 허용한다.
보고서에 노출되는 점수는 VALIDATED만 허용.
출력:
- Temp/honest_performance_guard_v2.json
- Temp/p0_01_strictness_report.json
"""
from __future__ import annotations
import json
import sys
from pathlib import Path
from datetime import datetime
from typing import Any
ROOT = Path(__file__).resolve().parent.parent
# 입력 파일
OP_REPORT = ROOT / "Temp" / "operational_report.json"
REBOUND_EFF = ROOT / "Temp" / "rebound_sell_efficiency_v1.json"
LATE_CHASE = ROOT / "Temp" / "late_chase_attribution_v1.json"
PREDICTION_ACC = ROOT / "Temp" / "prediction_accuracy_harness_v2.json"
# 출력 파일
OUTPUT_V2 = ROOT / "Temp" / "honest_performance_guard_v2.json"
REPORT_P001 = ROOT / "Temp" / "p0_01_strictness_report.json"
SAMPLE_THRESHOLD = 30
ACCEPTED_SCORE_KINDS = {"DESIGN", "VALIDATED"}
if sys.stdout.encoding and sys.stdout.encoding.lower() not in ("utf-8", "utf8"):
sys.stdout = open(sys.stdout.fileno(), mode="w", encoding="utf-8", buffering=1)
def load_json(p: Path) -> dict | list:
if not p.exists():
return {}
try:
return json.loads(p.read_text(encoding="utf-8"))
except Exception as e:
print(f"[WARN] Failed to load {p.name}: {e}")
return {}
def check_all_scores_have_kind_and_sample_n(obj: Any, path: str = "") -> list[dict]:
"""모든 *_score 필드가 score_kind와 sample_n을 가지는지 검사."""
violations = []
if isinstance(obj, dict):
for key, value in obj.items():
current_path = f"{path}.{key}" if path else key
# *_score 필드 검사
if key.endswith("_score"):
if not isinstance(value, dict):
violations.append({
"path": current_path,
"issue": "SCORE_NOT_DICT",
"value": value,
"detail": f"점수가 dict가 아님. 값={value}"
})
else:
# score_kind 검사
score_kind = value.get("score_kind")
sample_n = value.get("sample_n")
score_value = value.get("value")
if score_kind is None:
violations.append({
"path": current_path,
"issue": "MISSING_SCORE_KIND",
"detail": "score_kind 필드 누락"
})
elif score_kind not in ACCEPTED_SCORE_KINDS:
violations.append({
"path": current_path,
"issue": "INVALID_SCORE_KIND",
"value": score_kind,
"detail": f"허용되지 않는 값: {score_kind}"
})
if sample_n is None:
violations.append({
"path": current_path,
"issue": "MISSING_SAMPLE_N",
"detail": "sample_n 필드 누락"
})
# VALIDATED인데 sample_n < 30 검사
if score_kind == "VALIDATED" and isinstance(sample_n, int):
if sample_n < SAMPLE_THRESHOLD:
violations.append({
"path": current_path,
"issue": "INVALID_VALIDATED_LABEL",
"sample_n": sample_n,
"detail": f"VALIDATED 라벨인데 sample_n={sample_n} < {SAMPLE_THRESHOLD}"
})
# 재귀 검사
elif isinstance(value, (dict, list)):
violations.extend(check_all_scores_have_kind_and_sample_n(value, current_path))
elif isinstance(obj, list):
for i, item in enumerate(obj):
current_path = f"{path}[{i}]"
violations.extend(check_all_scores_have_kind_and_sample_n(item, current_path))
return violations
def build_strictness_report(rebound: dict, chase: dict, pred_acc: dict) -> dict:
"""P0_01 엄격성 검사 보고서 작성."""
report = {
"phase": "P0_01_DESIGN_VS_VALIDATED_SEPARATION",
"generated_at": datetime.now().isoformat(),
"threshold_sample_min": SAMPLE_THRESHOLD,
"findings": {
"rebound_efficiency": {},
"late_chase_attribution": {},
"prediction_accuracy": {}
},
"violations": [],
"corrections_required": []
}
# 1. rebound_efficiency 검사
rb_metrics = rebound.get("metrics", {})
rb_combo = rb_metrics.get("combo_count", 0)
rb_score = rb_metrics.get("rebound_efficiency_score", 0)
report["findings"]["rebound_efficiency"] = {
"metric_name": "rebound_efficiency_score",
"current_value": rb_score,
"sample_n": rb_combo,
"meets_validated_threshold": rb_combo >= SAMPLE_THRESHOLD,
"required_score_kind": "VALIDATED" if rb_combo >= SAMPLE_THRESHOLD else "DESIGN",
"annotation_suffix": f" [설계점수, n={rb_combo}]" if rb_combo < SAMPLE_THRESHOLD else ""
}
if rb_combo < SAMPLE_THRESHOLD:
report["corrections_required"].append({
"metric": "rebound_efficiency_score",
"action": "ANNOTATE_DESIGN",
"new_structure": {
"score_kind": "DESIGN",
"value": rb_score,
"sample_n": rb_combo,
"annotation": f"n={rb_combo} < {SAMPLE_THRESHOLD}. 실측 미검증."
}
})
# 2. late_chase_attribution 검사
chase_metrics = chase.get("metrics", {})
chase_sample = chase_metrics.get("sample_n", 0)
chase_rate = chase_metrics.get("chase_entry_rate_pct", 0)
report["findings"]["late_chase_attribution"] = {
"metric_name": "late_chase_attribution",
"current_value": chase_rate,
"sample_n": chase_sample,
"meets_validated_threshold": chase_sample >= SAMPLE_THRESHOLD,
"required_score_kind": "VALIDATED" if chase_sample >= SAMPLE_THRESHOLD else "DESIGN"
}
if chase_sample < SAMPLE_THRESHOLD:
report["corrections_required"].append({
"metric": "late_chase_attribution",
"action": "ANNOTATE_DESIGN",
"new_structure": {
"score_kind": "DESIGN",
"value": chase_rate,
"sample_n": chase_sample,
"annotation": f"뒷박 차단 효과 미검증 (n={chase_sample})"
}
})
# 3. prediction_accuracy 검사
t5_sample = pred_acc.get("t5_sample", 0)
t5_rate = pred_acc.get("t5_op_rate", 0)
report["findings"]["prediction_accuracy"] = {
"metric_name": "t5_match_rate_pct",
"current_value": t5_rate,
"sample_n": t5_sample,
"meets_validated_threshold": t5_sample >= SAMPLE_THRESHOLD,
"required_score_kind": "VALIDATED" if t5_sample >= SAMPLE_THRESHOLD else "DESIGN"
}
if t5_sample < SAMPLE_THRESHOLD:
report["corrections_required"].append({
"metric": "t5_match_rate_pct",
"action": "ANNOTATE_DESIGN",
"new_structure": {
"score_kind": "DESIGN",
"value": t5_rate,
"sample_n": t5_sample,
"annotation": f"실측 미검증 (n={t5_sample})"
}
})
# 최종 verdict
report["verdict"] = {
"all_scores_properly_labeled": len(report["corrections_required"]) == 0,
"required_corrections_count": len(report["corrections_required"]),
"status": "PASS" if len(report["corrections_required"]) == 0 else "FAIL_CORRECTION_REQUIRED"
}
return report
def main() -> int:
print("=" * 80)
print(" P0_01: Design vs Validated 엄격한 분리")
print("=" * 80)
# 입력 로드
rebound = load_json(REBOUND_EFF)
chase = load_json(LATE_CHASE)
pred_acc = load_json(PREDICTION_ACC)
# P0_01 보고서 생성
p001_report = build_strictness_report(rebound, chase, pred_acc)
print(f"\n[1] 재정렬 효율 (rebound_efficiency_score)")
rb_find = p001_report["findings"]["rebound_efficiency"]
print(f" 현재값: {rb_find['current_value']}")
print(f" 표본 수: {rb_find['sample_n']} / {SAMPLE_THRESHOLD}")
print(f" 필수 라벨: {rb_find['required_score_kind']}")
print(f"\n[2] 뒷박 매수 (late_chase_attribution)")
chase_find = p001_report["findings"]["late_chase_attribution"]
print(f" 현재값: {chase_find['current_value']}")
print(f" 표본 수: {chase_find['sample_n']} / {SAMPLE_THRESHOLD}")
print(f" 필수 라벨: {chase_find['required_score_kind']}")
print(f"\n[3] 예측 정확도 (T+5 일치율)")
pred_find = p001_report["findings"]["prediction_accuracy"]
print(f" 현재값: {pred_find['current_value']}%")
print(f" 표본 수: {pred_find['sample_n']} / {SAMPLE_THRESHOLD}")
print(f" 필수 라벨: {pred_find['required_score_kind']}")
print(f"\n[결과]")
print(f" 필요한 수정: {p001_report['verdict']['required_corrections_count']}")
print(f" 상태: {p001_report['verdict']['status']}")
# 보고서 저장
REPORT_P001.write_text(
json.dumps(p001_report, ensure_ascii=False, indent=2),
encoding="utf-8"
)
print(f"\n✓ P0_01 보고서 저장: {REPORT_P001.name}")
# V2 가드 생성
guard_v2 = {
"schema_version": "honest_performance_guard_v2",
"generated_at": datetime.now().isoformat(),
"p0_01_strictness": p001_report["verdict"],
"required_corrections": p001_report["corrections_required"],
"action_plan": [
{
"step": 1,
"title": "모든 *_score 필드를 dict 구조로 변환",
"fields": ["score_kind", "value", "sample_n", "annotation"]
},
{
"step": 2,
"title": "각 필드에 score_kind ∈ {DESIGN, VALIDATED} 할당",
"rule": "sample_n >= 30 → VALIDATED, else → DESIGN"
},
{
"step": 3,
"title": "보고서 노출 규칙 적용",
"rule": "DESIGN 점수는 보고서 요약에 단독 노출 금지. (설계, n=N) 접미사 필수"
}
]
}
OUTPUT_V2.write_text(
json.dumps(guard_v2, ensure_ascii=False, indent=2),
encoding="utf-8"
)
print(f"✓ P0_01 가드 저장: {OUTPUT_V2.name}")
return 0 if p001_report['verdict']['status'] == "PASS" else 1
if __name__ == "__main__":
sys.exit(main())