09ba3ece32
- P0_01: design vs validated 분리 엄격화 (build_honest_performance_guard_v2.py) - P0_02: adjusted 마스킹 제거 검증 (build_p0_02_masking_removal.py) - P0_03: 커버리지 분모 통일 (build_p0_03_unified_coverage.py) - execution_order 공식 53개 vs legacy 288/204 분모 충돌 식별 - P1_01: 실행 권위 단일화 (build_p1_01_execution_verdict_unify.py) - final_decision_packet_v2 단일 진실 원칙 검증 상태: 거짓 100% 박멸 + 실행 권위 충돌 검증 완료. 다음: P2 실전 피드백 루프 Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
299 lines
11 KiB
Python
299 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
build_honest_performance_guard_v2.py
|
|
────────────────────────────────────────────────────────────────────────
|
|
정직 성과증빙 하네스 V2 (P0_01 단계)
|
|
|
|
P0_01: design vs validated 분리를 엄격하게
|
|
|
|
모든 *_score 필드에 score_kind ∈ {DESIGN, VALIDATED} 라벨을 강제하고,
|
|
VALIDATED는 live_sample_n >= 30일 때만 허용한다.
|
|
보고서에 노출되는 점수는 VALIDATED만 허용.
|
|
|
|
출력:
|
|
- Temp/honest_performance_guard_v2.json
|
|
- Temp/p0_01_strictness_report.json
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import Any
|
|
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
|
|
# 입력 파일
|
|
OP_REPORT = ROOT / "Temp" / "operational_report.json"
|
|
REBOUND_EFF = ROOT / "Temp" / "rebound_sell_efficiency_v1.json"
|
|
LATE_CHASE = ROOT / "Temp" / "late_chase_attribution_v1.json"
|
|
PREDICTION_ACC = ROOT / "Temp" / "prediction_accuracy_harness_v2.json"
|
|
|
|
# 출력 파일
|
|
OUTPUT_V2 = ROOT / "Temp" / "honest_performance_guard_v2.json"
|
|
REPORT_P001 = ROOT / "Temp" / "p0_01_strictness_report.json"
|
|
|
|
SAMPLE_THRESHOLD = 30
|
|
ACCEPTED_SCORE_KINDS = {"DESIGN", "VALIDATED"}
|
|
|
|
if sys.stdout.encoding and sys.stdout.encoding.lower() not in ("utf-8", "utf8"):
|
|
sys.stdout = open(sys.stdout.fileno(), mode="w", encoding="utf-8", buffering=1)
|
|
|
|
|
|
def load_json(p: Path) -> dict | list:
|
|
if not p.exists():
|
|
return {}
|
|
try:
|
|
return json.loads(p.read_text(encoding="utf-8"))
|
|
except Exception as e:
|
|
print(f"[WARN] Failed to load {p.name}: {e}")
|
|
return {}
|
|
|
|
|
|
def check_all_scores_have_kind_and_sample_n(obj: Any, path: str = "") -> list[dict]:
|
|
"""모든 *_score 필드가 score_kind와 sample_n을 가지는지 검사."""
|
|
violations = []
|
|
|
|
if isinstance(obj, dict):
|
|
for key, value in obj.items():
|
|
current_path = f"{path}.{key}" if path else key
|
|
|
|
# *_score 필드 검사
|
|
if key.endswith("_score"):
|
|
if not isinstance(value, dict):
|
|
violations.append({
|
|
"path": current_path,
|
|
"issue": "SCORE_NOT_DICT",
|
|
"value": value,
|
|
"detail": f"점수가 dict가 아님. 값={value}"
|
|
})
|
|
else:
|
|
# score_kind 검사
|
|
score_kind = value.get("score_kind")
|
|
sample_n = value.get("sample_n")
|
|
score_value = value.get("value")
|
|
|
|
if score_kind is None:
|
|
violations.append({
|
|
"path": current_path,
|
|
"issue": "MISSING_SCORE_KIND",
|
|
"detail": "score_kind 필드 누락"
|
|
})
|
|
elif score_kind not in ACCEPTED_SCORE_KINDS:
|
|
violations.append({
|
|
"path": current_path,
|
|
"issue": "INVALID_SCORE_KIND",
|
|
"value": score_kind,
|
|
"detail": f"허용되지 않는 값: {score_kind}"
|
|
})
|
|
|
|
if sample_n is None:
|
|
violations.append({
|
|
"path": current_path,
|
|
"issue": "MISSING_SAMPLE_N",
|
|
"detail": "sample_n 필드 누락"
|
|
})
|
|
|
|
# VALIDATED인데 sample_n < 30 검사
|
|
if score_kind == "VALIDATED" and isinstance(sample_n, int):
|
|
if sample_n < SAMPLE_THRESHOLD:
|
|
violations.append({
|
|
"path": current_path,
|
|
"issue": "INVALID_VALIDATED_LABEL",
|
|
"sample_n": sample_n,
|
|
"detail": f"VALIDATED 라벨인데 sample_n={sample_n} < {SAMPLE_THRESHOLD}"
|
|
})
|
|
|
|
# 재귀 검사
|
|
elif isinstance(value, (dict, list)):
|
|
violations.extend(check_all_scores_have_kind_and_sample_n(value, current_path))
|
|
|
|
elif isinstance(obj, list):
|
|
for i, item in enumerate(obj):
|
|
current_path = f"{path}[{i}]"
|
|
violations.extend(check_all_scores_have_kind_and_sample_n(item, current_path))
|
|
|
|
return violations
|
|
|
|
|
|
def build_strictness_report(rebound: dict, chase: dict, pred_acc: dict) -> dict:
|
|
"""P0_01 엄격성 검사 보고서 작성."""
|
|
report = {
|
|
"phase": "P0_01_DESIGN_VS_VALIDATED_SEPARATION",
|
|
"generated_at": datetime.now().isoformat(),
|
|
"threshold_sample_min": SAMPLE_THRESHOLD,
|
|
"findings": {
|
|
"rebound_efficiency": {},
|
|
"late_chase_attribution": {},
|
|
"prediction_accuracy": {}
|
|
},
|
|
"violations": [],
|
|
"corrections_required": []
|
|
}
|
|
|
|
# 1. rebound_efficiency 검사
|
|
rb_metrics = rebound.get("metrics", {})
|
|
rb_combo = rb_metrics.get("combo_count", 0)
|
|
rb_score = rb_metrics.get("rebound_efficiency_score", 0)
|
|
|
|
report["findings"]["rebound_efficiency"] = {
|
|
"metric_name": "rebound_efficiency_score",
|
|
"current_value": rb_score,
|
|
"sample_n": rb_combo,
|
|
"meets_validated_threshold": rb_combo >= SAMPLE_THRESHOLD,
|
|
"required_score_kind": "VALIDATED" if rb_combo >= SAMPLE_THRESHOLD else "DESIGN",
|
|
"annotation_suffix": f" [설계점수, n={rb_combo}]" if rb_combo < SAMPLE_THRESHOLD else ""
|
|
}
|
|
|
|
if rb_combo < SAMPLE_THRESHOLD:
|
|
report["corrections_required"].append({
|
|
"metric": "rebound_efficiency_score",
|
|
"action": "ANNOTATE_DESIGN",
|
|
"new_structure": {
|
|
"score_kind": "DESIGN",
|
|
"value": rb_score,
|
|
"sample_n": rb_combo,
|
|
"annotation": f"n={rb_combo} < {SAMPLE_THRESHOLD}. 실측 미검증."
|
|
}
|
|
})
|
|
|
|
# 2. late_chase_attribution 검사
|
|
chase_metrics = chase.get("metrics", {})
|
|
chase_sample = chase_metrics.get("sample_n", 0)
|
|
chase_rate = chase_metrics.get("chase_entry_rate_pct", 0)
|
|
|
|
report["findings"]["late_chase_attribution"] = {
|
|
"metric_name": "late_chase_attribution",
|
|
"current_value": chase_rate,
|
|
"sample_n": chase_sample,
|
|
"meets_validated_threshold": chase_sample >= SAMPLE_THRESHOLD,
|
|
"required_score_kind": "VALIDATED" if chase_sample >= SAMPLE_THRESHOLD else "DESIGN"
|
|
}
|
|
|
|
if chase_sample < SAMPLE_THRESHOLD:
|
|
report["corrections_required"].append({
|
|
"metric": "late_chase_attribution",
|
|
"action": "ANNOTATE_DESIGN",
|
|
"new_structure": {
|
|
"score_kind": "DESIGN",
|
|
"value": chase_rate,
|
|
"sample_n": chase_sample,
|
|
"annotation": f"뒷박 차단 효과 미검증 (n={chase_sample})"
|
|
}
|
|
})
|
|
|
|
# 3. prediction_accuracy 검사
|
|
t5_sample = pred_acc.get("t5_sample", 0)
|
|
t5_rate = pred_acc.get("t5_op_rate", 0)
|
|
|
|
report["findings"]["prediction_accuracy"] = {
|
|
"metric_name": "t5_match_rate_pct",
|
|
"current_value": t5_rate,
|
|
"sample_n": t5_sample,
|
|
"meets_validated_threshold": t5_sample >= SAMPLE_THRESHOLD,
|
|
"required_score_kind": "VALIDATED" if t5_sample >= SAMPLE_THRESHOLD else "DESIGN"
|
|
}
|
|
|
|
if t5_sample < SAMPLE_THRESHOLD:
|
|
report["corrections_required"].append({
|
|
"metric": "t5_match_rate_pct",
|
|
"action": "ANNOTATE_DESIGN",
|
|
"new_structure": {
|
|
"score_kind": "DESIGN",
|
|
"value": t5_rate,
|
|
"sample_n": t5_sample,
|
|
"annotation": f"실측 미검증 (n={t5_sample})"
|
|
}
|
|
})
|
|
|
|
# 최종 verdict
|
|
report["verdict"] = {
|
|
"all_scores_properly_labeled": len(report["corrections_required"]) == 0,
|
|
"required_corrections_count": len(report["corrections_required"]),
|
|
"status": "PASS" if len(report["corrections_required"]) == 0 else "FAIL_CORRECTION_REQUIRED"
|
|
}
|
|
|
|
return report
|
|
|
|
|
|
def main() -> int:
|
|
print("=" * 80)
|
|
print(" P0_01: Design vs Validated 엄격한 분리")
|
|
print("=" * 80)
|
|
|
|
# 입력 로드
|
|
rebound = load_json(REBOUND_EFF)
|
|
chase = load_json(LATE_CHASE)
|
|
pred_acc = load_json(PREDICTION_ACC)
|
|
|
|
# P0_01 보고서 생성
|
|
p001_report = build_strictness_report(rebound, chase, pred_acc)
|
|
|
|
print(f"\n[1] 재정렬 효율 (rebound_efficiency_score)")
|
|
rb_find = p001_report["findings"]["rebound_efficiency"]
|
|
print(f" 현재값: {rb_find['current_value']}")
|
|
print(f" 표본 수: {rb_find['sample_n']} / {SAMPLE_THRESHOLD}")
|
|
print(f" 필수 라벨: {rb_find['required_score_kind']}")
|
|
|
|
print(f"\n[2] 뒷박 매수 (late_chase_attribution)")
|
|
chase_find = p001_report["findings"]["late_chase_attribution"]
|
|
print(f" 현재값: {chase_find['current_value']}")
|
|
print(f" 표본 수: {chase_find['sample_n']} / {SAMPLE_THRESHOLD}")
|
|
print(f" 필수 라벨: {chase_find['required_score_kind']}")
|
|
|
|
print(f"\n[3] 예측 정확도 (T+5 일치율)")
|
|
pred_find = p001_report["findings"]["prediction_accuracy"]
|
|
print(f" 현재값: {pred_find['current_value']}%")
|
|
print(f" 표본 수: {pred_find['sample_n']} / {SAMPLE_THRESHOLD}")
|
|
print(f" 필수 라벨: {pred_find['required_score_kind']}")
|
|
|
|
print(f"\n[결과]")
|
|
print(f" 필요한 수정: {p001_report['verdict']['required_corrections_count']}")
|
|
print(f" 상태: {p001_report['verdict']['status']}")
|
|
|
|
# 보고서 저장
|
|
REPORT_P001.write_text(
|
|
json.dumps(p001_report, ensure_ascii=False, indent=2),
|
|
encoding="utf-8"
|
|
)
|
|
print(f"\n✓ P0_01 보고서 저장: {REPORT_P001.name}")
|
|
|
|
# V2 가드 생성
|
|
guard_v2 = {
|
|
"schema_version": "honest_performance_guard_v2",
|
|
"generated_at": datetime.now().isoformat(),
|
|
"p0_01_strictness": p001_report["verdict"],
|
|
"required_corrections": p001_report["corrections_required"],
|
|
"action_plan": [
|
|
{
|
|
"step": 1,
|
|
"title": "모든 *_score 필드를 dict 구조로 변환",
|
|
"fields": ["score_kind", "value", "sample_n", "annotation"]
|
|
},
|
|
{
|
|
"step": 2,
|
|
"title": "각 필드에 score_kind ∈ {DESIGN, VALIDATED} 할당",
|
|
"rule": "sample_n >= 30 → VALIDATED, else → DESIGN"
|
|
},
|
|
{
|
|
"step": 3,
|
|
"title": "보고서 노출 규칙 적용",
|
|
"rule": "DESIGN 점수는 보고서 요약에 단독 노출 금지. (설계, n=N) 접미사 필수"
|
|
}
|
|
]
|
|
}
|
|
|
|
OUTPUT_V2.write_text(
|
|
json.dumps(guard_v2, ensure_ascii=False, indent=2),
|
|
encoding="utf-8"
|
|
)
|
|
print(f"✓ P0_01 가드 저장: {OUTPUT_V2.name}")
|
|
|
|
return 0 if p001_report['verdict']['status'] == "PASS" else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|