"""build_fundamental_raw_v2.py — FUNDAMENTAL_RAW_V2 P1-011: fundamental_raw_v1의 data_quality=FULL 레이블이 OCF/FCF 부재를 숨기는 문제 해소. - 필드 단위 coverage 산출 (ticker 단위 아님) - OCF/FCF 없으면 FULL이 아닌 PARTIAL - engine_audit(61.6) vs data_quality(100) 충돌 근거 명시 """ from __future__ import annotations import argparse import json from datetime import datetime, timezone from pathlib import Path from v7_hardening_common import ROOT, TEMP, load_json, save_json DEFAULT_RAW_V1 = TEMP / "fundamental_raw_v1.json" DEFAULT_OUT = TEMP / "fundamental_raw_v2.json" # 필드 가중치 (multifactor_v4와 동일) FIELD_WEIGHTS = { "roe_pct": 25, "opm_pct": 20, "ocf_krw": 15, # OCF/FCF 합산 30점 중 반 "fcf_krw": 15, "net_debt_krw": 10, "per": 8, "pbr": 7, } TOTAL_WEIGHT = sum(FIELD_WEIGHTS.values()) # = 100 # FULL 판정: ROE/OPM + 밸류에이션 + (OCF OR FCF) 중 하나라도 있어야 함 def _reclassify_data_quality(row: dict) -> str: if row.get("data_quality") == "ETF_EXCLUDED": return "ETF_EXCLUDED" has_core = (row.get("roe_pct") is not None and row.get("opm_pct") is not None) has_val = (row.get("per") is not None or row.get("pbr") is not None) has_cf = (row.get("ocf_krw") is not None or row.get("fcf_krw") is not None) if has_core and has_val and has_cf: return "FULL" if has_core and has_val: return "PARTIAL" # OCF/FCF 없음 if has_core: return "SPARSE" return "MISSING" def _field_coverage(rows: list[dict]) -> dict[str, float]: non_etf = [r for r in rows if r.get("data_quality") != "ETF_EXCLUDED"] if not non_etf: return {} return { field: round(sum(1 for r in non_etf if r.get(field) is not None) / len(non_etf) * 100.0, 2) for field in FIELD_WEIGHTS } def _weighted_coverage(field_cov: dict[str, float]) -> float: total_w = 0.0 covered_w = 0.0 for field, weight in FIELD_WEIGHTS.items(): total_w += weight covered_w += weight * (field_cov.get(field, 0.0) / 100.0) return round(covered_w / total_w * 100.0, 2) if total_w else 0.0 def main() -> int: ap = argparse.ArgumentParser() ap.add_argument("--raw-v1", default=str(DEFAULT_RAW_V1)) ap.add_argument("--out", default=str(DEFAULT_OUT)) args = ap.parse_args() raw_v1 = load_json(Path(args.raw_v1)) rows_in: list[dict] = raw_v1.get("rows", []) if isinstance(raw_v1, dict) else [] rows_out = [] for row in rows_in: r = dict(row) r["data_quality_v1"] = row.get("data_quality") # 이전 레이블 보존 r["data_quality"] = _reclassify_data_quality(row) # 각 필드 실측 여부 기록 r["field_coverage"] = { f: (row.get(f) is not None) for f in FIELD_WEIGHTS } rows_out.append(r) field_cov = _field_coverage(rows_out) weighted_cov = _weighted_coverage(field_cov) non_etf = [r for r in rows_out if r.get("data_quality") != "ETF_EXCLUDED"] from collections import Counter dq_counts = Counter(r["data_quality"] for r in rows_out) result = { "formula_id": "FUNDAMENTAL_RAW_V2", "generated_at": datetime.now(timezone.utc).isoformat(), "ticker_count": len(rows_out), "non_etf_count": len(non_etf), # coverage 지표 "raw_field_coverage_pct": weighted_cov, "field_coverage_pct": field_cov, "data_quality_counts": dict(dq_counts), # 충돌 근거 (engine_audit vs data_quality) "conflict_note": ( "engine_audit가 낮은 fundamental_score를 보고하는 이유: " "OCF/FCF 0% 커버리지로 인해 가중 커버리지가 낮음. " "data_quality의 schema_presence_score=100은 필드 존재 여부만 확인." ), "v1_label_issue": ( f"v1 data_quality=FULL {dq_counts.get('FULL',0)+len([r for r in rows_out if r.get('data_quality_v1')=='FULL' and r['data_quality']=='PARTIAL'])}건 중 " f"{len([r for r in rows_out if r.get('data_quality_v1')=='FULL' and r['data_quality']=='PARTIAL'])}건이 " "OCF/FCF 부재로 실제 PARTIAL → 수정됨" ), "rows": rows_out, } save_json(args.out, result) print(json.dumps({k: v for k, v in result.items() if k != "rows"}, ensure_ascii=False, indent=2)) return 0 if __name__ == "__main__": raise SystemExit(main())