Files
QuantEngineByItz/src/quant_engine/sector_universe_refresh.py
T

297 lines
12 KiB
Python

from __future__ import annotations
import datetime as dt
from typing import Any
DEFAULT_MAX_AGE_DAYS = 31
def _txt(value: Any, default: str = "") -> str:
if value is None:
return default
if isinstance(value, str):
return value.strip() or default
return str(value).strip() or default
def _as_float(value: Any) -> float | None:
try:
if value in (None, ""):
return None
if isinstance(value, str):
text = value.strip().replace("%", "").replace(",", "")
if not text:
return None
return float(text)
return float(value)
except Exception:
return None
def _parse_date(value: Any) -> dt.date | None:
if value in (None, ""):
return None
if isinstance(value, dt.date):
return value
text = _txt(value)
if not text:
return None
for fmt in ("%Y-%m-%d", "%Y.%m.%d", "%Y/%m/%d"):
try:
return dt.datetime.strptime(text[:10], fmt).date()
except Exception:
pass
try:
return dt.date.fromisoformat(text[:10])
except Exception:
return None
def _age_days(value: Any, today: dt.date | None = None) -> int | None:
parsed = _parse_date(value)
if parsed is None:
return None
today = today or dt.datetime.now(dt.timezone(dt.timedelta(hours=9))).date()
return (today - parsed).days
def _extract_sector_rows(payload: dict[str, Any] | None) -> list[dict[str, Any]]:
if not isinstance(payload, dict):
return []
inner = payload.get("data")
if isinstance(inner, dict) and isinstance(inner.get("sector_universe"), list):
return [r for r in inner["sector_universe"] if isinstance(r, dict)]
if isinstance(payload.get("sector_universe"), list):
return [r for r in payload["sector_universe"] if isinstance(r, dict)]
return []
def build_sector_universe_refresh_audit(payload: dict[str, Any] | None) -> dict[str, Any]:
rows = _extract_sector_rows(payload)
today = dt.datetime.now(dt.timezone(dt.timedelta(hours=9))).date()
grouped: dict[str, list[dict[str, Any]]] = {}
for row in rows:
sector = _txt(row.get("Sector"))
if not sector:
continue
grouped.setdefault(sector, []).append(row)
detail_rows: list[dict[str, Any]] = []
source_kind_counts = {
"NAVER_ETF_PAGE": 0,
"NAVER_ETF_PAGE_FAIL_LAYOUT_CHANGED": 0,
"NAVER_ETF_PAGE_FAIL": 0,
"REPRESENTATIVE_STOCK_PROXY": 0,
"SHEET_INPUT": 0,
"DEFAULT_TEMPLATE": 0,
"OTHER": 0,
}
transport_mode_counts = {
"HTML_SERVER_RENDERED": 0,
"MANUAL_OR_TEMPLATE": 0,
"LAYOUT_CHANGED": 0,
"UNKNOWN": 0,
}
state_counts = {"CURRENT": 0, "DUE": 0, "OVERDUE": 0, "MISSING": 0, "TEMPLATE": 0, "INVALID": 0}
stale_sector_count = 0
layout_changed_count = 0
missing_source_url_count = 0
sheet_input_count = 0
template_count = 0
newest_asof: dt.date | None = None
oldest_asof: dt.date | None = None
for sector, sector_rows in grouped.items():
source_values = {_txt(r.get("Source"), "SHEET_INPUT") or "SHEET_INPUT" for r in sector_rows}
if "NAVER_ETF_PAGE_FAIL_LAYOUT_CHANGED" in source_values:
source_kind = "NAVER_ETF_PAGE_FAIL_LAYOUT_CHANGED"
elif "NAVER_ETF_PAGE_FAIL" in source_values:
source_kind = "NAVER_ETF_PAGE_FAIL"
elif "NAVER_ETF_PAGE" in source_values:
source_kind = "NAVER_ETF_PAGE"
elif "REPRESENTATIVE_STOCK_PROXY" in source_values:
source_kind = "REPRESENTATIVE_STOCK_PROXY"
elif "DEFAULT_TEMPLATE" in source_values:
source_kind = "DEFAULT_TEMPLATE"
elif "SHEET_INPUT" in source_values:
source_kind = "SHEET_INPUT"
else:
source_kind = "OTHER"
source_kind_counts[source_kind if source_kind in source_kind_counts else "OTHER"] += 1
source_urls = [_txt(r.get("Source_URL")) for r in sector_rows if _txt(r.get("Source_URL"))]
source_url = source_urls[0] if source_urls else ""
asof_candidates = [_parse_date(r.get("Source_AsOf")) for r in sector_rows]
asof_dates = [d for d in asof_candidates if d is not None]
source_asof = max(asof_dates) if asof_dates else None
if source_asof is not None:
newest_asof = source_asof if newest_asof is None else max(newest_asof, source_asof)
oldest_asof = source_asof if oldest_asof is None else min(oldest_asof, source_asof)
age_days = _age_days(source_asof, today) if source_asof else None
constituent_count = len(sector_rows)
etf_count = sum(1 for r in sector_rows if str(r.get("Is_ETF") or "").strip().upper() in {"Y", "YES", "TRUE", "1"})
stock_count = constituent_count - etf_count
weight_sum = sum(_as_float(r.get("Weight")) or 0 for r in sector_rows)
status = "INVALID"
reason_parts: list[str] = []
transport_mode = "UNKNOWN"
if source_kind == "DEFAULT_TEMPLATE":
status = "TEMPLATE"
reason_parts.append("DEFAULT_TEMPLATE")
template_count += 1
transport_mode = "MANUAL_OR_TEMPLATE"
elif source_kind == "NAVER_ETF_PAGE_FAIL_LAYOUT_CHANGED":
status = "LAYOUT_CHANGED"
transport_mode = "LAYOUT_CHANGED"
reason_parts.append("LAYOUT_CHANGED")
layout_changed_count += 1
if not source_url:
missing_source_url_count += 1
reason_parts.append("Source_URL_MISSING")
if age_days is None:
reason_parts.append("Source_AsOf_MISSING")
else:
stale_sector_count += 1
reason_parts.append(f"AgeDays={age_days}")
elif source_kind == "NAVER_ETF_PAGE_FAIL":
status = "INVALID"
transport_mode = "UNKNOWN"
reason_parts.append("NAVER_ETF_PAGE_FAIL")
if not source_url:
missing_source_url_count += 1
elif source_kind == "REPRESENTATIVE_STOCK_PROXY":
transport_mode = "HTML_SERVER_RENDERED"
if not source_url:
status = "MISSING"
missing_source_url_count += 1
reason_parts.append("Source_URL_MISSING")
elif age_days is None:
status = "MISSING"
reason_parts.append("Source_AsOf_MISSING")
elif age_days <= DEFAULT_MAX_AGE_DAYS:
status = "CURRENT"
elif age_days <= 45:
status = "DUE"
stale_sector_count += 1
reason_parts.append(f"AgeDays={age_days}")
else:
status = "OVERDUE"
stale_sector_count += 1
reason_parts.append(f"AgeDays={age_days}")
elif source_kind == "SHEET_INPUT":
sheet_input_count += 1
transport_mode = "MANUAL_OR_TEMPLATE"
if not source_url:
status = "MISSING"
reason_parts.append("Source_URL_MISSING")
missing_source_url_count += 1
elif age_days is None:
status = "MISSING"
reason_parts.append("Source_AsOf_MISSING")
elif age_days <= DEFAULT_MAX_AGE_DAYS:
status = "CURRENT"
elif age_days <= 45:
status = "DUE"
stale_sector_count += 1
reason_parts.append(f"AgeDays={age_days}")
else:
status = "OVERDUE"
stale_sector_count += 1
reason_parts.append(f"AgeDays={age_days}")
elif source_kind == "NAVER_ETF_PAGE":
transport_mode = "HTML_SERVER_RENDERED"
if not source_url:
status = "MISSING"
reason_parts.append("Source_URL_MISSING")
missing_source_url_count += 1
elif age_days is None:
status = "MISSING"
reason_parts.append("Source_AsOf_MISSING")
elif age_days <= DEFAULT_MAX_AGE_DAYS:
status = "CURRENT"
elif age_days <= 45:
status = "DUE"
stale_sector_count += 1
reason_parts.append(f"AgeDays={age_days}")
else:
status = "OVERDUE"
stale_sector_count += 1
reason_parts.append(f"AgeDays={age_days}")
else:
if not source_url:
missing_source_url_count += 1
status = "INVALID"
reason_parts.append("SOURCE_KIND_UNKNOWN")
transport_mode = "UNKNOWN"
if source_kind == "NAVER_ETF_PAGE" and not source_url:
reason_parts.append("NAVER_URL_MISSING")
if not source_url:
reason_parts.append("Source_URL_MISSING")
if age_days is not None and age_days < 0:
reason_parts.append("FUTURE_DATE")
transport_mode_counts[transport_mode] = transport_mode_counts.get(transport_mode, 0) + 1
refresh_reason = ";".join(reason_parts) if reason_parts else "OK"
detail_rows.append({
"sector": sector,
"proxy_ticker": _txt(sector_rows[0].get("Proxy_Ticker")),
"proxy_name": _txt(sector_rows[0].get("Proxy_Name")),
"proxy_type": _txt(sector_rows[0].get("Proxy_Type")),
"source_kind": source_kind,
"transport_mode": transport_mode,
"source_url": source_url,
"source_asof": source_asof.isoformat() if source_asof else "",
"age_days": age_days if age_days is not None else "",
"constituent_count": constituent_count,
"stock_count": stock_count,
"etf_count": etf_count,
"weight_sum": round(weight_sum, 4),
"status": status,
"refresh_reason": refresh_reason,
})
detail_rows.sort(key=lambda r: (r.get("status") != "CURRENT", r.get("status"), r.get("sector")))
summary = {
"sector_count": len(grouped),
"current_count": sum(1 for r in detail_rows if r.get("status") == "CURRENT"),
"due_count": sum(1 for r in detail_rows if r.get("status") == "DUE"),
"overdue_count": sum(1 for r in detail_rows if r.get("status") == "OVERDUE"),
"missing_count": sum(1 for r in detail_rows if r.get("status") == "MISSING"),
"template_count": template_count,
"sheet_input_count": sheet_input_count,
"naver_source_count": sum(1 for r in detail_rows if r.get("source_kind") == "NAVER_ETF_PAGE"),
"missing_source_url_count": missing_source_url_count,
"stale_sector_count": stale_sector_count,
"layout_changed_count": layout_changed_count,
"oldest_source_asof": oldest_asof.isoformat() if oldest_asof else "",
"newest_source_asof": newest_asof.isoformat() if newest_asof else "",
"source_kind_counts": source_kind_counts,
"transport_mode_counts": transport_mode_counts,
"ajax_mode": "NO",
"transport_model": "HTML_SERVER_RENDERED",
}
gate = "PASS"
if template_count > 0 or missing_source_url_count > 0 or stale_sector_count > 0 or layout_changed_count > 0:
gate = "FAIL"
elif sheet_input_count > 0:
gate = "WARN"
return {
"formula_id": "sector_universe_refresh_audit_v1",
"gate": gate,
"max_age_days": DEFAULT_MAX_AGE_DAYS,
"summary": summary,
"rows": detail_rows,
"source": {
"sector_rows": len(rows),
"grouped_sectors": len(grouped),
},
}