Files
QuantEngineByItz/tools/update_sector_universe_from_naver.py
T

617 lines
26 KiB
Python

from __future__ import annotations
import argparse
import datetime as dt
import json
import re
import shutil
import sys
from collections import OrderedDict
from pathlib import Path
from typing import Any
from urllib.parse import urljoin, urlparse, parse_qs
import requests
from bs4 import BeautifulSoup
from openpyxl import load_workbook
from openpyxl.styles import Alignment, Font, PatternFill
from openpyxl.utils import get_column_letter
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from src.quant_engine.sector_universe_refresh import build_sector_universe_refresh_audit
DEFAULT_INPUT_XLSX = ROOT / "GatherTradingData.xlsx"
DEFAULT_OUTPUT_XLSX = ROOT / "outputs" / "sector_universe_refresh" / "GatherTradingData_sector_universe.xlsx"
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36"
NAVER_BASE = "https://finance.naver.com"
NAVER_ITEM_CODE_RE = re.compile(r"(?:https?:)?//finance\.naver\.com(?P<path>/item/[^\"'\s<>]+code=(?P<code>\d+)[^\"'\s<>]*)", re.I)
NAVER_REL_CODE_RE = re.compile(r"(?P<path>/item/[^\"'\s<>]+code=(?P<code>\d+)[^\"'\s<>]*)", re.I)
TITLE_FILL = PatternFill("solid", fgColor="1F4E78")
HEADER_FILL = PatternFill("solid", fgColor="1F4E78")
SUBHEADER_FILL = PatternFill("solid", fgColor="D9EAF7")
WHITE_FONT = Font(color="FFFFFF", bold=True)
BOLD_FONT = Font(bold=True)
NOTE_FONT = Font(italic=True, color="666666")
def _kst_now() -> dt.datetime:
return dt.datetime.now(dt.timezone(dt.timedelta(hours=9)))
def _kst_today() -> str:
return _kst_now().strftime("%Y-%m-%d")
def _clean_text(value: Any) -> str:
if value is None:
return ""
return str(value).strip()
def _normalize_code(value: Any) -> str:
text = _clean_text(value)
if not text:
return ""
text = text.replace(",", "")
if text.endswith(".0"):
text = text[:-2]
if text.isdigit():
return text.zfill(6)
if re.fullmatch(r"\d+\.\d+", text):
return str(int(float(text))).zfill(6)
return text
def _parse_weight(value: str) -> float | None:
text = _clean_text(value).replace("%", "").replace(",", "")
if not text:
return None
try:
return float(text)
except Exception:
return None
def _discover_naver_candidate_urls(soup: BeautifulSoup, proxy_ticker: str) -> list[str]:
candidates: list[str] = []
seen: set[str] = set()
def add(url: str) -> None:
url = _clean_text(url)
if not url or url in seen:
return
seen.add(url)
candidates.append(url)
expected_code = _normalize_code(proxy_ticker)
for script in soup.find_all("script"):
src = _clean_text(script.get("src"))
if src:
if expected_code and expected_code in src:
if src.startswith("//"):
add(f"https:{src}")
elif src.startswith("/"):
add(urljoin(NAVER_BASE, src))
else:
add(src)
continue
text = script.get_text(" ", strip=True) or ""
if not text:
continue
for regex in (NAVER_ITEM_CODE_RE, NAVER_REL_CODE_RE):
for match in regex.finditer(text):
code = _normalize_code(match.groupdict().get("code") or "")
if expected_code and code and code != expected_code:
continue
path = match.groupdict().get("path") or ""
if path:
add(urljoin(NAVER_BASE, path))
return candidates
def _parse_naver_etf_holdings(session: requests.Session, proxy_ticker: str, limit: int) -> dict[str, Any]:
url_candidates = [
f"{NAVER_BASE}/item/main.naver?code={proxy_ticker}",
f"{NAVER_BASE}/item/coinfo.naver?code={proxy_ticker}&target=cu_more",
]
last_message = ""
for url in url_candidates:
response = session.get(url, timeout=20)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
discovered = _discover_naver_candidate_urls(soup, proxy_ticker)
for candidate in discovered:
if candidate not in url_candidates:
url_candidates.append(candidate)
section = soup.select_one("div.section.etf_asset")
table = section.select_one("table.tb_type1_a") if section is not None else None
if table is None:
# layout changed or this endpoint does not expose the constituent table
last_message = "ETF constituent table missing; page structure may have changed"
continue
holdings: list[dict[str, Any]] = []
for tr in table.select("tbody tr"):
tds = tr.find_all("td")
if len(tds) < 3:
continue
name_link = tr.find("a", href=re.compile(r"code=\d+"))
if name_link is None:
continue
name = _clean_text(name_link.get_text(" ", strip=True))
href = _clean_text(name_link.get("href"))
m = re.search(r"code=(\d+)", href)
code = _normalize_code(m.group(1) if m else "")
if not code or not name:
continue
weight = _parse_weight(tds[2].get_text(" ", strip=True))
if weight is None:
continue
holdings.append({
"Constituent_Code": code,
"Constituent_Name": name,
"Weight": round(weight / 100.0, 6),
"Source": "NAVER_ETF_PAGE",
})
if len(holdings) >= limit:
break
if holdings:
return {
"source_url": url,
"source_kind": "NAVER_ETF_PAGE",
"holdings": holdings,
"discovered_urls": discovered,
"message": "",
}
last_message = "no holdings parsed"
return {
"source_url": url_candidates[0],
"source_kind": "NAVER_ETF_PAGE_FAIL_LAYOUT_CHANGED",
"holdings": [],
"discovered_urls": [],
"message": last_message or "page structure changed; no expected values were inferred",
}
def _extract_sector_seed_rows(ws) -> list[dict[str, Any]]:
headers = [ws.cell(2, c).value for c in range(1, ws.max_column + 1)]
headers = [str(h).strip() if h is not None else "" for h in headers]
idx = {name: i for i, name in enumerate(headers)}
rows: list[dict[str, Any]] = []
for r in range(3, ws.max_row + 1):
row = {name: ws.cell(r, c + 1).value for c, name in enumerate(headers) if name}
if not any(v not in (None, "") for v in row.values()):
continue
rows.append(row)
return rows
def _group_seed_rows(rows: list[dict[str, Any]]) -> OrderedDict[str, dict[str, Any]]:
grouped: OrderedDict[str, dict[str, Any]] = OrderedDict()
for row in rows:
sector = _clean_text(row.get("Sector"))
if not sector:
continue
if sector not in grouped:
grouped[sector] = {
"meta": row,
"rows": [],
}
grouped[sector]["rows"].append(row)
return grouped
def _build_refreshed_rows(seed_rows: list[dict[str, Any]], limit: int) -> tuple[list[dict[str, Any]], dict[str, Any]]:
session = requests.Session()
session.headers.update({"User-Agent": DEFAULT_USER_AGENT})
grouped = _group_seed_rows(seed_rows)
refreshed: list[dict[str, Any]] = []
sector_stats: list[dict[str, Any]] = []
today = _kst_today()
for sector, bundle in grouped.items():
meta = bundle["meta"]
proxy_ticker = _normalize_code(meta.get("Proxy_Ticker"))
proxy_name = _clean_text(meta.get("Proxy_Name"))
proxy_type = _clean_text(meta.get("Proxy_Type")) or "ETF"
base_ticker = _normalize_code(meta.get("Base_Ticker")) or "069500"
if sector == "금융/은행":
split_specs = [
{"sector": "은행", "proxy_ticker": "091170", "proxy_name": "KODEX 은행", "proxy_type": "ETF"},
{"sector": "증권", "proxy_ticker": "0111J0", "proxy_name": "HANARO 증권고배당TOP3플러스", "proxy_type": "ETF"},
{"sector": "지주회사", "proxy_ticker": "307520", "proxy_name": "TIGER 지주회사", "proxy_type": "ETF"},
]
for spec in split_specs:
split_proxy_ticker = _normalize_code(spec["proxy_ticker"])
split_proxy_name = _clean_text(spec["proxy_name"])
split_proxy_type = _clean_text(spec["proxy_type"]) or "ETF"
split_source = "SHEET_INPUT"
split_source_url = ""
split_message = ""
split_source_kind = "SHEET_INPUT"
try:
scraped = _parse_naver_etf_holdings(session, split_proxy_ticker, limit)
split_source_url = scraped.get("source_url", "")
split_source_kind = scraped.get("source_kind", "NAVER_ETF_PAGE_FAIL")
holdings = scraped.get("holdings", [])
split_message = scraped.get("message", "")
if holdings:
split_source = "NAVER_ETF_PAGE"
weight_sum = round(sum(float(h["Weight"]) for h in holdings), 6)
for h in holdings:
refreshed.append({
"Sector": spec["sector"],
"Proxy_Ticker": split_proxy_ticker,
"Proxy_Name": split_proxy_name,
"Proxy_Type": split_proxy_type,
"Base_Ticker": base_ticker,
"Constituent_Code": h["Constituent_Code"],
"Constituent_Name": h["Constituent_Name"],
"Weight": h["Weight"],
"Is_ETF": "N",
"Enabled": "Y",
"Effective_Date": today,
"Source": split_source,
"Transport_Mode": "HTML_SERVER_RENDERED",
"Source_URL": split_source_url,
"Source_AsOf": today,
"Sector_Check": spec["sector"],
"Weight_Sum_All": weight_sum,
"Weight_Sum_Stocks_Only": weight_sum,
"ETF_Rows": 0,
"Status": "OK",
})
sector_stats.append({
"sector": spec["sector"],
"proxy_ticker": split_proxy_ticker,
"proxy_name": split_proxy_name,
"proxy_type": split_proxy_type,
"source_kind": split_source,
"transport_mode": "HTML_SERVER_RENDERED",
"source_url": split_source_url,
"source_asof": today,
"constituent_count": len(holdings),
"weight_sum": weight_sum,
"status": "CURRENT",
"refresh_reason": "NAVER_ETF_PAGE_SPLIT",
})
continue
except Exception as exc:
split_message = str(exc)
split_source_kind = "NAVER_ETF_PAGE_FAIL"
# 실패 시는 투명하게 남기고, 섹터 누락은 그대로 드러낸다.
sector_stats.append({
"sector": spec["sector"],
"proxy_ticker": split_proxy_ticker,
"proxy_name": split_proxy_name,
"proxy_type": split_proxy_type,
"source_kind": split_source_kind,
"transport_mode": "LAYOUT_CHANGED" if split_source_kind == "NAVER_ETF_PAGE_FAIL_LAYOUT_CHANGED" else "UNKNOWN",
"source_url": split_source_url,
"source_asof": today,
"constituent_count": 0,
"weight_sum": 0.0,
"status": "FAIL" if "FAIL" in split_source_kind else "WARN",
"refresh_reason": split_message or "split_sector_fallback",
})
continue
source = "SHEET_INPUT"
source_url = ""
message = ""
source_kind = "SHEET_INPUT"
if proxy_type != "ETF":
source_kind = "REPRESENTATIVE_STOCK_PROXY"
source = source_kind
source_url = f"{NAVER_BASE}/item/main.naver?code={proxy_ticker}" if proxy_ticker else ""
fallback_rows = bundle["rows"][:limit] if bundle["rows"] else []
weight_sum = 0.0
for row in fallback_rows:
weight = row.get("Weight")
try:
weight_sum += float(weight) if weight not in (None, "") else 0.0
except Exception:
pass
refreshed.append({
"Sector": sector,
"Proxy_Ticker": proxy_ticker,
"Proxy_Name": proxy_name,
"Proxy_Type": proxy_type,
"Base_Ticker": base_ticker,
"Constituent_Code": _normalize_code(row.get("Constituent_Code")),
"Constituent_Name": _clean_text(row.get("Constituent_Name")),
"Weight": float(row.get("Weight") or 0),
"Is_ETF": _clean_text(row.get("Is_ETF")) or "N",
"Enabled": "Y",
"Effective_Date": today,
"Source": source_kind,
"Transport_Mode": "HTML_SERVER_RENDERED" if source_kind == "REPRESENTATIVE_STOCK_PROXY" else "MANUAL_OR_TEMPLATE",
"Source_URL": source_url,
"Source_AsOf": today,
"Sector_Check": sector,
"Weight_Sum_All": weight_sum,
"Weight_Sum_Stocks_Only": weight_sum,
"ETF_Rows": 0,
"Status": "CURRENT",
})
sector_stats.append({
"sector": sector,
"proxy_ticker": proxy_ticker,
"proxy_name": proxy_name,
"proxy_type": proxy_type,
"source_kind": source_kind,
"transport_mode": "HTML_SERVER_RENDERED" if source_kind == "REPRESENTATIVE_STOCK_PROXY" else "MANUAL_OR_TEMPLATE",
"source_url": source_url,
"source_asof": today,
"constituent_count": len(fallback_rows),
"weight_sum": round(weight_sum, 6),
"status": "CURRENT",
"refresh_reason": "REPRESENTATIVE_STOCK_PROXY",
})
continue
if proxy_ticker:
try:
scraped = _parse_naver_etf_holdings(session, proxy_ticker, limit)
source_url = scraped.get("source_url", "")
source_kind = scraped.get("source_kind", "NAVER_ETF_PAGE_FAIL")
holdings = scraped.get("holdings", [])
message = scraped.get("message", "")
if holdings:
source = "NAVER_ETF_PAGE"
weight_sum = round(sum(float(h["Weight"]) for h in holdings), 6)
for h in holdings:
refreshed.append({
"Sector": sector,
"Proxy_Ticker": proxy_ticker,
"Proxy_Name": proxy_name,
"Proxy_Type": proxy_type,
"Base_Ticker": base_ticker,
"Constituent_Code": h["Constituent_Code"],
"Constituent_Name": h["Constituent_Name"],
"Weight": h["Weight"],
"Is_ETF": "N",
"Enabled": "Y",
"Effective_Date": today,
"Source": source,
"Transport_Mode": "HTML_SERVER_RENDERED",
"Source_URL": source_url,
"Source_AsOf": today,
"Sector_Check": sector,
"Weight_Sum_All": weight_sum,
"Weight_Sum_Stocks_Only": weight_sum,
"ETF_Rows": 0,
"Status": "OK",
})
sector_stats.append({
"sector": sector,
"proxy_ticker": proxy_ticker,
"proxy_name": proxy_name,
"proxy_type": proxy_type,
"source_kind": source,
"transport_mode": "HTML_SERVER_RENDERED",
"source_url": source_url,
"source_asof": today,
"constituent_count": len(holdings),
"weight_sum": weight_sum,
"status": "CURRENT",
"refresh_reason": "NAVER_ETF_PAGE",
})
continue
except Exception as exc:
message = str(exc)
source_kind = "NAVER_ETF_PAGE_FAIL"
# fallback: preserve seed rows but expose the failure transparently
fallback_rows = bundle["rows"][:limit] if bundle["rows"] else []
weight_sum = 0.0
for row in fallback_rows:
weight = row.get("Weight")
try:
weight_sum += float(weight) if weight not in (None, "") else 0.0
except Exception:
pass
refreshed.append({
"Sector": sector,
"Proxy_Ticker": proxy_ticker,
"Proxy_Name": proxy_name,
"Proxy_Type": proxy_type,
"Base_Ticker": base_ticker,
"Constituent_Code": _normalize_code(row.get("Constituent_Code")),
"Constituent_Name": _clean_text(row.get("Constituent_Name")),
"Weight": float(row.get("Weight") or 0),
"Is_ETF": _clean_text(row.get("Is_ETF")) or "N",
"Enabled": "Y",
"Effective_Date": today,
"Source": source_kind,
"Transport_Mode": "LAYOUT_CHANGED" if source_kind == "NAVER_ETF_PAGE_FAIL_LAYOUT_CHANGED" else "UNKNOWN",
"Source_URL": source_url,
"Source_AsOf": today,
"Sector_Check": sector,
"Weight_Sum_All": weight_sum,
"Weight_Sum_Stocks_Only": weight_sum,
"ETF_Rows": 0,
"Status": "FAIL" if source_kind.endswith("FAIL") else "WARN",
})
sector_stats.append({
"sector": sector,
"proxy_ticker": proxy_ticker,
"proxy_name": proxy_name,
"proxy_type": proxy_type,
"source_kind": source_kind,
"transport_mode": "LAYOUT_CHANGED" if source_kind == "NAVER_ETF_PAGE_FAIL_LAYOUT_CHANGED" else "UNKNOWN",
"source_url": source_url,
"source_asof": today,
"constituent_count": len(fallback_rows),
"weight_sum": round(weight_sum, 6),
"status": "FAIL" if "FAIL" in source_kind else "WARN",
"refresh_reason": message or "seed_fallback",
})
audit_payload = build_sector_universe_refresh_audit({"data": {"sector_universe": refreshed}})
return refreshed, {
"sector_universe_refresh_audit": audit_payload,
"sector_stats": sector_stats,
}
def _style_title(ws, title: str, subtitle: str) -> None:
ws.merge_cells(start_row=1, start_column=1, end_row=1, end_column=max(8, ws.max_column or 8))
ws["A1"] = title
ws["A1"].font = WHITE_FONT
ws["A1"].fill = TITLE_FILL
ws["A1"].alignment = Alignment(horizontal="left")
ws.merge_cells(start_row=2, start_column=1, end_row=2, end_column=max(8, ws.max_column or 8))
ws["A2"] = subtitle
ws["A2"].font = NOTE_FONT
def _write_table(ws, start_row: int, start_col: int, headers: list[str], rows: list[list[Any]]) -> int:
for i, header in enumerate(headers, start=start_col):
cell = ws.cell(start_row, i)
cell.value = header
cell.font = WHITE_FONT
cell.fill = HEADER_FILL
cell.alignment = Alignment(horizontal="center")
for r_idx, row in enumerate(rows, start=start_row + 1):
for c_idx, value in enumerate(row, start=start_col):
ws.cell(r_idx, c_idx).value = value
return start_row + len(rows)
def _write_sector_universe_sheet(wb, rows: list[dict[str, Any]]) -> None:
if "sector_universe" in wb.sheetnames:
del wb["sector_universe"]
ws = wb.create_sheet("sector_universe")
headers = [
"Sector", "Proxy_Ticker", "Proxy_Name", "Proxy_Type", "Base_Ticker",
"Constituent_Code", "Constituent_Name", "Weight", "Is_ETF", "Enabled",
"Effective_Date", "Source", "Transport_Mode", "Source_URL", "Source_AsOf", "Sector_Check",
"Weight_Sum_All", "Weight_Sum_Stocks_Only", "ETF_Rows", "Status",
]
now = _kst_now().strftime("%Y-%m-%d %H:%M:%S")
ws["A1"] = f"updated: {now} KST"
ws["A1"].font = Font(bold=True)
_write_table(ws, 2, 1, headers, [[r.get(h, "") for h in headers] for r in rows])
for col_idx, header in enumerate(headers, start=1):
if header in {"Proxy_Ticker", "Base_Ticker", "Constituent_Code"}:
for r in range(3, ws.max_row + 1):
ws.cell(r, col_idx).number_format = "@"
if header in {"Weight", "Weight_Sum_All", "Weight_Sum_Stocks_Only"}:
for r in range(3, ws.max_row + 1):
ws.cell(r, col_idx).number_format = "0.0000"
width = 16
if header in {"Constituent_Name", "Proxy_Name"}:
width = 22
elif header in {"Source_URL"}:
width = 42
elif header in {"Status", "Source", "Sector_Check", "Proxy_Type", "Transport_Mode"}:
width = 16
ws.column_dimensions[get_column_letter(col_idx)].width = width
ws.freeze_panes = "A3"
ws.sheet_view.showGridLines = False
def _write_audit_sheet(wb, audit_payload: dict[str, Any]) -> None:
audit = audit_payload["sector_universe_refresh_audit"]
if "sector_universe_refresh_audit" in wb.sheetnames:
del wb["sector_universe_refresh_audit"]
ws = wb.create_sheet("sector_universe_refresh_audit")
ws.sheet_view.showGridLines = False
_style_title(
ws,
"섹터 월간 갱신 감사",
"Naver ETF 페이지 기반 월간 갱신 상태와 provenance 분리 현황을 점검한다.",
)
summary = audit.get("summary", {})
summary_rows = [
["formula_id", audit.get("formula_id", "")],
["gate", audit.get("gate", "")],
["sector_count", summary.get("sector_count", 0)],
["current_count", summary.get("current_count", 0)],
["due_count", summary.get("due_count", 0)],
["overdue_count", summary.get("overdue_count", 0)],
["missing_count", summary.get("missing_count", 0)],
["template_count", summary.get("template_count", 0)],
["sheet_input_count", summary.get("sheet_input_count", 0)],
["naver_source_count", summary.get("naver_source_count", 0)],
["missing_source_url_count", summary.get("missing_source_url_count", 0)],
["stale_sector_count", summary.get("stale_sector_count", 0)],
["oldest_source_asof", summary.get("oldest_source_asof", "")],
["newest_source_asof", summary.get("newest_source_asof", "")],
]
_write_table(ws, 4, 1, ["key", "value"], summary_rows)
rows = audit.get("rows", []) or []
if rows:
headers = [
"sector", "proxy_ticker", "proxy_name", "proxy_type", "source_kind",
"source_url", "source_asof", "age_days", "constituent_count",
"stock_count", "etf_count", "weight_sum", "status", "refresh_reason",
]
_write_table(ws, 4, 4, headers, [[r.get(h, "") for h in headers] for r in rows])
for idx, header in enumerate(headers, start=4):
width = 16
if header in {"sector", "proxy_name", "refresh_reason"}:
width = 20
elif header == "source_url":
width = 42
ws.column_dimensions[get_column_letter(idx)].width = width
ws.freeze_panes = "A5"
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--input", default=str(DEFAULT_INPUT_XLSX))
ap.add_argument("--output", default=str(DEFAULT_OUTPUT_XLSX))
ap.add_argument("--limit", type=int, default=10, help="Per-sector holdings limit from Naver ETF pages")
ap.add_argument("--apply", action="store_true", help="Overwrite the input workbook in place as well")
args = ap.parse_args()
input_path = Path(args.input)
output_path = Path(args.output)
if not input_path.exists():
raise FileNotFoundError(input_path)
wb = load_workbook(input_path)
if "sector_universe" not in wb.sheetnames:
raise RuntimeError("sector_universe sheet not found")
seed_ws = wb["sector_universe"]
seed_rows = _extract_sector_seed_rows(seed_ws)
refreshed_rows, audit_payload = _build_refreshed_rows(seed_rows, max(1, args.limit))
_write_sector_universe_sheet(wb, refreshed_rows)
_write_audit_sheet(wb, audit_payload)
output_path.parent.mkdir(parents=True, exist_ok=True)
wb.save(output_path)
if args.apply and input_path.resolve() != output_path.resolve():
shutil.copy2(output_path, input_path)
print(json.dumps({
"status": "OK",
"input": str(input_path),
"output": str(output_path),
"rows": len(refreshed_rows),
"sectors": len(audit_payload["sector_stats"]),
"current_count": audit_payload["sector_universe_refresh_audit"]["summary"]["current_count"],
"overdue_count": audit_payload["sector_universe_refresh_audit"]["summary"]["overdue_count"],
"template_count": audit_payload["sector_universe_refresh_audit"]["summary"]["template_count"],
}, ensure_ascii=False, indent=2))
return 0
if __name__ == "__main__":
sys.exit(main())