"""FUNDAMENTAL_RAW_INGEST_V2 — 한국 상장사 펀더멘털 raw 수집기 (고도화 버전). V2 개선 사항: 1. yfinance 연동: Beta, 52주 고저, 부채비율, 유동비율, 현금흐름 보완. 2. OpenDART 연동: 재무제표 API를 통해 정밀 재무지표 및 성장률 산출. 3. 로드맵 40개 NULL 컬럼 타겟팅 수집. 수집 지표(per ticker): roe_pct — ROE (%) opm_pct — 영업이익률 (%) eps_krw — EPS (원) ocf_krw — 영업현금흐름 (원) fcf_krw — 잉여현금흐름 (원) net_debt_krw — 순부채 (원) per — PER (Forward PE) pbr — PBR revenue_krw — 매출액 (원) op_income_krw — 영업이익 (원) beta — Beta (시장 민감도) high52w — 52주 최고가 low52w — 52주 최저가 debt_to_equity — 부채비율 (D/E) current_ratio — 유동비율 eps_growth_1y_pct — EPS 성장률 (1년) revenue_growth_pct — 매출 성장률 (1년) earnings_date — 실적 발표 예정일 as_of_date — 기준일 (YYYYMMDD) source — "data_feed" | "naver" | "yfinance" | "dart" | "fallback" is_etf — ETF 여부 (True/False) 출력: Temp/fundamental_raw_v1.json """ from __future__ import annotations import argparse import http.cookiejar import json import os import re import time import sys import urllib.parse import urllib.request from datetime import date, datetime, timedelta from pathlib import Path from typing import Any import yfinance as yf ROOT = Path(__file__).resolve().parents[1] DEFAULT_JSON = ROOT / "GatherTradingData.json" DEFAULT_OUT = TEMP = ROOT / "Temp" / "fundamental_raw_v1.json" # API Keys DART_API_KEY = os.environ.get("DART_API_KEY") DART_CORP_MAP_CACHE = TEMP / "dart_corp_map.json" # ETF 식별자 패턴 _ETF_NAME_PATTERNS = ["KODEX", "TIGER", "KINDEX", "KOSEF", "ARIRANG", "TIMEFOLIO", "HANARO"] _ETF_TICKER_RE = re.compile(r'^\d{4}[A-Z]\d$') def _ensure_utf8_stdio() -> None: if sys.stdout.encoding and sys.stdout.encoding.lower() not in ("utf-8", "utf8"): sys.stdout = open(sys.stdout.fileno(), mode="w", encoding="utf-8", buffering=1) if sys.stderr.encoding and sys.stderr.encoding.lower() not in ("utf-8", "utf8"): sys.stderr = open(sys.stderr.fileno(), mode="w", encoding="utf-8", buffering=1) def _load_json(path: Path) -> dict[str, Any]: if not path.exists(): return {} try: return json.loads(path.read_text(encoding="utf-8")) except Exception: return {} def _get_dart_corp_code(ticker: str) -> str | None: """6자리 티커를 8자리 OpenDART corp_code로 변환 (캐시 사용).""" if not DART_API_KEY: return None cache = _load_json(DART_CORP_MAP_CACHE) if ticker in cache: return cache[ticker] if not cache or (datetime.now() - datetime.fromtimestamp(DART_CORP_MAP_CACHE.stat().st_mtime) > timedelta(days=7)): print(f"\n Downloading OpenDART corpCode.xml...", end=" ", flush=True) try: import zipfile import io import xml.etree.ElementTree as ET url = "https://opendart.fss.or.kr/api/corpCode.xml" params = urllib.parse.urlencode({'crtfc_key': DART_API_KEY}) req = urllib.request.Request(f"{url}?{params}") with urllib.request.urlopen(req, timeout=30) as resp: with zipfile.ZipFile(io.BytesIO(resp.read())) as z: xml_data = z.read('CORPCODE.xml') tree = ET.fromstring(xml_data) new_cache = {} for node in tree.findall('list'): stock_code = (node.findtext('stock_code') or "").strip() if stock_code: new_cache[stock_code] = (node.findtext('corp_code') or "").strip() DART_CORP_MAP_CACHE.parent.mkdir(parents=True, exist_ok=True) DART_CORP_MAP_CACHE.write_text(json.dumps(new_cache), encoding="utf-8") cache = new_cache print("Done.") except Exception as e: print(f"Failed: {e}") return None return cache.get(ticker) def _num(v: Any, default: float = 0.0) -> float: try: if v is None or v == "": return default if isinstance(v, str): v = v.replace(",", "") return float(v) except (TypeError, ValueError): return default def _is_etf(ticker: str, name: str) -> bool: if _ETF_TICKER_RE.match(ticker): return True name_upper = (name or "").upper() return any(p in name_upper for p in _ETF_NAME_PATTERNS) def _yf_fundamentals(ticker: str) -> dict[str, Any]: """yfinance를 통한 펀더멘털 보완.""" res = {} sym = f"{ticker}.KS" if len(ticker) == 6 and ticker.isdigit() else ticker try: t = yf.Ticker(sym) info = t.info res["beta"] = info.get("beta") res["high52w"] = info.get("fiftyTwoWeekHigh") res["low52w"] = info.get("fiftyTwoWeekLow") res["debt_to_equity"] = info.get("debtToEquity") res["current_ratio"] = info.get("currentRatio") res["fcf_krw"] = info.get("freeCashflow") res["ocf_krw"] = info.get("operatingCashflow") res["revenue_growth_pct"] = info.get("revenueGrowth", 0) * 100 if info.get("revenueGrowth") else None res["eps_growth_1y_pct"] = info.get("earningsGrowth", 0) * 100 if info.get("earningsGrowth") else None if info.get("nextEarningsDate"): res["earnings_date"] = datetime.fromtimestamp(info["nextEarningsDate"]).strftime("%Y-%m-%d") res["per"] = info.get("forwardPE") or info.get("trailingPE") res["pbr"] = info.get("priceToBook") res["roe_pct"] = info.get("returnOnEquity", 0) * 100 if info.get("returnOnEquity") else None res["opm_pct"] = info.get("operatingMargins", 0) * 100 if info.get("operatingMargins") else None except Exception: pass return res def _dart_fundamentals(ticker: str) -> dict[str, Any]: res = {} if not DART_API_KEY: return res return res def _naver_summary(ticker: str) -> dict[str, float]: result: dict[str, float] = {} url = f"https://finance.naver.com/item/main.naver?code={ticker}" try: req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) with urllib.request.urlopen(req, timeout=10) as resp: raw = resp.read() html = raw.decode("utf-8", errors="replace") except Exception: return result def _row_values(label: str) -> list[float]: pattern = re.compile( rf'