Files
QuantEngineByItz/tools/ingest_fundamental_raw.py
T
kjh2064 eabacde438 feat: Sprint-3 (펀더멘털 피드 완성, MDD 모니터링 구축, Gitea CI/CD 파이프라인 추가) (2026-06-13)
주요 변경 사항:
- tools/ingest_fundamental_raw.py 수정:
  * yfinance 패키지를 활용한 Yahoo Finance 펀더멘털 연동 파이프라인 전면 개편
  * FCF, OCF 및 순부채(totalDebt - totalCash) 자동 폴백 계산을 구현하여 40개 NULL 컬럼 수집 완성
- src/gas_adapter_parts/gdc_01_fetch_fundamentals.gs 수정:
  * 일별 자산 및 MDD를 기록하는 logDailyAssetHistory_ 함수 구현 및 runDataFeed() 연동
- tools/build_realized_performance_v1.py 수정:
  * daily_history 탭으로부터 MDD_realized를 실시간 파싱하여 insufficient_data 제거
- .gitea/workflows/ci.yml 추가:
  * Gitea Actions 용 Spec 검증, 릴리즈 게이트 및 번들 빌드 자동화 파이프라인 구축
- docs/ROADMAP_WBS.md 수정:
  * WBS-2.1, WBS-3.4, WBS-5.1 과업의 체크박스를 완료[x] 상태로 갱신
- 검증 결과: npm run full-gate (55단계 릴리즈 게이트) PASS 검증 완료

Co-Authored-By: Antigravity AI <noreply@google.com>
2026-06-13 14:31:40 +09:00

419 lines
14 KiB
Python

"""FUNDAMENTAL_RAW_INGEST_V1 — 한국 상장사 펀더멘털 raw 수집기.
data_feed의 Forward_PE / PBR / EPS 등 기존 수집 데이터를 primary source로 사용하고,
네이버 금융 HTML 스크래핑으로 ROE / OPM / OCF 등 누락 지표를 보완한다.
수집 지표(per ticker):
roe_pct — ROE (%)
opm_pct — 영업이익률 (%)
eps_krw — EPS (원)
ocf_krw — 영업현금흐름 (원)
fcf_krw — 잉여현금흐름 (원)
net_debt_krw — 순부채 (원)
per — PER (Forward PE)
pbr — PBR
revenue_krw — 매출액 (원)
op_income_krw — 영업이익 (원)
as_of_date — 기준일 (YYYYMMDD)
source — "data_feed" | "data_feed+naver" | "naver" | "fallback"
is_etf — ETF 여부 (True/False)
출력: Temp/fundamental_raw_v1.json
형식: {"formula_id":"FUNDAMENTAL_RAW_INGEST_V1","gate":"PASS|CAUTION|FAIL","rows":[...]}
"""
from __future__ import annotations
import argparse
import http.cookiejar
import json
import re
import time
import urllib.parse
import urllib.request
from datetime import date
from pathlib import Path
from typing import Any
import yfinance as yf
ROOT = Path(__file__).resolve().parents[1]
DEFAULT_JSON = ROOT / "GatherTradingData.json"
DEFAULT_OUT = ROOT / "Temp" / "fundamental_raw_v1.json"
def _yahoo_fundamentals_yf(ticker: str) -> dict[str, float]:
"""yfinance 라이브러리를 사용하여 ROE/OPM/beta/revenue/OCF/FCF/NetDebt를 가져온다."""
result: dict[str, float] = {}
if re.match(r"^\d{4}[A-Z]\d$", ticker):
return result
# 1. Ticker 객체 획득 (KOSPI/KOSDAQ 자동 Fallback)
t = None
if not ticker.isdigit():
t = yf.Ticker(ticker)
else:
for suffix in [".KS", ".KQ"]:
temp_t = yf.Ticker(f"{ticker}{suffix}")
try:
info = temp_t.info
if info and (info.get("longName") or info.get("shortName")):
t = temp_t
break
except Exception:
continue
if not t:
return result
try:
info = t.info
def safe_float(v):
if v is None:
return None
try:
return float(v)
except (ValueError, TypeError):
return None
# Info metrics
roe = safe_float(info.get("returnOnEquity"))
if roe is not None:
result["roe_pct"] = round(roe * 100, 2)
opm = safe_float(info.get("operatingMargins"))
if opm is not None:
result["opm_pct"] = round(opm * 100, 2)
eps = safe_float(info.get("trailingEps")) or safe_float(info.get("forwardEps"))
if eps is not None:
result["eps_krw"] = eps
pe = safe_float(info.get("forwardPE")) or safe_float(info.get("trailingPE"))
if pe is not None:
result["per"] = pe
pbr = safe_float(info.get("priceToBook"))
if pbr is not None:
result["pbr"] = pbr
rev = safe_float(info.get("totalRevenue"))
if rev is not None:
result["revenue_krw"] = rev
net_debt = safe_float(info.get("netDebt"))
if net_debt is None:
tot_debt = safe_float(info.get("totalDebt"))
tot_cash = safe_float(info.get("totalCash"))
if tot_debt is not None and tot_cash is not None:
net_debt = tot_debt - tot_cash
if net_debt is not None:
result["net_debt_krw"] = net_debt
# Cashflow metrics
try:
cf = t.cashflow
if cf is not None and not cf.empty:
fcf_idx = [idx for idx in cf.index if "Free Cash Flow" in str(idx)]
if fcf_idx:
fcf_val = safe_float(cf.loc[fcf_idx[0]].iloc[0])
if fcf_val is not None:
result["fcf_krw"] = fcf_val
ocf_idx = [idx for idx in cf.index if "Operating Cash Flow" in str(idx)]
if ocf_idx:
ocf_val = safe_float(cf.loc[ocf_idx[0]].iloc[0])
if ocf_val is not None:
result["ocf_krw"] = ocf_val
except Exception:
pass
except Exception as e:
print(f"Error fetching yfinance details for {ticker}: {e}")
return result
# ETF 식별자 패턴 (이름 포함)
_ETF_NAME_PATTERNS = ["KODEX", "TIGER", "KINDEX", "KOSEF", "ARIRANG", "TIMEFOLIO", "HANARO"]
# ETF 종목코드 특수 패턴 (0xxxV0 형태는 ETF)
_ETF_TICKER_RE = re.compile(r'^\d{4}[A-Z]\d$')
def _load_json(path: Path) -> dict[str, Any]:
if not path.exists():
return {}
try:
return json.loads(path.read_text(encoding="utf-8"))
except Exception:
return {}
def _num(v: Any, default: float = 0.0) -> float:
try:
if v is None or v == "":
return default
return float(str(v).replace(",", ""))
except (TypeError, ValueError):
return default
def _is_etf(ticker: str, name: str) -> bool:
"""ETF 여부 판별."""
if _ETF_TICKER_RE.match(ticker):
return True
name_upper = (name or "").upper()
return any(p in name_upper for p in _ETF_NAME_PATTERNS)
def _naver_summary(ticker: str) -> dict[str, float]:
"""네이버 금융 main.naver에서 PER/EPS/PBR/ROE/OPM을 가져온다."""
result: dict[str, float] = {}
url = f"https://finance.naver.com/item/main.naver?code={ticker}"
try:
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=10) as resp:
raw = resp.read()
html = raw.decode("utf-8", errors="replace")
except Exception:
return result
def _row_values(label: str) -> list[float]:
pattern = re.compile(
rf'<tr[^>]*>\s*<th scope="row" class="h_th2 th_cop_anal\d+"><strong>{re.escape(label)}</strong></th>(.*?)</tr>',
re.DOTALL,
)
m = pattern.search(html)
if not m:
return []
td_vals = []
for raw_num in re.findall(r'<td[^>]*>\s*(?:&nbsp;)?\s*([0-9,]+(?:\.[0-9]+)?)\s*</td>', m.group(1), re.DOTALL):
val = _num(raw_num)
if val != 0.0:
td_vals.append(val)
return td_vals
# 표 라벨은 cp949 디코딩된 값 기준으로 읽는다.
# 가장 오른쪽 값(최근 값)을 우선 사용한다.
row_label_map: dict[str, str] = {
"매출액": "revenue_krw",
"영업이익": "op_income_krw",
"당기순이익": "net_income_krw",
"영업이익률": "opm_pct",
"순이익률": "net_margin_pct",
"ROE(지배주주)": "roe_pct",
"부채비율": "debt_ratio_pct",
"당좌비율": "quick_ratio_pct",
"유보율": "retention_ratio_pct",
"EPS(원)": "eps_krw",
"PER(배)": "per",
"BPS(원)": "bps_krw",
"PBR(배)": "pbr",
}
for label, key in row_label_map.items():
vals = _row_values(label)
if vals and result.get(key) is None:
result[key] = vals[-1]
# 기존 summaryDetail 기반 PER/PBR/EPS가 있다면 우선 유지
for key in ("per", "pbr", "eps_krw", "roe_pct", "opm_pct", "revenue_krw", "op_income_krw"):
val = result.get(key)
if val is not None:
try:
result[key] = float(val)
except Exception:
pass
return result
def _collect_ticker(
ticker: str,
name: str,
df_row: dict[str, Any],
use_naver: bool,
current_year: int,
use_yahoo: bool = True,
) -> dict[str, Any]:
"""per-ticker raw 수집."""
today = str(date.today().isoformat()).replace("-", "")
row: dict[str, Any] = {
"ticker": ticker,
"name": name,
"as_of_date": today,
"source": "fallback",
"roe_pct": None,
"opm_pct": None,
"eps_krw": None,
"ocf_krw": None,
"fcf_krw": None,
"net_debt_krw": None,
"per": None,
"pbr": None,
"revenue_krw": None,
"op_income_krw": None,
"data_quality": "MISSING",
"is_etf": _is_etf(ticker, name),
}
# ETF는 펀더멘털 데이터 수집 생략
if row["is_etf"]:
row["data_quality"] = "ETF_EXCLUDED"
row["source"] = "etf_skip"
return row
# Step 1: data_feed에서 직접 가져오기 (가장 신뢰할 수 있음)
df_per = _num(df_row.get("Forward_PE"))
df_pbr = _num(df_row.get("PBR"))
df_eps = _num(df_row.get("EPS"))
df_roe = _num(df_row.get("ROE_Pct"))
df_opm = _num(df_row.get("Operating_Margin_Pct"))
if df_per > 0:
row["per"] = df_per
if df_pbr > 0:
row["pbr"] = df_pbr
if df_eps != 0:
row["eps_krw"] = df_eps
if df_roe > 0:
row["roe_pct"] = df_roe
if df_opm > 0:
row["opm_pct"] = df_opm
data_feed_ok = (row["per"] is not None or row["pbr"] is not None)
if data_feed_ok:
row["source"] = "data_feed"
# Step 2: 네이버 fallback (ROE/OPM 누락 시)
if use_naver and (row["roe_pct"] is None or row["opm_pct"] is None or row["per"] is None):
try:
naver = _naver_summary(ticker)
time.sleep(0.3)
for k, v in naver.items():
if row.get(k) is None:
row[k] = v
if naver:
row["source"] = "data_feed+naver" if data_feed_ok else "naver"
except Exception:
pass
# Step 3: 야후 Finance v10 폴백 (ROE/OPM/revenue 등 누락 시)
# 네이버가 PE/PBR/EPS 우선, 야후가 ROE/OPM/OCF/FCF 보완
needs_yahoo = (
row["roe_pct"] is None or row["opm_pct"] is None
or row["ocf_krw"] is None or row["revenue_krw"] is None
)
if use_yahoo and needs_yahoo and not row["is_etf"]:
try:
yahoo = _yahoo_fundamentals_yf(ticker)
if yahoo:
for k, v in yahoo.items():
if row.get(k) is None and v is not None:
row[k] = v
src_prev = row["source"]
row["source"] = (src_prev + "+yahoo") if src_prev != "fallback" else "yahoo"
except Exception:
pass
# 데이터 품질 평가 (ETF 제외)
filled = sum(1 for k in ("roe_pct", "opm_pct", "per", "pbr", "eps_krw") if row.get(k) not in (None, 0.0))
if filled >= 4:
row["data_quality"] = "FULL"
elif filled >= 2:
row["data_quality"] = "PARTIAL"
elif filled >= 1:
row["data_quality"] = "SPARSE"
else:
row["data_quality"] = "MISSING"
return row
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--json", default=str(DEFAULT_JSON))
ap.add_argument("--out", default=str(DEFAULT_OUT))
ap.add_argument("--no-naver", action="store_true", help="네이버 스크래핑 비활성화")
ap.add_argument("--no-yahoo", action="store_true", help="야후 Finance v10 폴백 비활성화")
ap.add_argument("--tickers", default="", help="쉼표구분 종목코드 (빈 값이면 data_feed에서 자동 추출)")
args = ap.parse_args()
json_path = Path(args.json)
out_path = Path(args.out)
if not json_path.is_absolute():
json_path = ROOT / json_path
if not out_path.is_absolute():
out_path = ROOT / out_path
src = _load_json(json_path)
data = src.get("data") if isinstance(src.get("data"), dict) else {}
df_list = data.get("data_feed") if isinstance(data.get("data_feed"), list) else []
# data_feed를 ticker 기준 dict로 변환
df_map: dict[str, dict[str, Any]] = {}
for r in df_list:
if isinstance(r, dict):
t = str(r.get("Ticker") or r.get("ticker") or "")
if t:
df_map[t] = r
# 수집 대상 tickers
if args.tickers.strip():
tickers_with_names = [(t.strip(), df_map.get(t.strip(), {}).get("Name", "")) for t in args.tickers.split(",") if t.strip()]
else:
tickers_with_names = [(t, df_map.get(t, {}).get("Name", "")) for t in sorted(df_map.keys())]
use_naver = not args.no_naver
use_yahoo = not args.no_yahoo
current_year = date.today().year
print(f"FUNDAMENTAL_RAW_INGEST_V1: collecting {len(tickers_with_names)} tickers, naver={'YES' if use_naver else 'NO'}")
rows: list[dict[str, Any]] = []
for i, (ticker, name) in enumerate(tickers_with_names):
print(f" [{i+1}/{len(tickers_with_names)}] {ticker} {name} ...", end=" ", flush=True)
row = _collect_ticker(ticker, name, df_map.get(ticker, {}), use_naver, current_year, use_yahoo=use_yahoo)
rows.append(row)
print(f"{row['data_quality']} source={row['source']}")
# 품질 집계 (ETF 제외)
non_etf = [r for r in rows if r["data_quality"] != "ETF_EXCLUDED"]
quality_counts: dict[str, int] = {}
for r in rows:
q = str(r.get("data_quality") or "MISSING")
quality_counts[q] = quality_counts.get(q, 0) + 1
full_count = quality_counts.get("FULL", 0)
partial_count = quality_counts.get("PARTIAL", 0)
sparse_count = quality_counts.get("SPARSE", 0)
missing_count = quality_counts.get("MISSING", 0)
coverage_pct = round(
(full_count + partial_count + sparse_count * 0.5) / len(non_etf) * 100.0, 2
) if non_etf else 0.0
gate = "PASS" if coverage_pct >= 80.0 else ("CAUTION" if coverage_pct >= 30.0 else "FAIL")
result = {
"formula_id": "FUNDAMENTAL_RAW_INGEST_V1",
"gate": gate,
"as_of_date": str(date.today()),
"ticker_count": len(rows),
"non_etf_count": len(non_etf),
"coverage_pct": coverage_pct,
"quality_counts": quality_counts,
"rows": rows,
}
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8")
print(
f"FUNDAMENTAL_RAW_INGEST_V1 gate={gate} tickers={len(rows)} non_etf={len(non_etf)} "
f"coverage={coverage_pct}% full={full_count} partial={partial_count} missing={missing_count}"
)
print("FUNDAMENTAL_RAW_INGEST_V1_OK" if gate != "FAIL" else "FUNDAMENTAL_RAW_INGEST_V1_FAIL")
return 0 if gate != "FAIL" else 1
if __name__ == "__main__":
raise SystemExit(main())