QuantEngineByItz/tools/fetch_naver_market_data_v1.py

"""Naver Finance 시세/수급 수집기 — qualitative_sell_strategy_v1 입력용.

확인된 무인증 엔드포인트만 사용한다(2026-06-21 세션 실측):
  - https://finance.naver.com/item/sise_day.naver?code={code}&page=N   (일별 시세/거래량)
  - https://finance.naver.com/item/frgn.naver?code={code}&page=N       (외국인/기관 수급)
  - https://polling.finance.naver.com/api/realtime/domestic/stock/{code} (실시간 스냅샷, JSON)

investing.com 직접 스크래핑은 403(Cloudflare 차단) 확인됨 — 시도하지 않는다.
KRX 공매도 잔고(data.krx.co.kr)는 OTP 세션 필요(LOGOUT 응답) — 시도하지 않는다.
이미 GAS(gdc_01_fetch_fundamentals.gs/gas_event_calendar.gs)에서 수집 중인
외국인/기관 수급·실적발표 일정·경제지표 일정은 보유종목에 대해서는 account_snapshot/
GatherTradingData.xlsx에서 재사용하고, 이 스크립트는 그 시트에 없는 위성 후보군
티커를 평가할 때만 직접 호출한다(중복 수집 금지).
"""
from __future__ import annotations

import argparse
import datetime as dt
import json
import sys
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup

ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36"
NAVER_REFERER = "https://finance.naver.com/"


def _session() -> requests.Session:
    s = requests.Session()
    s.headers.update({
        "User-Agent": USER_AGENT,
        "Referer": NAVER_REFERER,
        "Accept-Language": "ko-KR,ko;q=0.9,en;q=0.8",
    })
    return s


def _num(text: str) -> float:
    cleaned = text.replace(",", "").replace("+", "").strip()
    try:
        return float(cleaned)
    except ValueError:
        return 0.0


def fetch_price_history(session: requests.Session, code: str, pages: int = 3) -> dict[str, Any]:
    """일별 [date, close, change, open, high, low, volume] 최신순. 페이지당 10행."""
    rows: list[dict[str, Any]] = []
    for page in range(1, pages + 1):
        url = f"https://finance.naver.com/item/sise_day.naver?code={code}&page={page}"
        try:
            resp = session.get(url, timeout=10)
            if resp.status_code == 403:
                return {
                    "status": "CLOUDFLARE_BLOCKED_403",
                    "rows": [],
                    "error": "Cloudflare rejected request (403 Forbidden)",
                    "source_url": url,
                    "wbs_ref": "WBS-7.9: Naver 스크래핑 Cloudflare 모니터링",
                }
            resp.raise_for_status()
        except requests.RequestException as e:
            return {
                "status": "FETCH_ERROR",
                "rows": [],
                "error": str(e),
                "source_url": url,
            }
        resp.encoding = "euc-kr"
        soup = BeautifulSoup(resp.text, "html.parser")
        table = soup.find("table", {"class": "type2"})
        if table is None:
            break
        for tr in table.find_all("tr"):
            cells = [td.get_text(strip=True) for td in tr.find_all("td")]
            if len(cells) != 7 or not cells[0]:
                continue
            rows.append({
                "date": cells[0].replace(".", "-"),
                "close": _num(cells[1]),
                "open": _num(cells[3]),
                "high": _num(cells[4]),
                "low": _num(cells[5]),
                "volume": _num(cells[6]),
            })
    if not rows:
        return {"status": "DATA_MISSING", "rows": [], "source_url": NAVER_REFERER}
    return {
        "status": "OK",
        "rows": rows,
        "source_url": f"https://finance.naver.com/item/sise_day.naver?code={code}",
        "source_as_of": dt.datetime.now(dt.timezone(dt.timedelta(hours=9))).isoformat(),
    }


def fetch_foreign_institution_flow(session: requests.Session, code: str, pages: int = 2) -> dict[str, Any]:
    """외국인/기관 5일·20일 수급. tds: [date, close, change, ret_pct, volume, inst, frgn, frgn_ratio]."""
    rows: list[dict[str, Any]] = []
    for page in range(1, pages + 1):
        url = f"https://finance.naver.com/item/frgn.naver?code={code}&page={page}"
        try:
            resp = session.get(url, timeout=10)
            if resp.status_code == 403:
                return {
                    "status": "CLOUDFLARE_BLOCKED_403",
                    "rows": [],
                    "error": "Cloudflare rejected request (403 Forbidden)",
                    "source_url": url,
                    "wbs_ref": "WBS-7.9: Naver 스크래핑 Cloudflare 모니터링",
                }
            resp.raise_for_status()
        except requests.RequestException as e:
            return {
                "status": "FETCH_ERROR",
                "rows": [],
                "error": str(e),
                "source_url": url,
            }
        resp.encoding = "euc-kr"
        soup = BeautifulSoup(resp.text, "html.parser")
        for table in soup.find_all("table", {"class": "type2"}):
            for tr in table.find_all("tr"):
                cells = [td.get_text(strip=True) for td in tr.find_all("td")]
                if len(cells) < 8 or not cells[0] or "." not in cells[0]:
                    continue
                rows.append({
                    "date": cells[0].replace(".", "-"),
                    "close": _num(cells[1]),
                    "inst_net": _num(cells[5]),
                    "frgn_net": _num(cells[6]),
                })
    if not rows:
        return {"status": "DATA_MISSING", "rows": []}
    return {
        "status": "OK",
        "rows": rows,
        "source_url": f"https://finance.naver.com/item/frgn.naver?code={code}",
        "source_as_of": dt.datetime.now(dt.timezone(dt.timedelta(hours=9))).isoformat(),
    }


def compute_relative_return_20d(stock_rows: list[dict[str, Any]], benchmark_rows: list[dict[str, Any]]) -> float | None:
    """종목수익률(최신 vs 20거래일전) - 벤치마크(섹터ETF/KOSPI)수익률, %p."""
    def _ret(rows: list[dict[str, Any]]) -> float | None:
        closes = [r["close"] for r in rows if r.get("close")]
        if len(closes) < 2:
            return None
        recent, past = closes[0], closes[min(len(closes) - 1, 19)]
        if not past:
            return None
        return (recent / past - 1.0) * 100.0

    stock_ret = _ret(stock_rows)
    bench_ret = _ret(benchmark_rows)
    if stock_ret is None or bench_ret is None:
        return None
    return round(stock_ret - bench_ret, 4)


def compute_volume_ratio_5d(rows: list[dict[str, Any]]) -> float | None:
    """오늘 거래량 / 직전 5일 평균거래량."""
    volumes = [r["volume"] for r in rows if r.get("volume")]
    if len(volumes) < 6:
        return None
    today_vol = volumes[0]
    avg5 = sum(volumes[1:6]) / 5.0
    if avg5 <= 0:
        return None
    return round(today_vol / avg5, 4)


def main() -> int:
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("--code", required=True, help="6자리 종목코드")
    ap.add_argument("--benchmark-code", default="069500", help="비교 벤치마크 코드(기본 KODEX200 069500)")
    args = ap.parse_args()

    session = _session()
    price = fetch_price_history(session, args.code)
    benchmark = fetch_price_history(session, args.benchmark_code)
    flow = fetch_foreign_institution_flow(session, args.code)

    result = {
        "code": args.code,
        "price_history": price,
        "foreign_institution_flow": flow,
        "relative_return_20d": compute_relative_return_20d(price.get("rows", []), benchmark.get("rows", [])),
        "volume_ratio_5d": compute_volume_ratio_5d(price.get("rows", [])),
    }
    print(json.dumps(result, ensure_ascii=False, indent=2))
    return 0


if __name__ == "__main__":
    raise SystemExit(main())