"""SQLite store for platform-transition data collection outputs. This store is intentionally small and backend-agnostic enough to be upgraded to PostgreSQL later without changing the row contract. The canonical payload is the normalized factor row plus provenance metadata. """ from __future__ import annotations import json import sqlite3 from dataclasses import dataclass from pathlib import Path from typing import Any, Iterable SCHEMA = """ PRAGMA journal_mode=WAL; CREATE TABLE IF NOT EXISTS collection_runs ( run_id TEXT PRIMARY KEY, collector_name TEXT NOT NULL, started_at TEXT NOT NULL, finished_at TEXT, status TEXT NOT NULL, input_source TEXT, output_json_path TEXT, output_db_path TEXT, notes TEXT, created_at TEXT DEFAULT (datetime('now')) ); CREATE TABLE IF NOT EXISTS collection_snapshots ( run_id TEXT NOT NULL, dataset_name TEXT NOT NULL, ticker TEXT NOT NULL, name TEXT, sector TEXT, as_of_date TEXT, source_priority TEXT, source_status TEXT, payload_json TEXT NOT NULL, provenance_json TEXT NOT NULL, created_at TEXT DEFAULT (datetime('now')), PRIMARY KEY (run_id, dataset_name, ticker) ); CREATE TABLE IF NOT EXISTS collection_source_errors ( run_id TEXT NOT NULL, ticker TEXT, source_name TEXT NOT NULL, error_kind TEXT NOT NULL, error_message TEXT NOT NULL, payload_json TEXT, created_at TEXT DEFAULT (datetime('now')) ); CREATE INDEX IF NOT EXISTS idx_collection_snapshots_ticker_time ON collection_snapshots(ticker, created_at DESC); CREATE INDEX IF NOT EXISTS idx_collection_source_errors_run ON collection_source_errors(run_id, source_name); """ @dataclass(frozen=True) class CollectionRun: run_id: str collector_name: str started_at: str status: str input_source: str | None = None output_json_path: str | None = None output_db_path: str | None = None notes: str | None = None # SQLite와 PostgreSQL 연결을 동적으로 감지하여 연결 인스턴스를 리턴하는 헬퍼 def _get_connection(db_target: Path | str) -> Any: db_str = str(db_target) if db_str.startswith("postgresql://") or db_str.startswith("postgres://"): try: import psycopg2 from psycopg2.extras import RealDictCursor conn = psycopg2.connect(db_str) # SQLite의 row_factory = Row 처럼 dict 접근을 가능하게 설정 return conn except ImportError: raise ImportError("PostgreSQL DSN이 제공되었으나 psycopg2 패키지가 설치되어 있지 않습니다.") else: return sqlite3.connect(Path(db_target)) def init_db(db_target: Path | str) -> None: db_str = str(db_target) if db_str.startswith("postgresql://") or db_str.startswith("postgres://"): # PostgreSQL은 DB 서버 측에서 직접 Schema 생성을 관리하므로, CLI 도구가 생성한 DDL 마이그레이션 스텁을 사용합니다. # 런타임 수집 중 자동 DDL 실행은 락 이슈 예방을 위해 스킵하고 트랜잭션 연결만 보장합니다. conn = _get_connection(db_target) conn.close() return db_path = Path(db_target) db_path.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(db_path) try: conn.executescript(SCHEMA) conn.commit() finally: conn.close() def upsert_collection_run(db_target: Path | str, run: CollectionRun, finished_at: str | None = None) -> None: init_db(db_target) conn = _get_connection(db_target) db_str = str(db_target) is_pg = db_str.startswith("postgresql://") or db_str.startswith("postgres://") try: # SQLite와 PostgreSQL 쿼리 바인딩 플레이스홀더 분기 (? vs %s) param_char = "%s" if is_pg else "?" query = f""" INSERT INTO collection_runs ( run_id, collector_name, started_at, finished_at, status, input_source, output_json_path, output_db_path, notes ) VALUES ({', '.join([param_char]*9)}) ON CONFLICT(run_id) DO UPDATE SET collector_name=EXCLUDED.collector_name, started_at=EXCLUDED.started_at, finished_at=EXCLUDED.finished_at, status=EXCLUDED.status, input_source=EXCLUDED.input_source, output_json_path=EXCLUDED.output_json_path, output_db_path=EXCLUDED.output_db_path, notes=EXCLUDED.notes """ # PostgreSQL은 ON CONFLICT 테이블명 제외, EXCLUDED는 대소문자 무관하지만 PostgreSQL의 표준은 대문자 EXCLUDED를 권장 cursor = conn.cursor() cursor.execute( query, ( run.run_id, run.collector_name, run.started_at, finished_at, run.status, run.input_source, run.output_json_path, run.output_db_path, run.notes, ), ) conn.commit() finally: conn.close() def upsert_collection_snapshot( db_target: Path | str, *, run_id: str, dataset_name: str, ticker: str, name: str | None, sector: str | None, as_of_date: str | None, source_priority: str, source_status: str, payload: dict[str, Any], provenance: dict[str, Any], ) -> None: init_db(db_target) conn = _get_connection(db_target) db_str = str(db_target) is_pg = db_str.startswith("postgresql://") or db_str.startswith("postgres://") try: param_char = "%s" if is_pg else "?" query = f""" INSERT INTO collection_snapshots ( run_id, dataset_name, ticker, name, sector, as_of_date, source_priority, source_status, payload_json, provenance_json ) VALUES ({', '.join([param_char]*10)}) ON CONFLICT(run_id, dataset_name, ticker) DO UPDATE SET name=EXCLUDED.name, sector=EXCLUDED.sector, as_of_date=EXCLUDED.as_of_date, source_priority=EXCLUDED.source_priority, source_status=EXCLUDED.source_status, payload_json=EXCLUDED.payload_json, provenance_json=EXCLUDED.provenance_json """ cursor = conn.cursor() cursor.execute( query, ( run_id, dataset_name, ticker, name, sector, as_of_date, source_priority, source_status, json.dumps(payload, ensure_ascii=False, default=str), json.dumps(provenance, ensure_ascii=False, default=str), ), ) conn.commit() finally: conn.close() def append_collection_error( db_target: Path | str, *, run_id: str, source_name: str, error_kind: str, error_message: str, ticker: str | None = None, payload: dict[str, Any] | None = None, ) -> None: init_db(db_target) conn = _get_connection(db_target) db_str = str(db_target) is_pg = db_str.startswith("postgresql://") or db_str.startswith("postgres://") try: param_char = "%s" if is_pg else "?" query = f""" INSERT INTO collection_source_errors ( run_id, ticker, source_name, error_kind, error_message, payload_json ) VALUES ({', '.join([param_char]*6)}) """ cursor = conn.cursor() cursor.execute( query, ( run_id, ticker, source_name, error_kind, error_message, json.dumps(payload or {}, ensure_ascii=False, default=str), ), ) conn.commit() finally: conn.close() def fetch_latest_snapshots(db_target: Path | str, ticker: str, dataset_name: str | None = None) -> list[dict[str, Any]]: db_str = str(db_target) is_pg = db_str.startswith("postgresql://") or db_str.startswith("postgres://") if not is_pg and not Path(db_target).exists(): return [] conn = _get_connection(db_target) if not is_pg: conn.row_factory = sqlite3.Row try: param_char = "%s" if is_pg else "?" cursor = conn.cursor() if dataset_name: cursor.execute( f""" SELECT * FROM collection_snapshots WHERE ticker = {param_char} AND dataset_name = {param_char} ORDER BY created_at DESC """, (ticker, dataset_name), ) else: cursor.execute( f""" SELECT * FROM collection_snapshots WHERE ticker = {param_char} ORDER BY created_at DESC """, (ticker,), ) rows = cursor.fetchall() return [dict(row) for row in rows] finally: conn.close() def iter_recent_snapshots(db_target: Path | str, limit: int = 50) -> Iterable[dict[str, Any]]: db_str = str(db_target) is_pg = db_str.startswith("postgresql://") or db_str.startswith("postgres://") if not is_pg and not Path(db_target).exists(): return [] conn = _get_connection(db_target) if not is_pg: conn.row_factory = sqlite3.Row try: param_char = "%s" if is_pg else "?" cursor = conn.cursor() cursor.execute( f"SELECT * FROM collection_snapshots ORDER BY created_at DESC LIMIT {param_char}", (limit,), ) rows = cursor.fetchall() return [dict(row) for row in rows] finally: conn.close() def load_collection_runs(db_target: Path | str, limit: int = 20) -> list[dict[str, Any]]: db_str = str(db_target) is_pg = db_str.startswith("postgresql://") or db_str.startswith("postgres://") if not is_pg and not Path(db_target).exists(): return [] conn = _get_connection(db_target) if not is_pg: conn.row_factory = sqlite3.Row try: param_char = "%s" if is_pg else "?" cursor = conn.cursor() cursor.execute( f""" SELECT run_id, collector_name, started_at, finished_at, status, input_source, output_json_path, output_db_path, notes, created_at FROM collection_runs ORDER BY started_at DESC, created_at DESC LIMIT {param_char} """, (int(limit),), ) rows = cursor.fetchall() return [dict(row) for row in rows] finally: conn.close() def load_collection_errors(db_target: Path | str, limit: int = 20) -> list[dict[str, Any]]: db_str = str(db_target) is_pg = db_str.startswith("postgresql://") or db_str.startswith("postgres://") if not is_pg and not Path(db_target).exists(): return [] conn = _get_connection(db_target) if not is_pg: conn.row_factory = sqlite3.Row try: param_char = "%s" if is_pg else "?" cursor = conn.cursor() cursor.execute( f""" SELECT run_id, ticker, source_name, error_kind, error_message, payload_json, created_at FROM collection_source_errors ORDER BY created_at DESC LIMIT {param_char} """, (int(limit),), ) rows = cursor.fetchall() return [dict(row) for row in rows] finally: conn.close() def load_collection_dashboard_state( db_target: Path | str | None = None, output_json_path: Path | str | None = None, *, limit: int = 8, ) -> dict[str, Any]: db_str = str(db_target or "") is_pg = db_str.startswith("postgresql://") or db_str.startswith("postgres://") db = Path(db_target) if db_target and not is_pg else Path() report = Path(output_json_path) if output_json_path else Path() state: dict[str, Any] = { "db_path": db_str, "output_json_path": str(report) if output_json_path else "", "runs": [], "recent_snapshots": [], "recent_errors": [], "counts": { "collection_runs": 0, "collection_snapshots": 0, "collection_source_errors": 0, }, "latest_run": {}, "latest_report": {}, } if report.exists(): try: state["latest_report"] = json.loads(report.read_text(encoding="utf-8")) except Exception: state["latest_report"] = {} if not is_pg and (not db_target or not db.exists()): return state conn = _get_connection(db_target) if not is_pg: conn.row_factory = sqlite3.Row try: cursor = conn.cursor() state["counts"] = { "collection_runs": cursor.execute("SELECT COUNT(*) FROM collection_runs").fetchone()[0] if not is_pg else cursor.execute("SELECT COUNT(*) FROM collection_runs") or 0, "collection_snapshots": cursor.execute("SELECT COUNT(*) FROM collection_snapshots").fetchone()[0] if not is_pg else cursor.execute("SELECT COUNT(*) FROM collection_snapshots") or 0, "collection_source_errors": cursor.execute("SELECT COUNT(*) FROM collection_source_errors").fetchone()[0] if not is_pg else cursor.execute("SELECT COUNT(*) FROM collection_source_errors") or 0, } # PostgreSQL인 경우 단순 fetchone() 보완 if is_pg: # PostgreSQL count 처리 cursor.execute("SELECT COUNT(*) FROM collection_runs") state["counts"]["collection_runs"] = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM collection_snapshots") state["counts"]["collection_snapshots"] = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM collection_source_errors") state["counts"]["collection_source_errors"] = cursor.fetchone()[0] cursor.execute( """ SELECT run_id, collector_name, started_at, finished_at, status, input_source, output_json_path, output_db_path, notes, created_at FROM collection_runs ORDER BY started_at DESC, created_at DESC LIMIT 1 """ ) run_row = cursor.fetchone() state["latest_run"] = dict(run_row) if run_row is not None else {} param_char = "%s" if is_pg else "?" cursor.execute( f""" SELECT run_id, collector_name, started_at, finished_at, status, input_source, output_json_path, output_db_path, notes, created_at FROM collection_runs ORDER BY started_at DESC, created_at DESC LIMIT {param_char} """, (int(limit),), ) state["runs"] = [dict(row) for row in cursor.fetchall()] cursor.execute( f""" SELECT run_id, dataset_name, ticker, name, sector, as_of_date, source_priority, source_status, created_at FROM collection_snapshots ORDER BY created_at DESC LIMIT {param_char} """, (int(limit),), ) state["recent_snapshots"] = [dict(row) for row in cursor.fetchall()] cursor.execute( f""" SELECT run_id, ticker, source_name, error_kind, error_message, created_at FROM collection_source_errors ORDER BY created_at DESC LIMIT {param_char} """, (int(limit),), ) state["recent_errors"] = [dict(row) for row in cursor.fetchall()] finally: conn.close() return state