from __future__ import annotations import json from pathlib import Path import pandas as pd from convert_xlsx_to_json import find_header_row, clean_dataframe, normalize_code ROOT = Path(__file__).resolve().parents[1] XLSX = ROOT / "GatherTradingData.xlsx" JSON_PATH = ROOT / "GatherTradingData.json" def validate_conversion(xlsx_path: Path, json_path: Path) -> int: print(f"Validating {xlsx_path.name} vs {json_path.name}...") payload = json.loads(json_path.read_text(encoding="utf-8")) json_data = payload["data"] xl = pd.ExcelFile(xlsx_path) errors: list[str] = [] for sheet in xl.sheet_names: if sheet.startswith("cs_chunk_"): continue if sheet not in json_data: errors.append(f"{sheet}: missing in JSON") continue header_row = find_header_row(xlsx_path, sheet) df = pd.read_excel(xlsx_path, sheet_name=sheet, header=header_row) df = clean_dataframe(df) expected_rows = len(df) actual = json_data[sheet] actual_rows = len(actual) if hasattr(actual, "__len__") else 0 if expected_rows != actual_rows: errors.append(f"{sheet}: XLSX rows={expected_rows} JSON rows={actual_rows}") continue if isinstance(actual, list) and actual: columns = set(df.columns) json_columns = set(actual[0]) if not columns <= json_columns: errors.append(f"{sheet}: JSON missing columns sample={sorted(columns - json_columns)[:10]}") if "Ticker" in columns: xlsx_ticker = normalize_code(df.iloc[0]["Ticker"]) json_ticker = str(actual[0].get("Ticker", "")) if xlsx_ticker != json_ticker: errors.append(f"{sheet}: first Ticker mismatch XLSX={xlsx_ticker} JSON={json_ticker}") if errors: print("JSON CONVERSION VALIDATION FAIL") for err in errors: print(f"- {err}") return 1 print("JSON CONVERSION VALIDATION OK") return 0 if __name__ == "__main__": raise SystemExit(validate_conversion(XLSX, JSON_PATH))