QuantEngineByItz/tools/build_document_search_index_v1.py

#!/usr/bin/env python3
from __future__ import annotations

import json
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
OUT = ROOT / "Temp" / "document_search_index_v1.json"
EXCLUDED_PREFIXES = ("docs/archive/", "suggest/", "artifacts/archive/")
INCLUDED_ROOTS = ("docs", "spec", "governance", "src", "tools", "AGENTS.md", "README.md")


def _is_excluded(rel: str) -> bool:
    return rel.startswith(EXCLUDED_PREFIXES)


def main() -> int:
    indexed: list[str] = []
    excluded: list[str] = []

    for path in ROOT.rglob("*"):
        if not path.is_file():
            continue
        rel = path.relative_to(ROOT).as_posix()
        if _is_excluded(rel):
            excluded.append(rel)
            continue
        if rel.startswith("docs/") or rel.startswith("spec/") or rel.startswith("governance/") or rel.startswith("src/") or rel.startswith("tools/") or rel in {"AGENTS.md", "README.md"}:
            indexed.append(rel)

    result = {
        "formula_id": "DOCUMENT_SEARCH_INDEX_V1",
        "gate": "PASS",
        "indexed_count": len(indexed),
        "excluded_count": len(excluded),
        "excluded_prefixes": list(EXCLUDED_PREFIXES),
        "indexed_sample": sorted(indexed)[:50],
        "excluded_sample": sorted(excluded)[:50],
    }
    OUT.write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8")
    print(json.dumps(result, ensure_ascii=False, indent=2))
    return 0


if __name__ == "__main__":
    raise SystemExit(main())