QuantEngineByItz/tools/refresh_trading_calendar.py

import os
import re
import sys
import hashlib
import requests
from datetime import datetime
from pathlib import Path
from bs4 import BeautifulSoup
import openpyxl

# Reconfigure stdout for UTF-8 to prevent CP949 encoding crashes on Windows
sys.stdout.reconfigure(encoding='utf-8')

ROOT = Path(__file__).resolve().parent.parent
XLSX_PATH = ROOT / "GatherTradingData.xlsx"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
}

url = "https://tradingeconomics.com/calendar"

TYPE_MAP = [
  { 'keys': ['FOMC','연준','Federal Open Market','Fed Rate'],               'type': 'FOMC'     },
  { 'keys': ['CPI','소비자물가','Consumer Price','Inflation'],               'type': None       },
  { 'keys': ['PPI','생산자물가','Producer Price'],                           'type': 'US_PPI'   },
  { 'keys': ['PCE','개인소비지출','Personal Consumption'],                   'type': 'US_PCE'   },
  { 'keys': ['NFP','비농업','Nonfarm','Payroll'],                            'type': 'US_NFP'   },
  { 'keys': ['실적','잠정실적','Earnings','EPS','Revenue'],                  'type': 'EARNINGS' },
  { 'keys': ['옵션만기','선물만기','만기일','Expiry','Triple Witching'],      'type': 'EXPIRY'   },
  { 'keys': ['한국은행','금통위','BOK','Bank of Korea'],                     'type': 'BOK'      },
  { 'keys': ['환율','FX','Dollar','달러'],                                   'type': 'FX'       },
  { 'keys': ['국채','채권','Bond','Treasury','KTB'],                         'type': 'BOND'     },
  { 'keys': ['BOJ','일본은행','Bank of Japan','BOJ Rate','BOJ Interest'],    'type': 'BOJ'      },
]

def guessEventType(eventName, region):
    upper = eventName.upper()
    reg = region.upper().strip()
    for rule in TYPE_MAP:
        if any(k.upper() in upper for k in rule['keys']):
            if rule['type'] is None:
                if reg == 'KR' or '한국' in upper or 'KR' in upper:
                    return 'KR_CPI'
                if reg == 'US' or '미국' in upper or 'US' in upper:
                    return 'US_CPI'
                return 'CUSTOM'
            us_only_types = ['US_PPI', 'US_PCE', 'US_NFP', 'FOMC']
            if rule['type'] in us_only_types and reg != 'US' and reg != '':
                return 'CUSTOM'
            if rule['type'] == 'BOJ' and reg != 'JP' and reg != '':
                return 'CUSTOM'
            return rule['type']
    return 'CUSTOM'

def guessImpact(type_str, eventName):
    high_types = ['FOMC','US_CPI','US_NFP','BOK','KR_CPI','BOJ']
    med_types = ['US_PPI','US_PCE','EARNINGS','EXPIRY']
    if type_str in high_types:
        return 'HIGH'
    if type_str in med_types:
        return 'MEDIUM'
    return 'LOW'

def build_key(date_str, event_name, type_str):
    raw = f"{date_str}|{type_str.upper()}|{event_name.strip()}"
    return hashlib.md5(raw.encode('utf-8')).hexdigest()

def main() -> int:
    print(f"Loading Excel workbook from {XLSX_PATH}...")
    if not XLSX_PATH.exists():
        print(f"Error: {XLSX_PATH} does not exist!")
        return 1

    wb = openpyxl.load_workbook(XLSX_PATH)
    sheet_name = "event_calendar"

    # Auto-create sheet if missing
    if sheet_name not in wb.sheetnames:
        print(f"Sheet '{sheet_name}' not found. Creating a new one...")
        ws = wb.create_sheet(sheet_name)
        default_headers = ['Date', 'Event', 'Type', 'Impact', 'Alert', 'DaysLeft', 'AlertStatus', 'LastCheckedAt', 'Source', 'SourceUrl', 'Key']
        ws.append(default_headers)
    else:
        ws = wb[sheet_name]

    # Ensure all required headers exist in the sheet. Append them automatically if missing.
    headers_list = [cell.value for cell in ws[1]]
    header_map = {name: idx + 1 for idx, name in enumerate(headers_list) if name}

    all_required_headers = ['Date', 'Event', 'Type', 'Impact', 'Alert', 'DaysLeft', 'AlertStatus', 'LastCheckedAt', 'Source', 'SourceUrl', 'Key']
    ws_updated = False

    for req in all_required_headers:
        if req not in header_map:
            new_col_idx = len(headers_list) + 1
            ws.cell(row=1, column=new_col_idx, value=req)
            headers_list.append(req)
            header_map[req] = new_col_idx
            print(f"Automatically added missing header column '{req}' at column index {new_col_idx}")
            ws_updated = True

    if ws_updated:
        wb.save(XLSX_PATH)
        print("Excel workbook headers updated and saved.")

    # Index existing keys to avoid duplicates
    key_col = header_map['Key']
    row_by_key = {}
    for r_idx in range(2, ws.max_row + 1):
        k = ws.cell(row=r_idx, column=key_col).value
        if k:
            row_by_key[k] = r_idx

    # Calculate date range (60 days ahead)
    today = datetime.now().date()
    today_str = today.strftime("%Y-%m-%d")

    # Calculate 60 days ahead date
    from datetime import timedelta
    end_date = today + timedelta(days=60)
    end_date_str = end_date.strftime("%Y-%m-%d")

    print(f"Requesting Trading Economics calendar for range: {today_str} to {end_date_str}...")
    headers_req = headers.copy()
    headers_req['Cookie'] = f"cal-custom-range={today_str}|{end_date_str}"

    try:
        resp = requests.get(url, headers=headers_req, timeout=12)
        if resp.status_code != 200:
            print(f"Error: Fetch failed with HTTP status {resp.status_code}")
            return 1

        soup = BeautifulSoup(resp.text, 'html.parser')
        t = soup.find('table', id='calendar')
        if not t:
            print("Error: Could not find calendar table in HTML response.")
            return 1

        rows = t.find_all('tr')
        print(f"Found {len(rows)} raw HTML rows. Starting parser...")

        upsert_count = 0
        insert_count = 0
        now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        for r in rows:
            if not r.get('data-event'):
                continue

            tds = r.find_all('td')
            if len(tds) < 9:
                continue

            # Date from td[0] class
            td0 = tds[0]
            date_classes = td0.get('class', [])
            date_str = ""
            for c in date_classes:
                if re.match(r'^\d{4}-\d{2}-\d{2}$', c):
                    date_str = c
                    break
            if not date_str:
                continue

            # Impact (parse stars from td0 html snippet)
            td0_html = str(td0)
            impact = 'LOW'
            if 'calendar-date-3' in td0_html:
                impact = 'HIGH'
            elif 'calendar-date-2' in td0_html:
                impact = 'MEDIUM'

            # Country ISO code from td[3]
            country_iso = tds[3].get_text(strip=True).upper()

            # Event name from a.calendar-event inside td[4]
            a_ev = tds[4].find('a', class_='calendar-event')
            if not a_ev:
                continue
            eventName = a_ev.get_text(strip=True)

            # Skip noise countries (US, KR, JP only)
            if country_iso not in ['US', 'KR', 'JP']:
                continue

            type_str = guessEventType(eventName, country_iso)
            final_impact = guessImpact(type_str, eventName)
            if final_impact == 'LOW' and impact != 'LOW':
                final_impact = impact

            # Skip LOW impact CUSTOM (Except for South Korea)
            if type_str == 'CUSTOM' and final_impact == 'LOW' and country_iso != 'KR':
                continue

            # Actual, Previous, Consensus
            def clean_text(td_el):
                val = re.sub(r'<[^>]+>', ' ', str(td_el))
                val = re.sub(r'\s+', ' ', val).strip()
                return val

            actual = clean_text(tds[5])
            previous = clean_text(tds[6])
            consensus = clean_text(tds[7])

            alert_text_list = []
            if actual and actual != '-':
                alert_text_list.append(f"Act: {actual}")
            if consensus and consensus != '-':
                alert_text_list.append(f"Est: {consensus}")
            if previous and previous != '-':
                alert_text_list.append(f"Prev: {previous}")
            alert_str = " ".join(alert_text_list)

            # Calculate DaysLeft
            try:
                ev_date = datetime.strptime(date_str, "%Y-%m-%d").date()
                days_left = (ev_date - today).days
            except Exception:
                days_left = ""

            key = build_key(date_str, eventName, type_str)

            # Prepare row cells mapping
            row_data = {
                'Date': date_str,
                'Event': eventName,
                'Type': type_str,
                'Impact': final_impact,
                'Alert': alert_str,
                'DaysLeft': days_left,
                'LastCheckedAt': now_str,
                'Source': 'Trading Economics',
                'SourceUrl': 'https://tradingeconomics.com/calendar',
                'Key': key
            }

            if key in row_by_key:
                # Update existing row
                r_num = row_by_key[key]
                for col_name, val in row_data.items():
                    col_idx = header_map[col_name]
                    ws.cell(row=r_num, column=col_idx, value=val)
                upsert_count += 1
            else:
                # Append new row
                max_col = max(header_map.values())
                new_row = ["" for _ in range(max_col)]
                for col_name, val in row_data.items():
                    col_idx = header_map[col_name]
                    new_row[col_idx - 1] = val
                ws.append(new_row)
                row_by_key[key] = ws.max_row # keep index updated
                insert_count += 1

        print(f"Parser complete. Added {insert_count} new events, Updated {upsert_count} existing events.")

        # Save Excel file
        print(f"Saving workbook back to {XLSX_PATH}...")
        wb.save(XLSX_PATH)
        print("Excel workbook successfully updated!")
        return 0

    except Exception as e:
        print("Failed to run refresh script:", e)
        return 1

if __name__ == "__main__":
    raise SystemExit(main())