#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
fetch_month.py (OKX) - monthly candle fetcher (audit-grade)

Fixes vs prior build:
- Uses OKX official paging semantics: 'after' requests older data
- Sets cursor to the *oldest candle ts returned* (no -1), which avoids systematic skipping on some OKX responses
- Deduplicates by timestamp and writes a final sorted, contiguous file
- Writes a verification summary at the end (rows, min/max ts, expected rows for full month)

Endpoint:
  GET /api/v5/market/history-candles  (limit max 100)
Docs: OKX V5 "Candlesticks history" (after=older, before=newer).

CSV output:
  ts,o,h,l,c,vol,volCcy,volCcyQuote,confirm
"""

import os, sys, time, json, argparse, traceback, calendar
from datetime import datetime, timezone
import urllib.parse
import urllib.request

def utc_iso(ts=None):
    if ts is None:
        ts = time.time()
    return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()

def ms_to_iso(ms):
    return datetime.fromtimestamp(ms/1000.0, tz=timezone.utc).isoformat()

def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def log_line(fp, s):
    line = "[%s] %s" % (utc_iso(), s)
    print(line, flush=True)
    fp.write(line + "\n")
    fp.flush()

def month_bounds_ms(year, month):
    start_dt = datetime(year, month, 1, 0, 0, 0, tzinfo=timezone.utc)
    last_day = calendar.monthrange(year, month)[1]
    end_dt = datetime(year, month, last_day, 23, 59, 59, tzinfo=timezone.utc)
    start_ms = int(start_dt.timestamp() * 1000)
    end_ms = int(end_dt.timestamp() * 1000)
    return start_ms, end_ms, start_dt, end_dt, last_day

def http_get_json(url, timeout=30):
    req = urllib.request.Request(url, headers={"User-Agent": "kappa-fetch/1.2"})
    with urllib.request.urlopen(req, timeout=timeout) as resp:
        raw = resp.read().decode("utf-8", errors="replace")
    return json.loads(raw)

def okx_history_candles(base_url, inst_id, bar, limit=100, after=None):
    q = {"instId": inst_id, "bar": bar, "limit": str(limit)}
    if after is not None:
        q["after"] = str(after)
    url = base_url.rstrip("/") + "/api/v5/market/history-candles?" + urllib.parse.urlencode(q)
    j = http_get_json(url)
    if not isinstance(j, dict) or "data" not in j:
        raise RuntimeError("Unexpected response: %s" % str(j)[:200])
    if j.get("code") not in (None, "0", 0):
        raise RuntimeError("OKX error code=%s msg=%s" % (j.get("code"), j.get("msg")))
    return j.get("data", [])

def expected_rows_for_month(last_day):
    return last_day * 24 * 12  # 5m candles

def write_sorted_csv(path, rows_by_ts):
    ts_sorted = sorted(rows_by_ts.keys())
    with open(path, "w", encoding="utf-8") as f:
        for ts in ts_sorted:
            f.write(",".join(rows_by_ts[ts]) + "\n")
    return len(ts_sorted), (ts_sorted[0] if ts_sorted else None), (ts_sorted[-1] if ts_sorted else None)

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--coin", required=True, help="e.g. BTC-USDT")
    ap.add_argument("--bar", default="5m", help="e.g. 5m")
    ap.add_argument("--year", type=int, required=True)
    ap.add_argument("--month", type=int, required=True)
    ap.add_argument("--base-url", default="https://www.okx.com")
    ap.add_argument("--out-dir", default=None, help="override base output directory")
    ap.add_argument("--log-dir", default=None, help="override logs dir")
    ap.add_argument("--overwrite", action="store_true")
    ap.add_argument("--max-retries", type=int, default=5)
    ap.add_argument("--sleep", type=float, default=0.10)
    ap.add_argument("--limit", type=int, default=100, help="OKX history-candles max=100")
    args = ap.parse_args()

    base_dir = os.path.dirname(os.path.abspath(__file__))
    data_root = args.out_dir or os.path.join(base_dir, "daily_data")
    logs_root = args.log_dir or os.path.join(base_dir, "data_monthly")
    ensure_dir(logs_root)

    log_path = os.path.join(logs_root, "cron_fetch.log")
    with open(log_path, "a", encoding="utf-8") as logfp:
        inst = args.coin.strip()
        bar = args.bar.strip()
        year = args.year
        month = args.month
        ym = "%04d-%02d" % (year, month)

        try:
            log_line(logfp, "RUN_START fetch_month coin=%s bar=%s year=%d month=%d" % (inst, bar, year, month))

            out_dir = os.path.join(data_root, bar, inst)
            ensure_dir(out_dir)
            out_file = os.path.join(out_dir, ym + ".csv")

            if os.path.exists(out_file) and (not args.overwrite):
                log_line(logfp, "SKIP exists %s (overwrite=false)" % out_file)
                log_line(logfp, "RUN_END fetch_month ok=1")
                return 0

            start_ms, end_ms, start_dt, end_dt, last_day = month_bounds_ms(year, month)
            log_line(logfp, "BOUNDS %s .. %s (ms %d..%d)" % (start_dt.isoformat(), end_dt.isoformat(), start_ms, end_ms))

            # Start paging from the month end; 'after' asks for older-than cursor.
            cursor = end_ms

            rows_by_ts = {}
            batches = 0
            empty_streak = 0

            while True:
                # Fetch with retries
                last_err = None
                data = None
                for attempt in range(1, args.max_retries + 1):
                    try:
                        data = okx_history_candles(args.base_url, inst, bar, limit=args.limit, after=cursor)
                        break
                    except Exception as e:
                        last_err = e
                        log_line(logfp, "RETRY %d/%d cursor=%s error=%s" % (attempt, args.max_retries, str(cursor), str(e)))
                        time.sleep(min(2.0, 0.25 * attempt))
                if data is None:
                    raise RuntimeError("Fetch failed after retries: %s" % str(last_err))

                batches += 1

                if not data:
                    empty_streak += 1
                    log_line(logfp, "BATCH_EMPTY #%d cursor=%s streak=%d -> done" % (batches, str(cursor), empty_streak))
                    if empty_streak >= 2:
                        break
                    # nudge cursor older by one bar to escape edge cases
                    cursor -= 5 * 60 * 1000
                    continue

                empty_streak = 0

                # OKX returns newest->oldest; oldest is last element
                ts_list = [int(r[0]) for r in data]
                newest = max(ts_list)
                oldest = min(ts_list)

                # Keep only month rows; dedupe by ts
                in_range = 0
                for r in data:
                    ts = int(r[0])
                    if start_ms <= ts <= end_ms:
                        rows_by_ts[ts] = r
                        in_range += 1

                log_line(
                    logfp,
                    "BATCH #%d cursor=%s got=%d in_range=%d ts_range=[%d..%d] oldest=%d newest=%d kept_total=%d" %
                    (batches, str(cursor), len(data), in_range, oldest, newest, oldest, newest, len(rows_by_ts))
                )

                # Stop if we've gone earlier than the month start
                if oldest < start_ms:
                    log_line(logfp, "DONE reached older than month start oldest=%d (%s) < start_ms=%d (%s)" %
                             (oldest, ms_to_iso(oldest), start_ms, ms_to_iso(start_ms)))
                    break

                # Advance cursor to oldest candle ts (NOT -1). This avoids skipping on some OKX responses.
                # If OKX treats 'after' as inclusive, duplicates are harmless due to dedupe.
                new_cursor = oldest
                if new_cursor == cursor:
                    # safety: move one bar older
                    new_cursor = cursor - 5 * 60 * 1000
                    log_line(logfp, "CURSOR_NO_CHANGE -> nudge older to %d" % new_cursor)

                cursor = new_cursor

                if args.sleep > 0:
                    time.sleep(args.sleep)

                # Safety cap to avoid infinite loops
                if batches > 5000:
                    log_line(logfp, "STOP safety_cap batches>5000")
                    break

            # Write final file sorted by ts ascending
            wrote, min_ts, max_ts = write_sorted_csv(out_file, rows_by_ts)

            exp = expected_rows_for_month(last_day)
            log_line(logfp, "WRITE_COMPLETE rows=%d expected=%d min_ts=%s max_ts=%s file=%s" % (
                wrote, exp,
                (ms_to_iso(min_ts) if min_ts else "NA"),
                (ms_to_iso(max_ts) if max_ts else "NA"),
                out_file
            ))

            ok = 1 if wrote > 0 else 0
            log_line(logfp, "RUN_END fetch_month ok=%d rows=%d file=%s" % (ok, wrote, out_file))
            return 0 if ok else 2

        except Exception as e:
            log_line(logfp, "RUN_END fetch_month ok=0 error=%s" % str(e))
            log_line(logfp, traceback.format_exc())
            return 1

if __name__ == "__main__":
    sys.exit(main())
