#!/usr/bin/env python3
"""
build_thegraph_cl_npz_v1.py

Build NPZ for cl_fee_replay_fast_npz_v3 from TheGraph subgraph CSV.

TheGraph format differences vs on-chain RPC format:
  - column names: block_number, log_index, sqrt_price_x96, liquidity_amount
  - amount0/amount1 are ALREADY human-readable floats (not raw integers)
  - liquidity_amount = NaN (not included in subgraph swap events)
  - active_liquidity must be set from pool.liquidity snapshot (constant)

Price is computed from tick: price_token0_per_token1 = 10^(dec1-dec0) / 1.0001^tick
"""
from __future__ import annotations

import argparse
import json
import math
from pathlib import Path

import numpy as np
import pandas as pd

SCRIPT_VERSION = "build_thegraph_cl_npz_v1_2026_05_02"


def main() -> None:
    print(f"[script_version] {__file__} SCRIPT_VERSION={SCRIPT_VERSION}")

    ap = argparse.ArgumentParser()
    ap.add_argument("--events", required=True, help="events_all.csv or swaps.csv from TheGraph")
    ap.add_argument("--out-npz", required=True)
    ap.add_argument("--pool-name", default="")
    ap.add_argument("--token0", default="USDC")
    ap.add_argument("--token1", default="WETH")
    ap.add_argument("--dec0", type=int, default=6)
    ap.add_argument("--dec1", type=int, default=18)
    ap.add_argument("--fee-rate", type=float, default=0.003, help="pool fee rate, e.g. 0.003 for 0.3%%")
    ap.add_argument("--active-liquidity", type=float, default=1798017888714385920.0,
                    help="Pool-level active liquidity (raw Uniswap units). "
                         "Use pool.liquidity from pool metadata. "
                         "Default = WETH/USDC 0.3%% mainnet snapshot 2026-05.")
    args = ap.parse_args()

    events_path = Path(args.events)
    if not events_path.exists():
        raise SystemExit(f"events file not found: {events_path}")

    df = pd.read_parquet(events_path) if events_path.suffix == ".parquet" else pd.read_csv(events_path)

    # Filter to Swap events only
    if "event_type" in df.columns:
        sw = df[df["event_type"].astype(str).str.lower() == "swap"].copy()
    else:
        sw = df.copy()  # assume already swaps-only (e.g. swaps.csv)

    if sw.empty:
        raise SystemExit("no swap rows found")

    # Normalize column names
    rename = {
        "block_number": "blockNumber",
        "log_index": "logIndex",
        "sqrt_price_x96": "sqrtPriceX96",
        "liquidity_amount": "liquidity",
    }
    sw = sw.rename(columns={k: v for k, v in rename.items() if k in sw.columns})

    # Sort by time
    for c in ["timestamp", "blockNumber", "logIndex"]:
        if c in sw.columns:
            sw[c] = pd.to_numeric(sw[c], errors="coerce")
    sw = sw.dropna(subset=["timestamp", "tick"]).copy()
    sw["tick"] = pd.to_numeric(sw["tick"], errors="coerce")
    sw = sw.dropna(subset=["tick"])
    sort_cols = [c for c in ["timestamp", "blockNumber", "logIndex"] if c in sw.columns]
    sw = sw.sort_values(sort_cols, kind="stable").reset_index(drop=True)

    n = len(sw)
    if n == 0:
        raise SystemExit("no valid swap rows after filtering")

    ts = sw["timestamp"].astype(np.int64).to_numpy()
    block = sw["blockNumber"].astype(np.int64).to_numpy() if "blockNumber" in sw.columns else np.zeros(n, dtype=np.int64)
    log_idx = sw["logIndex"].astype(np.int32).to_numpy() if "logIndex" in sw.columns else np.zeros(n, dtype=np.int32)
    tick = sw["tick"].astype(np.int32).to_numpy()

    # Price: token0 per token1 (USDC/WETH)
    # price = 10^(dec1-dec0) / (1.0001^tick)
    price = (10.0 ** (args.dec1 - args.dec0)) / (1.0001 ** tick.astype(np.float64))

    # amount0_h, amount1_h: already human-readable from TheGraph
    if "amount0" in sw.columns:
        amount0_h = sw["amount0"].astype(np.float64).to_numpy()
    else:
        amount0_h = np.zeros(n, dtype=np.float64)

    if "amount1" in sw.columns:
        amount1_h = sw["amount1"].astype(np.float64).to_numpy()
    else:
        amount1_h = np.zeros(n, dtype=np.float64)

    # input_usd: use amount_usd if available, else reconstruct
    if "amount_usd" in sw.columns:
        input_usd = sw["amount_usd"].astype(np.float64).fillna(0.0).abs().to_numpy()
    else:
        # Fallback: max of |amount0| in token0 terms and |amount1|*price
        input_usd = np.maximum(
            np.abs(amount0_h),
            np.abs(amount1_h) * price
        )

    # Active liquidity: constant proxy from pool.liquidity snapshot
    active_liq = np.full(n, args.active_liquidity, dtype=np.float64)

    meta = {
        "script_version": SCRIPT_VERSION,
        "source_events": str(events_path),
        "pool_name": args.pool_name,
        "token0": args.token0,
        "token1": args.token1,
        "dec0": args.dec0,
        "dec1": args.dec1,
        "fee_rate": args.fee_rate,
        "active_liquidity_source": "pool_snapshot_constant",
        "active_liquidity_value": args.active_liquidity,
        "rows_swap": int(n),
        "timestamp_start": int(ts[0]),
        "timestamp_end": int(ts[-1]),
        "price_start": float(price[0]),
        "price_end": float(price[-1]),
        "price_min": float(np.nanmin(price)),
        "price_max": float(np.nanmax(price)),
        "input_usd_sum": float(np.nansum(input_usd)),
        "note": "active_liquidity is a constant snapshot proxy. Fee share calculations are approximate.",
    }

    out_npz = Path(args.out_npz)
    out_npz.parent.mkdir(parents=True, exist_ok=True)

    np.savez_compressed(
        out_npz,
        ts=ts,
        block=block,
        log_index=log_idx,
        tick=tick,
        price=price,
        amount0_h=amount0_h,
        amount1_h=amount1_h,
        input_usd=input_usd,
        active_liquidity=active_liq,
        meta_json=np.array(json.dumps(meta, ensure_ascii=False), dtype=np.str_),
    )

    meta_path = out_npz.with_suffix("").with_suffix(".meta.json")
    meta_path.write_text(json.dumps(meta, indent=2, ensure_ascii=False), encoding="utf-8")

    print(json.dumps({
        "out_npz": str(out_npz),
        "meta": str(meta_path),
        "rows_swap": int(n),
        "input_usd_sum": float(np.nansum(input_usd)),
        "price_start": float(price[0]),
        "price_end": float(price[-1]),
        "price_min": float(np.nanmin(price)),
        "price_max": float(np.nanmax(price)),
    }, indent=2))


if __name__ == "__main__":
    main()
