import asyncio, os, time
from typing import Dict, List, Optional
import httpx
from tenacity import retry, wait_fixed, stop_after_attempt
from .registry import get_tasks
from ..policy import get_policy, effective_policy_for, TokenBucketManager
from ..storage.writer import write_jsonl, write_csv_latest, compose_combined_latest
from ..storage.change_index import produce_changes
from ..parsers.generic import parse_listings
from selectolax.parser import HTMLParser
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import traceback
import sys

_latest_status: Dict = {"last_run": None, "next_run": None, "counts": {}, "errors": [], "debug": []}

def latest_status() -> Dict:
    return _latest_status

def _dbg(entry: Dict):
    try:
        _latest_status.setdefault("debug", []).append(entry)
    except Exception:
        pass

@retry(wait=wait_fixed(2), stop=stop_after_attempt(3))
async def _fetch_httpx(client: httpx.AsyncClient, url: str) -> str:
    r = await client.get(url, timeout=30)
    r.raise_for_status()
    return r.text

async def _fetch_playwright(url: str) -> str:
    # Ensure correct asyncio loop policy for Playwright on Windows (Python 3.12 Proactor loop lacks subprocess support)
    if sys.platform.startswith("win"):
        try:
            asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
        except Exception:
            pass
    # Use sync Playwright API in a worker thread to avoid asyncio subprocess limitations on Windows
    def _sync_fetch(u: str) -> str:
        from playwright.sync_api import sync_playwright
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            context = browser.new_context(
                user_agent=(
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                    "Chrome/123.0.0.0 Safari/537.36"
                ),
                viewport={"width": 1366, "height": 768},
            )
            page = context.new_page()
            page.goto(u, wait_until="domcontentloaded", timeout=45000)
            html = page.content()
            context.close()
            browser.close()
            return html

    return await asyncio.to_thread(_sync_fetch, url)

async def _scrape_task(task: Dict) -> List[Dict]:
    broker = task["name"]
    policy = get_policy()
    eff = effective_policy_for(broker, policy)
    g_bucket, b_bucket = TokenBucketManager.get_buckets_for(broker)
    per_run_limit = eff.get("per_run_limit") if eff.get("crawl_mode") == "per_run" else None
    results: List[Dict] = []
    requires_js = bool(task.get("requires_js"))
    pagination = task.get("pagination", {"type": "none"})
    selectors = task.get("selectors", {})

    max_pages = int(pagination.get("max_pages", 1))
    start_page = int(pagination.get("start", 1))

    async def throttle_take():
        sleep_s = max(g_bucket.take(1), b_bucket.take(1))
        if sleep_s > 0:
            time.sleep(sleep_s)

    async def crawl_with_fetch(fetch_fn):
        nonlocal results
        ptype = pagination.get("type", "none")
        # normalize start_urls iteration
        start_urls = task.get("start_urls", [])
        if ptype == "query_param":
            param = pagination.get("param", "page")
            for base in start_urls:
                for page_num in range(start_page, start_page + max_pages):
                    await throttle_take()
                    sep = "&" if "?" in base else "?"
                    url = f"{base}{sep}{param}={page_num}"
                    try:
                        html = await fetch_fn(url)
                    except Exception as e:
                        _latest_status.setdefault("errors", []).append({
                            "broker": broker,
                            "error": getattr(e, "args", [""])[0] if str(e) else repr(e),
                            "error_type": type(e).__name__,
                            "url": url,
                            "stage": "fetch",
                            "ptype": "query_param",
                            "trace": traceback.format_exc(),
                        })
                        break
                    try:
                        items = parse_listings(html, selectors)
                    except Exception as e:
                        _latest_status.setdefault("errors", []).append({
                            "broker": broker,
                            "error": getattr(e, "args", [""])[0] if str(e) else repr(e),
                            "error_type": type(e).__name__,
                            "url": url,
                            "stage": "parse",
                            "ptype": "query_param",
                            "trace": traceback.format_exc(),
                        })
                        items = []
                        _dbg({"broker": broker, "url": url, "ptype": "query_param", "items": len(items)})
                        for r in items:
                            r["broker"] = broker
                        if items:
                            results.extend(items)
                        else:
                            break
                        if per_run_limit and len(results) >= int(per_run_limit):
                            results = results[: int(per_run_limit)]
                            return
                    except Exception as e:
                        _latest_status.setdefault("errors", []).append({
                            "broker": broker,
                            "error": getattr(e, "args", [""])[0] if str(e) else repr(e),
                            "error_type": type(e).__name__,
                            "url": url,
                            "stage": "outer",
                            "trace": "".join(traceback.format_exception_only(type(e), e)).strip(),
                        })
                        break
        elif ptype == "link_selector":
            next_sel = pagination.get("selector")
            for start in start_urls:
                cur = start
                pages_seen = 0
                visited = set()
                while cur and pages_seen < max_pages and cur not in visited:
                    visited.add(cur)
                    pages_seen += 1
                    await throttle_take()
                    try:
                        html = await fetch_fn(cur)
                    except Exception as e:
                        _latest_status.setdefault("errors", []).append({
                            "broker": broker,
                            "error": getattr(e, "args", [""])[0] if str(e) else repr(e),
                            "error_type": type(e).__name__,
                            "url": cur,
                            "stage": "fetch",
                            "ptype": "link_selector",
                            "trace": traceback.format_exc(),
                        })
                        break
                    try:
                        items = parse_listings(html, selectors)
                    except Exception as e:
                        _latest_status.setdefault("errors", []).append({
                            "broker": broker,
                            "error": getattr(e, "args", [""])[0] if str(e) else repr(e),
                            "error_type": type(e).__name__,
                            "url": cur,
                            "stage": "parse",
                            "ptype": "link_selector",
                            "trace": traceback.format_exc(),
                        })
                        items = []
                        _dbg({"broker": broker, "url": cur, "ptype": "link_selector", "items": len(items)})
                        for r in items:
                            r["broker"] = broker
                        results.extend(items)
                        if per_run_limit and len(results) >= int(per_run_limit):
                            results = results[: int(per_run_limit)]
                            return
                        if not next_sel:
                            break
                        doc = HTMLParser(html)
                        try:
                            nxt = doc.css_first(next_sel)
                            href = nxt and (nxt.attributes.get("href") or nxt.attributes.get("data-href"))
                        except NotImplementedError:
                            soup = BeautifulSoup(html, "html.parser")
                            n2 = soup.select_one(next_sel)
                            href = n2.get("href") if n2 else None
                        if href:
                            cur = urljoin(cur, href)
                        else:
                            break
                    except Exception as e:
                        _latest_status.setdefault("errors", []).append({
                            "broker": broker,
                            "error": getattr(e, "args", [""])[0] if str(e) else repr(e),
                            "error_type": type(e).__name__,
                            "url": cur,
                        })
                        break
        else:
            # no pagination: fetch each start url once
            for url in start_urls:
                await throttle_take()
                try:
                    html = await fetch_fn(url)
                except Exception as e:
                    _latest_status.setdefault("errors", []).append({
                        "broker": broker,
                        "error": getattr(e, "args", [""])[0] if str(e) else repr(e),
                        "error_type": type(e).__name__,
                        "url": url,
                        "stage": "fetch",
                        "ptype": "none",
                        "trace": traceback.format_exc(),
                    })
                    continue
                try:
                    items = parse_listings(html, selectors)
                except Exception as e:
                    _latest_status.setdefault("errors", []).append({
                        "broker": broker,
                        "error": getattr(e, "args", [""])[0] if str(e) else repr(e),
                        "error_type": type(e).__name__,
                        "url": url,
                        "stage": "parse",
                        "ptype": "none",
                        "trace": traceback.format_exc(),
                    })
                    items = []
                    _dbg({"broker": broker, "url": url, "ptype": "none", "items": len(items)})
                    for r in items:
                        r["broker"] = broker
                    results.extend(items)
                    if per_run_limit and len(results) >= int(per_run_limit):
                        results = results[: int(per_run_limit)]
                        return
                except Exception as e:
                    _latest_status.setdefault("errors", []).append({
                        "broker": broker,
                        "error": getattr(e, "args", [""])[0] if str(e) else repr(e),
                        "error_type": type(e).__name__,
                        "url": url,
                        "stage": "outer",
                        "trace": "".join(traceback.format_exception_only(type(e), e)).strip(),
                    })

    if requires_js:
        await crawl_with_fetch(_fetch_playwright)
    else:
        async with httpx.AsyncClient(follow_redirects=True, headers={"User-Agent": "dvc_brokers/1.0"}) as client:
            async def httpx_fetch(u: str) -> str:
                return await _fetch_httpx(client, u)
            await crawl_with_fetch(httpx_fetch)
    return results

async def _run(selected_broker: Optional[str] = None) -> Dict:
    tasks = [t for t in get_tasks() if (not selected_broker or t["name"] == selected_broker)]
    # reset status collections at each run and stamp time
    _latest_status["errors"] = []
    _latest_status["debug"] = []
    try:
        from datetime import datetime, timezone
        _latest_status["last_run"] = datetime.now(timezone.utc).isoformat()
    except Exception:
        _latest_status["last_run"] = str(time.time())
    _dbg({"phase": "run_start", "selected": selected_broker or "all", "tasks": len(tasks)})
    counts = {}
    for t in tasks:
        rows = await _scrape_task(t)
        counts[t["name"]] = len(rows)
        write_jsonl(t["name"], rows)
        write_csv_latest(t["name"], rows)
    compose_combined_latest()
    produce_changes()
    _latest_status["counts"] = counts
    return {"ok": True, "counts": counts}

def run_all() -> Dict:
    return asyncio.run(_run())

def run_one(broker: str) -> Dict:
    return asyncio.run(_run(broker))
