#!/usr/bin/env python3
"""
Rightmove Property Scraper
===========================
Scrapes Rightmove listings near W6 8RE for properties to buy with:
  - Max price: £1,300,000
  - Min bedrooms: 3
  - Radius: 0.5 miles

Fetched data is stored in a JSON cache (rightmove_cache.json) alongside this script.
Run daily (e.g. via cron or Windows Task Scheduler) to build a historical record.

Usage:
    python rightmove_scraper.py

Dependencies:
    pip install requests beautifulsoup4
"""

import json
import math
import os
import re
import sys
import tempfile
import time
import random
import logging
from datetime import date
from pathlib import Path

import requests
from bs4 import BeautifulSoup

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

SEARCH_CONFIG = {
    "location_identifier": "OUTCODE%5E2766",   # W6 postcode outcode id on Rightmove
    "postcode": "W6 8RE",
    # "radius": "0.5",           # miles
    "max_price": "1500000",
    "min_bedrooms": "3",
    "property_type": "residential",
    "transaction_type": "BUY",          # buy = sale
    "display_location_identifier": "W6.html",
}

CACHE_FILE = Path(__file__).parent / "rightmove_cache.json"

BASE_SEARCH_URL = (
    "https://www.rightmove.co.uk/property-for-sale/find.html"
    "?locationIdentifier={location_identifier}"
    "&sortType=6" # most recent first
    "&channel={transaction_type}"
    "&transactionType={transaction_type}"
    "&displayLocationIdentifier={display_location_identifier}"
    "&maxPrice={max_price}"
    "&minBedrooms={min_bedrooms}"
    # "&radius={radius}"
)
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-GB,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Referer": "https://www.rightmove.co.uk/",
}

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
log = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Cache helpers
# ---------------------------------------------------------------------------

def load_cache() -> dict:
    """Load existing cache, handling corruption gracefully.

    If the file exists but is unreadable or contains invalid JSON (e.g. from a
    previous failed write or SD card error), the corrupt file is renamed with a
    .corrupt suffix so it isn't lost, and a fresh empty cache is returned.
    """
    if CACHE_FILE.exists():
        try:
            with open(CACHE_FILE, "r", encoding="utf-8") as f:
                return json.load(f)
        except (json.JSONDecodeError, OSError) as exc:
            bad = CACHE_FILE.with_suffix(".corrupt.json")
            try:
                CACHE_FILE.rename(bad)
                log.error("Cache corrupt (%s) — renamed to %s, starting fresh", exc, bad)
            except OSError:
                log.error("Cache corrupt (%s) and could not rename — starting fresh", exc)
    return {
        "meta": {
            "search_config": SEARCH_CONFIG,
            "schema_version": 1,
        },
        "listings": {},
    }


def save_cache(cache: dict) -> None:
    """Persist cache atomically — write to a temp file then rename.

    A direct open(..., 'w') truncates the file immediately; if the process
    is killed mid-write the cache is left empty/corrupt.  Writing to a temp
    file in the same directory and then renaming is an atomic operation on
    Linux/macOS, so the old file is only replaced once the new one is complete.
    """
    tmp_fd, tmp_path = tempfile.mkstemp(
        dir=CACHE_FILE.parent, prefix=".rightmove_cache_tmp_", suffix=".json"
    )
    try:
        with os.fdopen(tmp_fd, "w", encoding="utf-8") as f:
            json.dump(cache, f, indent=2, ensure_ascii=False)
        os.replace(tmp_path, CACHE_FILE)   # atomic on POSIX
        log.info("Cache saved → %s", CACHE_FILE)
    except Exception:
        try:
            os.unlink(tmp_path)
        except OSError:
            pass
        raise


# ---------------------------------------------------------------------------
# Parsing helpers
# ---------------------------------------------------------------------------

def _price_text_only(tag) -> str:
    """Return only the direct NavigableString children of a tag.

    Rightmove price elements contain nested <span> children (qualifiers like
    'Guide price', per-unit counts, etc).  Calling .get_text() concatenates
    ALL descendants, corrupting the number (e.g. '£775,000' + '24' → '77500024').
    This function takes only the text nodes that are immediate children.
    """
    if tag is None:
        return ""
    from bs4 import NavigableString
    return "".join(str(c) for c in tag.children if isinstance(c, NavigableString)).strip()


def parse_price(raw: str) -> int | None:
    """Extract integer price from a string like '£850,000' or '£775,000 Guide price'.

    Takes the FIRST £-prefixed number only.  Applies a sanity cap: any value
    >= £100,000,000 is almost certainly a parse artefact and is discarded.
    """
    m = re.search(r'£\s*([\d,]+)', raw)
    if not m:
        return None
    value = int(m.group(1).replace(',', ''))
    if value >= 100_000_000:
        log.warning("Suspiciously large price %d from %r — discarding", value, raw[:60])
        return None
    return value


def parse_sqft(raw: str) -> float | None:
    """
    Extract square footage from strings like:
      '1,234 sq ft'  /  '1234 sq. ft'  /  '114.5 sq m'
    Converts sq m → sq ft when needed.
    """
    if not raw:
        return None
    raw = raw.replace(",", "").strip()
    # sq ft
    m = re.search(r"([\d.]+)\s*sq\.?\s*ft", raw, re.IGNORECASE)
    if m:
        return round(float(m.group(1)), 1)
    # sq m  →  × 10.7639
    m = re.search(r"([\d.]+)\s*sq\.?\s*m", raw, re.IGNORECASE)
    if m:
        return round(float(m.group(1)) * 10.7639, 1)
    return None


def price_per_sqft(price: int | None, sqft: float | None) -> float | None:
    if price and sqft and sqft > 0:
        return round(price / sqft, 2)
    return None


# ---------------------------------------------------------------------------
# Scraping
# ---------------------------------------------------------------------------

def build_search_url(index: int = 0) -> str:
    """Build paginated Rightmove search URL (24 results per page)."""
    url = BASE_SEARCH_URL.format(**SEARCH_CONFIG)
    if index:
        url += f"&index={index}"
    return url


def _is_blocked(soup: BeautifulSoup) -> bool:
    """Detect Rightmove block / CAPTCHA pages that return HTTP 200 but no data.

    Rightmove sometimes serves an access-denied or bot-detection page with a
    200 status, so raise_for_status() won't catch it.  We scan for known
    phrases that appear on those pages.
    """
    text = soup.get_text(" ", strip=True).lower()
    signals = [
        "access denied", "captcha", "are you a robot",
        "unusual traffic", "verify you are human", "security check",
        "please verify", "robot or human",
    ]
    return any(s in text for s in signals)


def fetch_page(url: str, session: requests.Session, retries: int = 3) -> BeautifulSoup | None:
    """Fetch a URL and return parsed HTML, or None on failure.

    Retries up to `retries` times with exponential backoff.
    Handles HTTP 429 (rate limit) by honouring the Retry-After header.
    Detects soft blocks (HTTP 200 but CAPTCHA/access-denied content).
    """
    for attempt in range(1, retries + 1):
        try:
            resp = session.get(url, timeout=20)

            # Handle rate limiting explicitly before raise_for_status
            if resp.status_code == 429:
                retry_after = int(resp.headers.get("Retry-After", 60))
                log.warning(
                    "Rate limited (429) — waiting %ds before retry (attempt %d/%d)",
                    retry_after, attempt, retries,
                )
                time.sleep(retry_after)
                continue

            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")

            # Detect soft blocks — HTTP 200 but CAPTCHA/access-denied content
            if _is_blocked(soup):
                log.warning(
                    "Soft block / CAPTCHA detected on %s (attempt %d/%d)",
                    url, attempt, retries,
                )
                if attempt < retries:
                    wait = 2 ** attempt * 30  # 60s, 120s
                    log.info("Waiting %ds before retry…", wait)
                    time.sleep(wait)
                continue

            log.info("Fetched %s [%d]", url, resp.status_code)
            return soup

        except requests.RequestException as exc:
            log.error("Failed to fetch %s (attempt %d/%d): %s", url, attempt, retries, exc)
            if attempt < retries:
                wait = 2 ** (attempt - 1) * 10  # 10s, 20s
                log.info("Waiting %ds before retry…", wait)
                time.sleep(wait)

    log.error("All %d attempts failed for %s — giving up", retries, url)
    return None


def parse_listing_card(card: BeautifulSoup) -> dict | None:
    """
    Extract fields from a single property card on the search results page.
    Rightmove renders results as <div data-test="propertyCard-..."> elements.
    """
    try:
        # --- URL & property ID ---
        link_tag = card.select_one(
            "a[data-testid='property-details-lozenge'], "
            "a.propertyCard-link, a[data-test='property-details']"
        )
        if not link_tag:
            link_tag = card.find("a", href=re.compile(r"/properties/\d+"))
        if not link_tag:
            return None
        href = link_tag.get("href", "")
        if not href.startswith("http"):
            href = "https://www.rightmove.co.uk" + href
        # Normalise – strip query params so URL is stable across days
        url = href.split("?")[0].rstrip("/")

        # Extract property ID from anchor id attr (e.g. id="prop171939758")
        anchor_tag = card.select_one("a[id^='prop']")
        prop_id = None
        if anchor_tag:
            prop_id = re.sub(r"[^\d]", "", anchor_tag.get("id", "")) or None
        if not prop_id:
            m = re.search(r"/properties/(\d+)", url)
            prop_id = m.group(1) if m else None

        # --- Price ---
        # IMPORTANT: Rightmove uses two classes that both match 'PropertyPrice_price':
        #   PropertyPrice_priceContainer___xxxx  — outer wrapper div (no direct text)
        #   PropertyPrice_price__xxxx            — inner div with the actual £ number
        # The container also contains sibling elements like "24 Hour Security" in a
        # PropertyPrice_premiumListingText div.  We must match the INNER element only.
        # Strategy: try the most-specific selectors first, each targeting the leaf node.
        price_tag = (
            # New Rightmove React class — inner price element (contains double underscore)
            card.select_one("[class*='PropertyPrice_price__']") or
            # Legacy selectors
            card.select_one("[data-test='propertyCard-priceValue']") or
            card.select_one(".propertyCard-priceValue") or
            card.select_one(".property-information span.price")
        )
        # Extract direct text children only — never .get_text() which would pull in
        # sibling elements (e.g. "24 Hour Security") that concatenate onto the number.
        price_raw = _price_text_only(price_tag)
        # Last-resort fallback: only use get_text if there is genuinely no direct text
        # AND no sibling premium-text elements exist (i.e. safe to concatenate).
        if (not price_raw or '£' not in price_raw) and price_tag:
            # Only fall back if the tag has no sibling with premiumListingText
            parent = price_tag.parent
            has_premium_sibling = parent and parent.find(
                class_=re.compile(r'[Pp]remium|[Qq]ualifier|[Ll]abel', re.IGNORECASE)
            )
            if not has_premium_sibling:
                price_raw = price_tag.get_text(strip=True)
        price = parse_price(price_raw)

        # --- Address ---
        addr_tag = card.select_one(
            "address[class*='PropertyAddress_address'], "
            "[data-test='propertyCard-address'], .propertyCard-address, "
            "address.propertyCard-address"
        )
        address = addr_tag.get_text(strip=True) if addr_tag else ""

        # --- Property info (type, beds, baths) ---
        prop_type_tag = card.select_one("[class*='PropertyInformation_propertyType']")
        prop_type = prop_type_tag.get_text(strip=True) if prop_type_tag else None

        beds_tag = card.select_one("[class*='PropertyInformation_bedroomsCount']")
        beds_raw = beds_tag.get_text(strip=True) if beds_tag else None
        beds = int(re.sub(r"[^\d]", "", beds_raw)) if beds_raw and re.search(r"\d", beds_raw) else None

        baths_tag = card.select_one("[class*='PropertyInformation_bathContainer']")
        baths_raw = baths_tag.get_text(strip=True) if baths_tag else None
        baths = int(re.sub(r"[^\d]", "", baths_raw)) if baths_raw and re.search(r"\d", baths_raw) else None

        # --- Square footage (may be absent on card; fetched from detail page) ---
        sqft_tag = card.select_one(".propertyCard-floorplanImage + span, .property-size")
        sqft_raw = sqft_tag.get_text(strip=True) if sqft_tag else ""
        sqft = parse_sqft(sqft_raw)

        return {
            "url": url,
            "property_id": prop_id,
            "address": address,
            "price_raw": price_raw,
            "price": price,
            "property_type": prop_type,
            "bedrooms": beds,
            "bathrooms": baths,
            "sqft": sqft,
            "price_per_sqft": price_per_sqft(price, sqft),
        }
    except Exception as exc:  # pylint: disable=broad-except
        log.warning("Card parse error: %s", exc)
        return None


def fetch_sqft_from_detail(url: str, session: requests.Session) -> float | None:
    """
    Visit the individual property page to extract square footage.
    Tries structured JSON-LD data first (most reliable), then falls back
    to scanning all page text for a size pattern.
    """
    soup = fetch_page(url, session)
    if not soup:
        return None

    # 1. Structured data (JSON-LD) — most reliable when present
    for script in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string or "")
            floor_size = data.get("floorSize", {})
            if isinstance(floor_size, dict):
                value = floor_size.get("value")
                unit  = floor_size.get("unitCode", "")
                if value:
                    raw = f"{value} {'sq ft' if 'FTK' in unit.upper() else 'sq m'}"
                    result = parse_sqft(raw)
                    if result:
                        return result
        except (json.JSONDecodeError, AttributeError):
            pass

    # 2. Full-page text scan
    text = soup.get_text(" ", strip=True)
    return parse_sqft(text)


RESULTS_PER_PAGE = 24


def parse_total_results(soup: BeautifulSoup) -> int | None:
    """Extract the total result count from the search results page."""
    # Try various selectors Rightmove has used
    candidates = [
        "[data-test='search-result-count']",
        ".searchHeader-resultCount",
        "[class*='ResultsCount_resultsCount']",
        "[class*='searchHeader-resultCount']",
    ]
    for selector in candidates:
        tag = soup.select_one(selector)
        if tag:
            text = tag.get_text(strip=True)
            log.info("Result count element text: %r", text)
            m = re.search(r"([\d,]+)", text.replace(",", ""))
            if m:
                return int(m.group(1).replace(",", ""))

    # Fallback: scan page text for "X results" / "of X properties"
    text = soup.get_text(" ", strip=True)
    m = re.search(r"(\d[\d,]*)\s+(?:results?|properties)", text, re.IGNORECASE)
    if m:
        return int(m.group(1).replace(",", ""))

    return None


def scrape_all_listings(session: requests.Session) -> list[dict]:
    """Paginate through all Rightmove search results and return raw listing dicts."""
    all_listings: list[dict] = []
    index = 0
    page = 1
    total_results: int | None = None
    consecutive_failures = 0
    MAX_CONSECUTIVE_FAILURES = 3

    while True:
        url = build_search_url(index)
        log.info("Scraping page %d (index=%d)…", page, index)
        soup = fetch_page(url, session)
        if not soup:
            consecutive_failures += 1
            log.warning(
                "Page %d fetch failed (%d consecutive failure(s))",
                page, consecutive_failures,
            )
            if consecutive_failures >= MAX_CONSECUTIVE_FAILURES:
                log.error(
                    "%d consecutive page failures — aborting pagination",
                    MAX_CONSECUTIVE_FAILURES,
                )
                break
            # Skip this page and try the next rather than aborting everything
            index += RESULTS_PER_PAGE
            page += 1
            continue

        consecutive_failures = 0  # reset on success

        # Detect total results on first page to drive pagination
        if index == 0:
            total_results = parse_total_results(soup)
            if total_results is not None:
                total_pages = math.ceil(total_results / RESULTS_PER_PAGE)
                log.info("Rightmove reports %d results across %d page(s).", total_results, total_pages)
            else:
                log.warning("Could not determine total result count; will paginate until no cards found.")

        cards = soup.select(
            "div[class*='PropertyCard_propertyCardContainerWrapper'], "
            "div.l-searchResult, div[data-test^='propertyCard'], "
            "div.propertyCard, article.propertyCard"
        )
        if not cards:
            log.info("No more property cards found, stopping pagination.")
            break

        for card in cards:
            listing = parse_listing_card(card)
            if listing:
                all_listings.append(listing)

        log.info("Page %d: collected %d cards (running total: %d)", page, len(cards), len(all_listings))

        # Stop if we've collected all expected results
        if total_results is not None and len(all_listings) >= total_results:
            log.info("Collected all %d expected listings.", total_results)
            break

        # Stop if this page returned fewer results than a full page (last page)
        if len(cards) < RESULTS_PER_PAGE:
            log.info("Partial page (%d cards) — reached last page.", len(cards))
            break

        index += RESULTS_PER_PAGE
        page += 1
        # Be polite – randomised delay between pages
        delay = random.uniform(4, 9)
        log.info("Waiting %.1fs before next page…", delay)
        time.sleep(delay)

    log.info("Total listings scraped from search pages: %d", len(all_listings))
    return all_listings


# ---------------------------------------------------------------------------
# Cache update logic
# ---------------------------------------------------------------------------

def update_cache(cache: dict, listings: list[dict], session: requests.Session) -> dict:
    """
    Merge today's scraped listings into the persistent cache.

    Cache structure:
    {
      "meta": { ... },
      "listings": {
        "<normalised_url>": {
          "url": "...",
          "address": "...",
          "history": [
            {
              "date": "YYYY-MM-DD",
              "price": 850000,
              "sqft": 1100.0,
              "price_per_sqft": 772.73
            },
            ...
          ]
        },
        ...
      }
    }
    """
    today = date.today().isoformat()
    existing = cache.setdefault("listings", {})

    log.info("━" * 60)
    log.info("Processing %d listings scraped today (%s)", len(listings), today)
    log.info("━" * 60)

    for listing in listings:
        url = listing["url"]

        # If sqft missing from card, check for a previously stored value first
        if listing.get("sqft") is None:
            manual = existing.get(url, {}).get("manual_sqft")
            if manual:
                log.info("Using manual_sqft=%.0f for %s", manual, url)
                listing["sqft"] = manual
                listing["price_per_sqft"] = price_per_sqft(listing["price"], manual)
            else:
                # Check if a previous history snapshot already has sqft — avoids
                # fetching the detail page again for every known listing every day
                prev_sqft = None
                for snap in reversed(existing.get(url, {}).get("history", [])):
                    if snap.get("sqft"):
                        prev_sqft = snap["sqft"]
                        break
                if prev_sqft:
                    log.info("Reusing historical sqft=%.0f for %s", prev_sqft, url)
                    listing["sqft"] = prev_sqft
                    listing["price_per_sqft"] = price_per_sqft(listing["price"], prev_sqft)
                else:
                    log.info("Fetching detail page for sqft: %s", url)
                    listing["sqft"] = fetch_sqft_from_detail(url, session)
                    listing["price_per_sqft"] = price_per_sqft(listing["price"], listing["sqft"])
                    time.sleep(random.uniform(3, 7))

        snapshot = {
            "date": today,
            "price": listing["price"],
            "sqft": listing["sqft"],
            "price_per_sqft": listing["price_per_sqft"],
        }

        # Human-readable summary fields
        price_str = f"£{listing['price']:,}"                           if listing.get("price")             else "price n/a"
        sqft_str  = f"{int(listing['sqft']):,} sqft"                   if listing.get("sqft")              else "sqft n/a"
        psf_str   = f"£{listing['price_per_sqft']:.0f}/sqft"           if listing.get("price_per_sqft")    else "psf n/a"
        beds_str  = f"{listing['bedrooms']}bed"                        if listing.get("bedrooms")          else ""
        type_str  = listing.get("property_type") or ""
        meta      = "  ·  ".join(filter(None, [beds_str, type_str]))

        if url not in existing:
            existing[url] = {
                "url": url,
                "property_id": listing.get("property_id"),
                "address": listing["address"],
                "property_type": listing.get("property_type"),
                "bedrooms": listing.get("bedrooms"),
                "bathrooms": listing.get("bathrooms"),
                "first_seen": today,
                "history": [snapshot],
            }
            log.info(
                "[NEW]  %s\n"
                "       %s  |  %s  |  %s  |  %s",
                listing["address"], price_str, sqft_str, psf_str, meta,
            )
        else:
            entry = existing[url]
            if listing["address"] and not entry.get("address"):
                entry["address"] = listing["address"]

            # Repair historically corrupt prices.
            # Two cases:
            #   1. Obvious artefacts >= £100m (caught by sanity cap in parse_price
            #      but may already be stored from before the fix was applied).
            #   2. Concatenation artefacts that slipped under £100m — e.g. £775,000
            #      + "24 Hour Security" → 77500024 (£77.5m, plausible-looking but wrong).
            #      Heuristic: if today's clean price differs from a stored price by more
            #      than 10× in either direction, the stored value is almost certainly corrupt.
            if listing["price"]:
                clean = listing["price"]
                for hist_snap in entry.get("history", []):
                    stored = hist_snap.get("price")
                    if not stored:
                        continue
                    is_obviously_corrupt = stored >= 100_000_000
                    is_concat_artefact   = stored > clean * 5 or stored < clean / 5
                    if is_obviously_corrupt or is_concat_artefact:
                        log.warning(
                            "Repairing corrupt price %d → %d for %s",
                            stored, clean, listing["address"],
                        )
                        hist_snap["price"] = clean
                        if hist_snap.get("sqft"):
                            hist_snap["price_per_sqft"] = round(clean / hist_snap["sqft"], 2)

            dates_recorded = [h["date"] for h in entry.get("history", [])]
            if today not in dates_recorded:
                prev_snap  = entry["history"][-1] if entry.get("history") else None
                prev_price = prev_snap.get("price") if prev_snap else None
                prev_psf   = prev_snap.get("price_per_sqft") if prev_snap else None

                price_chg = ""
                if prev_price and listing.get("price") and prev_price != listing["price"]:
                    d = listing["price"] - prev_price
                    price_chg = f"  (price {'+' if d>=0 else ''}{d:,})"
                psf_chg = ""
                if prev_psf and listing.get("price_per_sqft") and prev_psf != listing["price_per_sqft"]:
                    d = listing["price_per_sqft"] - prev_psf
                    psf_chg = f"  (psf {'+' if d>=0 else ''}{d:.0f})"

                entry["history"].append(snapshot)
                log.info(
                    "[UPD]  %s\n"
                    "       %s  |  %s  |  %s  |  %s%s%s",
                    listing["address"], price_str, sqft_str, psf_str, meta, price_chg, psf_chg,
                )
            else:
                log.info("[SKIP] %s — already recorded today", listing["address"])

    # ── Gone detection ────────────────────────────────────────────────────────
    # Guard against partial blocks: if fewer than 50% of known active listings
    # were returned, it's likely a scrape block rather than genuine delistings.
    # Marking 60 properties as sold because of a bad scrape day would corrupt
    # months of historical data.
    active_existing = sum(1 for e in existing.values() if not e.get("last_seen_missing"))
    active_ratio = len(listings) / max(active_existing, 1)

    if active_ratio < 0.5:
        log.warning(
            "Only %.0f%% of known active listings returned (%d of %d) — "
            "skipping gone-detection to avoid false positives from a partial block.",
            active_ratio * 100, len(listings), active_existing,
        )
    else:
        active_urls = {l["url"] for l in listings}
        gone_count = 0
        for url, entry in existing.items():
            if url not in active_urls:
                if not entry.get("last_seen_missing"):
                    entry["last_seen_missing"] = today
                    log.info("[GONE] %s — no longer in search results", entry.get("address", url))
                    gone_count += 1
        if gone_count:
            log.info("[GONE] %d listing(s) newly marked as no longer active", gone_count)

    cache["meta"]["last_run"] = today
    cache["meta"]["search_config"] = SEARCH_CONFIG
    cache["meta"]["schema_version"] = 1   # bump this if the cache structure changes
    return cache


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------

def main() -> None:
    log.info("=== Rightmove Scraper starting — %s ===", date.today().isoformat())
    session = requests.Session()
    session.headers.update(HEADERS)

    # 1. Load cache
    cache = load_cache()
    log.info("Cache loaded — %d listings in history", len(cache.get("listings", {})))

    # 2. Scrape search results
    listings = scrape_all_listings(session)
    if not listings:
        log.warning("No listings returned — aborting cache update to avoid data loss.")
        sys.exit(1)   # non-zero exit so cron/systemd/git_push.sh detect the failure

    # 3. Merge into cache
    cache = update_cache(cache, listings, session)

    # 4. Save
    save_cache(cache)

    # 5. Summary
    total = len(cache["listings"])
    active = sum(
        1 for e in cache["listings"].values()
        if not e.get("last_seen_missing")
    )
    log.info("Done. Cache now contains %d total listings (%d currently active).", total, active)


if __name__ == "__main__":
    main()