monitoring-pf/app/scrapers/base.py

from __future__ import annotations

import json
import logging
import re
from dataclasses import dataclass

import httpx
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)

DEFAULT_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/131.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
}


@dataclass
class ScrapedListing:
    source: str            # "propertyfinder" | "bayut"
    external_id: str       # listing id on the source
    url: str
    title: str | None
    price: float | None
    currency: str | None
    permit_number: str | None
    agent_name: str | None
    agency_name: str | None
    building: str | None = None
    bedrooms: int | None = None
    size_sqft: float | None = None
    is_active: bool = True


class ScraperError(Exception):
    pass


def fetch_html(url: str, timeout: float = 30.0) -> str:
    """GET a URL with browser-like headers. Raises ScraperError on non-2xx."""
    try:
        with httpx.Client(headers=DEFAULT_HEADERS, follow_redirects=True, timeout=timeout) as client:
            r = client.get(url)
        if r.status_code in (403, 429):
            raise ScraperError(f"Blocked by site ({r.status_code}) at {url}")
        if r.status_code == 404:
            return ""
        r.raise_for_status()
        return r.text
    except httpx.HTTPError as e:
        raise ScraperError(f"HTTP error for {url}: {e}") from e


_NEXT_DATA_RE = re.compile(
    r'<script[^>]+id="__NEXT_DATA__"[^>]*>(.*?)</script>',
    re.DOTALL,
)


def extract_next_data(html: str) -> dict | None:
    """Extract Next.js __NEXT_DATA__ JSON blob — both PF and Bayut are Next.js apps."""
    if not html:
        return None
    m = _NEXT_DATA_RE.search(html)
    if not m:
        # Fallback via BeautifulSoup if regex misses (rare).
        soup = BeautifulSoup(html, "lxml")
        tag = soup.find("script", id="__NEXT_DATA__")
        if not tag or not tag.string:
            return None
        raw = tag.string
    else:
        raw = m.group(1)
    try:
        return json.loads(raw)
    except json.JSONDecodeError as e:
        logger.warning("Failed to parse __NEXT_DATA__: %s", e)
        return None


def parse_price(value) -> float | None:
    if value is None:
        return None
    if isinstance(value, (int, float)):
        return float(value)
    s = re.sub(r"[^\d.]", "", str(value))
    try:
        return float(s) if s else None
    except ValueError:
        return None