from __future__ import annotations import json import logging import re from dataclasses import dataclass import httpx from bs4 import BeautifulSoup logger = logging.getLogger(__name__) DEFAULT_HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/131.0.0.0 Safari/537.36" ), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", } @dataclass class ScrapedListing: source: str # "propertyfinder" | "bayut" external_id: str # listing id on the source url: str title: str | None price: float | None currency: str | None permit_number: str | None agent_name: str | None agency_name: str | None is_active: bool = True class ScraperError(Exception): pass def fetch_html(url: str, timeout: float = 30.0) -> str: """GET a URL with browser-like headers. Raises ScraperError on non-2xx.""" try: with httpx.Client(headers=DEFAULT_HEADERS, follow_redirects=True, timeout=timeout) as client: r = client.get(url) if r.status_code in (403, 429): raise ScraperError(f"Blocked by site ({r.status_code}) at {url}") if r.status_code == 404: return "" r.raise_for_status() return r.text except httpx.HTTPError as e: raise ScraperError(f"HTTP error for {url}: {e}") from e _NEXT_DATA_RE = re.compile( r']+id="__NEXT_DATA__"[^>]*>(.*?)', re.DOTALL, ) def extract_next_data(html: str) -> dict | None: """Extract Next.js __NEXT_DATA__ JSON blob — both PF and Bayut are Next.js apps.""" if not html: return None m = _NEXT_DATA_RE.search(html) if not m: # Fallback via BeautifulSoup if regex misses (rare). soup = BeautifulSoup(html, "lxml") tag = soup.find("script", id="__NEXT_DATA__") if not tag or not tag.string: return None raw = tag.string else: raw = m.group(1) try: return json.loads(raw) except json.JSONDecodeError as e: logger.warning("Failed to parse __NEXT_DATA__: %s", e) return None def parse_price(value) -> float | None: if value is None: return None if isinstance(value, (int, float)): return float(value) s = re.sub(r"[^\d.]", "", str(value)) try: return float(s) if s else None except ValueError: return None