100 lines
2.7 KiB
Python
100 lines
2.7 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass
|
|
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
DEFAULT_HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/131.0.0.0 Safari/537.36"
|
|
),
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"Connection": "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class ScrapedListing:
|
|
source: str # "propertyfinder" | "bayut"
|
|
external_id: str # listing id on the source
|
|
url: str
|
|
title: str | None
|
|
price: float | None
|
|
currency: str | None
|
|
permit_number: str | None
|
|
agent_name: str | None
|
|
agency_name: str | None
|
|
building: str | None = None
|
|
bedrooms: int | None = None
|
|
size_sqft: float | None = None
|
|
is_active: bool = True
|
|
|
|
|
|
class ScraperError(Exception):
|
|
pass
|
|
|
|
|
|
def fetch_html(url: str, timeout: float = 30.0) -> str:
|
|
"""GET a URL with browser-like headers. Raises ScraperError on non-2xx."""
|
|
try:
|
|
with httpx.Client(headers=DEFAULT_HEADERS, follow_redirects=True, timeout=timeout) as client:
|
|
r = client.get(url)
|
|
if r.status_code in (403, 429):
|
|
raise ScraperError(f"Blocked by site ({r.status_code}) at {url}")
|
|
if r.status_code == 404:
|
|
return ""
|
|
r.raise_for_status()
|
|
return r.text
|
|
except httpx.HTTPError as e:
|
|
raise ScraperError(f"HTTP error for {url}: {e}") from e
|
|
|
|
|
|
_NEXT_DATA_RE = re.compile(
|
|
r'<script[^>]+id="__NEXT_DATA__"[^>]*>(.*?)</script>',
|
|
re.DOTALL,
|
|
)
|
|
|
|
|
|
def extract_next_data(html: str) -> dict | None:
|
|
"""Extract Next.js __NEXT_DATA__ JSON blob — both PF and Bayut are Next.js apps."""
|
|
if not html:
|
|
return None
|
|
m = _NEXT_DATA_RE.search(html)
|
|
if not m:
|
|
# Fallback via BeautifulSoup if regex misses (rare).
|
|
soup = BeautifulSoup(html, "lxml")
|
|
tag = soup.find("script", id="__NEXT_DATA__")
|
|
if not tag or not tag.string:
|
|
return None
|
|
raw = tag.string
|
|
else:
|
|
raw = m.group(1)
|
|
try:
|
|
return json.loads(raw)
|
|
except json.JSONDecodeError as e:
|
|
logger.warning("Failed to parse __NEXT_DATA__: %s", e)
|
|
return None
|
|
|
|
|
|
def parse_price(value) -> float | None:
|
|
if value is None:
|
|
return None
|
|
if isinstance(value, (int, float)):
|
|
return float(value)
|
|
s = re.sub(r"[^\d.]", "", str(value))
|
|
try:
|
|
return float(s) if s else None
|
|
except ValueError:
|
|
return None
|