Add monitoring PF service
This commit is contained in:
96
app/scrapers/base.py
Normal file
96
app/scrapers/base.py
Normal file
@@ -0,0 +1,96 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/131.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapedListing:
|
||||
source: str # "propertyfinder" | "bayut"
|
||||
external_id: str # listing id on the source
|
||||
url: str
|
||||
title: str | None
|
||||
price: float | None
|
||||
currency: str | None
|
||||
permit_number: str | None
|
||||
agent_name: str | None
|
||||
agency_name: str | None
|
||||
is_active: bool = True
|
||||
|
||||
|
||||
class ScraperError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def fetch_html(url: str, timeout: float = 30.0) -> str:
|
||||
"""GET a URL with browser-like headers. Raises ScraperError on non-2xx."""
|
||||
try:
|
||||
with httpx.Client(headers=DEFAULT_HEADERS, follow_redirects=True, timeout=timeout) as client:
|
||||
r = client.get(url)
|
||||
if r.status_code in (403, 429):
|
||||
raise ScraperError(f"Blocked by site ({r.status_code}) at {url}")
|
||||
if r.status_code == 404:
|
||||
return ""
|
||||
r.raise_for_status()
|
||||
return r.text
|
||||
except httpx.HTTPError as e:
|
||||
raise ScraperError(f"HTTP error for {url}: {e}") from e
|
||||
|
||||
|
||||
_NEXT_DATA_RE = re.compile(
|
||||
r'<script[^>]+id="__NEXT_DATA__"[^>]*>(.*?)</script>',
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
def extract_next_data(html: str) -> dict | None:
|
||||
"""Extract Next.js __NEXT_DATA__ JSON blob — both PF and Bayut are Next.js apps."""
|
||||
if not html:
|
||||
return None
|
||||
m = _NEXT_DATA_RE.search(html)
|
||||
if not m:
|
||||
# Fallback via BeautifulSoup if regex misses (rare).
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
tag = soup.find("script", id="__NEXT_DATA__")
|
||||
if not tag or not tag.string:
|
||||
return None
|
||||
raw = tag.string
|
||||
else:
|
||||
raw = m.group(1)
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning("Failed to parse __NEXT_DATA__: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
def parse_price(value) -> float | None:
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, (int, float)):
|
||||
return float(value)
|
||||
s = re.sub(r"[^\d.]", "", str(value))
|
||||
try:
|
||||
return float(s) if s else None
|
||||
except ValueError:
|
||||
return None
|
||||
Reference in New Issue
Block a user