Add monitoring PF service

2026-06-04 14:55:41 +03:00
commit dd3edd7088
41 changed files with 3194 additions and 0 deletions
--- a/app/scrapers/base.py
+++ b/app/scrapers/base.py
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+import json
+import logging
+import re
+from dataclasses import dataclass
+
+import httpx
+from bs4 import BeautifulSoup
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/131.0.0.0 Safari/537.36"
+    ),
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language": "en-US,en;q=0.9",
+    "Accept-Encoding": "gzip, deflate, br",
+    "Connection": "keep-alive",
+    "Upgrade-Insecure-Requests": "1",
+}
+
+
+@dataclass
+class ScrapedListing:
+    source: str            # "propertyfinder" | "bayut"
+    external_id: str       # listing id on the source
+    url: str
+    title: str | None
+    price: float | None
+    currency: str | None
+    permit_number: str | None
+    agent_name: str | None
+    agency_name: str | None
+    is_active: bool = True
+
+
+class ScraperError(Exception):
+    pass
+
+
+def fetch_html(url: str, timeout: float = 30.0) -> str:
+    """GET a URL with browser-like headers. Raises ScraperError on non-2xx."""
+    try:
+        with httpx.Client(headers=DEFAULT_HEADERS, follow_redirects=True, timeout=timeout) as client:
+            r = client.get(url)
+        if r.status_code in (403, 429):
+            raise ScraperError(f"Blocked by site ({r.status_code}) at {url}")
+        if r.status_code == 404:
+            return ""
+        r.raise_for_status()
+        return r.text
+    except httpx.HTTPError as e:
+        raise ScraperError(f"HTTP error for {url}: {e}") from e
+
+
+_NEXT_DATA_RE = re.compile(
+    r'<script[^>]+id="__NEXT_DATA__"[^>]*>(.*?)</script>',
+    re.DOTALL,
+)
+
+
+def extract_next_data(html: str) -> dict | None:
+    """Extract Next.js __NEXT_DATA__ JSON blob — both PF and Bayut are Next.js apps."""
+    if not html:
+        return None
+    m = _NEXT_DATA_RE.search(html)
+    if not m:
+        # Fallback via BeautifulSoup if regex misses (rare).
+        soup = BeautifulSoup(html, "lxml")
+        tag = soup.find("script", id="__NEXT_DATA__")
+        if not tag or not tag.string:
+            return None
+        raw = tag.string
+    else:
+        raw = m.group(1)
+    try:
+        return json.loads(raw)
+    except json.JSONDecodeError as e:
+        logger.warning("Failed to parse __NEXT_DATA__: %s", e)
+        return None
+
+
+def parse_price(value) -> float | None:
+    if value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    s = re.sub(r"[^\d.]", "", str(value))
+    try:
+        return float(s) if s else None
+    except ValueError:
+        return None