Add monitoring PF service

2026-06-04 14:55:41 +03:00
commit dd3edd7088
41 changed files with 3194 additions and 0 deletions
--- a/app/scrapers/init.py
+++ b/app/scrapers/init.py
@@ -0,0 +1,5 @@
+from app.scrapers.base import ScrapedListing
+from app.scrapers.bayut import BayutScraper
+from app.scrapers.propertyfinder import PropertyFinderScraper
+
+__all__ = ["ScrapedListing", "BayutScraper", "PropertyFinderScraper"]
--- a/app/scrapers/base.py
+++ b/app/scrapers/base.py
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+import json
+import logging
+import re
+from dataclasses import dataclass
+
+import httpx
+from bs4 import BeautifulSoup
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/131.0.0.0 Safari/537.36"
+    ),
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language": "en-US,en;q=0.9",
+    "Accept-Encoding": "gzip, deflate, br",
+    "Connection": "keep-alive",
+    "Upgrade-Insecure-Requests": "1",
+}
+
+
+@dataclass
+class ScrapedListing:
+    source: str            # "propertyfinder" | "bayut"
+    external_id: str       # listing id on the source
+    url: str
+    title: str | None
+    price: float | None
+    currency: str | None
+    permit_number: str | None
+    agent_name: str | None
+    agency_name: str | None
+    is_active: bool = True
+
+
+class ScraperError(Exception):
+    pass
+
+
+def fetch_html(url: str, timeout: float = 30.0) -> str:
+    """GET a URL with browser-like headers. Raises ScraperError on non-2xx."""
+    try:
+        with httpx.Client(headers=DEFAULT_HEADERS, follow_redirects=True, timeout=timeout) as client:
+            r = client.get(url)
+        if r.status_code in (403, 429):
+            raise ScraperError(f"Blocked by site ({r.status_code}) at {url}")
+        if r.status_code == 404:
+            return ""
+        r.raise_for_status()
+        return r.text
+    except httpx.HTTPError as e:
+        raise ScraperError(f"HTTP error for {url}: {e}") from e
+
+
+_NEXT_DATA_RE = re.compile(
+    r'<script[^>]+id="__NEXT_DATA__"[^>]*>(.*?)</script>',
+    re.DOTALL,
+)
+
+
+def extract_next_data(html: str) -> dict | None:
+    """Extract Next.js __NEXT_DATA__ JSON blob — both PF and Bayut are Next.js apps."""
+    if not html:
+        return None
+    m = _NEXT_DATA_RE.search(html)
+    if not m:
+        # Fallback via BeautifulSoup if regex misses (rare).
+        soup = BeautifulSoup(html, "lxml")
+        tag = soup.find("script", id="__NEXT_DATA__")
+        if not tag or not tag.string:
+            return None
+        raw = tag.string
+    else:
+        raw = m.group(1)
+    try:
+        return json.loads(raw)
+    except json.JSONDecodeError as e:
+        logger.warning("Failed to parse __NEXT_DATA__: %s", e)
+        return None
+
+
+def parse_price(value) -> float | None:
+    if value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    s = re.sub(r"[^\d.]", "", str(value))
+    try:
+        return float(s) if s else None
+    except ValueError:
+        return None
--- a/app/scrapers/bayut.py
+++ b/app/scrapers/bayut.py
@@ -0,0 +1,212 @@
+"""Bayut.com scraper.
+
+Two operations:
+- fetch_listing(url): read a listing detail page → ScrapedListing.
+- search_similar(building, bedrooms, deal_type): search Bayut for similar candidates.
+
+Bayut is a Next.js app; __NEXT_DATA__ contains the property in pageProps.
+Unlike PF, Bayut shows the permit number as text in the JSON.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from urllib.parse import quote_plus, urljoin
+
+from app.scrapers.base import (
+    ScrapedListing,
+    ScraperError,
+    extract_next_data,
+    fetch_html,
+    parse_price,
+)
+
+logger = logging.getLogger(__name__)
+
+BASE_URL = "https://www.bayut.com"
+SOURCE = "bayut"
+
+
+def _path_for_deal(deal_type: str) -> str:
+    return "to-buy" if deal_type == "sale" else "to-rent"
+
+
+def _walk(node):
+    if isinstance(node, dict):
+        yield node
+        for v in node.values():
+            yield from _walk(v)
+    elif isinstance(node, list):
+        for it in node:
+            yield from _walk(it)
+
+
+def _extract_price(item: dict) -> tuple[float | None, str | None]:
+    price = item.get("price")
+    if isinstance(price, dict):
+        val = price.get("value") or price.get("amount")
+        cur = price.get("currency") or "AED"
+        return parse_price(val), cur
+    if isinstance(price, (int, float, str)):
+        return parse_price(price), "AED"
+    return None, "AED"
+
+
+def _extract_broker(item: dict) -> tuple[str | None, str | None]:
+    agency = item.get("agency") or {}
+    agency_name = agency.get("name") if isinstance(agency, dict) else None
+    agent_name = item.get("contactName") or item.get("agentName") or item.get("ownerAgent", {}).get("name") if isinstance(item.get("ownerAgent"), dict) else item.get("contactName")
+    return agent_name, agency_name
+
+
+def _extract_permit(item: dict) -> str | None:
+    for key in ("permitNumber", "permit_number", "rera", "trakheesi", "permit"):
+        v = item.get(key)
+        if v:
+            return str(v).strip()
+    return None
+
+
+_ID_FROM_URL = re.compile(r"details-(\d+)\.html(?:[?#].*)?$")
+
+
+def _extract_id_from_url(url: str) -> str | None:
+    m = _ID_FROM_URL.search(url)
+    return m.group(1) if m else None
+
+
+def _is_listing_dict(item: dict) -> bool:
+    if not isinstance(item, dict):
+        return False
+    has_price = "price" in item
+    has_id = any(k in item for k in ("externalID", "id", "objectID"))
+    return has_price and has_id
+
+
+class BayutScraper:
+    source = SOURCE
+
+    def fetch_listing(self, url: str) -> ScrapedListing | None:
+        try:
+            html = fetch_html(url)
+        except ScraperError as e:
+            logger.warning("Bayut refetch failed for %s: %s", url, e)
+            return None
+
+        if not html:
+            return ScrapedListing(
+                source=SOURCE, external_id=_extract_id_from_url(url) or "", url=url,
+                title=None, price=None, currency=None, permit_number=None,
+                agent_name=None, agency_name=None, is_active=False,
+            )
+
+        data = extract_next_data(html)
+        if not data:
+            return None
+
+        best = None
+        best_score = -1
+        for node in _walk(data):
+            if not _is_listing_dict(node):
+                continue
+            score = 0
+            if "title" in node or "name" in node:
+                score += 2
+            if "agency" in node or "contactName" in node:
+                score += 2
+            if "rooms" in node or "bedrooms" in node:
+                score += 1
+            if score > best_score:
+                best_score = score
+                best = node
+
+        if best is None:
+            logger.warning("Bayut: no listing dict found in __NEXT_DATA__ for %s", url)
+            return None
+
+        price, currency = _extract_price(best)
+        agent_name, agency_name = _extract_broker(best)
+        ext_id = (
+            str(best.get("externalID") or best.get("id") or "")
+            or _extract_id_from_url(url)
+            or ""
+        )
+        return ScrapedListing(
+            source=SOURCE,
+            external_id=ext_id,
+            url=url,
+            title=best.get("title") or best.get("name"),
+            price=price,
+            currency=currency,
+            permit_number=_extract_permit(best),
+            agent_name=agent_name,
+            agency_name=agency_name,
+            is_active=True,
+        )
+
+    def search_similar(
+        self,
+        building: str | None,
+        bedrooms: int | None,
+        deal_type: str,
+        limit: int = 20,
+        location_url: str | None = None,
+    ) -> list[ScrapedListing]:
+        if not building:
+            return []
+        path = _path_for_deal(deal_type)
+        q = quote_plus(building.strip())
+        url = f"{BASE_URL}/{path}/property/dubai/?q={q}"
+        if bedrooms is not None:
+            url += f"&beds_in={bedrooms}"
+        logger.info("Bayut search_similar: %s", url)
+
+        try:
+            html = fetch_html(url)
+        except ScraperError as e:
+            logger.warning("Bayut search failed: %s", e)
+            return []
+
+        data = extract_next_data(html)
+        if not data:
+            return []
+
+        results: list[ScrapedListing] = []
+        seen_ids: set[str] = set()
+        for node in _walk(data):
+            if not _is_listing_dict(node):
+                continue
+            ext_id = str(node.get("externalID") or node.get("id") or "")
+            if not ext_id or ext_id in seen_ids:
+                continue
+
+            title = node.get("title") or node.get("name") or ""
+            if building.lower() not in (title or "").lower():
+                slug = str(node.get("slug") or "").lower()
+                building_token = building.lower().replace(" ", "-")
+                if building_token not in slug:
+                    continue
+
+            seen_ids.add(ext_id)
+            price, currency = _extract_price(node)
+            agent_name, agency_name = _extract_broker(node)
+            cand_url = urljoin(BASE_URL, f"/property/details-{ext_id}.html")
+
+            results.append(
+                ScrapedListing(
+                    source=SOURCE,
+                    external_id=ext_id,
+                    url=cand_url,
+                    title=title or None,
+                    price=price,
+                    currency=currency,
+                    permit_number=_extract_permit(node),
+                    agent_name=agent_name,
+                    agency_name=agency_name,
+                    is_active=True,
+                )
+            )
+            if len(results) >= limit:
+                break
+        return results
--- a/app/scrapers/propertyfinder.py
+++ b/app/scrapers/propertyfinder.py
@@ -0,0 +1,325 @@
+"""PropertyFinder.ae scraper.
+
+Two operations:
+- fetch_listing(url): read a listing detail page → ScrapedListing (title/price/agent/permit).
+- search_similar(building, bedrooms, deal_type): search PF for similar candidates
+  by building name + bedrooms filter → list[ScrapedListing].
+
+PF is a Next.js app — listing data sits in <script id="__NEXT_DATA__">.
+Note: PF intentionally hides the Trakheesi permit as an image on the detail page,
+so permit may come back as None — that's fine, we don't depend on it.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from urllib.parse import urljoin
+
+from app.scrapers.base import (
+    ScrapedListing,
+    ScraperError,
+    extract_next_data,
+    fetch_html,
+    parse_price,
+)
+
+logger = logging.getLogger(__name__)
+
+BASE_URL = "https://www.propertyfinder.ae"
+SOURCE = "propertyfinder"
+
+# PF location hierarchy, most specific first. search_similar scopes by the most
+# specific id available on a reference listing page.
+_LOC_TYPE_PRIORITY = {
+    "TOWER": 5,
+    "BUILDING": 4,
+    "DEVELOPMENT": 3,
+    "SUBCOMMUNITY": 2,
+    "COMMUNITY": 1,
+    "CITY": 0,
+}
+
+
+def _category_for_deal(deal_type: str) -> int:
+    return 1 if deal_type == "sale" else 2
+
+
+def _get(d, *keys, default=None):
+    cur = d
+    for k in keys:
+        if not isinstance(cur, dict):
+            return default
+        cur = cur.get(k)
+        if cur is None:
+            return default
+    return cur
+
+
+def _walk(node):
+    """Iterate over every dict in a nested JSON structure."""
+    if isinstance(node, dict):
+        yield node
+        for v in node.values():
+            yield from _walk(v)
+    elif isinstance(node, list):
+        for it in node:
+            yield from _walk(it)
+
+
+def _extract_price(item: dict) -> tuple[float | None, str | None]:
+    price = item.get("price")
+    if isinstance(price, dict):
+        val = price.get("value") or price.get("amount") or price.get("min") or price.get("from")
+        cur = price.get("currency") or "AED"
+        return parse_price(val), cur
+    if isinstance(price, (int, float, str)):
+        return parse_price(price), item.get("currency") or "AED"
+    return None, "AED"
+
+
+def _extract_broker(item: dict) -> tuple[str | None, str | None]:
+    broker = item.get("broker") or item.get("agency") or {}
+    agent = item.get("agent") or item.get("contact") or {}
+    agency_name = broker.get("name") if isinstance(broker, dict) else None
+    agent_name = agent.get("name") if isinstance(agent, dict) else None
+    return agent_name, agency_name
+
+
+def _extract_permit(item: dict) -> str | None:
+    for key in ("permit_number", "permitNumber", "trakheesi", "rera", "permit"):
+        v = item.get(key)
+        if v:
+            return str(v).strip()
+    reg = item.get("regulatory") or item.get("regulation") or {}
+    if isinstance(reg, dict):
+        for key in ("permit", "permit_number", "trakheesi", "rera"):
+            v = reg.get(key)
+            if v:
+                return str(v).strip()
+    return None
+
+
+def _find_permit_on_page(data: dict) -> str | None:
+    """The DLD permit number lives in a regulatory block rendered as an image,
+    but its plain value is still in __NEXT_DATA__: the dict that carries a
+    `permit_validation_url` (the Trakheesi link) also has the number in
+    `number`. Walk the page and pull it out."""
+    for node in _walk(data):
+        if isinstance(node, dict) and node.get("permit_validation_url") and node.get("number"):
+            return str(node["number"]).strip()
+    return None
+
+
+_ID_FROM_URL = re.compile(r"-(\d+)\.html(?:[?#].*)?$")
+
+
+def _extract_id_from_url(url: str) -> str | None:
+    m = _ID_FROM_URL.search(url)
+    return m.group(1) if m else None
+
+
+def _is_listing_dict(item: dict) -> bool:
+    """Heuristic: a listing dict contains a price plus an id-like field."""
+    if not isinstance(item, dict):
+        return False
+    has_price = "price" in item
+    has_id = any(k in item for k in ("id", "reference", "listing_id", "externalID"))
+    return has_price and has_id
+
+
+class PropertyFinderScraper:
+    source = SOURCE
+
+    def fetch_listing(self, url: str) -> ScrapedListing | None:
+        """Refetch a known listing URL. Returns:
+        - ScrapedListing(is_active=False) if the URL returns 404 (listing removed)
+        - ScrapedListing with current data if alive
+        - None on network/parse failure (we won't update the DB in that case)
+        """
+        try:
+            html = fetch_html(url)
+        except ScraperError as e:
+            logger.warning("PF refetch failed for %s: %s", url, e)
+            return None
+
+        if not html:
+            return ScrapedListing(
+                source=SOURCE, external_id=_extract_id_from_url(url) or "", url=url,
+                title=None, price=None, currency=None, permit_number=None,
+                agent_name=None, agency_name=None, is_active=False,
+            )
+
+        data = extract_next_data(html)
+        if not data:
+            return None
+
+        # On a PF detail page the property dict is nested in pageProps. Walk and pick
+        # the dict that has both a "price" and an id, ignoring trivial nested ones.
+        best = None
+        best_score = -1
+        for node in _walk(data):
+            if not _is_listing_dict(node):
+                continue
+            score = 0
+            if "title" in node or "name" in node:
+                score += 2
+            if any(k in node for k in ("broker", "agent", "agency")):
+                score += 2
+            if "bedrooms" in node or "rooms" in node:
+                score += 1
+            if score > best_score:
+                best_score = score
+                best = node
+
+        if best is None:
+            logger.warning("PF: no listing dict found in __NEXT_DATA__ for %s", url)
+            return None
+
+        price, currency = _extract_price(best)
+        agent_name, agency_name = _extract_broker(best)
+        ext_id = (
+            str(best.get("id") or best.get("reference") or best.get("listing_id") or "")
+            or _extract_id_from_url(url)
+            or ""
+        )
+        return ScrapedListing(
+            source=SOURCE,
+            external_id=ext_id,
+            url=url,
+            title=best.get("title") or best.get("name"),
+            price=price,
+            currency=currency,
+            permit_number=_find_permit_on_page(data) or _extract_permit(best),
+            agent_name=agent_name,
+            agency_name=agency_name,
+            is_active=True,
+        )
+
+    def get_permit(self, url: str) -> str | None:
+        """Fetch a listing page and return only its DLD permit number (or None).
+        Used to compare candidates against our own permit during suggestions."""
+        try:
+            html = fetch_html(url)
+        except ScraperError as e:
+            logger.warning("PF get_permit fetch failed for %s: %s", url, e)
+            return None
+        data = extract_next_data(html)
+        return _find_permit_on_page(data) if data else None
+
+    def resolve_location_id(self, listing_url: str) -> int | None:
+        """Read a PF listing page and return the most specific location id
+        (tower > building > subcommunity > community).
+
+        PF's search only filters by numeric location id (`l=`); the free-text
+        `q=` param does NOT scope results to a building — it returns unrelated
+        recommendations. So we derive the location id from a known listing that
+        sits in the same building (our own listing, or an already-tracked one).
+        """
+        try:
+            html = fetch_html(listing_url)
+        except ScraperError as e:
+            logger.warning("PF resolve_location_id fetch failed for %s: %s", listing_url, e)
+            return None
+        data = extract_next_data(html)
+        if not data:
+            return None
+
+        best_id: object = None
+        best_rank = -1
+        for node in _walk(data):
+            if not isinstance(node, dict):
+                continue
+            rank = _LOC_TYPE_PRIORITY.get(str(node.get("type", "")).upper(), -1)
+            if rank > best_rank and node.get("id") and node.get("name"):
+                best_rank, best_id = rank, node.get("id")
+        try:
+            return int(best_id) if best_id is not None else None
+        except (TypeError, ValueError):
+            return None
+
+    def search_similar(
+        self,
+        building: str | None,
+        bedrooms: int | None,
+        deal_type: str,
+        limit: int = 200,
+        location_url: str | None = None,
+        max_pages: int = 8,
+    ) -> list[ScrapedListing]:
+        """Search PF for candidates in the same building, scoped by location id.
+
+        `location_url` is a reference listing in the target building (our own
+        listing or an already-tracked competitor) — we resolve it to a PF
+        location id and search by `l=`. Without it we can't reliably scope a
+        building search on PF, so we return nothing rather than garbage.
+
+        Paginates: a same-permit competitor can sit on any results page (PF
+        can't be queried by permit), so we collect across pages up to
+        `max_pages`/`limit`.
+        """
+        location_id = self.resolve_location_id(location_url) if location_url else None
+        if location_id is None:
+            logger.info(
+                "PF search_similar: no location id (url=%r) — skipping (q= text search "
+                "does not filter by building on PF)", location_url,
+            )
+            return []
+
+        c = _category_for_deal(deal_type)
+        base = f"{BASE_URL}/en/search?c={c}&l={location_id}"
+        if bedrooms is not None:
+            base += f"&bf={bedrooms}&bt={bedrooms}"  # PF uses bf=bedrooms-from, bt=bedrooms-to
+
+        results: list[ScrapedListing] = []
+        seen_ids: set[str] = set()
+        for page in range(1, max_pages + 1):
+            page_url = base if page == 1 else f"{base}&page={page}"
+            try:
+                html = fetch_html(page_url)
+            except ScraperError as e:
+                logger.warning("PF search failed (page %d): %s", page, e)
+                break
+            data = extract_next_data(html)
+            if not data:
+                break
+
+            new_on_page = 0
+            for node in _walk(data):
+                if not _is_listing_dict(node):
+                    continue
+                ext_id = str(node.get("id") or node.get("reference") or "")
+                if not ext_id or ext_id in seen_ids:
+                    continue
+                seen_ids.add(ext_id)
+                new_on_page += 1
+
+                # Results are scoped to the location by l=, so no title filter.
+                title = node.get("title") or node.get("name") or ""
+                price, currency = _extract_price(node)
+                agent_name, agency_name = _extract_broker(node)
+                share = node.get("share_url") or node.get("path")
+                cand_url = share if str(share).startswith("http") else urljoin(BASE_URL, str(share or ""))
+
+                results.append(
+                    ScrapedListing(
+                        source=SOURCE,
+                        external_id=ext_id,
+                        url=cand_url or page_url,
+                        title=title or None,
+                        price=price,
+                        currency=currency,
+                        permit_number=_extract_permit(node),
+                        agent_name=agent_name,
+                        agency_name=agency_name,
+                        is_active=True,
+                    )
+                )
+                if len(results) >= limit:
+                    break
+
+            # No new listings on this page → we've passed the last page.
+            if len(results) >= limit or new_on_page == 0:
+                break
+        logger.info("PF search_similar: collected %d candidates (l=%s)", len(results), location_id)
+        return results