"""Bayut.com scraper. Two operations: - fetch_listing(url): read a listing detail page → ScrapedListing. - search_similar(building, bedrooms, deal_type): search Bayut for similar candidates. Bayut is a Next.js app; __NEXT_DATA__ contains the property in pageProps. Unlike PF, Bayut shows the permit number as text in the JSON. """ from __future__ import annotations import logging import re from urllib.parse import quote_plus, urljoin from app.scrapers.base import ( ScrapedListing, ScraperError, extract_next_data, fetch_html, parse_price, ) logger = logging.getLogger(__name__) BASE_URL = "https://www.bayut.com" SOURCE = "bayut" def _path_for_deal(deal_type: str) -> str: return "to-buy" if deal_type == "sale" else "to-rent" def _walk(node): if isinstance(node, dict): yield node for v in node.values(): yield from _walk(v) elif isinstance(node, list): for it in node: yield from _walk(it) def _extract_price(item: dict) -> tuple[float | None, str | None]: price = item.get("price") if isinstance(price, dict): val = price.get("value") or price.get("amount") cur = price.get("currency") or "AED" return parse_price(val), cur if isinstance(price, (int, float, str)): return parse_price(price), "AED" return None, "AED" def _extract_broker(item: dict) -> tuple[str | None, str | None]: agency = item.get("agency") or {} agency_name = agency.get("name") if isinstance(agency, dict) else None agent_name = item.get("contactName") or item.get("agentName") or item.get("ownerAgent", {}).get("name") if isinstance(item.get("ownerAgent"), dict) else item.get("contactName") return agent_name, agency_name def _extract_permit(item: dict) -> str | None: for key in ("permitNumber", "permit_number", "rera", "trakheesi", "permit"): v = item.get(key) if v: return str(v).strip() return None _ID_FROM_URL = re.compile(r"details-(\d+)\.html(?:[?#].*)?$") def _extract_id_from_url(url: str) -> str | None: m = _ID_FROM_URL.search(url) return m.group(1) if m else None def _is_listing_dict(item: dict) -> bool: if not isinstance(item, dict): return False has_price = "price" in item has_id = any(k in item for k in ("externalID", "id", "objectID")) return has_price and has_id class BayutScraper: source = SOURCE def fetch_listing(self, url: str) -> ScrapedListing | None: try: html = fetch_html(url) except ScraperError as e: logger.warning("Bayut refetch failed for %s: %s", url, e) return None if not html: return ScrapedListing( source=SOURCE, external_id=_extract_id_from_url(url) or "", url=url, title=None, price=None, currency=None, permit_number=None, agent_name=None, agency_name=None, is_active=False, ) data = extract_next_data(html) if not data: return None best = None best_score = -1 for node in _walk(data): if not _is_listing_dict(node): continue score = 0 if "title" in node or "name" in node: score += 2 if "agency" in node or "contactName" in node: score += 2 if "rooms" in node or "bedrooms" in node: score += 1 if score > best_score: best_score = score best = node if best is None: logger.warning("Bayut: no listing dict found in __NEXT_DATA__ for %s", url) return None price, currency = _extract_price(best) agent_name, agency_name = _extract_broker(best) ext_id = ( str(best.get("externalID") or best.get("id") or "") or _extract_id_from_url(url) or "" ) return ScrapedListing( source=SOURCE, external_id=ext_id, url=url, title=best.get("title") or best.get("name"), price=price, currency=currency, permit_number=_extract_permit(best), agent_name=agent_name, agency_name=agency_name, is_active=True, ) def search_similar( self, building: str | None, bedrooms: int | None, deal_type: str, limit: int = 20, location_url: str | None = None, ) -> list[ScrapedListing]: if not building: return [] path = _path_for_deal(deal_type) q = quote_plus(building.strip()) url = f"{BASE_URL}/{path}/property/dubai/?q={q}" if bedrooms is not None: url += f"&beds_in={bedrooms}" logger.info("Bayut search_similar: %s", url) try: html = fetch_html(url) except ScraperError as e: logger.warning("Bayut search failed: %s", e) return [] data = extract_next_data(html) if not data: return [] results: list[ScrapedListing] = [] seen_ids: set[str] = set() for node in _walk(data): if not _is_listing_dict(node): continue ext_id = str(node.get("externalID") or node.get("id") or "") if not ext_id or ext_id in seen_ids: continue title = node.get("title") or node.get("name") or "" if building.lower() not in (title or "").lower(): slug = str(node.get("slug") or "").lower() building_token = building.lower().replace(" ", "-") if building_token not in slug: continue seen_ids.add(ext_id) price, currency = _extract_price(node) agent_name, agency_name = _extract_broker(node) cand_url = urljoin(BASE_URL, f"/property/details-{ext_id}.html") results.append( ScrapedListing( source=SOURCE, external_id=ext_id, url=cand_url, title=title or None, price=price, currency=currency, permit_number=_extract_permit(node), agent_name=agent_name, agency_name=agency_name, is_active=True, ) ) if len(results) >= limit: break return results