213 lines
6.4 KiB
Python
213 lines
6.4 KiB
Python
"""Bayut.com scraper.
|
|
|
|
Two operations:
|
|
- fetch_listing(url): read a listing detail page → ScrapedListing.
|
|
- search_similar(building, bedrooms, deal_type): search Bayut for similar candidates.
|
|
|
|
Bayut is a Next.js app; __NEXT_DATA__ contains the property in pageProps.
|
|
Unlike PF, Bayut shows the permit number as text in the JSON.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from urllib.parse import quote_plus, urljoin
|
|
|
|
from app.scrapers.base import (
|
|
ScrapedListing,
|
|
ScraperError,
|
|
extract_next_data,
|
|
fetch_html,
|
|
parse_price,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
BASE_URL = "https://www.bayut.com"
|
|
SOURCE = "bayut"
|
|
|
|
|
|
def _path_for_deal(deal_type: str) -> str:
|
|
return "to-buy" if deal_type == "sale" else "to-rent"
|
|
|
|
|
|
def _walk(node):
|
|
if isinstance(node, dict):
|
|
yield node
|
|
for v in node.values():
|
|
yield from _walk(v)
|
|
elif isinstance(node, list):
|
|
for it in node:
|
|
yield from _walk(it)
|
|
|
|
|
|
def _extract_price(item: dict) -> tuple[float | None, str | None]:
|
|
price = item.get("price")
|
|
if isinstance(price, dict):
|
|
val = price.get("value") or price.get("amount")
|
|
cur = price.get("currency") or "AED"
|
|
return parse_price(val), cur
|
|
if isinstance(price, (int, float, str)):
|
|
return parse_price(price), "AED"
|
|
return None, "AED"
|
|
|
|
|
|
def _extract_broker(item: dict) -> tuple[str | None, str | None]:
|
|
agency = item.get("agency") or {}
|
|
agency_name = agency.get("name") if isinstance(agency, dict) else None
|
|
agent_name = item.get("contactName") or item.get("agentName") or item.get("ownerAgent", {}).get("name") if isinstance(item.get("ownerAgent"), dict) else item.get("contactName")
|
|
return agent_name, agency_name
|
|
|
|
|
|
def _extract_permit(item: dict) -> str | None:
|
|
for key in ("permitNumber", "permit_number", "rera", "trakheesi", "permit"):
|
|
v = item.get(key)
|
|
if v:
|
|
return str(v).strip()
|
|
return None
|
|
|
|
|
|
_ID_FROM_URL = re.compile(r"details-(\d+)\.html(?:[?#].*)?$")
|
|
|
|
|
|
def _extract_id_from_url(url: str) -> str | None:
|
|
m = _ID_FROM_URL.search(url)
|
|
return m.group(1) if m else None
|
|
|
|
|
|
def _is_listing_dict(item: dict) -> bool:
|
|
if not isinstance(item, dict):
|
|
return False
|
|
has_price = "price" in item
|
|
has_id = any(k in item for k in ("externalID", "id", "objectID"))
|
|
return has_price and has_id
|
|
|
|
|
|
class BayutScraper:
|
|
source = SOURCE
|
|
|
|
def fetch_listing(self, url: str) -> ScrapedListing | None:
|
|
try:
|
|
html = fetch_html(url)
|
|
except ScraperError as e:
|
|
logger.warning("Bayut refetch failed for %s: %s", url, e)
|
|
return None
|
|
|
|
if not html:
|
|
return ScrapedListing(
|
|
source=SOURCE, external_id=_extract_id_from_url(url) or "", url=url,
|
|
title=None, price=None, currency=None, permit_number=None,
|
|
agent_name=None, agency_name=None, is_active=False,
|
|
)
|
|
|
|
data = extract_next_data(html)
|
|
if not data:
|
|
return None
|
|
|
|
best = None
|
|
best_score = -1
|
|
for node in _walk(data):
|
|
if not _is_listing_dict(node):
|
|
continue
|
|
score = 0
|
|
if "title" in node or "name" in node:
|
|
score += 2
|
|
if "agency" in node or "contactName" in node:
|
|
score += 2
|
|
if "rooms" in node or "bedrooms" in node:
|
|
score += 1
|
|
if score > best_score:
|
|
best_score = score
|
|
best = node
|
|
|
|
if best is None:
|
|
logger.warning("Bayut: no listing dict found in __NEXT_DATA__ for %s", url)
|
|
return None
|
|
|
|
price, currency = _extract_price(best)
|
|
agent_name, agency_name = _extract_broker(best)
|
|
ext_id = (
|
|
str(best.get("externalID") or best.get("id") or "")
|
|
or _extract_id_from_url(url)
|
|
or ""
|
|
)
|
|
return ScrapedListing(
|
|
source=SOURCE,
|
|
external_id=ext_id,
|
|
url=url,
|
|
title=best.get("title") or best.get("name"),
|
|
price=price,
|
|
currency=currency,
|
|
permit_number=_extract_permit(best),
|
|
agent_name=agent_name,
|
|
agency_name=agency_name,
|
|
is_active=True,
|
|
)
|
|
|
|
def search_similar(
|
|
self,
|
|
building: str | None,
|
|
bedrooms: int | None,
|
|
deal_type: str,
|
|
limit: int = 20,
|
|
location_url: str | None = None,
|
|
) -> list[ScrapedListing]:
|
|
if not building:
|
|
return []
|
|
path = _path_for_deal(deal_type)
|
|
q = quote_plus(building.strip())
|
|
url = f"{BASE_URL}/{path}/property/dubai/?q={q}"
|
|
if bedrooms is not None:
|
|
url += f"&beds_in={bedrooms}"
|
|
logger.info("Bayut search_similar: %s", url)
|
|
|
|
try:
|
|
html = fetch_html(url)
|
|
except ScraperError as e:
|
|
logger.warning("Bayut search failed: %s", e)
|
|
return []
|
|
|
|
data = extract_next_data(html)
|
|
if not data:
|
|
return []
|
|
|
|
results: list[ScrapedListing] = []
|
|
seen_ids: set[str] = set()
|
|
for node in _walk(data):
|
|
if not _is_listing_dict(node):
|
|
continue
|
|
ext_id = str(node.get("externalID") or node.get("id") or "")
|
|
if not ext_id or ext_id in seen_ids:
|
|
continue
|
|
|
|
title = node.get("title") or node.get("name") or ""
|
|
if building.lower() not in (title or "").lower():
|
|
slug = str(node.get("slug") or "").lower()
|
|
building_token = building.lower().replace(" ", "-")
|
|
if building_token not in slug:
|
|
continue
|
|
|
|
seen_ids.add(ext_id)
|
|
price, currency = _extract_price(node)
|
|
agent_name, agency_name = _extract_broker(node)
|
|
cand_url = urljoin(BASE_URL, f"/property/details-{ext_id}.html")
|
|
|
|
results.append(
|
|
ScrapedListing(
|
|
source=SOURCE,
|
|
external_id=ext_id,
|
|
url=cand_url,
|
|
title=title or None,
|
|
price=price,
|
|
currency=currency,
|
|
permit_number=_extract_permit(node),
|
|
agent_name=agent_name,
|
|
agency_name=agency_name,
|
|
is_active=True,
|
|
)
|
|
)
|
|
if len(results) >= limit:
|
|
break
|
|
return results
|