Add monitoring PF service
This commit is contained in:
5
app/scrapers/__init__.py
Normal file
5
app/scrapers/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from app.scrapers.base import ScrapedListing
|
||||
from app.scrapers.bayut import BayutScraper
|
||||
from app.scrapers.propertyfinder import PropertyFinderScraper
|
||||
|
||||
__all__ = ["ScrapedListing", "BayutScraper", "PropertyFinderScraper"]
|
||||
96
app/scrapers/base.py
Normal file
96
app/scrapers/base.py
Normal file
@@ -0,0 +1,96 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/131.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapedListing:
|
||||
source: str # "propertyfinder" | "bayut"
|
||||
external_id: str # listing id on the source
|
||||
url: str
|
||||
title: str | None
|
||||
price: float | None
|
||||
currency: str | None
|
||||
permit_number: str | None
|
||||
agent_name: str | None
|
||||
agency_name: str | None
|
||||
is_active: bool = True
|
||||
|
||||
|
||||
class ScraperError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def fetch_html(url: str, timeout: float = 30.0) -> str:
|
||||
"""GET a URL with browser-like headers. Raises ScraperError on non-2xx."""
|
||||
try:
|
||||
with httpx.Client(headers=DEFAULT_HEADERS, follow_redirects=True, timeout=timeout) as client:
|
||||
r = client.get(url)
|
||||
if r.status_code in (403, 429):
|
||||
raise ScraperError(f"Blocked by site ({r.status_code}) at {url}")
|
||||
if r.status_code == 404:
|
||||
return ""
|
||||
r.raise_for_status()
|
||||
return r.text
|
||||
except httpx.HTTPError as e:
|
||||
raise ScraperError(f"HTTP error for {url}: {e}") from e
|
||||
|
||||
|
||||
_NEXT_DATA_RE = re.compile(
|
||||
r'<script[^>]+id="__NEXT_DATA__"[^>]*>(.*?)</script>',
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
def extract_next_data(html: str) -> dict | None:
|
||||
"""Extract Next.js __NEXT_DATA__ JSON blob — both PF and Bayut are Next.js apps."""
|
||||
if not html:
|
||||
return None
|
||||
m = _NEXT_DATA_RE.search(html)
|
||||
if not m:
|
||||
# Fallback via BeautifulSoup if regex misses (rare).
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
tag = soup.find("script", id="__NEXT_DATA__")
|
||||
if not tag or not tag.string:
|
||||
return None
|
||||
raw = tag.string
|
||||
else:
|
||||
raw = m.group(1)
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning("Failed to parse __NEXT_DATA__: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
def parse_price(value) -> float | None:
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, (int, float)):
|
||||
return float(value)
|
||||
s = re.sub(r"[^\d.]", "", str(value))
|
||||
try:
|
||||
return float(s) if s else None
|
||||
except ValueError:
|
||||
return None
|
||||
212
app/scrapers/bayut.py
Normal file
212
app/scrapers/bayut.py
Normal file
@@ -0,0 +1,212 @@
|
||||
"""Bayut.com scraper.
|
||||
|
||||
Two operations:
|
||||
- fetch_listing(url): read a listing detail page → ScrapedListing.
|
||||
- search_similar(building, bedrooms, deal_type): search Bayut for similar candidates.
|
||||
|
||||
Bayut is a Next.js app; __NEXT_DATA__ contains the property in pageProps.
|
||||
Unlike PF, Bayut shows the permit number as text in the JSON.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from urllib.parse import quote_plus, urljoin
|
||||
|
||||
from app.scrapers.base import (
|
||||
ScrapedListing,
|
||||
ScraperError,
|
||||
extract_next_data,
|
||||
fetch_html,
|
||||
parse_price,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BASE_URL = "https://www.bayut.com"
|
||||
SOURCE = "bayut"
|
||||
|
||||
|
||||
def _path_for_deal(deal_type: str) -> str:
|
||||
return "to-buy" if deal_type == "sale" else "to-rent"
|
||||
|
||||
|
||||
def _walk(node):
|
||||
if isinstance(node, dict):
|
||||
yield node
|
||||
for v in node.values():
|
||||
yield from _walk(v)
|
||||
elif isinstance(node, list):
|
||||
for it in node:
|
||||
yield from _walk(it)
|
||||
|
||||
|
||||
def _extract_price(item: dict) -> tuple[float | None, str | None]:
|
||||
price = item.get("price")
|
||||
if isinstance(price, dict):
|
||||
val = price.get("value") or price.get("amount")
|
||||
cur = price.get("currency") or "AED"
|
||||
return parse_price(val), cur
|
||||
if isinstance(price, (int, float, str)):
|
||||
return parse_price(price), "AED"
|
||||
return None, "AED"
|
||||
|
||||
|
||||
def _extract_broker(item: dict) -> tuple[str | None, str | None]:
|
||||
agency = item.get("agency") or {}
|
||||
agency_name = agency.get("name") if isinstance(agency, dict) else None
|
||||
agent_name = item.get("contactName") or item.get("agentName") or item.get("ownerAgent", {}).get("name") if isinstance(item.get("ownerAgent"), dict) else item.get("contactName")
|
||||
return agent_name, agency_name
|
||||
|
||||
|
||||
def _extract_permit(item: dict) -> str | None:
|
||||
for key in ("permitNumber", "permit_number", "rera", "trakheesi", "permit"):
|
||||
v = item.get(key)
|
||||
if v:
|
||||
return str(v).strip()
|
||||
return None
|
||||
|
||||
|
||||
_ID_FROM_URL = re.compile(r"details-(\d+)\.html(?:[?#].*)?$")
|
||||
|
||||
|
||||
def _extract_id_from_url(url: str) -> str | None:
|
||||
m = _ID_FROM_URL.search(url)
|
||||
return m.group(1) if m else None
|
||||
|
||||
|
||||
def _is_listing_dict(item: dict) -> bool:
|
||||
if not isinstance(item, dict):
|
||||
return False
|
||||
has_price = "price" in item
|
||||
has_id = any(k in item for k in ("externalID", "id", "objectID"))
|
||||
return has_price and has_id
|
||||
|
||||
|
||||
class BayutScraper:
|
||||
source = SOURCE
|
||||
|
||||
def fetch_listing(self, url: str) -> ScrapedListing | None:
|
||||
try:
|
||||
html = fetch_html(url)
|
||||
except ScraperError as e:
|
||||
logger.warning("Bayut refetch failed for %s: %s", url, e)
|
||||
return None
|
||||
|
||||
if not html:
|
||||
return ScrapedListing(
|
||||
source=SOURCE, external_id=_extract_id_from_url(url) or "", url=url,
|
||||
title=None, price=None, currency=None, permit_number=None,
|
||||
agent_name=None, agency_name=None, is_active=False,
|
||||
)
|
||||
|
||||
data = extract_next_data(html)
|
||||
if not data:
|
||||
return None
|
||||
|
||||
best = None
|
||||
best_score = -1
|
||||
for node in _walk(data):
|
||||
if not _is_listing_dict(node):
|
||||
continue
|
||||
score = 0
|
||||
if "title" in node or "name" in node:
|
||||
score += 2
|
||||
if "agency" in node or "contactName" in node:
|
||||
score += 2
|
||||
if "rooms" in node or "bedrooms" in node:
|
||||
score += 1
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best = node
|
||||
|
||||
if best is None:
|
||||
logger.warning("Bayut: no listing dict found in __NEXT_DATA__ for %s", url)
|
||||
return None
|
||||
|
||||
price, currency = _extract_price(best)
|
||||
agent_name, agency_name = _extract_broker(best)
|
||||
ext_id = (
|
||||
str(best.get("externalID") or best.get("id") or "")
|
||||
or _extract_id_from_url(url)
|
||||
or ""
|
||||
)
|
||||
return ScrapedListing(
|
||||
source=SOURCE,
|
||||
external_id=ext_id,
|
||||
url=url,
|
||||
title=best.get("title") or best.get("name"),
|
||||
price=price,
|
||||
currency=currency,
|
||||
permit_number=_extract_permit(best),
|
||||
agent_name=agent_name,
|
||||
agency_name=agency_name,
|
||||
is_active=True,
|
||||
)
|
||||
|
||||
def search_similar(
|
||||
self,
|
||||
building: str | None,
|
||||
bedrooms: int | None,
|
||||
deal_type: str,
|
||||
limit: int = 20,
|
||||
location_url: str | None = None,
|
||||
) -> list[ScrapedListing]:
|
||||
if not building:
|
||||
return []
|
||||
path = _path_for_deal(deal_type)
|
||||
q = quote_plus(building.strip())
|
||||
url = f"{BASE_URL}/{path}/property/dubai/?q={q}"
|
||||
if bedrooms is not None:
|
||||
url += f"&beds_in={bedrooms}"
|
||||
logger.info("Bayut search_similar: %s", url)
|
||||
|
||||
try:
|
||||
html = fetch_html(url)
|
||||
except ScraperError as e:
|
||||
logger.warning("Bayut search failed: %s", e)
|
||||
return []
|
||||
|
||||
data = extract_next_data(html)
|
||||
if not data:
|
||||
return []
|
||||
|
||||
results: list[ScrapedListing] = []
|
||||
seen_ids: set[str] = set()
|
||||
for node in _walk(data):
|
||||
if not _is_listing_dict(node):
|
||||
continue
|
||||
ext_id = str(node.get("externalID") or node.get("id") or "")
|
||||
if not ext_id or ext_id in seen_ids:
|
||||
continue
|
||||
|
||||
title = node.get("title") or node.get("name") or ""
|
||||
if building.lower() not in (title or "").lower():
|
||||
slug = str(node.get("slug") or "").lower()
|
||||
building_token = building.lower().replace(" ", "-")
|
||||
if building_token not in slug:
|
||||
continue
|
||||
|
||||
seen_ids.add(ext_id)
|
||||
price, currency = _extract_price(node)
|
||||
agent_name, agency_name = _extract_broker(node)
|
||||
cand_url = urljoin(BASE_URL, f"/property/details-{ext_id}.html")
|
||||
|
||||
results.append(
|
||||
ScrapedListing(
|
||||
source=SOURCE,
|
||||
external_id=ext_id,
|
||||
url=cand_url,
|
||||
title=title or None,
|
||||
price=price,
|
||||
currency=currency,
|
||||
permit_number=_extract_permit(node),
|
||||
agent_name=agent_name,
|
||||
agency_name=agency_name,
|
||||
is_active=True,
|
||||
)
|
||||
)
|
||||
if len(results) >= limit:
|
||||
break
|
||||
return results
|
||||
325
app/scrapers/propertyfinder.py
Normal file
325
app/scrapers/propertyfinder.py
Normal file
@@ -0,0 +1,325 @@
|
||||
"""PropertyFinder.ae scraper.
|
||||
|
||||
Two operations:
|
||||
- fetch_listing(url): read a listing detail page → ScrapedListing (title/price/agent/permit).
|
||||
- search_similar(building, bedrooms, deal_type): search PF for similar candidates
|
||||
by building name + bedrooms filter → list[ScrapedListing].
|
||||
|
||||
PF is a Next.js app — listing data sits in <script id="__NEXT_DATA__">.
|
||||
Note: PF intentionally hides the Trakheesi permit as an image on the detail page,
|
||||
so permit may come back as None — that's fine, we don't depend on it.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from app.scrapers.base import (
|
||||
ScrapedListing,
|
||||
ScraperError,
|
||||
extract_next_data,
|
||||
fetch_html,
|
||||
parse_price,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BASE_URL = "https://www.propertyfinder.ae"
|
||||
SOURCE = "propertyfinder"
|
||||
|
||||
# PF location hierarchy, most specific first. search_similar scopes by the most
|
||||
# specific id available on a reference listing page.
|
||||
_LOC_TYPE_PRIORITY = {
|
||||
"TOWER": 5,
|
||||
"BUILDING": 4,
|
||||
"DEVELOPMENT": 3,
|
||||
"SUBCOMMUNITY": 2,
|
||||
"COMMUNITY": 1,
|
||||
"CITY": 0,
|
||||
}
|
||||
|
||||
|
||||
def _category_for_deal(deal_type: str) -> int:
|
||||
return 1 if deal_type == "sale" else 2
|
||||
|
||||
|
||||
def _get(d, *keys, default=None):
|
||||
cur = d
|
||||
for k in keys:
|
||||
if not isinstance(cur, dict):
|
||||
return default
|
||||
cur = cur.get(k)
|
||||
if cur is None:
|
||||
return default
|
||||
return cur
|
||||
|
||||
|
||||
def _walk(node):
|
||||
"""Iterate over every dict in a nested JSON structure."""
|
||||
if isinstance(node, dict):
|
||||
yield node
|
||||
for v in node.values():
|
||||
yield from _walk(v)
|
||||
elif isinstance(node, list):
|
||||
for it in node:
|
||||
yield from _walk(it)
|
||||
|
||||
|
||||
def _extract_price(item: dict) -> tuple[float | None, str | None]:
|
||||
price = item.get("price")
|
||||
if isinstance(price, dict):
|
||||
val = price.get("value") or price.get("amount") or price.get("min") or price.get("from")
|
||||
cur = price.get("currency") or "AED"
|
||||
return parse_price(val), cur
|
||||
if isinstance(price, (int, float, str)):
|
||||
return parse_price(price), item.get("currency") or "AED"
|
||||
return None, "AED"
|
||||
|
||||
|
||||
def _extract_broker(item: dict) -> tuple[str | None, str | None]:
|
||||
broker = item.get("broker") or item.get("agency") or {}
|
||||
agent = item.get("agent") or item.get("contact") or {}
|
||||
agency_name = broker.get("name") if isinstance(broker, dict) else None
|
||||
agent_name = agent.get("name") if isinstance(agent, dict) else None
|
||||
return agent_name, agency_name
|
||||
|
||||
|
||||
def _extract_permit(item: dict) -> str | None:
|
||||
for key in ("permit_number", "permitNumber", "trakheesi", "rera", "permit"):
|
||||
v = item.get(key)
|
||||
if v:
|
||||
return str(v).strip()
|
||||
reg = item.get("regulatory") or item.get("regulation") or {}
|
||||
if isinstance(reg, dict):
|
||||
for key in ("permit", "permit_number", "trakheesi", "rera"):
|
||||
v = reg.get(key)
|
||||
if v:
|
||||
return str(v).strip()
|
||||
return None
|
||||
|
||||
|
||||
def _find_permit_on_page(data: dict) -> str | None:
|
||||
"""The DLD permit number lives in a regulatory block rendered as an image,
|
||||
but its plain value is still in __NEXT_DATA__: the dict that carries a
|
||||
`permit_validation_url` (the Trakheesi link) also has the number in
|
||||
`number`. Walk the page and pull it out."""
|
||||
for node in _walk(data):
|
||||
if isinstance(node, dict) and node.get("permit_validation_url") and node.get("number"):
|
||||
return str(node["number"]).strip()
|
||||
return None
|
||||
|
||||
|
||||
_ID_FROM_URL = re.compile(r"-(\d+)\.html(?:[?#].*)?$")
|
||||
|
||||
|
||||
def _extract_id_from_url(url: str) -> str | None:
|
||||
m = _ID_FROM_URL.search(url)
|
||||
return m.group(1) if m else None
|
||||
|
||||
|
||||
def _is_listing_dict(item: dict) -> bool:
|
||||
"""Heuristic: a listing dict contains a price plus an id-like field."""
|
||||
if not isinstance(item, dict):
|
||||
return False
|
||||
has_price = "price" in item
|
||||
has_id = any(k in item for k in ("id", "reference", "listing_id", "externalID"))
|
||||
return has_price and has_id
|
||||
|
||||
|
||||
class PropertyFinderScraper:
|
||||
source = SOURCE
|
||||
|
||||
def fetch_listing(self, url: str) -> ScrapedListing | None:
|
||||
"""Refetch a known listing URL. Returns:
|
||||
- ScrapedListing(is_active=False) if the URL returns 404 (listing removed)
|
||||
- ScrapedListing with current data if alive
|
||||
- None on network/parse failure (we won't update the DB in that case)
|
||||
"""
|
||||
try:
|
||||
html = fetch_html(url)
|
||||
except ScraperError as e:
|
||||
logger.warning("PF refetch failed for %s: %s", url, e)
|
||||
return None
|
||||
|
||||
if not html:
|
||||
return ScrapedListing(
|
||||
source=SOURCE, external_id=_extract_id_from_url(url) or "", url=url,
|
||||
title=None, price=None, currency=None, permit_number=None,
|
||||
agent_name=None, agency_name=None, is_active=False,
|
||||
)
|
||||
|
||||
data = extract_next_data(html)
|
||||
if not data:
|
||||
return None
|
||||
|
||||
# On a PF detail page the property dict is nested in pageProps. Walk and pick
|
||||
# the dict that has both a "price" and an id, ignoring trivial nested ones.
|
||||
best = None
|
||||
best_score = -1
|
||||
for node in _walk(data):
|
||||
if not _is_listing_dict(node):
|
||||
continue
|
||||
score = 0
|
||||
if "title" in node or "name" in node:
|
||||
score += 2
|
||||
if any(k in node for k in ("broker", "agent", "agency")):
|
||||
score += 2
|
||||
if "bedrooms" in node or "rooms" in node:
|
||||
score += 1
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best = node
|
||||
|
||||
if best is None:
|
||||
logger.warning("PF: no listing dict found in __NEXT_DATA__ for %s", url)
|
||||
return None
|
||||
|
||||
price, currency = _extract_price(best)
|
||||
agent_name, agency_name = _extract_broker(best)
|
||||
ext_id = (
|
||||
str(best.get("id") or best.get("reference") or best.get("listing_id") or "")
|
||||
or _extract_id_from_url(url)
|
||||
or ""
|
||||
)
|
||||
return ScrapedListing(
|
||||
source=SOURCE,
|
||||
external_id=ext_id,
|
||||
url=url,
|
||||
title=best.get("title") or best.get("name"),
|
||||
price=price,
|
||||
currency=currency,
|
||||
permit_number=_find_permit_on_page(data) or _extract_permit(best),
|
||||
agent_name=agent_name,
|
||||
agency_name=agency_name,
|
||||
is_active=True,
|
||||
)
|
||||
|
||||
def get_permit(self, url: str) -> str | None:
|
||||
"""Fetch a listing page and return only its DLD permit number (or None).
|
||||
Used to compare candidates against our own permit during suggestions."""
|
||||
try:
|
||||
html = fetch_html(url)
|
||||
except ScraperError as e:
|
||||
logger.warning("PF get_permit fetch failed for %s: %s", url, e)
|
||||
return None
|
||||
data = extract_next_data(html)
|
||||
return _find_permit_on_page(data) if data else None
|
||||
|
||||
def resolve_location_id(self, listing_url: str) -> int | None:
|
||||
"""Read a PF listing page and return the most specific location id
|
||||
(tower > building > subcommunity > community).
|
||||
|
||||
PF's search only filters by numeric location id (`l=`); the free-text
|
||||
`q=` param does NOT scope results to a building — it returns unrelated
|
||||
recommendations. So we derive the location id from a known listing that
|
||||
sits in the same building (our own listing, or an already-tracked one).
|
||||
"""
|
||||
try:
|
||||
html = fetch_html(listing_url)
|
||||
except ScraperError as e:
|
||||
logger.warning("PF resolve_location_id fetch failed for %s: %s", listing_url, e)
|
||||
return None
|
||||
data = extract_next_data(html)
|
||||
if not data:
|
||||
return None
|
||||
|
||||
best_id: object = None
|
||||
best_rank = -1
|
||||
for node in _walk(data):
|
||||
if not isinstance(node, dict):
|
||||
continue
|
||||
rank = _LOC_TYPE_PRIORITY.get(str(node.get("type", "")).upper(), -1)
|
||||
if rank > best_rank and node.get("id") and node.get("name"):
|
||||
best_rank, best_id = rank, node.get("id")
|
||||
try:
|
||||
return int(best_id) if best_id is not None else None
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
def search_similar(
|
||||
self,
|
||||
building: str | None,
|
||||
bedrooms: int | None,
|
||||
deal_type: str,
|
||||
limit: int = 200,
|
||||
location_url: str | None = None,
|
||||
max_pages: int = 8,
|
||||
) -> list[ScrapedListing]:
|
||||
"""Search PF for candidates in the same building, scoped by location id.
|
||||
|
||||
`location_url` is a reference listing in the target building (our own
|
||||
listing or an already-tracked competitor) — we resolve it to a PF
|
||||
location id and search by `l=`. Without it we can't reliably scope a
|
||||
building search on PF, so we return nothing rather than garbage.
|
||||
|
||||
Paginates: a same-permit competitor can sit on any results page (PF
|
||||
can't be queried by permit), so we collect across pages up to
|
||||
`max_pages`/`limit`.
|
||||
"""
|
||||
location_id = self.resolve_location_id(location_url) if location_url else None
|
||||
if location_id is None:
|
||||
logger.info(
|
||||
"PF search_similar: no location id (url=%r) — skipping (q= text search "
|
||||
"does not filter by building on PF)", location_url,
|
||||
)
|
||||
return []
|
||||
|
||||
c = _category_for_deal(deal_type)
|
||||
base = f"{BASE_URL}/en/search?c={c}&l={location_id}"
|
||||
if bedrooms is not None:
|
||||
base += f"&bf={bedrooms}&bt={bedrooms}" # PF uses bf=bedrooms-from, bt=bedrooms-to
|
||||
|
||||
results: list[ScrapedListing] = []
|
||||
seen_ids: set[str] = set()
|
||||
for page in range(1, max_pages + 1):
|
||||
page_url = base if page == 1 else f"{base}&page={page}"
|
||||
try:
|
||||
html = fetch_html(page_url)
|
||||
except ScraperError as e:
|
||||
logger.warning("PF search failed (page %d): %s", page, e)
|
||||
break
|
||||
data = extract_next_data(html)
|
||||
if not data:
|
||||
break
|
||||
|
||||
new_on_page = 0
|
||||
for node in _walk(data):
|
||||
if not _is_listing_dict(node):
|
||||
continue
|
||||
ext_id = str(node.get("id") or node.get("reference") or "")
|
||||
if not ext_id or ext_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(ext_id)
|
||||
new_on_page += 1
|
||||
|
||||
# Results are scoped to the location by l=, so no title filter.
|
||||
title = node.get("title") or node.get("name") or ""
|
||||
price, currency = _extract_price(node)
|
||||
agent_name, agency_name = _extract_broker(node)
|
||||
share = node.get("share_url") or node.get("path")
|
||||
cand_url = share if str(share).startswith("http") else urljoin(BASE_URL, str(share or ""))
|
||||
|
||||
results.append(
|
||||
ScrapedListing(
|
||||
source=SOURCE,
|
||||
external_id=ext_id,
|
||||
url=cand_url or page_url,
|
||||
title=title or None,
|
||||
price=price,
|
||||
currency=currency,
|
||||
permit_number=_extract_permit(node),
|
||||
agent_name=agent_name,
|
||||
agency_name=agency_name,
|
||||
is_active=True,
|
||||
)
|
||||
)
|
||||
if len(results) >= limit:
|
||||
break
|
||||
|
||||
# No new listings on this page → we've passed the last page.
|
||||
if len(results) >= limit or new_on_page == 0:
|
||||
break
|
||||
logger.info("PF search_similar: collected %d candidates (l=%s)", len(results), location_id)
|
||||
return results
|
||||
Reference in New Issue
Block a user