Validate PF project listings and sync own price
All checks were successful
CI / go (push) Successful in 41s
CI / python (push) Successful in 2s
Build and Deploy / build-and-deploy (push) Successful in 33s

This commit is contained in:
Grendgi
2026-06-11 14:46:35 +03:00
parent d53ecb2add
commit c763ff423d
3 changed files with 171 additions and 6 deletions

View File

@@ -119,6 +119,15 @@ def _extract_id_from_url(url: str) -> str | None:
return m.group(1) if m else None
def is_listing_url(url: str) -> bool:
"""True only for a concrete PF listing URL.
PF search pages also contain listing-like JSON. Treating them as a detail
page can bind monitoring to a random result, so callers must reject them.
"""
return bool(_extract_id_from_url(url or ""))
def _is_listing_dict(item: dict) -> bool:
"""Heuristic: a listing dict contains a price plus an id-like field."""
if not isinstance(item, dict):
@@ -131,12 +140,22 @@ def _is_listing_dict(item: dict) -> bool:
class PropertyFinderScraper:
source = SOURCE
def is_listing_url(self, url: str) -> bool:
return is_listing_url(url)
def listing_id_from_url(self, url: str) -> str | None:
return _extract_id_from_url(url)
def fetch_listing(self, url: str) -> ScrapedListing | None:
"""Refetch a known listing URL. Returns:
- ScrapedListing(is_active=False) if the URL returns 404 (listing removed)
- ScrapedListing with current data if alive
- None on network/parse failure (we won't update the DB in that case)
"""
if not is_listing_url(url):
logger.warning("PF fetch_listing rejected non-listing URL: %s", url)
return None
try:
html = fetch_html(url)
except ScraperError as e:
@@ -199,6 +218,9 @@ class PropertyFinderScraper:
def get_permit(self, url: str) -> str | None:
"""Fetch a listing page and return only its DLD permit number (or None).
Used to compare candidates against our own permit during suggestions."""
if not is_listing_url(url):
logger.warning("PF get_permit rejected non-listing URL: %s", url)
return None
try:
html = fetch_html(url)
except ScraperError as e:
@@ -300,6 +322,8 @@ class PropertyFinderScraper:
agent_name, agency_name = _extract_broker(node)
share = node.get("share_url") or node.get("path")
cand_url = share if str(share).startswith("http") else urljoin(BASE_URL, str(share or ""))
if not is_listing_url(cand_url):
continue
results.append(
ScrapedListing(