Validate PF project listings and sync own price

2026-06-11 14:46:35 +03:00
parent d53ecb2add
commit c763ff423d
3 changed files with 171 additions and 6 deletions
--- a/app/scrapers/propertyfinder.py
+++ b/app/scrapers/propertyfinder.py
@@ -119,6 +119,15 @@ def _extract_id_from_url(url: str) -> str | None:
    return m.group(1) if m else None


+def is_listing_url(url: str) -> bool:
+    """True only for a concrete PF listing URL.
+
+    PF search pages also contain listing-like JSON. Treating them as a detail
+    page can bind monitoring to a random result, so callers must reject them.
+    """
+    return bool(_extract_id_from_url(url or ""))
+
+
 def _is_listing_dict(item: dict) -> bool:
    """Heuristic: a listing dict contains a price plus an id-like field."""
    if not isinstance(item, dict):
@@ -131,12 +140,22 @@ def _is_listing_dict(item: dict) -> bool:
 class PropertyFinderScraper:
    source = SOURCE

+    def is_listing_url(self, url: str) -> bool:
+        return is_listing_url(url)
+
+    def listing_id_from_url(self, url: str) -> str | None:
+        return _extract_id_from_url(url)
+
    def fetch_listing(self, url: str) -> ScrapedListing | None:
        """Refetch a known listing URL. Returns:
        - ScrapedListing(is_active=False) if the URL returns 404 (listing removed)
        - ScrapedListing with current data if alive
        - None on network/parse failure (we won't update the DB in that case)
        """
+        if not is_listing_url(url):
+            logger.warning("PF fetch_listing rejected non-listing URL: %s", url)
+            return None
+
        try:
            html = fetch_html(url)
        except ScraperError as e:
@@ -199,6 +218,9 @@ class PropertyFinderScraper:
    def get_permit(self, url: str) -> str | None:
        """Fetch a listing page and return only its DLD permit number (or None).
        Used to compare candidates against our own permit during suggestions."""
+        if not is_listing_url(url):
+            logger.warning("PF get_permit rejected non-listing URL: %s", url)
+            return None
        try:
            html = fetch_html(url)
        except ScraperError as e:
@@ -300,6 +322,8 @@ class PropertyFinderScraper:
                agent_name, agency_name = _extract_broker(node)
                share = node.get("share_url") or node.get("path")
                cand_url = share if str(share).startswith("http") else urljoin(BASE_URL, str(share or ""))
+                if not is_listing_url(cand_url):
+                    continue

                results.append(
                    ScrapedListing(