Add monitoring PF service

This commit is contained in:
Grendgi
2026-06-04 14:55:41 +03:00
commit dd3edd7088
41 changed files with 3194 additions and 0 deletions

96
app/scrapers/base.py Normal file
View File

@@ -0,0 +1,96 @@
from __future__ import annotations
import json
import logging
import re
from dataclasses import dataclass
import httpx
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
DEFAULT_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/131.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
@dataclass
class ScrapedListing:
source: str # "propertyfinder" | "bayut"
external_id: str # listing id on the source
url: str
title: str | None
price: float | None
currency: str | None
permit_number: str | None
agent_name: str | None
agency_name: str | None
is_active: bool = True
class ScraperError(Exception):
pass
def fetch_html(url: str, timeout: float = 30.0) -> str:
"""GET a URL with browser-like headers. Raises ScraperError on non-2xx."""
try:
with httpx.Client(headers=DEFAULT_HEADERS, follow_redirects=True, timeout=timeout) as client:
r = client.get(url)
if r.status_code in (403, 429):
raise ScraperError(f"Blocked by site ({r.status_code}) at {url}")
if r.status_code == 404:
return ""
r.raise_for_status()
return r.text
except httpx.HTTPError as e:
raise ScraperError(f"HTTP error for {url}: {e}") from e
_NEXT_DATA_RE = re.compile(
r'<script[^>]+id="__NEXT_DATA__"[^>]*>(.*?)</script>',
re.DOTALL,
)
def extract_next_data(html: str) -> dict | None:
"""Extract Next.js __NEXT_DATA__ JSON blob — both PF and Bayut are Next.js apps."""
if not html:
return None
m = _NEXT_DATA_RE.search(html)
if not m:
# Fallback via BeautifulSoup if regex misses (rare).
soup = BeautifulSoup(html, "lxml")
tag = soup.find("script", id="__NEXT_DATA__")
if not tag or not tag.string:
return None
raw = tag.string
else:
raw = m.group(1)
try:
return json.loads(raw)
except json.JSONDecodeError as e:
logger.warning("Failed to parse __NEXT_DATA__: %s", e)
return None
def parse_price(value) -> float | None:
if value is None:
return None
if isinstance(value, (int, float)):
return float(value)
s = re.sub(r"[^\d.]", "", str(value))
try:
return float(s) if s else None
except ValueError:
return None