feat: parse project metadata from PF links
All checks were successful
CI / hygiene (push) Successful in 2s
Build and Deploy / build-and-deploy (push) Successful in 38s
CI / go (push) Successful in 26s
CI / python (push) Successful in 16s

This commit is contained in:
Grendgi
2026-06-24 14:03:25 +03:00
parent 31c498af39
commit 6750722429
7 changed files with 276 additions and 0 deletions

View File

@@ -35,6 +35,9 @@ class ScrapedListing:
permit_number: str | None
agent_name: str | None
agency_name: str | None
building: str | None = None
bedrooms: int | None = None
size_sqft: float | None = None
is_active: bool = True

View File

@@ -100,6 +100,110 @@ def _extract_permit(item: dict) -> str | None:
return None
def _parse_int(value) -> int | None:
if value is None:
return None
if isinstance(value, bool):
return None
if isinstance(value, (int, float)):
return int(value)
text = str(value).strip().lower()
if text in {"studio", "студия"}:
return 0
m = re.search(r"\d+", text)
return int(m.group(0)) if m else None
def _extract_bedrooms(item: dict) -> int | None:
for key in ("bedrooms", "bedroom", "beds", "rooms", "bedroom_count", "bedrooms_count"):
value = item.get(key)
if isinstance(value, dict):
value = value.get("value") or value.get("count") or value.get("name")
parsed = _parse_int(value)
if parsed is not None:
return parsed
for node in _walk(item):
if not isinstance(node, dict):
continue
name = str(node.get("name") or node.get("label") or node.get("key") or "").lower()
if "bed" not in name and "спал" not in name:
continue
parsed = _parse_int(node.get("value") or node.get("count") or node.get("text"))
if parsed is not None:
return parsed
return None
def _area_to_sqft(value, unit: str | None = None) -> float | None:
parsed = parse_price(value)
if parsed is None:
return None
unit_text = (unit or "").lower()
if any(token in unit_text for token in ("sqm", "sq m", "m2", "", "метр")):
return round(parsed * 10.7639, 2)
return parsed
def _extract_size_sqft(item: dict) -> float | None:
for key in ("size", "area", "property_size", "built_up_area", "builtup_area", "plot_area"):
value = item.get(key)
unit = None
if isinstance(value, dict):
unit = value.get("unit") or value.get("unit_label") or value.get("unitLabel")
value = value.get("value") or value.get("amount") or value.get("text")
parsed = _area_to_sqft(value, unit)
if parsed is not None:
return parsed
for node in _walk(item):
if not isinstance(node, dict):
continue
name = str(node.get("name") or node.get("label") or node.get("key") or "").lower()
if not any(token in name for token in ("size", "area", "sqft", "sq ft", "площад")):
continue
parsed = _area_to_sqft(
node.get("value") or node.get("amount") or node.get("text"),
str(node.get("unit") or node.get("unit_label") or ""),
)
if parsed is not None:
return parsed
return None
def _location_candidate(node: dict) -> tuple[int, str] | None:
rank = _LOC_TYPE_PRIORITY.get(str(node.get("type", "")).upper(), -1)
name = str(node.get("name") or "").strip()
if rank < 0 or not name:
return None
return rank, name
def _extract_building_from(node) -> str | None:
best_name: str | None = None
best_rank = -1
for item in _walk(node):
if not isinstance(item, dict):
continue
candidate = _location_candidate(item)
if not candidate:
continue
rank, name = candidate
if rank > best_rank:
best_rank, best_name = rank, name
return best_name
def _extract_building(data: dict, item: dict) -> str | None:
for key in ("location", "location_tree", "locations", "locationTree", "community"):
value = item.get(key)
if value:
building = _extract_building_from(value)
if building:
return building
return _extract_building_from(data)
def _find_permit_on_page(data: dict) -> str | None:
"""The DLD permit number lives in a regulatory block rendered as an image,
but its plain value is still in __NEXT_DATA__: the dict that carries a
@@ -212,6 +316,9 @@ class PropertyFinderScraper:
permit_number=_find_permit_on_page(data) or _extract_permit(best),
agent_name=agent_name,
agency_name=agency_name,
building=_extract_building(data, best),
bedrooms=_extract_bedrooms(best),
size_sqft=_extract_size_sqft(best),
is_active=True,
)
@@ -336,6 +443,9 @@ class PropertyFinderScraper:
permit_number=_extract_permit(node),
agent_name=agent_name,
agency_name=agency_name,
building=_extract_building(data, node),
bedrooms=_extract_bedrooms(node),
size_sqft=_extract_size_sqft(node),
is_active=True,
)
)

View File

@@ -189,6 +189,43 @@ def add_competitor_url(db: Session, project: Project, url: str) -> tuple[Competi
return listing, ""
def parse_our_listing_url(url: str) -> dict:
"""Parse our own PF listing for project metadata.
Used by the Go API before project validation, so users can paste only the
concrete object URL and let the service fill price/permit/building/area.
"""
url = (url or "").strip()
if not url:
raise ValueError("URL пустой")
source = detect_source_from_url(url)
if source is None:
raise ValueError("URL должен быть с propertyfinder.ae или bayut.com")
if source == Source.BAYUT and not BAYUT_ENABLED:
raise ValueError(
"Bayut временно не поддерживается — площадка перешла на защищённый "
"рендеринг. Используйте ссылку PropertyFinder."
)
if not _is_supported_listing_url(source, url):
raise ValueError("Укажите ссылку на конкретное объявление, а не на страницу поиска")
scraped = _scraper_for(source).fetch_listing(url)
if scraped is None:
raise ValueError("Не удалось загрузить страницу — сайт мог заблокировать запрос, попробуйте позже")
if not scraped.is_active:
raise ValueError("Страница объявления вернула 404 — ссылка битая или объявление снято")
return {
"title": scraped.title,
"our_price": scraped.price,
"dld_permit": scraped.permit_number,
"building": scraped.building,
"bedrooms": scraped.bedrooms,
"size_sqft": scraped.size_sqft,
"currency": scraped.currency or "AED",
}
def add_competitor_urls(db: Session, project: Project, urls: list[str]) -> dict:
"""Add several pasted/selected URLs in one go (used by the suggest page's
multi-select). Processes them sequentially — each one re-fetches the page —
@@ -421,6 +458,12 @@ def refresh_our_listing(db: Session, project: Project, *, now: datetime | None =
changed: list[str] = []
if scraped.permit_number and not project.dld_permit:
project.dld_permit = scraped.permit_number
if scraped.building and not project.building:
project.building = scraped.building
if scraped.bedrooms is not None and project.bedrooms is None:
project.bedrooms = scraped.bedrooms
if scraped.size_sqft is not None and project.size_sqft is None:
project.size_sqft = scraped.size_sqft
old_price = project.our_price
new_price = scraped.price

View File

@@ -17,6 +17,8 @@ from app.models import Project
from app.services.monitor import (
BAYUT_ENABLED,
add_competitor_url,
notify_project_changes,
parse_our_listing_url,
run_check_all,
run_check_for_project,
sync_permit_competitors,
@@ -133,6 +135,14 @@ def cmd_suggest(payload: dict[str, Any]) -> None:
db.close()
def cmd_parse_own_listing(payload: dict[str, Any]) -> None:
url = str(payload.get("url") or "")
try:
_write(parse_our_listing_url(url))
except ValueError as exc:
_fail(str(exc))
def cmd_health(_: dict[str, Any]) -> None:
db = SessionLocal()
try:
@@ -149,6 +159,7 @@ COMMANDS = {
"check-project": cmd_check_project,
"check-all": cmd_check_all,
"suggest": cmd_suggest,
"parse-own-listing": cmd_parse_own_listing,
}

View File

@@ -356,6 +356,10 @@ func (a *App) CreateProject(ctx context.Context, ownerID int64, p ProjectPayload
}
p.Title = title
p.DealType = deal
p, err = a.enrichProjectPayloadFromURL(ctx, p)
if err != nil {
return nil, err
}
if err := validateProjectRequired(p); err != nil {
return nil, err
}
@@ -395,6 +399,10 @@ func (a *App) UpdateProject(ctx context.Context, ownerID, projectID int64, p Pro
p = mergeProjectPayload(current, p)
p.Title = title
p.DealType = deal
p, err = a.enrichProjectPayloadFromURL(ctx, p)
if err != nil {
return nil, err
}
if err := validateProjectRequired(p); err != nil {
return nil, err
}
@@ -437,6 +445,51 @@ func mergeProjectPayload(current *Project, p ProjectPayload) ProjectPayload {
return p
}
func (a *App) enrichProjectPayloadFromURL(ctx context.Context, p ProjectPayload) (ProjectPayload, error) {
url := cleanPtr(p.OurURL)
if url == nil || a.Worker == nil {
return p, nil
}
parsed, err := a.Worker.ParseOwnListing(ctx, *url)
if err != nil {
if projectMissingParsedFields(p) {
return p, fmt.Errorf("parse our_url: %w", err)
}
return p, nil
}
return applyParsedOwnListing(p, parsed), nil
}
func projectMissingParsedFields(p ProjectPayload) bool {
return p.OurPrice == nil ||
cleanPtr(p.DLDPermit) == nil ||
cleanPtr(p.Building) == nil ||
p.Bedrooms == nil ||
p.SizeSqft == nil
}
func applyParsedOwnListing(p ProjectPayload, parsed *ParsedOwnListing) ProjectPayload {
if parsed == nil {
return p
}
if parsed.OurPrice != nil && *parsed.OurPrice > 0 {
p.OurPrice = parsed.OurPrice
}
if permit := cleanPtr(parsed.DLDPermit); permit != nil {
p.DLDPermit = permit
}
if building := cleanPtr(parsed.Building); building != nil {
p.Building = building
}
if parsed.Bedrooms != nil {
p.Bedrooms = parsed.Bedrooms
}
if parsed.SizeSqft != nil && *parsed.SizeSqft > 0 {
p.SizeSqft = parsed.SizeSqft
}
return p
}
func validateProjectRequired(p ProjectPayload) error {
if cleanString(p.Title) == "" {
return fmt.Errorf("title is required")

View File

@@ -65,3 +65,41 @@ func TestValidateProjectRequiredRejectsListingLikeURLWithoutID(t *testing.T) {
t.Fatalf("unexpected error: %v", err)
}
}
func TestApplyParsedOwnListingFillsProjectMetadata(t *testing.T) {
payload := ProjectPayload{
Title: "Full Park View",
DealType: "sale",
OurURL: strPtr(
"https://www.propertyfinder.ae/en/plp/buy/apartment-for-sale-dubai-dubai-creek-harbour-the-lagoons-harbour-gate-harbour-gate-tower-2-86176216.html",
),
}
parsed := &ParsedOwnListing{
OurPrice: float64Ptr(3500000),
DLDPermit: strPtr("7140504127"),
Building: strPtr("Harbour Gate Tower 2"),
Bedrooms: int64Ptr(2),
SizeSqft: float64Ptr(1081),
}
payload = applyParsedOwnListing(payload, parsed)
if err := validateProjectRequired(payload); err != nil {
t.Fatalf("validateProjectRequired() after parsed metadata returned error: %v", err)
}
if payload.OurPrice == nil || *payload.OurPrice != 3500000 {
t.Fatalf("our_price was not applied: %#v", payload.OurPrice)
}
if payload.DLDPermit == nil || *payload.DLDPermit != "7140504127" {
t.Fatalf("dld_permit was not applied: %#v", payload.DLDPermit)
}
if payload.Building == nil || *payload.Building != "Harbour Gate Tower 2" {
t.Fatalf("building was not applied: %#v", payload.Building)
}
if payload.Bedrooms == nil || *payload.Bedrooms != 2 {
t.Fatalf("bedrooms was not applied: %#v", payload.Bedrooms)
}
if payload.SizeSqft == nil || *payload.SizeSqft != 1081 {
t.Fatalf("size_sqft was not applied: %#v", payload.SizeSqft)
}
}

View File

@@ -51,6 +51,16 @@ type Suggestion struct {
IsActive bool `json:"is_active"`
}
type ParsedOwnListing struct {
Title *string `json:"title"`
OurPrice *float64 `json:"our_price"`
DLDPermit *string `json:"dld_permit"`
Building *string `json:"building"`
Bedrooms *int64 `json:"bedrooms"`
SizeSqft *float64 `json:"size_sqft"`
Currency *string `json:"currency"`
}
type SuggestionsResponse struct {
OurPermit *string `json:"our_permit"`
BayutEnabled bool `json:"bayut_enabled"`
@@ -108,6 +118,14 @@ func (w *Worker) Suggest(ctx context.Context, projectID int64) (*SuggestionsResp
return &out, nil
}
func (w *Worker) ParseOwnListing(ctx context.Context, url string) (*ParsedOwnListing, error) {
var out ParsedOwnListing
if err := w.call(ctx, "parse-own-listing", map[string]any{"url": url}, &out); err != nil {
return nil, err
}
return &out, nil
}
func (w *Worker) Health(ctx context.Context) error {
var out HealthResult
if err := w.call(ctx, "health", map[string]any{}, &out); err != nil {