diff --git a/app/scrapers/base.py b/app/scrapers/base.py index dab18b6..89e3f6c 100644 --- a/app/scrapers/base.py +++ b/app/scrapers/base.py @@ -35,6 +35,9 @@ class ScrapedListing: permit_number: str | None agent_name: str | None agency_name: str | None + building: str | None = None + bedrooms: int | None = None + size_sqft: float | None = None is_active: bool = True diff --git a/app/scrapers/propertyfinder.py b/app/scrapers/propertyfinder.py index f95f160..ffe2f02 100644 --- a/app/scrapers/propertyfinder.py +++ b/app/scrapers/propertyfinder.py @@ -100,6 +100,110 @@ def _extract_permit(item: dict) -> str | None: return None +def _parse_int(value) -> int | None: + if value is None: + return None + if isinstance(value, bool): + return None + if isinstance(value, (int, float)): + return int(value) + text = str(value).strip().lower() + if text in {"studio", "студия"}: + return 0 + m = re.search(r"\d+", text) + return int(m.group(0)) if m else None + + +def _extract_bedrooms(item: dict) -> int | None: + for key in ("bedrooms", "bedroom", "beds", "rooms", "bedroom_count", "bedrooms_count"): + value = item.get(key) + if isinstance(value, dict): + value = value.get("value") or value.get("count") or value.get("name") + parsed = _parse_int(value) + if parsed is not None: + return parsed + + for node in _walk(item): + if not isinstance(node, dict): + continue + name = str(node.get("name") or node.get("label") or node.get("key") or "").lower() + if "bed" not in name and "спал" not in name: + continue + parsed = _parse_int(node.get("value") or node.get("count") or node.get("text")) + if parsed is not None: + return parsed + return None + + +def _area_to_sqft(value, unit: str | None = None) -> float | None: + parsed = parse_price(value) + if parsed is None: + return None + unit_text = (unit or "").lower() + if any(token in unit_text for token in ("sqm", "sq m", "m2", "m²", "метр")): + return round(parsed * 10.7639, 2) + return parsed + + +def _extract_size_sqft(item: dict) -> float | None: + for key in ("size", "area", "property_size", "built_up_area", "builtup_area", "plot_area"): + value = item.get(key) + unit = None + if isinstance(value, dict): + unit = value.get("unit") or value.get("unit_label") or value.get("unitLabel") + value = value.get("value") or value.get("amount") or value.get("text") + parsed = _area_to_sqft(value, unit) + if parsed is not None: + return parsed + + for node in _walk(item): + if not isinstance(node, dict): + continue + name = str(node.get("name") or node.get("label") or node.get("key") or "").lower() + if not any(token in name for token in ("size", "area", "sqft", "sq ft", "площад")): + continue + parsed = _area_to_sqft( + node.get("value") or node.get("amount") or node.get("text"), + str(node.get("unit") or node.get("unit_label") or ""), + ) + if parsed is not None: + return parsed + return None + + +def _location_candidate(node: dict) -> tuple[int, str] | None: + rank = _LOC_TYPE_PRIORITY.get(str(node.get("type", "")).upper(), -1) + name = str(node.get("name") or "").strip() + if rank < 0 or not name: + return None + return rank, name + + +def _extract_building_from(node) -> str | None: + best_name: str | None = None + best_rank = -1 + for item in _walk(node): + if not isinstance(item, dict): + continue + candidate = _location_candidate(item) + if not candidate: + continue + rank, name = candidate + if rank > best_rank: + best_rank, best_name = rank, name + return best_name + + +def _extract_building(data: dict, item: dict) -> str | None: + for key in ("location", "location_tree", "locations", "locationTree", "community"): + value = item.get(key) + if value: + building = _extract_building_from(value) + if building: + return building + return _extract_building_from(data) + + def _find_permit_on_page(data: dict) -> str | None: """The DLD permit number lives in a regulatory block rendered as an image, but its plain value is still in __NEXT_DATA__: the dict that carries a @@ -212,6 +316,9 @@ class PropertyFinderScraper: permit_number=_find_permit_on_page(data) or _extract_permit(best), agent_name=agent_name, agency_name=agency_name, + building=_extract_building(data, best), + bedrooms=_extract_bedrooms(best), + size_sqft=_extract_size_sqft(best), is_active=True, ) @@ -336,6 +443,9 @@ class PropertyFinderScraper: permit_number=_extract_permit(node), agent_name=agent_name, agency_name=agency_name, + building=_extract_building(data, node), + bedrooms=_extract_bedrooms(node), + size_sqft=_extract_size_sqft(node), is_active=True, ) ) diff --git a/app/services/monitor.py b/app/services/monitor.py index f3451f4..c11efb8 100644 --- a/app/services/monitor.py +++ b/app/services/monitor.py @@ -189,6 +189,43 @@ def add_competitor_url(db: Session, project: Project, url: str) -> tuple[Competi return listing, "" +def parse_our_listing_url(url: str) -> dict: + """Parse our own PF listing for project metadata. + + Used by the Go API before project validation, so users can paste only the + concrete object URL and let the service fill price/permit/building/area. + """ + url = (url or "").strip() + if not url: + raise ValueError("URL пустой") + source = detect_source_from_url(url) + if source is None: + raise ValueError("URL должен быть с propertyfinder.ae или bayut.com") + if source == Source.BAYUT and not BAYUT_ENABLED: + raise ValueError( + "Bayut временно не поддерживается — площадка перешла на защищённый " + "рендеринг. Используйте ссылку PropertyFinder." + ) + if not _is_supported_listing_url(source, url): + raise ValueError("Укажите ссылку на конкретное объявление, а не на страницу поиска") + + scraped = _scraper_for(source).fetch_listing(url) + if scraped is None: + raise ValueError("Не удалось загрузить страницу — сайт мог заблокировать запрос, попробуйте позже") + if not scraped.is_active: + raise ValueError("Страница объявления вернула 404 — ссылка битая или объявление снято") + + return { + "title": scraped.title, + "our_price": scraped.price, + "dld_permit": scraped.permit_number, + "building": scraped.building, + "bedrooms": scraped.bedrooms, + "size_sqft": scraped.size_sqft, + "currency": scraped.currency or "AED", + } + + def add_competitor_urls(db: Session, project: Project, urls: list[str]) -> dict: """Add several pasted/selected URLs in one go (used by the suggest page's multi-select). Processes them sequentially — each one re-fetches the page — @@ -421,6 +458,12 @@ def refresh_our_listing(db: Session, project: Project, *, now: datetime | None = changed: list[str] = [] if scraped.permit_number and not project.dld_permit: project.dld_permit = scraped.permit_number + if scraped.building and not project.building: + project.building = scraped.building + if scraped.bedrooms is not None and project.bedrooms is None: + project.bedrooms = scraped.bedrooms + if scraped.size_sqft is not None and project.size_sqft is None: + project.size_sqft = scraped.size_sqft old_price = project.our_price new_price = scraped.price diff --git a/app/worker.py b/app/worker.py index 88c4560..2b4cf1b 100644 --- a/app/worker.py +++ b/app/worker.py @@ -17,6 +17,8 @@ from app.models import Project from app.services.monitor import ( BAYUT_ENABLED, add_competitor_url, + notify_project_changes, + parse_our_listing_url, run_check_all, run_check_for_project, sync_permit_competitors, @@ -133,6 +135,14 @@ def cmd_suggest(payload: dict[str, Any]) -> None: db.close() +def cmd_parse_own_listing(payload: dict[str, Any]) -> None: + url = str(payload.get("url") or "") + try: + _write(parse_our_listing_url(url)) + except ValueError as exc: + _fail(str(exc)) + + def cmd_health(_: dict[str, Any]) -> None: db = SessionLocal() try: @@ -149,6 +159,7 @@ COMMANDS = { "check-project": cmd_check_project, "check-all": cmd_check_all, "suggest": cmd_suggest, + "parse-own-listing": cmd_parse_own_listing, } diff --git a/internal/pf/store.go b/internal/pf/store.go index 5dcbc37..2b06816 100644 --- a/internal/pf/store.go +++ b/internal/pf/store.go @@ -356,6 +356,10 @@ func (a *App) CreateProject(ctx context.Context, ownerID int64, p ProjectPayload } p.Title = title p.DealType = deal + p, err = a.enrichProjectPayloadFromURL(ctx, p) + if err != nil { + return nil, err + } if err := validateProjectRequired(p); err != nil { return nil, err } @@ -395,6 +399,10 @@ func (a *App) UpdateProject(ctx context.Context, ownerID, projectID int64, p Pro p = mergeProjectPayload(current, p) p.Title = title p.DealType = deal + p, err = a.enrichProjectPayloadFromURL(ctx, p) + if err != nil { + return nil, err + } if err := validateProjectRequired(p); err != nil { return nil, err } @@ -437,6 +445,51 @@ func mergeProjectPayload(current *Project, p ProjectPayload) ProjectPayload { return p } +func (a *App) enrichProjectPayloadFromURL(ctx context.Context, p ProjectPayload) (ProjectPayload, error) { + url := cleanPtr(p.OurURL) + if url == nil || a.Worker == nil { + return p, nil + } + parsed, err := a.Worker.ParseOwnListing(ctx, *url) + if err != nil { + if projectMissingParsedFields(p) { + return p, fmt.Errorf("parse our_url: %w", err) + } + return p, nil + } + return applyParsedOwnListing(p, parsed), nil +} + +func projectMissingParsedFields(p ProjectPayload) bool { + return p.OurPrice == nil || + cleanPtr(p.DLDPermit) == nil || + cleanPtr(p.Building) == nil || + p.Bedrooms == nil || + p.SizeSqft == nil +} + +func applyParsedOwnListing(p ProjectPayload, parsed *ParsedOwnListing) ProjectPayload { + if parsed == nil { + return p + } + if parsed.OurPrice != nil && *parsed.OurPrice > 0 { + p.OurPrice = parsed.OurPrice + } + if permit := cleanPtr(parsed.DLDPermit); permit != nil { + p.DLDPermit = permit + } + if building := cleanPtr(parsed.Building); building != nil { + p.Building = building + } + if parsed.Bedrooms != nil { + p.Bedrooms = parsed.Bedrooms + } + if parsed.SizeSqft != nil && *parsed.SizeSqft > 0 { + p.SizeSqft = parsed.SizeSqft + } + return p +} + func validateProjectRequired(p ProjectPayload) error { if cleanString(p.Title) == "" { return fmt.Errorf("title is required") diff --git a/internal/pf/store_test.go b/internal/pf/store_test.go index 036bf9c..f16060c 100644 --- a/internal/pf/store_test.go +++ b/internal/pf/store_test.go @@ -65,3 +65,41 @@ func TestValidateProjectRequiredRejectsListingLikeURLWithoutID(t *testing.T) { t.Fatalf("unexpected error: %v", err) } } + +func TestApplyParsedOwnListingFillsProjectMetadata(t *testing.T) { + payload := ProjectPayload{ + Title: "Full Park View", + DealType: "sale", + OurURL: strPtr( + "https://www.propertyfinder.ae/en/plp/buy/apartment-for-sale-dubai-dubai-creek-harbour-the-lagoons-harbour-gate-harbour-gate-tower-2-86176216.html", + ), + } + parsed := &ParsedOwnListing{ + OurPrice: float64Ptr(3500000), + DLDPermit: strPtr("7140504127"), + Building: strPtr("Harbour Gate Tower 2"), + Bedrooms: int64Ptr(2), + SizeSqft: float64Ptr(1081), + } + + payload = applyParsedOwnListing(payload, parsed) + + if err := validateProjectRequired(payload); err != nil { + t.Fatalf("validateProjectRequired() after parsed metadata returned error: %v", err) + } + if payload.OurPrice == nil || *payload.OurPrice != 3500000 { + t.Fatalf("our_price was not applied: %#v", payload.OurPrice) + } + if payload.DLDPermit == nil || *payload.DLDPermit != "7140504127" { + t.Fatalf("dld_permit was not applied: %#v", payload.DLDPermit) + } + if payload.Building == nil || *payload.Building != "Harbour Gate Tower 2" { + t.Fatalf("building was not applied: %#v", payload.Building) + } + if payload.Bedrooms == nil || *payload.Bedrooms != 2 { + t.Fatalf("bedrooms was not applied: %#v", payload.Bedrooms) + } + if payload.SizeSqft == nil || *payload.SizeSqft != 1081 { + t.Fatalf("size_sqft was not applied: %#v", payload.SizeSqft) + } +} diff --git a/internal/pf/worker.go b/internal/pf/worker.go index 752ee9b..48646d0 100644 --- a/internal/pf/worker.go +++ b/internal/pf/worker.go @@ -51,6 +51,16 @@ type Suggestion struct { IsActive bool `json:"is_active"` } +type ParsedOwnListing struct { + Title *string `json:"title"` + OurPrice *float64 `json:"our_price"` + DLDPermit *string `json:"dld_permit"` + Building *string `json:"building"` + Bedrooms *int64 `json:"bedrooms"` + SizeSqft *float64 `json:"size_sqft"` + Currency *string `json:"currency"` +} + type SuggestionsResponse struct { OurPermit *string `json:"our_permit"` BayutEnabled bool `json:"bayut_enabled"` @@ -108,6 +118,14 @@ func (w *Worker) Suggest(ctx context.Context, projectID int64) (*SuggestionsResp return &out, nil } +func (w *Worker) ParseOwnListing(ctx context.Context, url string) (*ParsedOwnListing, error) { + var out ParsedOwnListing + if err := w.call(ctx, "parse-own-listing", map[string]any{"url": url}, &out); err != nil { + return nil, err + } + return &out, nil +} + func (w *Worker) Health(ctx context.Context) error { var out HealthResult if err := w.call(ctx, "health", map[string]any{}, &out); err != nil {