feat: parse project metadata from PF links
This commit is contained in:
@@ -35,6 +35,9 @@ class ScrapedListing:
|
|||||||
permit_number: str | None
|
permit_number: str | None
|
||||||
agent_name: str | None
|
agent_name: str | None
|
||||||
agency_name: str | None
|
agency_name: str | None
|
||||||
|
building: str | None = None
|
||||||
|
bedrooms: int | None = None
|
||||||
|
size_sqft: float | None = None
|
||||||
is_active: bool = True
|
is_active: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -100,6 +100,110 @@ def _extract_permit(item: dict) -> str | None:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_int(value) -> int | None:
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
if isinstance(value, bool):
|
||||||
|
return None
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
return int(value)
|
||||||
|
text = str(value).strip().lower()
|
||||||
|
if text in {"studio", "студия"}:
|
||||||
|
return 0
|
||||||
|
m = re.search(r"\d+", text)
|
||||||
|
return int(m.group(0)) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_bedrooms(item: dict) -> int | None:
|
||||||
|
for key in ("bedrooms", "bedroom", "beds", "rooms", "bedroom_count", "bedrooms_count"):
|
||||||
|
value = item.get(key)
|
||||||
|
if isinstance(value, dict):
|
||||||
|
value = value.get("value") or value.get("count") or value.get("name")
|
||||||
|
parsed = _parse_int(value)
|
||||||
|
if parsed is not None:
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
for node in _walk(item):
|
||||||
|
if not isinstance(node, dict):
|
||||||
|
continue
|
||||||
|
name = str(node.get("name") or node.get("label") or node.get("key") or "").lower()
|
||||||
|
if "bed" not in name and "спал" not in name:
|
||||||
|
continue
|
||||||
|
parsed = _parse_int(node.get("value") or node.get("count") or node.get("text"))
|
||||||
|
if parsed is not None:
|
||||||
|
return parsed
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _area_to_sqft(value, unit: str | None = None) -> float | None:
|
||||||
|
parsed = parse_price(value)
|
||||||
|
if parsed is None:
|
||||||
|
return None
|
||||||
|
unit_text = (unit or "").lower()
|
||||||
|
if any(token in unit_text for token in ("sqm", "sq m", "m2", "m²", "метр")):
|
||||||
|
return round(parsed * 10.7639, 2)
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_size_sqft(item: dict) -> float | None:
|
||||||
|
for key in ("size", "area", "property_size", "built_up_area", "builtup_area", "plot_area"):
|
||||||
|
value = item.get(key)
|
||||||
|
unit = None
|
||||||
|
if isinstance(value, dict):
|
||||||
|
unit = value.get("unit") or value.get("unit_label") or value.get("unitLabel")
|
||||||
|
value = value.get("value") or value.get("amount") or value.get("text")
|
||||||
|
parsed = _area_to_sqft(value, unit)
|
||||||
|
if parsed is not None:
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
for node in _walk(item):
|
||||||
|
if not isinstance(node, dict):
|
||||||
|
continue
|
||||||
|
name = str(node.get("name") or node.get("label") or node.get("key") or "").lower()
|
||||||
|
if not any(token in name for token in ("size", "area", "sqft", "sq ft", "площад")):
|
||||||
|
continue
|
||||||
|
parsed = _area_to_sqft(
|
||||||
|
node.get("value") or node.get("amount") or node.get("text"),
|
||||||
|
str(node.get("unit") or node.get("unit_label") or ""),
|
||||||
|
)
|
||||||
|
if parsed is not None:
|
||||||
|
return parsed
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _location_candidate(node: dict) -> tuple[int, str] | None:
|
||||||
|
rank = _LOC_TYPE_PRIORITY.get(str(node.get("type", "")).upper(), -1)
|
||||||
|
name = str(node.get("name") or "").strip()
|
||||||
|
if rank < 0 or not name:
|
||||||
|
return None
|
||||||
|
return rank, name
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_building_from(node) -> str | None:
|
||||||
|
best_name: str | None = None
|
||||||
|
best_rank = -1
|
||||||
|
for item in _walk(node):
|
||||||
|
if not isinstance(item, dict):
|
||||||
|
continue
|
||||||
|
candidate = _location_candidate(item)
|
||||||
|
if not candidate:
|
||||||
|
continue
|
||||||
|
rank, name = candidate
|
||||||
|
if rank > best_rank:
|
||||||
|
best_rank, best_name = rank, name
|
||||||
|
return best_name
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_building(data: dict, item: dict) -> str | None:
|
||||||
|
for key in ("location", "location_tree", "locations", "locationTree", "community"):
|
||||||
|
value = item.get(key)
|
||||||
|
if value:
|
||||||
|
building = _extract_building_from(value)
|
||||||
|
if building:
|
||||||
|
return building
|
||||||
|
return _extract_building_from(data)
|
||||||
|
|
||||||
|
|
||||||
def _find_permit_on_page(data: dict) -> str | None:
|
def _find_permit_on_page(data: dict) -> str | None:
|
||||||
"""The DLD permit number lives in a regulatory block rendered as an image,
|
"""The DLD permit number lives in a regulatory block rendered as an image,
|
||||||
but its plain value is still in __NEXT_DATA__: the dict that carries a
|
but its plain value is still in __NEXT_DATA__: the dict that carries a
|
||||||
@@ -212,6 +316,9 @@ class PropertyFinderScraper:
|
|||||||
permit_number=_find_permit_on_page(data) or _extract_permit(best),
|
permit_number=_find_permit_on_page(data) or _extract_permit(best),
|
||||||
agent_name=agent_name,
|
agent_name=agent_name,
|
||||||
agency_name=agency_name,
|
agency_name=agency_name,
|
||||||
|
building=_extract_building(data, best),
|
||||||
|
bedrooms=_extract_bedrooms(best),
|
||||||
|
size_sqft=_extract_size_sqft(best),
|
||||||
is_active=True,
|
is_active=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -336,6 +443,9 @@ class PropertyFinderScraper:
|
|||||||
permit_number=_extract_permit(node),
|
permit_number=_extract_permit(node),
|
||||||
agent_name=agent_name,
|
agent_name=agent_name,
|
||||||
agency_name=agency_name,
|
agency_name=agency_name,
|
||||||
|
building=_extract_building(data, node),
|
||||||
|
bedrooms=_extract_bedrooms(node),
|
||||||
|
size_sqft=_extract_size_sqft(node),
|
||||||
is_active=True,
|
is_active=True,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -189,6 +189,43 @@ def add_competitor_url(db: Session, project: Project, url: str) -> tuple[Competi
|
|||||||
return listing, ""
|
return listing, ""
|
||||||
|
|
||||||
|
|
||||||
|
def parse_our_listing_url(url: str) -> dict:
|
||||||
|
"""Parse our own PF listing for project metadata.
|
||||||
|
|
||||||
|
Used by the Go API before project validation, so users can paste only the
|
||||||
|
concrete object URL and let the service fill price/permit/building/area.
|
||||||
|
"""
|
||||||
|
url = (url or "").strip()
|
||||||
|
if not url:
|
||||||
|
raise ValueError("URL пустой")
|
||||||
|
source = detect_source_from_url(url)
|
||||||
|
if source is None:
|
||||||
|
raise ValueError("URL должен быть с propertyfinder.ae или bayut.com")
|
||||||
|
if source == Source.BAYUT and not BAYUT_ENABLED:
|
||||||
|
raise ValueError(
|
||||||
|
"Bayut временно не поддерживается — площадка перешла на защищённый "
|
||||||
|
"рендеринг. Используйте ссылку PropertyFinder."
|
||||||
|
)
|
||||||
|
if not _is_supported_listing_url(source, url):
|
||||||
|
raise ValueError("Укажите ссылку на конкретное объявление, а не на страницу поиска")
|
||||||
|
|
||||||
|
scraped = _scraper_for(source).fetch_listing(url)
|
||||||
|
if scraped is None:
|
||||||
|
raise ValueError("Не удалось загрузить страницу — сайт мог заблокировать запрос, попробуйте позже")
|
||||||
|
if not scraped.is_active:
|
||||||
|
raise ValueError("Страница объявления вернула 404 — ссылка битая или объявление снято")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"title": scraped.title,
|
||||||
|
"our_price": scraped.price,
|
||||||
|
"dld_permit": scraped.permit_number,
|
||||||
|
"building": scraped.building,
|
||||||
|
"bedrooms": scraped.bedrooms,
|
||||||
|
"size_sqft": scraped.size_sqft,
|
||||||
|
"currency": scraped.currency or "AED",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def add_competitor_urls(db: Session, project: Project, urls: list[str]) -> dict:
|
def add_competitor_urls(db: Session, project: Project, urls: list[str]) -> dict:
|
||||||
"""Add several pasted/selected URLs in one go (used by the suggest page's
|
"""Add several pasted/selected URLs in one go (used by the suggest page's
|
||||||
multi-select). Processes them sequentially — each one re-fetches the page —
|
multi-select). Processes them sequentially — each one re-fetches the page —
|
||||||
@@ -421,6 +458,12 @@ def refresh_our_listing(db: Session, project: Project, *, now: datetime | None =
|
|||||||
changed: list[str] = []
|
changed: list[str] = []
|
||||||
if scraped.permit_number and not project.dld_permit:
|
if scraped.permit_number and not project.dld_permit:
|
||||||
project.dld_permit = scraped.permit_number
|
project.dld_permit = scraped.permit_number
|
||||||
|
if scraped.building and not project.building:
|
||||||
|
project.building = scraped.building
|
||||||
|
if scraped.bedrooms is not None and project.bedrooms is None:
|
||||||
|
project.bedrooms = scraped.bedrooms
|
||||||
|
if scraped.size_sqft is not None and project.size_sqft is None:
|
||||||
|
project.size_sqft = scraped.size_sqft
|
||||||
|
|
||||||
old_price = project.our_price
|
old_price = project.our_price
|
||||||
new_price = scraped.price
|
new_price = scraped.price
|
||||||
|
|||||||
@@ -17,6 +17,8 @@ from app.models import Project
|
|||||||
from app.services.monitor import (
|
from app.services.monitor import (
|
||||||
BAYUT_ENABLED,
|
BAYUT_ENABLED,
|
||||||
add_competitor_url,
|
add_competitor_url,
|
||||||
|
notify_project_changes,
|
||||||
|
parse_our_listing_url,
|
||||||
run_check_all,
|
run_check_all,
|
||||||
run_check_for_project,
|
run_check_for_project,
|
||||||
sync_permit_competitors,
|
sync_permit_competitors,
|
||||||
@@ -133,6 +135,14 @@ def cmd_suggest(payload: dict[str, Any]) -> None:
|
|||||||
db.close()
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_parse_own_listing(payload: dict[str, Any]) -> None:
|
||||||
|
url = str(payload.get("url") or "")
|
||||||
|
try:
|
||||||
|
_write(parse_our_listing_url(url))
|
||||||
|
except ValueError as exc:
|
||||||
|
_fail(str(exc))
|
||||||
|
|
||||||
|
|
||||||
def cmd_health(_: dict[str, Any]) -> None:
|
def cmd_health(_: dict[str, Any]) -> None:
|
||||||
db = SessionLocal()
|
db = SessionLocal()
|
||||||
try:
|
try:
|
||||||
@@ -149,6 +159,7 @@ COMMANDS = {
|
|||||||
"check-project": cmd_check_project,
|
"check-project": cmd_check_project,
|
||||||
"check-all": cmd_check_all,
|
"check-all": cmd_check_all,
|
||||||
"suggest": cmd_suggest,
|
"suggest": cmd_suggest,
|
||||||
|
"parse-own-listing": cmd_parse_own_listing,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -356,6 +356,10 @@ func (a *App) CreateProject(ctx context.Context, ownerID int64, p ProjectPayload
|
|||||||
}
|
}
|
||||||
p.Title = title
|
p.Title = title
|
||||||
p.DealType = deal
|
p.DealType = deal
|
||||||
|
p, err = a.enrichProjectPayloadFromURL(ctx, p)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
if err := validateProjectRequired(p); err != nil {
|
if err := validateProjectRequired(p); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -395,6 +399,10 @@ func (a *App) UpdateProject(ctx context.Context, ownerID, projectID int64, p Pro
|
|||||||
p = mergeProjectPayload(current, p)
|
p = mergeProjectPayload(current, p)
|
||||||
p.Title = title
|
p.Title = title
|
||||||
p.DealType = deal
|
p.DealType = deal
|
||||||
|
p, err = a.enrichProjectPayloadFromURL(ctx, p)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
if err := validateProjectRequired(p); err != nil {
|
if err := validateProjectRequired(p); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -437,6 +445,51 @@ func mergeProjectPayload(current *Project, p ProjectPayload) ProjectPayload {
|
|||||||
return p
|
return p
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) enrichProjectPayloadFromURL(ctx context.Context, p ProjectPayload) (ProjectPayload, error) {
|
||||||
|
url := cleanPtr(p.OurURL)
|
||||||
|
if url == nil || a.Worker == nil {
|
||||||
|
return p, nil
|
||||||
|
}
|
||||||
|
parsed, err := a.Worker.ParseOwnListing(ctx, *url)
|
||||||
|
if err != nil {
|
||||||
|
if projectMissingParsedFields(p) {
|
||||||
|
return p, fmt.Errorf("parse our_url: %w", err)
|
||||||
|
}
|
||||||
|
return p, nil
|
||||||
|
}
|
||||||
|
return applyParsedOwnListing(p, parsed), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func projectMissingParsedFields(p ProjectPayload) bool {
|
||||||
|
return p.OurPrice == nil ||
|
||||||
|
cleanPtr(p.DLDPermit) == nil ||
|
||||||
|
cleanPtr(p.Building) == nil ||
|
||||||
|
p.Bedrooms == nil ||
|
||||||
|
p.SizeSqft == nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func applyParsedOwnListing(p ProjectPayload, parsed *ParsedOwnListing) ProjectPayload {
|
||||||
|
if parsed == nil {
|
||||||
|
return p
|
||||||
|
}
|
||||||
|
if parsed.OurPrice != nil && *parsed.OurPrice > 0 {
|
||||||
|
p.OurPrice = parsed.OurPrice
|
||||||
|
}
|
||||||
|
if permit := cleanPtr(parsed.DLDPermit); permit != nil {
|
||||||
|
p.DLDPermit = permit
|
||||||
|
}
|
||||||
|
if building := cleanPtr(parsed.Building); building != nil {
|
||||||
|
p.Building = building
|
||||||
|
}
|
||||||
|
if parsed.Bedrooms != nil {
|
||||||
|
p.Bedrooms = parsed.Bedrooms
|
||||||
|
}
|
||||||
|
if parsed.SizeSqft != nil && *parsed.SizeSqft > 0 {
|
||||||
|
p.SizeSqft = parsed.SizeSqft
|
||||||
|
}
|
||||||
|
return p
|
||||||
|
}
|
||||||
|
|
||||||
func validateProjectRequired(p ProjectPayload) error {
|
func validateProjectRequired(p ProjectPayload) error {
|
||||||
if cleanString(p.Title) == "" {
|
if cleanString(p.Title) == "" {
|
||||||
return fmt.Errorf("title is required")
|
return fmt.Errorf("title is required")
|
||||||
|
|||||||
@@ -65,3 +65,41 @@ func TestValidateProjectRequiredRejectsListingLikeURLWithoutID(t *testing.T) {
|
|||||||
t.Fatalf("unexpected error: %v", err)
|
t.Fatalf("unexpected error: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestApplyParsedOwnListingFillsProjectMetadata(t *testing.T) {
|
||||||
|
payload := ProjectPayload{
|
||||||
|
Title: "Full Park View",
|
||||||
|
DealType: "sale",
|
||||||
|
OurURL: strPtr(
|
||||||
|
"https://www.propertyfinder.ae/en/plp/buy/apartment-for-sale-dubai-dubai-creek-harbour-the-lagoons-harbour-gate-harbour-gate-tower-2-86176216.html",
|
||||||
|
),
|
||||||
|
}
|
||||||
|
parsed := &ParsedOwnListing{
|
||||||
|
OurPrice: float64Ptr(3500000),
|
||||||
|
DLDPermit: strPtr("7140504127"),
|
||||||
|
Building: strPtr("Harbour Gate Tower 2"),
|
||||||
|
Bedrooms: int64Ptr(2),
|
||||||
|
SizeSqft: float64Ptr(1081),
|
||||||
|
}
|
||||||
|
|
||||||
|
payload = applyParsedOwnListing(payload, parsed)
|
||||||
|
|
||||||
|
if err := validateProjectRequired(payload); err != nil {
|
||||||
|
t.Fatalf("validateProjectRequired() after parsed metadata returned error: %v", err)
|
||||||
|
}
|
||||||
|
if payload.OurPrice == nil || *payload.OurPrice != 3500000 {
|
||||||
|
t.Fatalf("our_price was not applied: %#v", payload.OurPrice)
|
||||||
|
}
|
||||||
|
if payload.DLDPermit == nil || *payload.DLDPermit != "7140504127" {
|
||||||
|
t.Fatalf("dld_permit was not applied: %#v", payload.DLDPermit)
|
||||||
|
}
|
||||||
|
if payload.Building == nil || *payload.Building != "Harbour Gate Tower 2" {
|
||||||
|
t.Fatalf("building was not applied: %#v", payload.Building)
|
||||||
|
}
|
||||||
|
if payload.Bedrooms == nil || *payload.Bedrooms != 2 {
|
||||||
|
t.Fatalf("bedrooms was not applied: %#v", payload.Bedrooms)
|
||||||
|
}
|
||||||
|
if payload.SizeSqft == nil || *payload.SizeSqft != 1081 {
|
||||||
|
t.Fatalf("size_sqft was not applied: %#v", payload.SizeSqft)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -51,6 +51,16 @@ type Suggestion struct {
|
|||||||
IsActive bool `json:"is_active"`
|
IsActive bool `json:"is_active"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type ParsedOwnListing struct {
|
||||||
|
Title *string `json:"title"`
|
||||||
|
OurPrice *float64 `json:"our_price"`
|
||||||
|
DLDPermit *string `json:"dld_permit"`
|
||||||
|
Building *string `json:"building"`
|
||||||
|
Bedrooms *int64 `json:"bedrooms"`
|
||||||
|
SizeSqft *float64 `json:"size_sqft"`
|
||||||
|
Currency *string `json:"currency"`
|
||||||
|
}
|
||||||
|
|
||||||
type SuggestionsResponse struct {
|
type SuggestionsResponse struct {
|
||||||
OurPermit *string `json:"our_permit"`
|
OurPermit *string `json:"our_permit"`
|
||||||
BayutEnabled bool `json:"bayut_enabled"`
|
BayutEnabled bool `json:"bayut_enabled"`
|
||||||
@@ -108,6 +118,14 @@ func (w *Worker) Suggest(ctx context.Context, projectID int64) (*SuggestionsResp
|
|||||||
return &out, nil
|
return &out, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (w *Worker) ParseOwnListing(ctx context.Context, url string) (*ParsedOwnListing, error) {
|
||||||
|
var out ParsedOwnListing
|
||||||
|
if err := w.call(ctx, "parse-own-listing", map[string]any{"url": url}, &out); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return &out, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (w *Worker) Health(ctx context.Context) error {
|
func (w *Worker) Health(ctx context.Context) error {
|
||||||
var out HealthResult
|
var out HealthResult
|
||||||
if err := w.call(ctx, "health", map[string]any{}, &out); err != nil {
|
if err := w.call(ctx, "health", map[string]any{}, &out); err != nil {
|
||||||
|
|||||||
Reference in New Issue
Block a user