feat: parse project metadata from PF links
All checks were successful
CI / hygiene (push) Successful in 2s
Build and Deploy / build-and-deploy (push) Successful in 38s
CI / go (push) Successful in 26s
CI / python (push) Successful in 16s

This commit is contained in:
Grendgi
2026-06-24 14:03:25 +03:00
parent 31c498af39
commit 6750722429
7 changed files with 276 additions and 0 deletions

View File

@@ -100,6 +100,110 @@ def _extract_permit(item: dict) -> str | None:
return None
def _parse_int(value) -> int | None:
if value is None:
return None
if isinstance(value, bool):
return None
if isinstance(value, (int, float)):
return int(value)
text = str(value).strip().lower()
if text in {"studio", "студия"}:
return 0
m = re.search(r"\d+", text)
return int(m.group(0)) if m else None
def _extract_bedrooms(item: dict) -> int | None:
for key in ("bedrooms", "bedroom", "beds", "rooms", "bedroom_count", "bedrooms_count"):
value = item.get(key)
if isinstance(value, dict):
value = value.get("value") or value.get("count") or value.get("name")
parsed = _parse_int(value)
if parsed is not None:
return parsed
for node in _walk(item):
if not isinstance(node, dict):
continue
name = str(node.get("name") or node.get("label") or node.get("key") or "").lower()
if "bed" not in name and "спал" not in name:
continue
parsed = _parse_int(node.get("value") or node.get("count") or node.get("text"))
if parsed is not None:
return parsed
return None
def _area_to_sqft(value, unit: str | None = None) -> float | None:
parsed = parse_price(value)
if parsed is None:
return None
unit_text = (unit or "").lower()
if any(token in unit_text for token in ("sqm", "sq m", "m2", "", "метр")):
return round(parsed * 10.7639, 2)
return parsed
def _extract_size_sqft(item: dict) -> float | None:
for key in ("size", "area", "property_size", "built_up_area", "builtup_area", "plot_area"):
value = item.get(key)
unit = None
if isinstance(value, dict):
unit = value.get("unit") or value.get("unit_label") or value.get("unitLabel")
value = value.get("value") or value.get("amount") or value.get("text")
parsed = _area_to_sqft(value, unit)
if parsed is not None:
return parsed
for node in _walk(item):
if not isinstance(node, dict):
continue
name = str(node.get("name") or node.get("label") or node.get("key") or "").lower()
if not any(token in name for token in ("size", "area", "sqft", "sq ft", "площад")):
continue
parsed = _area_to_sqft(
node.get("value") or node.get("amount") or node.get("text"),
str(node.get("unit") or node.get("unit_label") or ""),
)
if parsed is not None:
return parsed
return None
def _location_candidate(node: dict) -> tuple[int, str] | None:
rank = _LOC_TYPE_PRIORITY.get(str(node.get("type", "")).upper(), -1)
name = str(node.get("name") or "").strip()
if rank < 0 or not name:
return None
return rank, name
def _extract_building_from(node) -> str | None:
best_name: str | None = None
best_rank = -1
for item in _walk(node):
if not isinstance(item, dict):
continue
candidate = _location_candidate(item)
if not candidate:
continue
rank, name = candidate
if rank > best_rank:
best_rank, best_name = rank, name
return best_name
def _extract_building(data: dict, item: dict) -> str | None:
for key in ("location", "location_tree", "locations", "locationTree", "community"):
value = item.get(key)
if value:
building = _extract_building_from(value)
if building:
return building
return _extract_building_from(data)
def _find_permit_on_page(data: dict) -> str | None:
"""The DLD permit number lives in a regulatory block rendered as an image,
but its plain value is still in __NEXT_DATA__: the dict that carries a
@@ -212,6 +316,9 @@ class PropertyFinderScraper:
permit_number=_find_permit_on_page(data) or _extract_permit(best),
agent_name=agent_name,
agency_name=agency_name,
building=_extract_building(data, best),
bedrooms=_extract_bedrooms(best),
size_sqft=_extract_size_sqft(best),
is_active=True,
)
@@ -336,6 +443,9 @@ class PropertyFinderScraper:
permit_number=_extract_permit(node),
agent_name=agent_name,
agency_name=agency_name,
building=_extract_building(data, node),
bedrooms=_extract_bedrooms(node),
size_sqft=_extract_size_sqft(node),
is_active=True,
)
)