monitoring-tg/src/parser_bot/extractors.py

"""Heuristic extractors for Telegram message text.

Russian-first, regex/keyword based, no ML deps. Goal is to surface signals for
the UI: phone numbers, person names (FIO), and real-estate intent (sale/rent/
purchase). False positives are tolerable — operator triages in the UI.

Output shape (used as JSONB in messages.extracted):
{
  "phones": ["+79123456789", ...],
  "names":  ["Иван Петров", ...],
  "real_estate": {
    "kind": "sale" | "rent" | "purchase" | null,
    "property_type": str | null,    # квартира, дом, ...
    "rooms": str | null,            # "2-к"
    "area_m2": float | null,
    "price": str | null,            # raw matched string
  } | null
}
"""
from __future__ import annotations

import re
from typing import Any

# --- Telegram @handles ---------------------------------------------------

# Plain @username — Telegram allows 5–32 chars, letters/digits/_, no leading digit.
_TG_HANDLE_RE = re.compile(r"(?<![\w/])@([A-Za-z][A-Za-z0-9_]{4,31})\b")
# t.me / telegram.me links to a user/channel handle (not joinchat / +invite).
_TG_LINK_RE = re.compile(
    r"(?:https?://)?(?:t|telegram)\.me/(?!joinchat/|\+)([A-Za-z][A-Za-z0-9_]{4,31})\b"
)


def extract_tg_handles(text: str | None) -> list[str]:
    if not text:
        return []
    out: list[str] = []
    seen: set[str] = set()
    for h in _TG_HANDLE_RE.findall(text):
        key = h.lower()
        if key in seen:
            continue
        seen.add(key)
        out.append("@" + h)
    for h in _TG_LINK_RE.findall(text):
        key = h.lower()
        if key in seen:
            continue
        seen.add(key)
        out.append("@" + h)
    return out


# --- Phones --------------------------------------------------------------

# Russian-format: starts with +7, 7, or 8 (no plus), 11 digits total.
_PHONE_RU_RE = re.compile(
    r"(?<!\d)(?:\+?7|8)[\s\-().]*\d{3}[\s\-().]*\d{3}[\s\-().]*\d{2}[\s\-().]*\d{2}(?!\d)"
)

# International format: starts with `+<country code>` then 7–14 more digits
# with optional separators. Catches +971 (UAE), +1 (US), +44 (UK), etc.
_PHONE_INTL_RE = re.compile(
    r"(?<![\w\d])\+\d{1,3}[\s\-().]*(?:\d[\s\-().]*){6,14}\d(?!\d)"
)


def extract_phones(text: str | None) -> list[str]:
    if not text:
        return []
    out: list[str] = []
    seen: set[str] = set()

    # Pass 1: Russian-style. Normalize to +7XXXXXXXXXX.
    for raw in _PHONE_RU_RE.findall(text):
        digits = re.sub(r"\D", "", raw)
        if len(digits) == 11 and digits[0] in "78":
            normalized = "+7" + digits[1:]
        elif len(digits) == 10:
            normalized = "+7" + digits
        else:
            continue
        if normalized not in seen:
            seen.add(normalized)
            out.append(normalized)

    # Pass 2: international "+<country>...". Keep raw plus-prefix; just
    # collapse separators so the result is +<digits>.
    for raw in _PHONE_INTL_RE.findall(text):
        digits = re.sub(r"\D", "", raw)
        if not (8 <= len(digits) <= 15):
            continue
        normalized = "+" + digits
        # If it normalized to something we already captured (e.g. +7 number
        # picked up by both passes), skip.
        if normalized in seen:
            continue
        seen.add(normalized)
        out.append(normalized)
    return out


# --- Names (ФИО) ---------------------------------------------------------

# Two or three capitalized Cyrillic tokens in a row. Allows hyphens (Иванов-Петров).
_NAME_RE = re.compile(
    r"\b([А-ЯЁ][а-яё]+(?:\-[А-ЯЁ][а-яё]+)?(?:\s+[А-ЯЁ][а-яё]+(?:\-[А-ЯЁ][а-яё]+)?){1,2})\b"
)

# Common false positives — geo/places/orgs/etc. Skip exact matches.
_NAME_BLOCKLIST = {
    "Российская Федерация",
    "Санкт Петербург",
    "Санкт-Петербург",
    "Нижний Новгород",
    "Великий Новгород",
    "Ростов На Дону",
    "Ростов-На-Дону",
    "Москва Сити",
    "Красная Площадь",
    "Чёрное Море",
    "Чёрного Моря",
    "Без Депозита",
    "Без Залога",
    "Без Комиссии",
    "Сдам Квартиру",
    "Продам Квартиру",
    "Куплю Квартиру",
    "Сдам Студию",
    "Продам Студию",
}

# Words that look like names but rarely are (months, weekdays, common nouns).
_NAME_TOKEN_BLOCK = {
    "Январь", "Февраль", "Март", "Апрель", "Май", "Июнь",
    "Июль", "Август", "Сентябрь", "Октябрь", "Ноябрь", "Декабрь",
    "Понедельник", "Вторник", "Среда", "Четверг", "Пятница", "Суббота", "Воскресенье",
    "Москва", "Питер", "Россия", "Кремль", "Метро",
}


def extract_names(text: str | None) -> list[str]:
    if not text:
        return []
    out: list[str] = []
    seen: set[str] = set()
    for match in _NAME_RE.findall(text):
        candidate = match.strip()
        if candidate in _NAME_BLOCKLIST:
            continue
        tokens = re.split(r"[\s\-]+", candidate)
        if any(t in _NAME_TOKEN_BLOCK for t in tokens):
            continue
        # Heuristic: at least one token must have len >= 4 (rules out "Ул.")
        if not any(len(t) >= 4 for t in tokens):
            continue
        if candidate not in seen:
            seen.add(candidate)
            out.append(candidate)
    return out


# --- Real estate ---------------------------------------------------------

_DEAL_KEYWORDS: dict[str, tuple[str, ...]] = {
    "rent": (
        # ru
        "сдаётся", "сдается", "сдаю", "сдадим", "сдам", "сдаём",
        "аренда", "арендую", "арендуем", "снять",
        "посуточно", "помесячно",
        # en
        "for rent", "to let", "rental", "renting", "lease", "leasing",
        "per year", "per month", "/year", "/month", "/mo",
    ),
    "sale": (
        # ru
        "продаётся", "продается", "продаю", "продадим", "продам", "продаём",
        "продажа", "к продаже",
        # en
        "for sale", "#forsale", "selling", "selling price", "sale price",
    ),
    "purchase": (
        # ru
        "куплю", "купим", "покупаю", "покупка", "ищу квартиру",
        "ищу дом", "ищем квартиру", "рассматриваю покупку",
        # en
        "looking for", "want to buy", "wanted", "requirement", "wtb",
    ),
}

_PROPERTY_TYPES: tuple[tuple[str, str], ...] = (
    # ru
    ("квартир", "квартира"),
    ("студи", "студия"),
    ("апартамент", "апартаменты"),
    ("комнат", "комната"),
    ("таунхаус", "таунхаус"),
    ("коттедж", "коттедж"),
    ("дача", "дача"),
    ("дом", "дом"),
    ("офис", "офис"),
    ("склад", "склад"),
    ("помещен", "помещение"),
    ("земельн", "земельный участок"),
    ("участок", "участок"),
    ("гараж", "гараж"),
    ("машиномест", "машиноместо"),
    # en — kept as Russian labels for UI consistency
    ("villa", "дом"),
    ("townhouse", "таунхаус"),
    ("penthouse", "апартаменты"),
    ("apartment", "квартира"),
    ("studio", "студия"),
    ("plot", "участок"),
    (" land ", "участок"),
    ("office", "офис"),
    ("warehouse", "склад"),
    ("retail", "помещение"),
    ("garage", "гараж"),
)

_AREA_M2_RE = re.compile(
    r"(\d[\d\s,]*\d|\d)\s*(?:м[²2]|кв\.?\s*м|кв\.\s*метр)",
    re.IGNORECASE,
)
_AREA_SQFT_RE = re.compile(
    r"(\d[\d\s,]*\d|\d)\s*(?:sqft|sq\.?\s*ft|sq\s+ft|square\s+feet)",
    re.IGNORECASE,
)


def _parse_number(s: str) -> float | None:
    cleaned = s.replace(" ", "").replace(",", "")
    try:
        return float(cleaned)
    except ValueError:
        return None
_ROOMS_RE = re.compile(
    r"\b(\d)[\-\s]*(?:к\b|комн|комнатн|-комнат|br\b|bed\b|bedroom|-bed)",
    re.IGNORECASE,
)
# Studio is a special-case "0 rooms" indicator; not extracted as rooms count.
_PRICE_RE = re.compile(
    r"(\d[\d\s.,]*\d|\d)\s*(млн|млрд|тыс|тысяч|миллионов?|миллиардов?|руб(?:лей)?|₽|р/мес|/мес|р\b)",
    re.IGNORECASE,
)


def _detect_kind(low: str) -> str | None:
    for kind, words in _DEAL_KEYWORDS.items():
        for w in words:
            if w in low:
                return kind
    return None


def _detect_property_type(low: str) -> str | None:
    for stem, label in _PROPERTY_TYPES:
        if stem in low:
            return label
    return None


def extract_real_estate(text: str | None) -> dict[str, Any] | None:
    if not text:
        return None
    low = text.lower()
    kind = _detect_kind(low)
    prop = _detect_property_type(low)
    if kind is None and prop is None:
        return None

    rooms_m = _ROOMS_RE.search(low)
    rooms = f"{rooms_m.group(1)}-к" if rooms_m else None
    if rooms is None and ("студи" in low or "studio" in low):
        rooms = "студия"

    area: float | None = None
    area_m = _AREA_M2_RE.search(text)
    if area_m:
        area = _parse_number(area_m.group(1))
    if area is None:
        sqft_m = _AREA_SQFT_RE.search(text)
        if sqft_m:
            sqft = _parse_number(sqft_m.group(1))
            if sqft is not None:
                area = round(sqft * 0.0929, 1)

    price_m = _PRICE_RE.search(text)
    price = price_m.group(0).strip() if price_m else None

    return {
        "kind": kind,
        "property_type": prop,
        "rooms": rooms,
        "area_m2": area,
        "price": price,
    }


# --- Top-level analyzer --------------------------------------------------


def analyze(text: str | None) -> dict[str, Any]:
    """Synchronous regex-only analysis. Cheap and runs at insert time."""
    return {
        "phones": extract_phones(text),
        "names": extract_names(text),
        "tg_handles": extract_tg_handles(text),
        "real_estate": extract_real_estate(text),
    }