Clean up monitoring TG Python adapter

2026-06-04 16:10:13 +03:00
parent 76f1be8b2a
commit 7a01eebb5b
12 changed files with 85 additions and 1968 deletions
--- a/.env.example
+++ b/.env.example
@@ -24,16 +24,17 @@ POSTGRES_PORT=5432
 POLL_INTERVAL_SECONDS=60
 POLL_HISTORY_LIMIT=50

-# API
+# Go public API
 API_HOST=0.0.0.0
 API_PORT=8000
+PUBLIC_BASE_PATH=/api/monitoring-tg
+PYTHON_BASE_URL=http://127.0.0.1:8001

 # Media (downloaded photos / small videos / docs from parsed messages)
 MEDIA_DIR=/data/media
 MEDIA_MAX_BYTES=20971520

-# OpenAI-compatible LLM endpoint. In production this can point to the same
-# vLLM server/model used by telephony.
+# OpenAI-compatible LLM endpoint used by the Go classifier.
 LLM_ENABLED=true
 LLM_BASE_URL=http://10.2.3.5:8002
 LLM_API_KEY=
@@ -41,8 +42,6 @@ LLM_MODEL=qwen2.5-14b
 LLM_TIMEOUT_SECONDS=120
 LLM_MAX_TOKENS=600
 LLM_MIN_TEXT_LENGTH=20
-LLM_CLASSIFIER_OWNER=python
-# How often the background classifier wakes up and how many messages it
-# processes per tick. With 5/20s ≈ 900 messages/hour at ~3-6s per call.
+LLM_CLASSIFIER_OWNER=go
 LLM_CLASSIFY_INTERVAL_SECONDS=20
 LLM_CLASSIFY_BATCH_SIZE=5
--- a/README.md
+++ b/README.md
@@ -1,8 +1,9 @@
 # monitoring-tg

-Backend-сервис мониторинга Telegram-каналов для Portal. Python-часть отвечает
-за MTProto/Telethon, API и опрос каналов, а фоновая AI-классификация вынесена
-в Go-воркер. Сервис сохраняет сообщения в Postgres, раскладывает каналы по
+Backend-сервис мониторинга Telegram-каналов для Portal. Публичный API и
+AI-классификация работают на Go, Python оставлен только как внутренний
+MTProto/Telethon-адаптер для авторизации, опроса каналов и дозагрузки медиа.
+Сервис сохраняет сообщения в Postgres, раскладывает каналы по
 вертикалям/подразделам и выполняет AI-анализ через OpenAI-compatible endpoint,
 общий с другими сервисами портала.

@@ -37,6 +38,7 @@ POSTGRES_PASSWORD=parser
 POSTGRES_DB=parser

 PUBLIC_BASE_PATH=/api/monitoring-tg
+PYTHON_BASE_URL=http://127.0.0.1:8001

 LLM_ENABLED=true
 LLM_BASE_URL=http://10.2.3.5:8002
@@ -57,18 +59,19 @@ LLM_CLASSIFIER_OWNER=go
 kubectl apply -k k8s
 ```

-Миграции выполняются entrypoint-ом контейнера перед запуском API.
+Миграции выполняются entrypoint-ом контейнера перед запуском процессов.

 ## Структура

 ```text
 src/parser_bot/
-├── api/         FastAPI роуты + Pydantic-схемы
+├── api/         внутренние FastAPI роуты Telegram-адаптера
 ├── db/          SQLAlchemy модели + сессии
 ├── scheduler/   APScheduler-воркер периодического опроса
 ├── telegram/    Telethon-клиент
 ├── config.py    pydantic-settings
-└── main.py      FastAPI lifespan + uvicorn
+└── main.py      FastAPI lifespan + uvicorn для внутреннего адаптера
+cmd/server/      Go API для Portal
 cmd/classifier/  Go-воркер фоновой LLM-классификации сообщений
 alembic/         миграции
 k8s/             манифесты для портала
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "parser-tg-bot"
 version = "0.1.0"
-description = "Telegram channel parser — periodic polling + storage, future Go microservice"
+description = "Telegram channel monitoring service with Go API/classifier and Python Telethon adapter"
 requires-python = ">=3.11"
 dependencies = [
    "telethon>=1.36",
@@ -15,7 +15,6 @@ dependencies = [
    "pydantic-settings>=2.6",
    "python-dotenv>=1.0",
    "structlog>=24.4",
-    "httpx>=0.27",
 ]

 [project.optional-dependencies]
--- a/src/parser_bot/api/routes.py
+++ b/src/parser_bot/api/routes.py
--- a/src/parser_bot/api/schemas.py
+++ b/src/parser_bot/api/schemas.py
@@ -1,219 +0,0 @@
-import re
-from datetime import datetime
-from typing import Literal
-
-from pydantic import BaseModel, ConfigDict, Field, field_validator
-
-Vertical = Literal["real_estate", "hr"]
-
-# Section slugs are used as URL segments — keep them URL-safe.
-_SLUG_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,62}[a-z0-9]$|^[a-z0-9]$")
-
-
-class SectionCreate(BaseModel):
-    vertical: Vertical
-    slug: str = Field(..., min_length=1, max_length=64)
-    title: str = Field(..., min_length=1, max_length=255)
-    emoji: str | None = Field(None, max_length=8)
-    description: str | None = None
-
-    @field_validator("slug")
-    @classmethod
-    def _check_slug(cls, v: str) -> str:
-        if not _SLUG_RE.match(v):
-            raise ValueError(
-                "slug must be lowercase letters/digits with '-' or '_' separators"
-            )
-        return v
-
-
-class SectionUpdate(BaseModel):
-    title: str | None = Field(None, min_length=1, max_length=255)
-    emoji: str | None = Field(None, max_length=8)
-    description: str | None = None
-
-
-class SectionOut(BaseModel):
-    model_config = ConfigDict(from_attributes=True)
-
-    id: int
-    vertical: Vertical
-    department_id: str | None = None
-    slug: str
-    title: str
-    emoji: str | None
-    description: str | None
-    created_at: datetime
-
-
-class SectionWithStats(SectionOut):
-    """Section payload enriched with rollup counts for the section chooser page."""
-
-    channels_total: int = 0
-    channels_active: int = 0
-    messages_total: int = 0
-    leads_total: int = 0
-
-
-class ChannelCreate(BaseModel):
-    identifier: str = Field(..., min_length=1, max_length=255, description="@username or t.me link")
-    vertical: Vertical = "real_estate"
-    section: str = Field(
-        ..., min_length=1, max_length=64,
-        description="Slug of the section inside the vertical (e.g. 'dubai')",
-    )
-
-
-class ChannelUpdate(BaseModel):
-    is_active: bool | None = None
-    vertical: Vertical | None = None
-    section: str | None = Field(
-        None, min_length=1, max_length=64,
-        description="Move the channel to another section in the same vertical",
-    )
-
-
-class ChannelOut(BaseModel):
-    model_config = ConfigDict(from_attributes=True)
-
-    id: int
-    tg_id: int | None
-    identifier: str
-    title: str | None
-    vertical: Vertical
-    section_id: int
-    section_slug: str | None = None
-    is_active: bool
-    last_message_id: int | None
-    last_polled_at: datetime | None
-    created_at: datetime
-
-
-class ChannelStats(BaseModel):
-    channel_id: int
-    identifier: str
-    title: str | None
-    vertical: Vertical
-    section_slug: str | None = None
-    is_active: bool
-    last_polled_at: datetime | None
-    message_count: int
-    last_message_at: datetime | None
-
-
-class MediaFile(BaseModel):
-    kind: str  # photo | video | document | audio | sticker | unknown
-    url: str | None = None
-    mime: str | None = None
-    size: int | None = None
-    skipped: str | None = None  # set when not downloaded (e.g. "too_large")
-
-
-class RealEstate(BaseModel):
-    kind: str | None = None
-    property_type: str | None = None
-    rooms: str | None = None
-    area_m2: float | None = None
-    price: str | None = None
-
-
-class Lead(BaseModel):
-    is_listing: bool
-    kind: str | None = None  # sale | rent | purchase
-    property_type: str | None = None
-    rooms: str | None = None
-    area_m2: float | None = None
-    price_text: str | None = None
-    price_value: float | None = None
-    currency: str | None = None  # RUB | USD | EUR | AED | GBP | CNY | TRY | KZT | BYN | UAH
-    location: str | None = None
-    contact_phone: str | None = None
-    contact_name: str | None = None
-    summary: str | None = None
-    confidence: float = 0.0
-
-
-class HrLead(BaseModel):
-    """LLM verdict for HR-vertical messages (jobs / resumes / bare contacts)."""
-
-    is_lead: bool
-    kind: str | None = None  # vacancy | resume | contact
-    title: str | None = None
-    company: str | None = None
-    candidate_name: str | None = None
-    experience_years: float | None = None
-    skills: list[str] = []
-    location: str | None = None
-    remote: bool | None = None
-    employment_type: str | None = None
-    salary_text: str | None = None
-    salary_value: float | None = None
-    currency: str | None = None
-    contact_phone: str | None = None
-    contact_name: str | None = None
-    summary: str | None = None
-    confidence: float = 0.0
-
-
-class Extracted(BaseModel):
-    phones: list[str] = []
-    names: list[str] = []
-    tg_handles: list[str] = []
-    real_estate: RealEstate | None = None
-    lead: Lead | None = None
-    hr_lead: HrLead | None = None
-
-
-class MessageOut(BaseModel):
-    model_config = ConfigDict(from_attributes=True)
-
-    id: int
-    channel_id: int
-    channel_vertical: Vertical | None = None
-    channel_section_slug: str | None = None
-    tg_message_id: int
-    grouped_id: int | None = None
-    group_size: int = 1
-    date: datetime
-    text: str | None
-    sender_id: int | None
-    has_media: bool
-    media_files: list[MediaFile] | None = None
-    extracted: Extracted | None = None
-    sender_username: str | None = None
-    sender_name: str | None = None
-    post_url: str | None = None
-    views: int | None
-    forwards: int | None
-    fetched_at: datetime
-
-
-class GlobalStats(BaseModel):
-    vertical: Vertical
-    section_slug: str | None = None
-    channels_total: int
-    channels_active: int
-    messages_total: int
-    messages_last_24h: int
-    leads_total: int = 0
-    leads_last_24h: int = 0
-    poll_interval_seconds: int
-    last_poll_at: datetime | None
-
-
-class AuthStatus(BaseModel):
-    authorized: bool
-    username: str | None = None
-    phone: str | None = None
-
-
-class AuthCode(BaseModel):
-    code: str = Field(..., min_length=3, max_length=12)
-
-
-class AuthPassword(BaseModel):
-    password: str = Field(..., min_length=1)
-
-
-class AuthCodeResult(BaseModel):
-    needs_password: bool
--- a/src/parser_bot/config.py
+++ b/src/parser_bot/config.py
@@ -29,18 +29,6 @@ class Settings(BaseSettings):
    media_dir: str = Field("/data/media", alias="MEDIA_DIR")
    media_max_bytes: int = Field(20 * 1024 * 1024, alias="MEDIA_MAX_BYTES")

-    # OpenAI-compatible LLM endpoint, shared with telephony/vLLM in production.
-    llm_enabled: bool = Field(True, alias="LLM_ENABLED")
-    llm_base_url: str = Field("http://10.2.3.5:8002", alias="LLM_BASE_URL")
-    llm_api_key: str = Field("", alias="LLM_API_KEY")
-    llm_model: str = Field("qwen2.5-14b", alias="LLM_MODEL")
-    llm_timeout_seconds: int = Field(120, alias="LLM_TIMEOUT_SECONDS")
-    llm_max_tokens: int = Field(600, alias="LLM_MAX_TOKENS")
-    llm_min_text_length: int = Field(20, alias="LLM_MIN_TEXT_LENGTH")
-    llm_classify_interval_seconds: int = Field(20, alias="LLM_CLASSIFY_INTERVAL_SECONDS")
-    llm_classify_batch_size: int = Field(5, alias="LLM_CLASSIFY_BATCH_SIZE")
-    llm_classifier_owner: str = Field("python", alias="LLM_CLASSIFIER_OWNER")
-
    @property
    def database_url(self) -> str:
        return (
--- a/src/parser_bot/extractors.py
+++ b/src/parser_bot/extractors.py
@@ -310,26 +310,3 @@ def analyze(text: str | None) -> dict[str, Any]:
        "tg_handles": extract_tg_handles(text),
        "real_estate": extract_real_estate(text),
    }
-
-
-async def analyze_with_llm(
-    text: str | None,
-    vertical: str = "real_estate",
-    department_id: str | None = None,
-    section_slug: str | None = None,
-) -> dict[str, Any]:
-    """Regex extraction + local LLM lead classification, routed by vertical.
-
-    `department_id` + `section_slug` let the classifier pick a department and
-    section-specific system prompt with fallback to the department vertical
-    prompt. The LLM verdict goes under `lead` for RE and
-    under `hr_lead` for HR. Falls back to regex-only if the LLM is unavailable.
-    """
-    base = analyze(text)
-    # Lazy import to avoid hard dep on httpx in environments where LLM is off.
-    from parser_bot.llm import classify
-
-    verdict = await classify(text, vertical, department_id, section_slug)  # type: ignore[arg-type]
-    if verdict is not None:
-        base["hr_lead" if vertical == "hr" else "lead"] = verdict
-    return base
--- a/src/parser_bot/links.py
+++ b/src/parser_bot/links.py
@@ -1,44 +0,0 @@
-"""Build Telegram URLs from stored channel metadata."""
-from __future__ import annotations
-
-import re
-
-_USERNAME_RE = re.compile(r"^@?([A-Za-z][A-Za-z0-9_]{4,31})$")
-_TME_URL_RE = re.compile(
-    r"^(?:https?://)?(?:t|telegram)\.me/(?:s/)?([A-Za-z][A-Za-z0-9_]{4,31})(?:/.*)?$"
-)
-
-
-def channel_username(identifier: str | None) -> str | None:
-    """Extract the public username from a channel identifier if any.
-
-    Returns None for private channels (joinchat, +invite, raw IDs).
-    """
-    if not identifier:
-        return None
-    s = identifier.strip()
-    m = _USERNAME_RE.match(s)
-    if m:
-        return m.group(1)
-    m = _TME_URL_RE.match(s)
-    if m:
-        return m.group(1)
-    return None
-
-
-def post_url(identifier: str | None, tg_id: int | None, tg_message_id: int) -> str | None:
-    """Build a deep link to a Telegram post.
-
-    Public channel: https://t.me/<username>/<msg_id>
-    Private channel (no public username, only tg_id): https://t.me/c/<short>/<msg_id>
-    where <short> is the absolute id with the leading -100 stripped.
-    """
-    username = channel_username(identifier)
-    if username:
-        return f"https://t.me/{username}/{tg_message_id}"
-    if tg_id is None:
-        return None
-    raw = abs(tg_id)
-    s = str(raw)
-    short = s[3:] if s.startswith("100") and len(s) > 3 else s
-    return f"https://t.me/c/{short}/{tg_message_id}"
--- a/src/parser_bot/llm.py
+++ b/src/parser_bot/llm.py
@@ -1,387 +0,0 @@
-"""OpenAI-compatible LLM client for lead classification & extraction.
-
-Two verticals share one model and one process:
-  - real_estate: high recall on listings (sale/rent/purchase),
-  - hr:          vacancies, resumes, bare contact leads.
-
-The system prompt and JSON schema differ per vertical; the rest of the
-plumbing (timeouts, single-lock concurrency, JSON-mode parsing) is shared.
-On any error returns `None` and the caller falls back to regex-only extraction.
-"""
-from __future__ import annotations
-
-import asyncio
-import json
-from typing import Any, Literal
-
-import httpx
-import structlog
-
-from parser_bot.config import settings
-
-log = structlog.get_logger()
-
-
-# Single shared lock so we never run two LLM requests at once on the GPU —
-# they would just thrash VRAM and finish slower than sequential.
-_lock = asyncio.Lock()
-
-
-Vertical = Literal["real_estate", "hr"]
-
-
-DEFAULT_RE_SYSTEM_PROMPT = """\
-Ты — аналитик объявлений о недвижимости. Тебе дают текст из Telegram-канала.
-Сообщение МОЖЕТ БЫТЬ НА ЛЮБОМ ЯЗЫКЕ — русский, английский, арабский, любой
-другой. Обрабатывай его одинаково независимо от языка.
-
-Задача: определить, является ли это РЕАЛЬНЫМ объявлением о покупке, продаже
-или аренде НЕДВИЖИМОСТИ (квартира, дом/villa, студия/studio, апартаменты,
-комната, таунхаус/townhouse, дача, коттедж, пентхаус/penthouse, офис,
-склад, помещение, земельный участок/plot/land, гараж, машиноместо).
-Учитывай намёки и нечёткие формулировки — лучше отметить сомнительный лид
-как `is_listing=true` с низкой confidence, чем пропустить.
-
-Сигналы что это ОБЪЯВЛЕНИЕ (kind):
-— продажа/sale: «продаётся», «продаю», «продажа», «for sale», «#forsale»,
-  «selling price», «selling», «price», «AED 33M», ценник в любой валюте.
-— аренда/rent: «сдаётся», «сдаю», «аренда», «for rent», «to let», «rental»,
-  «per year», «per month», «AED ... /year».
-— покупка/purchase: «куплю», «куплю в», «looking for», «want to buy»,
-  «wanted», «requirement».
-
-ОДНО сообщение может быть и про продажу, И про аренду одновременно
-(«FOR SALE | RENT» / «продажа или аренда»). В таком случае выбирай
-основное намерение по самому тексту; если равноценно — `kind="sale"`
-и упомяни аренду в summary.
-
-НЕ объявления (is_listing=false):
-— общие новости / статьи / аналитика рынка;
-— воспоминания и истории («когда-то продавал квартиру»);
-— шутки, мемы, цитаты;
-— реклама услуг агентств без конкретного объекта;
-— чужие пересланные объявления без контактов и явного предложения от автора.
-
-Отвечай СТРОГО валидным JSON по схеме (никаких комментариев, никакого markdown):
-{
-  "is_listing": boolean,
-  "kind": "sale" | "rent" | "purchase" | null,
-  "property_type": "квартира" | "дом" | "студия" | "апартаменты" | "комната" | "таунхаус" | "дача" | "коттедж" | "офис" | "склад" | "помещение" | "участок" | "гараж" | "машиноместо" | null,
-  "rooms": "студия" | "1-к" | "2-к" | "3-к" | "4-к" | "5+к" | null,
-  "area_m2": number | null,
-  "price_text": string | null,
-  "price_value": number | null,
-  "currency": "RUB" | "USD" | "EUR" | "AED" | "GBP" | "CNY" | "TRY" | "KZT" | "BYN" | "UAH" | null,
-  "location": string | null,
-  "contact_phone": string | null,
-  "contact_name": string | null,
-  "summary": string,
-  "confidence": number
-}
-
-Поля:
- summary — ОДНО короткое предложение НА РУССКОМ языке (даже если исходный
-  текст на английском или другом). Это нужно для единообразного UI.
- property_type — пиши значение по-русски (villa→дом, apartment→квартира,
-  townhouse→таунхаус, plot/land→участок, studio→студия, penthouse→апартаменты,
-  house→дом, office→офис, warehouse→склад, retail→помещение).
- rooms — для англоязычного «3BR», «3 BR», «3 bed», «3-bedroom» возвращай
-  «3-к»; для «studio» → «студия».
- area_m2 — площадь В КВАДРАТНЫХ МЕТРАХ. Если в тексте sqft / sq.ft / sq ft /
-  square feet — переведи: m² = sqft × 0.0929. Округляй до целого.
- confidence ∈ [0, 1]: 0.9+ если явное объявление с ценой/контактом,
-  0.5–0.8 если правдоподобно, 0.2–0.4 если намёк.
- price_text — точная цитата из текста («2.5 млн ₽», «AED 850 000», «$320k»,
-  «300 тыс. дирхам», «د.إ 1.2M», «70,000,000 AED», «AED 4.3M», «AED 1.75M»).
- price_value — числовая величина цены В УКАЗАННОЙ ВАЛЮТЕ (не конвертируй).
-  Раскрывай сокращения: «AED 4.3M» → 4300000, «$320k» → 320000.
- currency — определяй гибко: ₽/руб/р/RUB/рублей → RUB; $/USD/долл/бакс → USD;
-  €/EUR/евро → EUR; AED/дирхам/дирхамов/дирхама/dh/dhs/د.إ/Dirhams → AED;
-  ₺/TRY/лир/лира → TRY; ¥/CNY/юань → CNY; ₸/KZT/тенге → KZT;
-  Br/BYN/бел.руб → BYN; ₴/UAH/грн → UAH. Если не уверен — null.
- contact_phone — любой номер телефона в тексте (с + или без, российский,
-  ОАЭ, любой международный).
-"""
-
-
-DEFAULT_HR_SYSTEM_PROMPT = """\
-Ты — аналитик HR-объявлений. Тебе дают текст из Telegram-канала. Сообщение
-МОЖЕТ БЫТЬ НА ЛЮБОМ ЯЗЫКЕ — обрабатывай одинаково.
-
-Задача: определить, относится ли сообщение к рынку труда, и какого типа лид
-это. Допускаются три типа (`kind`):
-— vacancy — компания/наниматель ищет сотрудника («ищем разработчика»,
-  «hiring backend engineer», «требуется бухгалтер», «we are looking for»);
-— resume — соискатель ищет работу («ищу работу», «open to work», «available
-  for hire», «рассматриваю предложения», «my CV», «резюме»);
-— contact — короткое сообщение с именем/контактом и намёком на профессию,
-  без явной вакансии/резюме («Иван Петров, Python, +7…», «@nick — UI/UX,
-  Дубай»). Используй, когда vacancy и resume не подходят, но из текста ясно,
-  что это HR-контакт.
-
-Лучше отметить сомнительный случай `is_lead=true` с низкой confidence,
-чем пропустить. НО полностью исключай:
-— общие новости и аналитика рынка труда без конкретной вакансии/резюме;
-— реклама курсов, школ, маркетплейсов услуг (Profi.ru и т.п.);
-— чужие пересланные посты без контактов и без явного предложения от автора;
-— объявления о продаже/аренде недвижимости, услуг и товаров;
-— мемы, шутки, цитаты.
-
-Отвечай СТРОГО валидным JSON по схеме (никаких комментариев, никакого markdown):
-{
-  "is_lead": boolean,
-  "kind": "vacancy" | "resume" | "contact" | null,
-  "title": string | null,
-  "company": string | null,
-  "candidate_name": string | null,
-  "experience_years": number | null,
-  "skills": string[],
-  "location": string | null,
-  "remote": boolean | null,
-  "employment_type": "full-time" | "part-time" | "contract" | "internship" | null,
-  "salary_text": string | null,
-  "salary_value": number | null,
-  "currency": "RUB" | "USD" | "EUR" | "AED" | "GBP" | "CNY" | "TRY" | "KZT" | "BYN" | "UAH" | null,
-  "contact_phone": string | null,
-  "contact_name": string | null,
-  "summary": string,
-  "confidence": number
-}
-
-Поля:
- title — должность/роль ОДНОЙ строкой («Senior Python Developer», «Бухгалтер»,
-  «UI/UX-дизайнер»). Для resume — желаемая роль. Для contact — то, что заявлено.
- company — название компании-нанимателя, если оно явно указано (vacancy).
- candidate_name — ФИО или ник кандидата (resume / contact).
- experience_years — стаж в годах числом. «5+ years» → 5. Если не указан — null.
- skills — короткий массив ключевых навыков/технологий (до ~10 элементов).
- remote — true для «удалёнка / remote / WFH / hybrid: remote», false для
-  «офис / on-site», null если не указано.
- employment_type — full-time для «полная занятость / full-time», part-time
-  для «частичная / part-time», contract для «договор/контракт/freelance»,
-  internship для «стажировка/internship». Иначе null.
- salary_text — точная цитата с зарплатой («200–300k ₽», «$5k/mo», «AED 18,000 per month»).
- salary_value — число В УКАЗАННОЙ ВАЛЮТЕ. Если диапазон — нижняя граница.
-  Раскрывай сокращения: «200k» → 200000, «1.5M» → 1500000.
- currency — определяй гибко: ₽/руб/RUB → RUB; $/USD/долл → USD; €/EUR/евро → EUR;
-  AED/дирхам/dh/dhs → AED; ₺/TRY/лир → TRY; ¥/CNY/юань → CNY; ₸/KZT/тенге → KZT;
-  Br/BYN/бел.руб → BYN; ₴/UAH/грн → UAH. Если не уверен — null.
- contact_phone — любой номер телефона (RU / международный, с + или без).
- contact_name — имя контактного лица (рекрутер / соискатель / автор).
- summary — ОДНО короткое предложение НА РУССКОМ языке.
- confidence ∈ [0, 1]: 0.9+ если явная вакансия/резюме с деталями, 0.5–0.8
-  если правдоподобно, 0.2–0.4 если намёк.
-"""
-
-
-# Back-compat alias — older imports referenced DEFAULT_SYSTEM_PROMPT.
-DEFAULT_SYSTEM_PROMPT = DEFAULT_RE_SYSTEM_PROMPT
-
-
-def _build_user_prompt(text: str) -> str:
-    return f"Текст сообщения:\n```\n{text}\n```\nВерни JSON."
-
-
-_VALID_CURRENCIES = {
-    "RUB", "USD", "EUR", "AED", "GBP", "CNY", "TRY", "KZT", "BYN", "UAH"
-}
-
-
-def _coerce_real_estate(payload: Any) -> dict | None:
-    if not isinstance(payload, dict):
-        return None
-    is_listing = bool(payload.get("is_listing"))
-    currency = payload.get("currency")
-    if currency is not None:
-        currency = str(currency).upper()
-        if currency not in _VALID_CURRENCIES:
-            currency = None
-    return {
-        "is_listing": is_listing,
-        "kind": payload.get("kind") if payload.get("kind") in ("sale", "rent", "purchase") else None,
-        "property_type": payload.get("property_type") or None,
-        "rooms": payload.get("rooms") or None,
-        "area_m2": _as_float(payload.get("area_m2")),
-        "price_text": payload.get("price_text") or None,
-        "price_value": _as_float(payload.get("price_value")),
-        "currency": currency,
-        "location": payload.get("location") or None,
-        "contact_phone": payload.get("contact_phone") or None,
-        "contact_name": payload.get("contact_name") or None,
-        "summary": (payload.get("summary") or "")[:300],
-        "confidence": max(0.0, min(1.0, _as_float(payload.get("confidence")) or 0.0)),
-    }
-
-
-def _coerce_hr(payload: Any) -> dict | None:
-    if not isinstance(payload, dict):
-        return None
-    is_lead = bool(payload.get("is_lead"))
-    currency = payload.get("currency")
-    if currency is not None:
-        currency = str(currency).upper()
-        if currency not in _VALID_CURRENCIES:
-            currency = None
-    skills_raw = payload.get("skills") or []
-    if isinstance(skills_raw, str):
-        skills = [s.strip() for s in skills_raw.split(",") if s.strip()]
-    elif isinstance(skills_raw, list):
-        skills = [str(s).strip() for s in skills_raw if str(s).strip()]
-    else:
-        skills = []
-    skills = skills[:15]
-    employment = payload.get("employment_type")
-    if employment is not None and employment not in (
-        "full-time", "part-time", "contract", "internship"
-    ):
-        employment = None
-    remote_raw = payload.get("remote")
-    remote = bool(remote_raw) if isinstance(remote_raw, bool) else None
-    return {
-        "is_lead": is_lead,
-        "kind": payload.get("kind") if payload.get("kind") in ("vacancy", "resume", "contact") else None,
-        "title": payload.get("title") or None,
-        "company": payload.get("company") or None,
-        "candidate_name": payload.get("candidate_name") or None,
-        "experience_years": _as_float(payload.get("experience_years")),
-        "skills": skills,
-        "location": payload.get("location") or None,
-        "remote": remote,
-        "employment_type": employment,
-        "salary_text": payload.get("salary_text") or None,
-        "salary_value": _as_float(payload.get("salary_value")),
-        "currency": currency,
-        "contact_phone": payload.get("contact_phone") or None,
-        "contact_name": payload.get("contact_name") or None,
-        "summary": (payload.get("summary") or "")[:300],
-        "confidence": max(0.0, min(1.0, _as_float(payload.get("confidence")) or 0.0)),
-    }
-
-
-def _as_float(v: Any) -> float | None:
-    if v is None or isinstance(v, bool):
-        return None
-    try:
-        return float(v)
-    except (TypeError, ValueError):
-        return None
-
-
-async def is_ready() -> bool:
-    """Check that the OpenAI-compatible model endpoint is reachable."""
-    try:
-        async with httpx.AsyncClient(timeout=5) as client:
-            headers = _headers()
-            r = await client.get(f"{_base_url()}/v1/models", headers=headers)
-            r.raise_for_status()
-            models = r.json().get("data", [])
-            if not models:
-                return True
-            model_ids = {
-                str(m.get("id") or m.get("name") or "")
-                for m in models
-                if isinstance(m, dict)
-            }
-            return settings.llm_model in model_ids or any(
-                mid.startswith(settings.llm_model) for mid in model_ids
-            )
-    except Exception:
-        return False
-
-
-def _base_url() -> str:
-    return settings.llm_base_url.rstrip("/")
-
-
-def _headers() -> dict[str, str]:
-    headers = {"Content-Type": "application/json"}
-    if settings.llm_api_key:
-        headers["Authorization"] = f"Bearer {settings.llm_api_key}"
-    return headers
-
-
-def default_prompt(vertical: Vertical) -> str:
-    return DEFAULT_HR_SYSTEM_PROMPT if vertical == "hr" else DEFAULT_RE_SYSTEM_PROMPT
-
-
-async def classify(
-    text: str | None,
-    vertical: Vertical = "real_estate",
-    department_id: str | None = None,
-    section_slug: str | None = None,
-) -> dict | None:
-    """Classify a message text under the given vertical/section.
-
-    The system prompt is resolved with `section → vertical → built-in` fallback,
-    so a per-section prompt can fine-tune extraction (e.g. AED/sqft for Dubai)
-    while unconfigured sections keep using the vertical-wide prompt.
-    Returns a vertical-specific structured dict or None on error / short text.
-    """
-    if not settings.llm_enabled:
-        return None
-    if not text or len(text.strip()) < settings.llm_min_text_length:
-        return None
-
-    # Lazy import to avoid a circular: prompt_store -> db.session -> config.
-    from parser_bot import prompt_store
-
-    system = await prompt_store.resolve(
-        vertical, department_id, section_slug, default_prompt(vertical)
-    )
-    payload = {
-        "model": settings.llm_model,
-        "messages": [
-            {"role": "system", "content": system},
-            {"role": "user", "content": _build_user_prompt(text)},
-        ],
-        "temperature": 0.1,
-        "max_tokens": settings.llm_max_tokens,
-        "response_format": {"type": "json_object"},
-    }
-    async with _lock:
-        try:
-            async with httpx.AsyncClient(timeout=settings.llm_timeout_seconds) as client:
-                r = await client.post(
-                    f"{_base_url()}/v1/chat/completions",
-                    headers=_headers(),
-                    json=payload,
-                )
-                if r.status_code != 200:
-                    log.warning(
-                        "llm_request_failed",
-                        status=r.status_code,
-                        model=settings.llm_model,
-                        vertical=vertical,
-                        section=section_slug,
-                        body=r.text[:300],
-                    )
-                    return None
-                data = r.json()
-        except Exception as exc:
-            log.warning(
-                "llm_request_failed", error=str(exc), model=settings.llm_model, vertical=vertical
-            )
-            return None
-
-    choices = data.get("choices") or []
-    message = choices[0].get("message") if choices and isinstance(choices[0], dict) else None
-    raw = ((message or {}).get("content") or "").strip()
-    if not raw:
-        return None
-    try:
-        parsed = json.loads(raw)
-    except json.JSONDecodeError:
-        # Best effort: extract first {...} block.
-        start, end = raw.find("{"), raw.rfind("}")
-        if start == -1 or end == -1:
-            log.warning("llm_invalid_json", raw=raw[:200], vertical=vertical)
-            return None
-        try:
-            parsed = json.loads(raw[start : end + 1])
-        except json.JSONDecodeError:
-            log.warning("llm_invalid_json", raw=raw[:200], vertical=vertical)
-            return None
-
-    if vertical == "hr":
-        return _coerce_hr(parsed)
-    return _coerce_real_estate(parsed)
--- a/src/parser_bot/main.py
+++ b/src/parser_bot/main.py
@@ -1,5 +1,4 @@
 from contextlib import asynccontextmanager
-from pathlib import Path

 import structlog
 import uvicorn
@@ -7,7 +6,6 @@ from fastapi import Depends, FastAPI
 from fastapi.openapi.docs import get_redoc_html, get_swagger_ui_html
 from fastapi.openapi.utils import get_openapi
 from fastapi.responses import JSONResponse
-from fastapi.staticfiles import StaticFiles

 from parser_bot.access import require_admin
 from parser_bot.api.routes import router
@@ -105,10 +103,6 @@ def create_app() -> FastAPI:
            title=app.title + " — redoc",
        )

-    media_dir = Path(settings.media_dir)
-    media_dir.mkdir(parents=True, exist_ok=True)
-    # /media is fine to cache — file names are content-stable.
-    app.mount("/media", StaticFiles(directory=media_dir), name="media")
    return app


--- a/src/parser_bot/prompt_store.py
+++ b/src/parser_bot/prompt_store.py
@@ -1,137 +0,0 @@
-"""Runtime-editable LLM system prompts, persisted in app_settings.
-
-Three resolution levels with fallback (more specific → less specific):
-  1. `llm_system_prompt:<department_id>:<vertical>:<section_slug>` — section override
-  2. `llm_system_prompt:<department_id>:<vertical>` — department vertical override
-  3. built-in DEFAULT_RE_SYSTEM_PROMPT / DEFAULT_HR_SYSTEM_PROMPT
-
-The prompt is read on every classification call but cached for a short
-window so the DB isn't hit per-message. Edits via the API invalidate the
-cache for that level, so a save in the UI takes effect within seconds.
-"""
-from __future__ import annotations
-
-import time
-from typing import Literal
-
-from sqlalchemy import select
-from sqlalchemy.dialects.postgresql import insert as pg_insert
-
-from parser_bot.db.models import AppSetting
-from parser_bot.db.session import session_scope
-
-Vertical = Literal["real_estate", "hr"]
-
-_KEY_PREFIX = "llm_system_prompt:"
-_CACHE_TTL_S = 5.0
-_cache: dict[str, tuple[float, str | None]] = {}
-
-
-def _key(
-    vertical: Vertical,
-    department_id: str | None,
-    section_slug: str | None = None,
-) -> str:
-    dept = department_id or "global"
-    if section_slug:
-        return f"{_KEY_PREFIX}{dept}:{vertical}:{section_slug}"
-    return f"{_KEY_PREFIX}{dept}:{vertical}"
-
-
-async def _load(key: str) -> str | None:
-    """Read a stored prompt by exact key. None if missing or empty."""
-    now = time.monotonic()
-    cached_at, cached_value = _cache.get(key, (0.0, None))
-    if now - cached_at < _CACHE_TTL_S:
-        return cached_value
-
-    async with session_scope() as session:
-        row = await session.execute(
-            select(AppSetting.value).where(AppSetting.key == key)
-        )
-        value = row.scalar_one_or_none()
-
-    text = value if isinstance(value, str) and value.strip() else None
-    _cache[key] = (now, text)
-    return text
-
-
-async def resolve(
-    vertical: Vertical, department_id: str | None, section_slug: str | None, default: str
-) -> str:
-    """Pick the most specific prompt available, falling back to `default`.
-
-    Always consults section-level → vertical-level → default. This is what
-    the classifier uses for every message.
-    """
-    if section_slug:
-        text = await _load(_key(vertical, department_id, section_slug))
-        if text is not None:
-            return text
-    text = await _load(_key(vertical, department_id))
-    if text is not None:
-        return text
-    return default
-
-
-async def get(
-    vertical: Vertical, department_id: str | None, section_slug: str | None, default: str
-) -> tuple[str, str]:
-    """For the settings UI: return (text, source) where source is one of
-    'section' | 'vertical' | 'default'. Lets the editor show which override
-    is currently active without a second round-trip.
-    """
-    if section_slug:
-        text = await _load(_key(vertical, department_id, section_slug))
-        if text is not None:
-            return text, "section"
-    text = await _load(_key(vertical, department_id))
-    if text is not None:
-        return text, "vertical"
-    return default, "default"
-
-
-async def set_prompt(
-    vertical: Vertical, department_id: str | None, section_slug: str | None, text: str
-) -> None:
-    """Save a new prompt at the given level (section or vertical)."""
-    if not isinstance(text, str) or not text.strip():
-        raise ValueError("prompt must be a non-empty string")
-    key = _key(vertical, department_id, section_slug)
-    async with session_scope() as session:
-        stmt = (
-            pg_insert(AppSetting)
-            .values(key=key, value=text)
-            .on_conflict_do_update(
-                index_elements=["key"], set_={"value": text}
-            )
-        )
-        await session.execute(stmt)
-    invalidate(key)
-
-
-async def reset(
-    vertical: Vertical, department_id: str | None, section_slug: str | None
-) -> None:
-    """Drop the override at the given level."""
-    key = _key(vertical, department_id, section_slug)
-    async with session_scope() as session:
-        await session.execute(
-            AppSetting.__table__.delete().where(AppSetting.key == key)
-        )
-    invalidate(key)
-
-
-def invalidate(key: str | None = None) -> None:
-    if key is None:
-        _cache.clear()
-    else:
-        _cache.pop(key, None)
-
-
-async def is_overridden(
-    vertical: Vertical, department_id: str | None, section_slug: str | None = None
-) -> bool:
-    """True iff a custom prompt is stored at this exact level."""
-    text = await _load(_key(vertical, department_id, section_slug))
-    return text is not None
--- a/src/parser_bot/scheduler/poller.py
+++ b/src/parser_bot/scheduler/poller.py
@@ -6,9 +6,9 @@ from sqlalchemy import func, select
 from sqlalchemy.dialects.postgresql import insert as pg_insert

 from parser_bot.config import settings
-from parser_bot.db.models import Channel, Message, Section
+from parser_bot.db.models import Channel, Message
 from parser_bot.db.session import session_scope
-from parser_bot.extractors import analyze, analyze_with_llm
+from parser_bot.extractors import analyze
 from parser_bot.telegram.client import (
    fetch_new_messages,
    fetch_specific_messages_with_media,
@@ -19,29 +19,6 @@ from parser_bot.telegram.client import (
 log = structlog.get_logger()


-def _verdict_key(vertical: str) -> str:
-    """JSONB key under `extracted` where the LLM verdict lives for this vertical."""
-    return "hr_lead" if vertical == "hr" else "lead"
-
-
-def _needs_work_clause(vertical: str | None):
-    """Rows that still need LLM classification.
-
-    A row needs work when:
-      - extracted IS NULL (never analyzed), or
-      - the verdict for this vertical is missing.
-
-    Without `vertical`, falls back to "missing any verdict" — used by
-    aggregate /llm/queue display when no vertical is selected.
-    """
-    if vertical is None:
-        return (Message.extracted.is_(None)) | (
-            Message.extracted["lead"].is_(None) & Message.extracted["hr_lead"].is_(None)
-        )
-    key = _verdict_key(vertical)
-    return (Message.extracted.is_(None)) | (Message.extracted[key].is_(None))
-
-
 async def poll_channel(channel_id: int) -> int:
    """Poll one channel for new messages. Returns count of inserted rows."""
    async with session_scope() as session:
@@ -63,9 +40,8 @@ async def poll_channel(channel_id: int) -> int:

        inserted = 0
        for m in msgs:
-            # Only the cheap regex pass runs in the poll path. LLM classification
-            # is handled by `classify_pending` in a background scheduler job so
-            # that a poll request never blocks on a 5s/message LLM call.
+            # Only the cheap regex pass runs in Python. LLM classification is
+            # handled by the Go classifier so Telegram polling stays lightweight.
            stmt = (
                pg_insert(Message)
                .values(
@@ -182,162 +158,6 @@ async def backfill_media(channel_id: int, batch_size: int = 50) -> dict[str, int
    return {"updated": updated, "pending": max(0, pending_total - updated)}


-async def reanalyze_channel(channel_id: int, batch_size: int = 5) -> dict[str, int]:
-    """Re-run extractors (regex + LLM) over messages missing this channel's verdict.
-
-    Picks the vertical AND section from the channel row so the right LLM
-    prompt is used. Only reanalyzes rows where the corresponding verdict key
-    is missing. Newest first so fresh leads surface during long backfills.
-    """
-    async with session_scope() as session:
-        result = await session.execute(
-            select(Channel, Section.slug, Section.department_id)
-            .join(Section, Section.id == Channel.section_id)
-            .where(Channel.id == channel_id)
-        )
-        row = result.one_or_none()
-        if row is None:
-            return {"updated": 0, "pending": 0}
-        channel, section_slug, department_id = row
-        vertical = channel.vertical
-        needs_work = _needs_work_clause(vertical)
-
-        pending_total = (
-            await session.execute(
-                select(func.count(Message.id)).where(
-                    Message.channel_id == channel_id,
-                    Message.text.is_not(None),
-                    needs_work,
-                )
-            )
-        ).scalar_one()
-
-        rows = (
-            await session.execute(
-                select(Message.id, Message.text)
-                .where(
-                    Message.channel_id == channel_id,
-                    Message.text.is_not(None),
-                    needs_work,
-                )
-                .order_by(Message.id.desc())
-                .limit(batch_size)
-            )
-        ).all()
-        if not rows:
-            return {"updated": 0, "pending": 0}
-
-        updated = 0
-        for db_id, text in rows:
-            extracted = (
-                await analyze_with_llm(text, vertical, department_id, section_slug)
-                if settings.llm_enabled
-                else analyze(text)
-            )
-            msg = await session.get(Message, db_id)
-            if msg is None:
-                continue
-            msg.extracted = extracted
-            updated += 1
-
-    log.info(
-        "reanalyzed_channel",
-        channel_id=channel_id,
-        vertical=vertical,
-        section=section_slug,
-        updated=updated,
-        remaining=max(0, pending_total - updated),
-    )
-    return {"updated": updated, "pending": max(0, pending_total - updated)}
-
-
-async def pending_llm_count(
-    vertical: str | None = None,
-    section_slug: str | None = None,
-    department_id: str | None = None,
-) -> int:
-    """How many text messages still need LLM classification.
-
-    When `vertical` is set, only counts messages from channels of that vertical
-    (and optionally that section) whose vertical-specific verdict is missing.
-    """
-    if not settings.llm_enabled:
-        return 0
-    needs_work = _needs_work_clause(vertical)
-    async with session_scope() as session:
-        stmt = select(func.count(Message.id)).where(
-            Message.text.is_not(None),
-            needs_work,
-        )
-        if vertical is not None:
-            stmt = stmt.join(Channel, Channel.id == Message.channel_id).where(
-                Channel.vertical == vertical
-            )
-        if section_slug is not None or department_id is not None:
-            if vertical is None:
-                stmt = stmt.join(Channel, Channel.id == Message.channel_id)
-            stmt = stmt.join(Section, Section.id == Channel.section_id)
-            if section_slug is not None:
-                stmt = stmt.where(Section.slug == section_slug)
-            if department_id is not None:
-                stmt = stmt.where(Section.department_id == department_id)
-        return (await session.execute(stmt)).scalar_one()
-
-
-async def classify_pending(batch_size: int = 5) -> int:
-    """Run LLM over a batch of unclassified messages across all channels.
-
-    Walks newest-first and picks the prompt/vertical/section from each
-    message's channel, so RE and HR channels (and per-section overrides)
-    share the same classifier worker without crosstalk.
-    """
-    if not settings.llm_enabled:
-        return 0
-    needs_work = _needs_work_clause(None)
-
-    async with session_scope() as session:
-        rows = (
-            await session.execute(
-                select(
-                    Message.id,
-                    Message.text,
-                    Channel.vertical,
-                    Section.slug,
-                    Section.department_id,
-                )
-                .join(Channel, Channel.id == Message.channel_id)
-                .join(Section, Section.id == Channel.section_id)
-                .where(Message.text.is_not(None), needs_work)
-                .order_by(Message.id.desc())
-                .limit(batch_size)
-            )
-        ).all()
-        if not rows:
-            return 0
-
-        updated = 0
-        for db_id, text, vertical, section_slug, department_id in rows:
-            # If extracted already has THIS vertical's verdict, skip — needs_work
-            # uses an OR over both keys and would otherwise re-run RE channels
-            # that already have a lead just because hr_lead is null.
-            existing = (
-                await session.execute(select(Message.extracted).where(Message.id == db_id))
-            ).scalar_one_or_none()
-            key = _verdict_key(vertical)
-            if existing and existing.get(key) is not None:
-                continue
-            extracted = await analyze_with_llm(text, vertical, department_id, section_slug)
-            msg = await session.get(Message, db_id)
-            if msg is None:
-                continue
-            msg.extracted = extracted
-            updated += 1
-
-    if updated:
-        log.info("classify_pending_batch", updated=updated)
-    return updated
-
-
 def build_scheduler() -> AsyncIOScheduler:
    scheduler = AsyncIOScheduler()
    scheduler.add_job(
@@ -348,14 +168,4 @@ def build_scheduler() -> AsyncIOScheduler:
        max_instances=1,
        coalesce=True,
    )
-    if settings.llm_enabled and settings.llm_classifier_owner != "go":
-        scheduler.add_job(
-            classify_pending,
-            "interval",
-            seconds=settings.llm_classify_interval_seconds,
-            id="classify_pending",
-            max_instances=1,
-            coalesce=True,
-            kwargs={"batch_size": settings.llm_classify_batch_size},
-        )
    return scheduler