Clean up monitoring TG Python adapter
This commit is contained in:
11
.env.example
11
.env.example
@@ -24,16 +24,17 @@ POSTGRES_PORT=5432
|
||||
POLL_INTERVAL_SECONDS=60
|
||||
POLL_HISTORY_LIMIT=50
|
||||
|
||||
# API
|
||||
# Go public API
|
||||
API_HOST=0.0.0.0
|
||||
API_PORT=8000
|
||||
PUBLIC_BASE_PATH=/api/monitoring-tg
|
||||
PYTHON_BASE_URL=http://127.0.0.1:8001
|
||||
|
||||
# Media (downloaded photos / small videos / docs from parsed messages)
|
||||
MEDIA_DIR=/data/media
|
||||
MEDIA_MAX_BYTES=20971520
|
||||
|
||||
# OpenAI-compatible LLM endpoint. In production this can point to the same
|
||||
# vLLM server/model used by telephony.
|
||||
# OpenAI-compatible LLM endpoint used by the Go classifier.
|
||||
LLM_ENABLED=true
|
||||
LLM_BASE_URL=http://10.2.3.5:8002
|
||||
LLM_API_KEY=
|
||||
@@ -41,8 +42,6 @@ LLM_MODEL=qwen2.5-14b
|
||||
LLM_TIMEOUT_SECONDS=120
|
||||
LLM_MAX_TOKENS=600
|
||||
LLM_MIN_TEXT_LENGTH=20
|
||||
LLM_CLASSIFIER_OWNER=python
|
||||
# How often the background classifier wakes up and how many messages it
|
||||
# processes per tick. With 5/20s ≈ 900 messages/hour at ~3-6s per call.
|
||||
LLM_CLASSIFIER_OWNER=go
|
||||
LLM_CLASSIFY_INTERVAL_SECONDS=20
|
||||
LLM_CLASSIFY_BATCH_SIZE=5
|
||||
|
||||
15
README.md
15
README.md
@@ -1,8 +1,9 @@
|
||||
# monitoring-tg
|
||||
|
||||
Backend-сервис мониторинга Telegram-каналов для Portal. Python-часть отвечает
|
||||
за MTProto/Telethon, API и опрос каналов, а фоновая AI-классификация вынесена
|
||||
в Go-воркер. Сервис сохраняет сообщения в Postgres, раскладывает каналы по
|
||||
Backend-сервис мониторинга Telegram-каналов для Portal. Публичный API и
|
||||
AI-классификация работают на Go, Python оставлен только как внутренний
|
||||
MTProto/Telethon-адаптер для авторизации, опроса каналов и дозагрузки медиа.
|
||||
Сервис сохраняет сообщения в Postgres, раскладывает каналы по
|
||||
вертикалям/подразделам и выполняет AI-анализ через OpenAI-compatible endpoint,
|
||||
общий с другими сервисами портала.
|
||||
|
||||
@@ -37,6 +38,7 @@ POSTGRES_PASSWORD=parser
|
||||
POSTGRES_DB=parser
|
||||
|
||||
PUBLIC_BASE_PATH=/api/monitoring-tg
|
||||
PYTHON_BASE_URL=http://127.0.0.1:8001
|
||||
|
||||
LLM_ENABLED=true
|
||||
LLM_BASE_URL=http://10.2.3.5:8002
|
||||
@@ -57,18 +59,19 @@ LLM_CLASSIFIER_OWNER=go
|
||||
kubectl apply -k k8s
|
||||
```
|
||||
|
||||
Миграции выполняются entrypoint-ом контейнера перед запуском API.
|
||||
Миграции выполняются entrypoint-ом контейнера перед запуском процессов.
|
||||
|
||||
## Структура
|
||||
|
||||
```text
|
||||
src/parser_bot/
|
||||
├── api/ FastAPI роуты + Pydantic-схемы
|
||||
├── api/ внутренние FastAPI роуты Telegram-адаптера
|
||||
├── db/ SQLAlchemy модели + сессии
|
||||
├── scheduler/ APScheduler-воркер периодического опроса
|
||||
├── telegram/ Telethon-клиент
|
||||
├── config.py pydantic-settings
|
||||
└── main.py FastAPI lifespan + uvicorn
|
||||
└── main.py FastAPI lifespan + uvicorn для внутреннего адаптера
|
||||
cmd/server/ Go API для Portal
|
||||
cmd/classifier/ Go-воркер фоновой LLM-классификации сообщений
|
||||
alembic/ миграции
|
||||
k8s/ манифесты для портала
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[project]
|
||||
name = "parser-tg-bot"
|
||||
version = "0.1.0"
|
||||
description = "Telegram channel parser — periodic polling + storage, future Go microservice"
|
||||
description = "Telegram channel monitoring service with Go API/classifier and Python Telethon adapter"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"telethon>=1.36",
|
||||
@@ -15,7 +15,6 @@ dependencies = [
|
||||
"pydantic-settings>=2.6",
|
||||
"python-dotenv>=1.0",
|
||||
"structlog>=24.4",
|
||||
"httpx>=0.27",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,219 +0,0 @@
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
||||
|
||||
Vertical = Literal["real_estate", "hr"]
|
||||
|
||||
# Section slugs are used as URL segments — keep them URL-safe.
|
||||
_SLUG_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,62}[a-z0-9]$|^[a-z0-9]$")
|
||||
|
||||
|
||||
class SectionCreate(BaseModel):
|
||||
vertical: Vertical
|
||||
slug: str = Field(..., min_length=1, max_length=64)
|
||||
title: str = Field(..., min_length=1, max_length=255)
|
||||
emoji: str | None = Field(None, max_length=8)
|
||||
description: str | None = None
|
||||
|
||||
@field_validator("slug")
|
||||
@classmethod
|
||||
def _check_slug(cls, v: str) -> str:
|
||||
if not _SLUG_RE.match(v):
|
||||
raise ValueError(
|
||||
"slug must be lowercase letters/digits with '-' or '_' separators"
|
||||
)
|
||||
return v
|
||||
|
||||
|
||||
class SectionUpdate(BaseModel):
|
||||
title: str | None = Field(None, min_length=1, max_length=255)
|
||||
emoji: str | None = Field(None, max_length=8)
|
||||
description: str | None = None
|
||||
|
||||
|
||||
class SectionOut(BaseModel):
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
id: int
|
||||
vertical: Vertical
|
||||
department_id: str | None = None
|
||||
slug: str
|
||||
title: str
|
||||
emoji: str | None
|
||||
description: str | None
|
||||
created_at: datetime
|
||||
|
||||
|
||||
class SectionWithStats(SectionOut):
|
||||
"""Section payload enriched with rollup counts for the section chooser page."""
|
||||
|
||||
channels_total: int = 0
|
||||
channels_active: int = 0
|
||||
messages_total: int = 0
|
||||
leads_total: int = 0
|
||||
|
||||
|
||||
class ChannelCreate(BaseModel):
|
||||
identifier: str = Field(..., min_length=1, max_length=255, description="@username or t.me link")
|
||||
vertical: Vertical = "real_estate"
|
||||
section: str = Field(
|
||||
..., min_length=1, max_length=64,
|
||||
description="Slug of the section inside the vertical (e.g. 'dubai')",
|
||||
)
|
||||
|
||||
|
||||
class ChannelUpdate(BaseModel):
|
||||
is_active: bool | None = None
|
||||
vertical: Vertical | None = None
|
||||
section: str | None = Field(
|
||||
None, min_length=1, max_length=64,
|
||||
description="Move the channel to another section in the same vertical",
|
||||
)
|
||||
|
||||
|
||||
class ChannelOut(BaseModel):
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
id: int
|
||||
tg_id: int | None
|
||||
identifier: str
|
||||
title: str | None
|
||||
vertical: Vertical
|
||||
section_id: int
|
||||
section_slug: str | None = None
|
||||
is_active: bool
|
||||
last_message_id: int | None
|
||||
last_polled_at: datetime | None
|
||||
created_at: datetime
|
||||
|
||||
|
||||
class ChannelStats(BaseModel):
|
||||
channel_id: int
|
||||
identifier: str
|
||||
title: str | None
|
||||
vertical: Vertical
|
||||
section_slug: str | None = None
|
||||
is_active: bool
|
||||
last_polled_at: datetime | None
|
||||
message_count: int
|
||||
last_message_at: datetime | None
|
||||
|
||||
|
||||
class MediaFile(BaseModel):
|
||||
kind: str # photo | video | document | audio | sticker | unknown
|
||||
url: str | None = None
|
||||
mime: str | None = None
|
||||
size: int | None = None
|
||||
skipped: str | None = None # set when not downloaded (e.g. "too_large")
|
||||
|
||||
|
||||
class RealEstate(BaseModel):
|
||||
kind: str | None = None
|
||||
property_type: str | None = None
|
||||
rooms: str | None = None
|
||||
area_m2: float | None = None
|
||||
price: str | None = None
|
||||
|
||||
|
||||
class Lead(BaseModel):
|
||||
is_listing: bool
|
||||
kind: str | None = None # sale | rent | purchase
|
||||
property_type: str | None = None
|
||||
rooms: str | None = None
|
||||
area_m2: float | None = None
|
||||
price_text: str | None = None
|
||||
price_value: float | None = None
|
||||
currency: str | None = None # RUB | USD | EUR | AED | GBP | CNY | TRY | KZT | BYN | UAH
|
||||
location: str | None = None
|
||||
contact_phone: str | None = None
|
||||
contact_name: str | None = None
|
||||
summary: str | None = None
|
||||
confidence: float = 0.0
|
||||
|
||||
|
||||
class HrLead(BaseModel):
|
||||
"""LLM verdict for HR-vertical messages (jobs / resumes / bare contacts)."""
|
||||
|
||||
is_lead: bool
|
||||
kind: str | None = None # vacancy | resume | contact
|
||||
title: str | None = None
|
||||
company: str | None = None
|
||||
candidate_name: str | None = None
|
||||
experience_years: float | None = None
|
||||
skills: list[str] = []
|
||||
location: str | None = None
|
||||
remote: bool | None = None
|
||||
employment_type: str | None = None
|
||||
salary_text: str | None = None
|
||||
salary_value: float | None = None
|
||||
currency: str | None = None
|
||||
contact_phone: str | None = None
|
||||
contact_name: str | None = None
|
||||
summary: str | None = None
|
||||
confidence: float = 0.0
|
||||
|
||||
|
||||
class Extracted(BaseModel):
|
||||
phones: list[str] = []
|
||||
names: list[str] = []
|
||||
tg_handles: list[str] = []
|
||||
real_estate: RealEstate | None = None
|
||||
lead: Lead | None = None
|
||||
hr_lead: HrLead | None = None
|
||||
|
||||
|
||||
class MessageOut(BaseModel):
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
id: int
|
||||
channel_id: int
|
||||
channel_vertical: Vertical | None = None
|
||||
channel_section_slug: str | None = None
|
||||
tg_message_id: int
|
||||
grouped_id: int | None = None
|
||||
group_size: int = 1
|
||||
date: datetime
|
||||
text: str | None
|
||||
sender_id: int | None
|
||||
has_media: bool
|
||||
media_files: list[MediaFile] | None = None
|
||||
extracted: Extracted | None = None
|
||||
sender_username: str | None = None
|
||||
sender_name: str | None = None
|
||||
post_url: str | None = None
|
||||
views: int | None
|
||||
forwards: int | None
|
||||
fetched_at: datetime
|
||||
|
||||
|
||||
class GlobalStats(BaseModel):
|
||||
vertical: Vertical
|
||||
section_slug: str | None = None
|
||||
channels_total: int
|
||||
channels_active: int
|
||||
messages_total: int
|
||||
messages_last_24h: int
|
||||
leads_total: int = 0
|
||||
leads_last_24h: int = 0
|
||||
poll_interval_seconds: int
|
||||
last_poll_at: datetime | None
|
||||
|
||||
|
||||
class AuthStatus(BaseModel):
|
||||
authorized: bool
|
||||
username: str | None = None
|
||||
phone: str | None = None
|
||||
|
||||
|
||||
class AuthCode(BaseModel):
|
||||
code: str = Field(..., min_length=3, max_length=12)
|
||||
|
||||
|
||||
class AuthPassword(BaseModel):
|
||||
password: str = Field(..., min_length=1)
|
||||
|
||||
|
||||
class AuthCodeResult(BaseModel):
|
||||
needs_password: bool
|
||||
@@ -29,18 +29,6 @@ class Settings(BaseSettings):
|
||||
media_dir: str = Field("/data/media", alias="MEDIA_DIR")
|
||||
media_max_bytes: int = Field(20 * 1024 * 1024, alias="MEDIA_MAX_BYTES")
|
||||
|
||||
# OpenAI-compatible LLM endpoint, shared with telephony/vLLM in production.
|
||||
llm_enabled: bool = Field(True, alias="LLM_ENABLED")
|
||||
llm_base_url: str = Field("http://10.2.3.5:8002", alias="LLM_BASE_URL")
|
||||
llm_api_key: str = Field("", alias="LLM_API_KEY")
|
||||
llm_model: str = Field("qwen2.5-14b", alias="LLM_MODEL")
|
||||
llm_timeout_seconds: int = Field(120, alias="LLM_TIMEOUT_SECONDS")
|
||||
llm_max_tokens: int = Field(600, alias="LLM_MAX_TOKENS")
|
||||
llm_min_text_length: int = Field(20, alias="LLM_MIN_TEXT_LENGTH")
|
||||
llm_classify_interval_seconds: int = Field(20, alias="LLM_CLASSIFY_INTERVAL_SECONDS")
|
||||
llm_classify_batch_size: int = Field(5, alias="LLM_CLASSIFY_BATCH_SIZE")
|
||||
llm_classifier_owner: str = Field("python", alias="LLM_CLASSIFIER_OWNER")
|
||||
|
||||
@property
|
||||
def database_url(self) -> str:
|
||||
return (
|
||||
|
||||
@@ -310,26 +310,3 @@ def analyze(text: str | None) -> dict[str, Any]:
|
||||
"tg_handles": extract_tg_handles(text),
|
||||
"real_estate": extract_real_estate(text),
|
||||
}
|
||||
|
||||
|
||||
async def analyze_with_llm(
|
||||
text: str | None,
|
||||
vertical: str = "real_estate",
|
||||
department_id: str | None = None,
|
||||
section_slug: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Regex extraction + local LLM lead classification, routed by vertical.
|
||||
|
||||
`department_id` + `section_slug` let the classifier pick a department and
|
||||
section-specific system prompt with fallback to the department vertical
|
||||
prompt. The LLM verdict goes under `lead` for RE and
|
||||
under `hr_lead` for HR. Falls back to regex-only if the LLM is unavailable.
|
||||
"""
|
||||
base = analyze(text)
|
||||
# Lazy import to avoid hard dep on httpx in environments where LLM is off.
|
||||
from parser_bot.llm import classify
|
||||
|
||||
verdict = await classify(text, vertical, department_id, section_slug) # type: ignore[arg-type]
|
||||
if verdict is not None:
|
||||
base["hr_lead" if vertical == "hr" else "lead"] = verdict
|
||||
return base
|
||||
|
||||
@@ -1,44 +0,0 @@
|
||||
"""Build Telegram URLs from stored channel metadata."""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
_USERNAME_RE = re.compile(r"^@?([A-Za-z][A-Za-z0-9_]{4,31})$")
|
||||
_TME_URL_RE = re.compile(
|
||||
r"^(?:https?://)?(?:t|telegram)\.me/(?:s/)?([A-Za-z][A-Za-z0-9_]{4,31})(?:/.*)?$"
|
||||
)
|
||||
|
||||
|
||||
def channel_username(identifier: str | None) -> str | None:
|
||||
"""Extract the public username from a channel identifier if any.
|
||||
|
||||
Returns None for private channels (joinchat, +invite, raw IDs).
|
||||
"""
|
||||
if not identifier:
|
||||
return None
|
||||
s = identifier.strip()
|
||||
m = _USERNAME_RE.match(s)
|
||||
if m:
|
||||
return m.group(1)
|
||||
m = _TME_URL_RE.match(s)
|
||||
if m:
|
||||
return m.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def post_url(identifier: str | None, tg_id: int | None, tg_message_id: int) -> str | None:
|
||||
"""Build a deep link to a Telegram post.
|
||||
|
||||
Public channel: https://t.me/<username>/<msg_id>
|
||||
Private channel (no public username, only tg_id): https://t.me/c/<short>/<msg_id>
|
||||
where <short> is the absolute id with the leading -100 stripped.
|
||||
"""
|
||||
username = channel_username(identifier)
|
||||
if username:
|
||||
return f"https://t.me/{username}/{tg_message_id}"
|
||||
if tg_id is None:
|
||||
return None
|
||||
raw = abs(tg_id)
|
||||
s = str(raw)
|
||||
short = s[3:] if s.startswith("100") and len(s) > 3 else s
|
||||
return f"https://t.me/c/{short}/{tg_message_id}"
|
||||
@@ -1,387 +0,0 @@
|
||||
"""OpenAI-compatible LLM client for lead classification & extraction.
|
||||
|
||||
Two verticals share one model and one process:
|
||||
- real_estate: high recall on listings (sale/rent/purchase),
|
||||
- hr: vacancies, resumes, bare contact leads.
|
||||
|
||||
The system prompt and JSON schema differ per vertical; the rest of the
|
||||
plumbing (timeouts, single-lock concurrency, JSON-mode parsing) is shared.
|
||||
On any error returns `None` and the caller falls back to regex-only extraction.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Any, Literal
|
||||
|
||||
import httpx
|
||||
import structlog
|
||||
|
||||
from parser_bot.config import settings
|
||||
|
||||
log = structlog.get_logger()
|
||||
|
||||
|
||||
# Single shared lock so we never run two LLM requests at once on the GPU —
|
||||
# they would just thrash VRAM and finish slower than sequential.
|
||||
_lock = asyncio.Lock()
|
||||
|
||||
|
||||
Vertical = Literal["real_estate", "hr"]
|
||||
|
||||
|
||||
DEFAULT_RE_SYSTEM_PROMPT = """\
|
||||
Ты — аналитик объявлений о недвижимости. Тебе дают текст из Telegram-канала.
|
||||
Сообщение МОЖЕТ БЫТЬ НА ЛЮБОМ ЯЗЫКЕ — русский, английский, арабский, любой
|
||||
другой. Обрабатывай его одинаково независимо от языка.
|
||||
|
||||
Задача: определить, является ли это РЕАЛЬНЫМ объявлением о покупке, продаже
|
||||
или аренде НЕДВИЖИМОСТИ (квартира, дом/villa, студия/studio, апартаменты,
|
||||
комната, таунхаус/townhouse, дача, коттедж, пентхаус/penthouse, офис,
|
||||
склад, помещение, земельный участок/plot/land, гараж, машиноместо).
|
||||
Учитывай намёки и нечёткие формулировки — лучше отметить сомнительный лид
|
||||
как `is_listing=true` с низкой confidence, чем пропустить.
|
||||
|
||||
Сигналы что это ОБЪЯВЛЕНИЕ (kind):
|
||||
— продажа/sale: «продаётся», «продаю», «продажа», «for sale», «#forsale»,
|
||||
«selling price», «selling», «price», «AED 33M», ценник в любой валюте.
|
||||
— аренда/rent: «сдаётся», «сдаю», «аренда», «for rent», «to let», «rental»,
|
||||
«per year», «per month», «AED ... /year».
|
||||
— покупка/purchase: «куплю», «куплю в», «looking for», «want to buy»,
|
||||
«wanted», «requirement».
|
||||
|
||||
ОДНО сообщение может быть и про продажу, И про аренду одновременно
|
||||
(«FOR SALE | RENT» / «продажа или аренда»). В таком случае выбирай
|
||||
основное намерение по самому тексту; если равноценно — `kind="sale"`
|
||||
и упомяни аренду в summary.
|
||||
|
||||
НЕ объявления (is_listing=false):
|
||||
— общие новости / статьи / аналитика рынка;
|
||||
— воспоминания и истории («когда-то продавал квартиру»);
|
||||
— шутки, мемы, цитаты;
|
||||
— реклама услуг агентств без конкретного объекта;
|
||||
— чужие пересланные объявления без контактов и явного предложения от автора.
|
||||
|
||||
Отвечай СТРОГО валидным JSON по схеме (никаких комментариев, никакого markdown):
|
||||
{
|
||||
"is_listing": boolean,
|
||||
"kind": "sale" | "rent" | "purchase" | null,
|
||||
"property_type": "квартира" | "дом" | "студия" | "апартаменты" | "комната" | "таунхаус" | "дача" | "коттедж" | "офис" | "склад" | "помещение" | "участок" | "гараж" | "машиноместо" | null,
|
||||
"rooms": "студия" | "1-к" | "2-к" | "3-к" | "4-к" | "5+к" | null,
|
||||
"area_m2": number | null,
|
||||
"price_text": string | null,
|
||||
"price_value": number | null,
|
||||
"currency": "RUB" | "USD" | "EUR" | "AED" | "GBP" | "CNY" | "TRY" | "KZT" | "BYN" | "UAH" | null,
|
||||
"location": string | null,
|
||||
"contact_phone": string | null,
|
||||
"contact_name": string | null,
|
||||
"summary": string,
|
||||
"confidence": number
|
||||
}
|
||||
|
||||
Поля:
|
||||
- summary — ОДНО короткое предложение НА РУССКОМ языке (даже если исходный
|
||||
текст на английском или другом). Это нужно для единообразного UI.
|
||||
- property_type — пиши значение по-русски (villa→дом, apartment→квартира,
|
||||
townhouse→таунхаус, plot/land→участок, studio→студия, penthouse→апартаменты,
|
||||
house→дом, office→офис, warehouse→склад, retail→помещение).
|
||||
- rooms — для англоязычного «3BR», «3 BR», «3 bed», «3-bedroom» возвращай
|
||||
«3-к»; для «studio» → «студия».
|
||||
- area_m2 — площадь В КВАДРАТНЫХ МЕТРАХ. Если в тексте sqft / sq.ft / sq ft /
|
||||
square feet — переведи: m² = sqft × 0.0929. Округляй до целого.
|
||||
- confidence ∈ [0, 1]: 0.9+ если явное объявление с ценой/контактом,
|
||||
0.5–0.8 если правдоподобно, 0.2–0.4 если намёк.
|
||||
- price_text — точная цитата из текста («2.5 млн ₽», «AED 850 000», «$320k»,
|
||||
«300 тыс. дирхам», «د.إ 1.2M», «70,000,000 AED», «AED 4.3M», «AED 1.75M»).
|
||||
- price_value — числовая величина цены В УКАЗАННОЙ ВАЛЮТЕ (не конвертируй).
|
||||
Раскрывай сокращения: «AED 4.3M» → 4300000, «$320k» → 320000.
|
||||
- currency — определяй гибко: ₽/руб/р/RUB/рублей → RUB; $/USD/долл/бакс → USD;
|
||||
€/EUR/евро → EUR; AED/дирхам/дирхамов/дирхама/dh/dhs/د.إ/Dirhams → AED;
|
||||
₺/TRY/лир/лира → TRY; ¥/CNY/юань → CNY; ₸/KZT/тенге → KZT;
|
||||
Br/BYN/бел.руб → BYN; ₴/UAH/грн → UAH. Если не уверен — null.
|
||||
- contact_phone — любой номер телефона в тексте (с + или без, российский,
|
||||
ОАЭ, любой международный).
|
||||
"""
|
||||
|
||||
|
||||
DEFAULT_HR_SYSTEM_PROMPT = """\
|
||||
Ты — аналитик HR-объявлений. Тебе дают текст из Telegram-канала. Сообщение
|
||||
МОЖЕТ БЫТЬ НА ЛЮБОМ ЯЗЫКЕ — обрабатывай одинаково.
|
||||
|
||||
Задача: определить, относится ли сообщение к рынку труда, и какого типа лид
|
||||
это. Допускаются три типа (`kind`):
|
||||
— vacancy — компания/наниматель ищет сотрудника («ищем разработчика»,
|
||||
«hiring backend engineer», «требуется бухгалтер», «we are looking for»);
|
||||
— resume — соискатель ищет работу («ищу работу», «open to work», «available
|
||||
for hire», «рассматриваю предложения», «my CV», «резюме»);
|
||||
— contact — короткое сообщение с именем/контактом и намёком на профессию,
|
||||
без явной вакансии/резюме («Иван Петров, Python, +7…», «@nick — UI/UX,
|
||||
Дубай»). Используй, когда vacancy и resume не подходят, но из текста ясно,
|
||||
что это HR-контакт.
|
||||
|
||||
Лучше отметить сомнительный случай `is_lead=true` с низкой confidence,
|
||||
чем пропустить. НО полностью исключай:
|
||||
— общие новости и аналитика рынка труда без конкретной вакансии/резюме;
|
||||
— реклама курсов, школ, маркетплейсов услуг (Profi.ru и т.п.);
|
||||
— чужие пересланные посты без контактов и без явного предложения от автора;
|
||||
— объявления о продаже/аренде недвижимости, услуг и товаров;
|
||||
— мемы, шутки, цитаты.
|
||||
|
||||
Отвечай СТРОГО валидным JSON по схеме (никаких комментариев, никакого markdown):
|
||||
{
|
||||
"is_lead": boolean,
|
||||
"kind": "vacancy" | "resume" | "contact" | null,
|
||||
"title": string | null,
|
||||
"company": string | null,
|
||||
"candidate_name": string | null,
|
||||
"experience_years": number | null,
|
||||
"skills": string[],
|
||||
"location": string | null,
|
||||
"remote": boolean | null,
|
||||
"employment_type": "full-time" | "part-time" | "contract" | "internship" | null,
|
||||
"salary_text": string | null,
|
||||
"salary_value": number | null,
|
||||
"currency": "RUB" | "USD" | "EUR" | "AED" | "GBP" | "CNY" | "TRY" | "KZT" | "BYN" | "UAH" | null,
|
||||
"contact_phone": string | null,
|
||||
"contact_name": string | null,
|
||||
"summary": string,
|
||||
"confidence": number
|
||||
}
|
||||
|
||||
Поля:
|
||||
- title — должность/роль ОДНОЙ строкой («Senior Python Developer», «Бухгалтер»,
|
||||
«UI/UX-дизайнер»). Для resume — желаемая роль. Для contact — то, что заявлено.
|
||||
- company — название компании-нанимателя, если оно явно указано (vacancy).
|
||||
- candidate_name — ФИО или ник кандидата (resume / contact).
|
||||
- experience_years — стаж в годах числом. «5+ years» → 5. Если не указан — null.
|
||||
- skills — короткий массив ключевых навыков/технологий (до ~10 элементов).
|
||||
- remote — true для «удалёнка / remote / WFH / hybrid: remote», false для
|
||||
«офис / on-site», null если не указано.
|
||||
- employment_type — full-time для «полная занятость / full-time», part-time
|
||||
для «частичная / part-time», contract для «договор/контракт/freelance»,
|
||||
internship для «стажировка/internship». Иначе null.
|
||||
- salary_text — точная цитата с зарплатой («200–300k ₽», «$5k/mo», «AED 18,000 per month»).
|
||||
- salary_value — число В УКАЗАННОЙ ВАЛЮТЕ. Если диапазон — нижняя граница.
|
||||
Раскрывай сокращения: «200k» → 200000, «1.5M» → 1500000.
|
||||
- currency — определяй гибко: ₽/руб/RUB → RUB; $/USD/долл → USD; €/EUR/евро → EUR;
|
||||
AED/дирхам/dh/dhs → AED; ₺/TRY/лир → TRY; ¥/CNY/юань → CNY; ₸/KZT/тенге → KZT;
|
||||
Br/BYN/бел.руб → BYN; ₴/UAH/грн → UAH. Если не уверен — null.
|
||||
- contact_phone — любой номер телефона (RU / международный, с + или без).
|
||||
- contact_name — имя контактного лица (рекрутер / соискатель / автор).
|
||||
- summary — ОДНО короткое предложение НА РУССКОМ языке.
|
||||
- confidence ∈ [0, 1]: 0.9+ если явная вакансия/резюме с деталями, 0.5–0.8
|
||||
если правдоподобно, 0.2–0.4 если намёк.
|
||||
"""
|
||||
|
||||
|
||||
# Back-compat alias — older imports referenced DEFAULT_SYSTEM_PROMPT.
|
||||
DEFAULT_SYSTEM_PROMPT = DEFAULT_RE_SYSTEM_PROMPT
|
||||
|
||||
|
||||
def _build_user_prompt(text: str) -> str:
|
||||
return f"Текст сообщения:\n```\n{text}\n```\nВерни JSON."
|
||||
|
||||
|
||||
_VALID_CURRENCIES = {
|
||||
"RUB", "USD", "EUR", "AED", "GBP", "CNY", "TRY", "KZT", "BYN", "UAH"
|
||||
}
|
||||
|
||||
|
||||
def _coerce_real_estate(payload: Any) -> dict | None:
|
||||
if not isinstance(payload, dict):
|
||||
return None
|
||||
is_listing = bool(payload.get("is_listing"))
|
||||
currency = payload.get("currency")
|
||||
if currency is not None:
|
||||
currency = str(currency).upper()
|
||||
if currency not in _VALID_CURRENCIES:
|
||||
currency = None
|
||||
return {
|
||||
"is_listing": is_listing,
|
||||
"kind": payload.get("kind") if payload.get("kind") in ("sale", "rent", "purchase") else None,
|
||||
"property_type": payload.get("property_type") or None,
|
||||
"rooms": payload.get("rooms") or None,
|
||||
"area_m2": _as_float(payload.get("area_m2")),
|
||||
"price_text": payload.get("price_text") or None,
|
||||
"price_value": _as_float(payload.get("price_value")),
|
||||
"currency": currency,
|
||||
"location": payload.get("location") or None,
|
||||
"contact_phone": payload.get("contact_phone") or None,
|
||||
"contact_name": payload.get("contact_name") or None,
|
||||
"summary": (payload.get("summary") or "")[:300],
|
||||
"confidence": max(0.0, min(1.0, _as_float(payload.get("confidence")) or 0.0)),
|
||||
}
|
||||
|
||||
|
||||
def _coerce_hr(payload: Any) -> dict | None:
|
||||
if not isinstance(payload, dict):
|
||||
return None
|
||||
is_lead = bool(payload.get("is_lead"))
|
||||
currency = payload.get("currency")
|
||||
if currency is not None:
|
||||
currency = str(currency).upper()
|
||||
if currency not in _VALID_CURRENCIES:
|
||||
currency = None
|
||||
skills_raw = payload.get("skills") or []
|
||||
if isinstance(skills_raw, str):
|
||||
skills = [s.strip() for s in skills_raw.split(",") if s.strip()]
|
||||
elif isinstance(skills_raw, list):
|
||||
skills = [str(s).strip() for s in skills_raw if str(s).strip()]
|
||||
else:
|
||||
skills = []
|
||||
skills = skills[:15]
|
||||
employment = payload.get("employment_type")
|
||||
if employment is not None and employment not in (
|
||||
"full-time", "part-time", "contract", "internship"
|
||||
):
|
||||
employment = None
|
||||
remote_raw = payload.get("remote")
|
||||
remote = bool(remote_raw) if isinstance(remote_raw, bool) else None
|
||||
return {
|
||||
"is_lead": is_lead,
|
||||
"kind": payload.get("kind") if payload.get("kind") in ("vacancy", "resume", "contact") else None,
|
||||
"title": payload.get("title") or None,
|
||||
"company": payload.get("company") or None,
|
||||
"candidate_name": payload.get("candidate_name") or None,
|
||||
"experience_years": _as_float(payload.get("experience_years")),
|
||||
"skills": skills,
|
||||
"location": payload.get("location") or None,
|
||||
"remote": remote,
|
||||
"employment_type": employment,
|
||||
"salary_text": payload.get("salary_text") or None,
|
||||
"salary_value": _as_float(payload.get("salary_value")),
|
||||
"currency": currency,
|
||||
"contact_phone": payload.get("contact_phone") or None,
|
||||
"contact_name": payload.get("contact_name") or None,
|
||||
"summary": (payload.get("summary") or "")[:300],
|
||||
"confidence": max(0.0, min(1.0, _as_float(payload.get("confidence")) or 0.0)),
|
||||
}
|
||||
|
||||
|
||||
def _as_float(v: Any) -> float | None:
|
||||
if v is None or isinstance(v, bool):
|
||||
return None
|
||||
try:
|
||||
return float(v)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
async def is_ready() -> bool:
|
||||
"""Check that the OpenAI-compatible model endpoint is reachable."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5) as client:
|
||||
headers = _headers()
|
||||
r = await client.get(f"{_base_url()}/v1/models", headers=headers)
|
||||
r.raise_for_status()
|
||||
models = r.json().get("data", [])
|
||||
if not models:
|
||||
return True
|
||||
model_ids = {
|
||||
str(m.get("id") or m.get("name") or "")
|
||||
for m in models
|
||||
if isinstance(m, dict)
|
||||
}
|
||||
return settings.llm_model in model_ids or any(
|
||||
mid.startswith(settings.llm_model) for mid in model_ids
|
||||
)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _base_url() -> str:
|
||||
return settings.llm_base_url.rstrip("/")
|
||||
|
||||
|
||||
def _headers() -> dict[str, str]:
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if settings.llm_api_key:
|
||||
headers["Authorization"] = f"Bearer {settings.llm_api_key}"
|
||||
return headers
|
||||
|
||||
|
||||
def default_prompt(vertical: Vertical) -> str:
|
||||
return DEFAULT_HR_SYSTEM_PROMPT if vertical == "hr" else DEFAULT_RE_SYSTEM_PROMPT
|
||||
|
||||
|
||||
async def classify(
|
||||
text: str | None,
|
||||
vertical: Vertical = "real_estate",
|
||||
department_id: str | None = None,
|
||||
section_slug: str | None = None,
|
||||
) -> dict | None:
|
||||
"""Classify a message text under the given vertical/section.
|
||||
|
||||
The system prompt is resolved with `section → vertical → built-in` fallback,
|
||||
so a per-section prompt can fine-tune extraction (e.g. AED/sqft for Dubai)
|
||||
while unconfigured sections keep using the vertical-wide prompt.
|
||||
Returns a vertical-specific structured dict or None on error / short text.
|
||||
"""
|
||||
if not settings.llm_enabled:
|
||||
return None
|
||||
if not text or len(text.strip()) < settings.llm_min_text_length:
|
||||
return None
|
||||
|
||||
# Lazy import to avoid a circular: prompt_store -> db.session -> config.
|
||||
from parser_bot import prompt_store
|
||||
|
||||
system = await prompt_store.resolve(
|
||||
vertical, department_id, section_slug, default_prompt(vertical)
|
||||
)
|
||||
payload = {
|
||||
"model": settings.llm_model,
|
||||
"messages": [
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": _build_user_prompt(text)},
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": settings.llm_max_tokens,
|
||||
"response_format": {"type": "json_object"},
|
||||
}
|
||||
async with _lock:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=settings.llm_timeout_seconds) as client:
|
||||
r = await client.post(
|
||||
f"{_base_url()}/v1/chat/completions",
|
||||
headers=_headers(),
|
||||
json=payload,
|
||||
)
|
||||
if r.status_code != 200:
|
||||
log.warning(
|
||||
"llm_request_failed",
|
||||
status=r.status_code,
|
||||
model=settings.llm_model,
|
||||
vertical=vertical,
|
||||
section=section_slug,
|
||||
body=r.text[:300],
|
||||
)
|
||||
return None
|
||||
data = r.json()
|
||||
except Exception as exc:
|
||||
log.warning(
|
||||
"llm_request_failed", error=str(exc), model=settings.llm_model, vertical=vertical
|
||||
)
|
||||
return None
|
||||
|
||||
choices = data.get("choices") or []
|
||||
message = choices[0].get("message") if choices and isinstance(choices[0], dict) else None
|
||||
raw = ((message or {}).get("content") or "").strip()
|
||||
if not raw:
|
||||
return None
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
# Best effort: extract first {...} block.
|
||||
start, end = raw.find("{"), raw.rfind("}")
|
||||
if start == -1 or end == -1:
|
||||
log.warning("llm_invalid_json", raw=raw[:200], vertical=vertical)
|
||||
return None
|
||||
try:
|
||||
parsed = json.loads(raw[start : end + 1])
|
||||
except json.JSONDecodeError:
|
||||
log.warning("llm_invalid_json", raw=raw[:200], vertical=vertical)
|
||||
return None
|
||||
|
||||
if vertical == "hr":
|
||||
return _coerce_hr(parsed)
|
||||
return _coerce_real_estate(parsed)
|
||||
@@ -1,5 +1,4 @@
|
||||
from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
|
||||
import structlog
|
||||
import uvicorn
|
||||
@@ -7,7 +6,6 @@ from fastapi import Depends, FastAPI
|
||||
from fastapi.openapi.docs import get_redoc_html, get_swagger_ui_html
|
||||
from fastapi.openapi.utils import get_openapi
|
||||
from fastapi.responses import JSONResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
from parser_bot.access import require_admin
|
||||
from parser_bot.api.routes import router
|
||||
@@ -105,10 +103,6 @@ def create_app() -> FastAPI:
|
||||
title=app.title + " — redoc",
|
||||
)
|
||||
|
||||
media_dir = Path(settings.media_dir)
|
||||
media_dir.mkdir(parents=True, exist_ok=True)
|
||||
# /media is fine to cache — file names are content-stable.
|
||||
app.mount("/media", StaticFiles(directory=media_dir), name="media")
|
||||
return app
|
||||
|
||||
|
||||
|
||||
@@ -1,137 +0,0 @@
|
||||
"""Runtime-editable LLM system prompts, persisted in app_settings.
|
||||
|
||||
Three resolution levels with fallback (more specific → less specific):
|
||||
1. `llm_system_prompt:<department_id>:<vertical>:<section_slug>` — section override
|
||||
2. `llm_system_prompt:<department_id>:<vertical>` — department vertical override
|
||||
3. built-in DEFAULT_RE_SYSTEM_PROMPT / DEFAULT_HR_SYSTEM_PROMPT
|
||||
|
||||
The prompt is read on every classification call but cached for a short
|
||||
window so the DB isn't hit per-message. Edits via the API invalidate the
|
||||
cache for that level, so a save in the UI takes effect within seconds.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from typing import Literal
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||
|
||||
from parser_bot.db.models import AppSetting
|
||||
from parser_bot.db.session import session_scope
|
||||
|
||||
Vertical = Literal["real_estate", "hr"]
|
||||
|
||||
_KEY_PREFIX = "llm_system_prompt:"
|
||||
_CACHE_TTL_S = 5.0
|
||||
_cache: dict[str, tuple[float, str | None]] = {}
|
||||
|
||||
|
||||
def _key(
|
||||
vertical: Vertical,
|
||||
department_id: str | None,
|
||||
section_slug: str | None = None,
|
||||
) -> str:
|
||||
dept = department_id or "global"
|
||||
if section_slug:
|
||||
return f"{_KEY_PREFIX}{dept}:{vertical}:{section_slug}"
|
||||
return f"{_KEY_PREFIX}{dept}:{vertical}"
|
||||
|
||||
|
||||
async def _load(key: str) -> str | None:
|
||||
"""Read a stored prompt by exact key. None if missing or empty."""
|
||||
now = time.monotonic()
|
||||
cached_at, cached_value = _cache.get(key, (0.0, None))
|
||||
if now - cached_at < _CACHE_TTL_S:
|
||||
return cached_value
|
||||
|
||||
async with session_scope() as session:
|
||||
row = await session.execute(
|
||||
select(AppSetting.value).where(AppSetting.key == key)
|
||||
)
|
||||
value = row.scalar_one_or_none()
|
||||
|
||||
text = value if isinstance(value, str) and value.strip() else None
|
||||
_cache[key] = (now, text)
|
||||
return text
|
||||
|
||||
|
||||
async def resolve(
|
||||
vertical: Vertical, department_id: str | None, section_slug: str | None, default: str
|
||||
) -> str:
|
||||
"""Pick the most specific prompt available, falling back to `default`.
|
||||
|
||||
Always consults section-level → vertical-level → default. This is what
|
||||
the classifier uses for every message.
|
||||
"""
|
||||
if section_slug:
|
||||
text = await _load(_key(vertical, department_id, section_slug))
|
||||
if text is not None:
|
||||
return text
|
||||
text = await _load(_key(vertical, department_id))
|
||||
if text is not None:
|
||||
return text
|
||||
return default
|
||||
|
||||
|
||||
async def get(
|
||||
vertical: Vertical, department_id: str | None, section_slug: str | None, default: str
|
||||
) -> tuple[str, str]:
|
||||
"""For the settings UI: return (text, source) where source is one of
|
||||
'section' | 'vertical' | 'default'. Lets the editor show which override
|
||||
is currently active without a second round-trip.
|
||||
"""
|
||||
if section_slug:
|
||||
text = await _load(_key(vertical, department_id, section_slug))
|
||||
if text is not None:
|
||||
return text, "section"
|
||||
text = await _load(_key(vertical, department_id))
|
||||
if text is not None:
|
||||
return text, "vertical"
|
||||
return default, "default"
|
||||
|
||||
|
||||
async def set_prompt(
|
||||
vertical: Vertical, department_id: str | None, section_slug: str | None, text: str
|
||||
) -> None:
|
||||
"""Save a new prompt at the given level (section or vertical)."""
|
||||
if not isinstance(text, str) or not text.strip():
|
||||
raise ValueError("prompt must be a non-empty string")
|
||||
key = _key(vertical, department_id, section_slug)
|
||||
async with session_scope() as session:
|
||||
stmt = (
|
||||
pg_insert(AppSetting)
|
||||
.values(key=key, value=text)
|
||||
.on_conflict_do_update(
|
||||
index_elements=["key"], set_={"value": text}
|
||||
)
|
||||
)
|
||||
await session.execute(stmt)
|
||||
invalidate(key)
|
||||
|
||||
|
||||
async def reset(
|
||||
vertical: Vertical, department_id: str | None, section_slug: str | None
|
||||
) -> None:
|
||||
"""Drop the override at the given level."""
|
||||
key = _key(vertical, department_id, section_slug)
|
||||
async with session_scope() as session:
|
||||
await session.execute(
|
||||
AppSetting.__table__.delete().where(AppSetting.key == key)
|
||||
)
|
||||
invalidate(key)
|
||||
|
||||
|
||||
def invalidate(key: str | None = None) -> None:
|
||||
if key is None:
|
||||
_cache.clear()
|
||||
else:
|
||||
_cache.pop(key, None)
|
||||
|
||||
|
||||
async def is_overridden(
|
||||
vertical: Vertical, department_id: str | None, section_slug: str | None = None
|
||||
) -> bool:
|
||||
"""True iff a custom prompt is stored at this exact level."""
|
||||
text = await _load(_key(vertical, department_id, section_slug))
|
||||
return text is not None
|
||||
@@ -6,9 +6,9 @@ from sqlalchemy import func, select
|
||||
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||
|
||||
from parser_bot.config import settings
|
||||
from parser_bot.db.models import Channel, Message, Section
|
||||
from parser_bot.db.models import Channel, Message
|
||||
from parser_bot.db.session import session_scope
|
||||
from parser_bot.extractors import analyze, analyze_with_llm
|
||||
from parser_bot.extractors import analyze
|
||||
from parser_bot.telegram.client import (
|
||||
fetch_new_messages,
|
||||
fetch_specific_messages_with_media,
|
||||
@@ -19,29 +19,6 @@ from parser_bot.telegram.client import (
|
||||
log = structlog.get_logger()
|
||||
|
||||
|
||||
def _verdict_key(vertical: str) -> str:
|
||||
"""JSONB key under `extracted` where the LLM verdict lives for this vertical."""
|
||||
return "hr_lead" if vertical == "hr" else "lead"
|
||||
|
||||
|
||||
def _needs_work_clause(vertical: str | None):
|
||||
"""Rows that still need LLM classification.
|
||||
|
||||
A row needs work when:
|
||||
- extracted IS NULL (never analyzed), or
|
||||
- the verdict for this vertical is missing.
|
||||
|
||||
Without `vertical`, falls back to "missing any verdict" — used by
|
||||
aggregate /llm/queue display when no vertical is selected.
|
||||
"""
|
||||
if vertical is None:
|
||||
return (Message.extracted.is_(None)) | (
|
||||
Message.extracted["lead"].is_(None) & Message.extracted["hr_lead"].is_(None)
|
||||
)
|
||||
key = _verdict_key(vertical)
|
||||
return (Message.extracted.is_(None)) | (Message.extracted[key].is_(None))
|
||||
|
||||
|
||||
async def poll_channel(channel_id: int) -> int:
|
||||
"""Poll one channel for new messages. Returns count of inserted rows."""
|
||||
async with session_scope() as session:
|
||||
@@ -63,9 +40,8 @@ async def poll_channel(channel_id: int) -> int:
|
||||
|
||||
inserted = 0
|
||||
for m in msgs:
|
||||
# Only the cheap regex pass runs in the poll path. LLM classification
|
||||
# is handled by `classify_pending` in a background scheduler job so
|
||||
# that a poll request never blocks on a 5s/message LLM call.
|
||||
# Only the cheap regex pass runs in Python. LLM classification is
|
||||
# handled by the Go classifier so Telegram polling stays lightweight.
|
||||
stmt = (
|
||||
pg_insert(Message)
|
||||
.values(
|
||||
@@ -182,162 +158,6 @@ async def backfill_media(channel_id: int, batch_size: int = 50) -> dict[str, int
|
||||
return {"updated": updated, "pending": max(0, pending_total - updated)}
|
||||
|
||||
|
||||
async def reanalyze_channel(channel_id: int, batch_size: int = 5) -> dict[str, int]:
|
||||
"""Re-run extractors (regex + LLM) over messages missing this channel's verdict.
|
||||
|
||||
Picks the vertical AND section from the channel row so the right LLM
|
||||
prompt is used. Only reanalyzes rows where the corresponding verdict key
|
||||
is missing. Newest first so fresh leads surface during long backfills.
|
||||
"""
|
||||
async with session_scope() as session:
|
||||
result = await session.execute(
|
||||
select(Channel, Section.slug, Section.department_id)
|
||||
.join(Section, Section.id == Channel.section_id)
|
||||
.where(Channel.id == channel_id)
|
||||
)
|
||||
row = result.one_or_none()
|
||||
if row is None:
|
||||
return {"updated": 0, "pending": 0}
|
||||
channel, section_slug, department_id = row
|
||||
vertical = channel.vertical
|
||||
needs_work = _needs_work_clause(vertical)
|
||||
|
||||
pending_total = (
|
||||
await session.execute(
|
||||
select(func.count(Message.id)).where(
|
||||
Message.channel_id == channel_id,
|
||||
Message.text.is_not(None),
|
||||
needs_work,
|
||||
)
|
||||
)
|
||||
).scalar_one()
|
||||
|
||||
rows = (
|
||||
await session.execute(
|
||||
select(Message.id, Message.text)
|
||||
.where(
|
||||
Message.channel_id == channel_id,
|
||||
Message.text.is_not(None),
|
||||
needs_work,
|
||||
)
|
||||
.order_by(Message.id.desc())
|
||||
.limit(batch_size)
|
||||
)
|
||||
).all()
|
||||
if not rows:
|
||||
return {"updated": 0, "pending": 0}
|
||||
|
||||
updated = 0
|
||||
for db_id, text in rows:
|
||||
extracted = (
|
||||
await analyze_with_llm(text, vertical, department_id, section_slug)
|
||||
if settings.llm_enabled
|
||||
else analyze(text)
|
||||
)
|
||||
msg = await session.get(Message, db_id)
|
||||
if msg is None:
|
||||
continue
|
||||
msg.extracted = extracted
|
||||
updated += 1
|
||||
|
||||
log.info(
|
||||
"reanalyzed_channel",
|
||||
channel_id=channel_id,
|
||||
vertical=vertical,
|
||||
section=section_slug,
|
||||
updated=updated,
|
||||
remaining=max(0, pending_total - updated),
|
||||
)
|
||||
return {"updated": updated, "pending": max(0, pending_total - updated)}
|
||||
|
||||
|
||||
async def pending_llm_count(
|
||||
vertical: str | None = None,
|
||||
section_slug: str | None = None,
|
||||
department_id: str | None = None,
|
||||
) -> int:
|
||||
"""How many text messages still need LLM classification.
|
||||
|
||||
When `vertical` is set, only counts messages from channels of that vertical
|
||||
(and optionally that section) whose vertical-specific verdict is missing.
|
||||
"""
|
||||
if not settings.llm_enabled:
|
||||
return 0
|
||||
needs_work = _needs_work_clause(vertical)
|
||||
async with session_scope() as session:
|
||||
stmt = select(func.count(Message.id)).where(
|
||||
Message.text.is_not(None),
|
||||
needs_work,
|
||||
)
|
||||
if vertical is not None:
|
||||
stmt = stmt.join(Channel, Channel.id == Message.channel_id).where(
|
||||
Channel.vertical == vertical
|
||||
)
|
||||
if section_slug is not None or department_id is not None:
|
||||
if vertical is None:
|
||||
stmt = stmt.join(Channel, Channel.id == Message.channel_id)
|
||||
stmt = stmt.join(Section, Section.id == Channel.section_id)
|
||||
if section_slug is not None:
|
||||
stmt = stmt.where(Section.slug == section_slug)
|
||||
if department_id is not None:
|
||||
stmt = stmt.where(Section.department_id == department_id)
|
||||
return (await session.execute(stmt)).scalar_one()
|
||||
|
||||
|
||||
async def classify_pending(batch_size: int = 5) -> int:
|
||||
"""Run LLM over a batch of unclassified messages across all channels.
|
||||
|
||||
Walks newest-first and picks the prompt/vertical/section from each
|
||||
message's channel, so RE and HR channels (and per-section overrides)
|
||||
share the same classifier worker without crosstalk.
|
||||
"""
|
||||
if not settings.llm_enabled:
|
||||
return 0
|
||||
needs_work = _needs_work_clause(None)
|
||||
|
||||
async with session_scope() as session:
|
||||
rows = (
|
||||
await session.execute(
|
||||
select(
|
||||
Message.id,
|
||||
Message.text,
|
||||
Channel.vertical,
|
||||
Section.slug,
|
||||
Section.department_id,
|
||||
)
|
||||
.join(Channel, Channel.id == Message.channel_id)
|
||||
.join(Section, Section.id == Channel.section_id)
|
||||
.where(Message.text.is_not(None), needs_work)
|
||||
.order_by(Message.id.desc())
|
||||
.limit(batch_size)
|
||||
)
|
||||
).all()
|
||||
if not rows:
|
||||
return 0
|
||||
|
||||
updated = 0
|
||||
for db_id, text, vertical, section_slug, department_id in rows:
|
||||
# If extracted already has THIS vertical's verdict, skip — needs_work
|
||||
# uses an OR over both keys and would otherwise re-run RE channels
|
||||
# that already have a lead just because hr_lead is null.
|
||||
existing = (
|
||||
await session.execute(select(Message.extracted).where(Message.id == db_id))
|
||||
).scalar_one_or_none()
|
||||
key = _verdict_key(vertical)
|
||||
if existing and existing.get(key) is not None:
|
||||
continue
|
||||
extracted = await analyze_with_llm(text, vertical, department_id, section_slug)
|
||||
msg = await session.get(Message, db_id)
|
||||
if msg is None:
|
||||
continue
|
||||
msg.extracted = extracted
|
||||
updated += 1
|
||||
|
||||
if updated:
|
||||
log.info("classify_pending_batch", updated=updated)
|
||||
return updated
|
||||
|
||||
|
||||
def build_scheduler() -> AsyncIOScheduler:
|
||||
scheduler = AsyncIOScheduler()
|
||||
scheduler.add_job(
|
||||
@@ -348,14 +168,4 @@ def build_scheduler() -> AsyncIOScheduler:
|
||||
max_instances=1,
|
||||
coalesce=True,
|
||||
)
|
||||
if settings.llm_enabled and settings.llm_classifier_owner != "go":
|
||||
scheduler.add_job(
|
||||
classify_pending,
|
||||
"interval",
|
||||
seconds=settings.llm_classify_interval_seconds,
|
||||
id="classify_pending",
|
||||
max_instances=1,
|
||||
coalesce=True,
|
||||
kwargs={"batch_size": settings.llm_classify_batch_size},
|
||||
)
|
||||
return scheduler
|
||||
|
||||
Reference in New Issue
Block a user