From f9e072774ca19ab2f3e619f250a14653cc84b0bb Mon Sep 17 00:00:00 2001 From: Grendgi Date: Thu, 4 Jun 2026 14:55:41 +0300 Subject: [PATCH] Add monitoring TG service --- .dockerignore | 21 + .env.example | 54 + .gitea/workflows/deploy.yaml | 58 + .gitignore | 14 + Dockerfile | 28 + README.md | 123 ++ alembic.ini | 39 + alembic/env.py | 52 + alembic/script.py.mako | 25 + alembic/versions/0001_initial.py | 71 ++ alembic/versions/0002_add_media_files.py | 28 + alembic/versions/0003_add_grouped_id.py | 39 + alembic/versions/0004_add_extracted.py | 34 + alembic/versions/0005_add_sender_info.py | 30 + alembic/versions/0006_add_app_settings.py | 35 + alembic/versions/0007_add_channel_vertical.py | 37 + alembic/versions/0008_add_sections.py | 110 ++ .../versions/0009_add_section_access_code.py | 24 + docker-compose.yml | 64 + docker/entrypoint.sh | 16 + k8s/configmap.yaml | 20 + k8s/kustomization.yaml | 12 + k8s/namespace.yaml | 4 + k8s/postgres.yaml | 65 + k8s/secrets.yaml | 25 + k8s/server-deployment.yaml | 70 ++ k8s/server-service.yaml | 18 + pyproject.toml | 44 + src/parser_bot/__init__.py | 0 src/parser_bot/access.py | 116 ++ src/parser_bot/api/__init__.py | 0 src/parser_bot/api/routes.py | 1048 +++++++++++++++++ src/parser_bot/api/schemas.py | 231 ++++ src/parser_bot/auth.py | 51 + src/parser_bot/config.py | 64 + src/parser_bot/db/__init__.py | 0 src/parser_bot/db/models.py | 119 ++ src/parser_bot/db/session.py | 25 + src/parser_bot/extractors.py | 334 ++++++ src/parser_bot/links.py | 44 + src/parser_bot/llm.py | 363 ++++++ src/parser_bot/main.py | 205 ++++ src/parser_bot/prompt_store.py | 130 ++ src/parser_bot/scheduler/__init__.py | 0 src/parser_bot/scheduler/poller.py | 349 ++++++ src/parser_bot/telegram/__init__.py | 0 src/parser_bot/telegram/client.py | 319 +++++ src/parser_bot/web/static/admin.html | 36 + src/parser_bot/web/static/auth.html | 85 ++ src/parser_bot/web/static/css/app.css | 241 ++++ src/parser_bot/web/static/hr/index.html | 99 ++ .../web/static/hr/section/channels.html | 48 + .../web/static/hr/section/index.html | 43 + .../web/static/hr/section/messages.html | 78 ++ .../web/static/hr/section/settings.html | 66 ++ src/parser_bot/web/static/index.html | 76 ++ src/parser_bot/web/static/js/access.js | 41 + src/parser_bot/web/static/js/admin.js | 49 + src/parser_bot/web/static/js/api.js | 192 +++ src/parser_bot/web/static/js/auth.js | 120 ++ src/parser_bot/web/static/js/channels.js | 132 +++ src/parser_bot/web/static/js/dashboard.js | 87 ++ src/parser_bot/web/static/js/messages.js | 433 +++++++ src/parser_bot/web/static/js/nav-status.js | 25 + src/parser_bot/web/static/js/nav.js | 71 ++ src/parser_bot/web/static/js/sections-list.js | 202 ++++ src/parser_bot/web/static/js/settings.js | 118 ++ src/parser_bot/web/static/js/slugify.js | 22 + src/parser_bot/web/static/js/vertical.js | 76 ++ .../web/static/real-estate/index.html | 99 ++ .../static/real-estate/section/channels.html | 48 + .../web/static/real-estate/section/index.html | 43 + .../static/real-estate/section/messages.html | 78 ++ .../static/real-estate/section/settings.html | 66 ++ 74 files changed, 7232 insertions(+) create mode 100644 .dockerignore create mode 100644 .env.example create mode 100644 .gitea/workflows/deploy.yaml create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 alembic.ini create mode 100644 alembic/env.py create mode 100644 alembic/script.py.mako create mode 100644 alembic/versions/0001_initial.py create mode 100644 alembic/versions/0002_add_media_files.py create mode 100644 alembic/versions/0003_add_grouped_id.py create mode 100644 alembic/versions/0004_add_extracted.py create mode 100644 alembic/versions/0005_add_sender_info.py create mode 100644 alembic/versions/0006_add_app_settings.py create mode 100644 alembic/versions/0007_add_channel_vertical.py create mode 100644 alembic/versions/0008_add_sections.py create mode 100644 alembic/versions/0009_add_section_access_code.py create mode 100644 docker-compose.yml create mode 100644 docker/entrypoint.sh create mode 100644 k8s/configmap.yaml create mode 100644 k8s/kustomization.yaml create mode 100644 k8s/namespace.yaml create mode 100644 k8s/postgres.yaml create mode 100644 k8s/secrets.yaml create mode 100644 k8s/server-deployment.yaml create mode 100644 k8s/server-service.yaml create mode 100644 pyproject.toml create mode 100644 src/parser_bot/__init__.py create mode 100644 src/parser_bot/access.py create mode 100644 src/parser_bot/api/__init__.py create mode 100644 src/parser_bot/api/routes.py create mode 100644 src/parser_bot/api/schemas.py create mode 100644 src/parser_bot/auth.py create mode 100644 src/parser_bot/config.py create mode 100644 src/parser_bot/db/__init__.py create mode 100644 src/parser_bot/db/models.py create mode 100644 src/parser_bot/db/session.py create mode 100644 src/parser_bot/extractors.py create mode 100644 src/parser_bot/links.py create mode 100644 src/parser_bot/llm.py create mode 100644 src/parser_bot/main.py create mode 100644 src/parser_bot/prompt_store.py create mode 100644 src/parser_bot/scheduler/__init__.py create mode 100644 src/parser_bot/scheduler/poller.py create mode 100644 src/parser_bot/telegram/__init__.py create mode 100644 src/parser_bot/telegram/client.py create mode 100644 src/parser_bot/web/static/admin.html create mode 100644 src/parser_bot/web/static/auth.html create mode 100644 src/parser_bot/web/static/css/app.css create mode 100644 src/parser_bot/web/static/hr/index.html create mode 100644 src/parser_bot/web/static/hr/section/channels.html create mode 100644 src/parser_bot/web/static/hr/section/index.html create mode 100644 src/parser_bot/web/static/hr/section/messages.html create mode 100644 src/parser_bot/web/static/hr/section/settings.html create mode 100644 src/parser_bot/web/static/index.html create mode 100644 src/parser_bot/web/static/js/access.js create mode 100644 src/parser_bot/web/static/js/admin.js create mode 100644 src/parser_bot/web/static/js/api.js create mode 100644 src/parser_bot/web/static/js/auth.js create mode 100644 src/parser_bot/web/static/js/channels.js create mode 100644 src/parser_bot/web/static/js/dashboard.js create mode 100644 src/parser_bot/web/static/js/messages.js create mode 100644 src/parser_bot/web/static/js/nav-status.js create mode 100644 src/parser_bot/web/static/js/nav.js create mode 100644 src/parser_bot/web/static/js/sections-list.js create mode 100644 src/parser_bot/web/static/js/settings.js create mode 100644 src/parser_bot/web/static/js/slugify.js create mode 100644 src/parser_bot/web/static/js/vertical.js create mode 100644 src/parser_bot/web/static/real-estate/index.html create mode 100644 src/parser_bot/web/static/real-estate/section/channels.html create mode 100644 src/parser_bot/web/static/real-estate/section/index.html create mode 100644 src/parser_bot/web/static/real-estate/section/messages.html create mode 100644 src/parser_bot/web/static/real-estate/section/settings.html diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..a62a50b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,21 @@ +.git/ +.gitignore +.gitea/ +.env +.venv/ +venv/ +__pycache__/ +**/__pycache__/ +*.pyc +*.pyo +*.egg-info/ +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.idea/ +.vscode/ +.claude/ +.DS_Store +data/ +*.session +*.session-journal diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..12fd497 --- /dev/null +++ b/.env.example @@ -0,0 +1,54 @@ +# Telegram MTProto credentials — get from https://my.telegram.org +TG_API_ID= +TG_API_HASH= +TG_PHONE= + +# --- ONE OF THE TWO BELOW IS REQUIRED --- +# Preferred (no volumes, k8s-friendly): get the string by running +# docker compose run --rm -it app python -m parser_bot.auth +# It prints `TG_SESSION_STRING=...` — paste that line here. +TG_SESSION_STRING= + +# Fallback (file-based): only used if TG_SESSION_STRING is empty. +# Requires mounting ./data/session as a volume. +TG_SESSION_PATH=/data/session/parser.session + +# Postgres +POSTGRES_USER=parser +POSTGRES_PASSWORD=parser +POSTGRES_DB=parser +POSTGRES_HOST=db +POSTGRES_PORT=5432 + +# Polling +POLL_INTERVAL_SECONDS=60 +POLL_HISTORY_LIMIT=50 + +# API +API_HOST=0.0.0.0 +API_PORT=8000 + +# Media (downloaded photos / small videos / docs from parsed messages) +MEDIA_DIR=/data/media +MEDIA_MAX_BYTES=20971520 + +# Local LLM (Ollama) — runs Qwen 2.5 7B Q4 on CPU. Set LLM_ENABLED=false to disable. +LLM_ENABLED=true +LLM_BASE_URL=http://ollama:11434 +LLM_MODEL=qwen2.5:7b-instruct-q4_K_M +LLM_TIMEOUT_SECONDS=120 +LLM_MIN_TEXT_LENGTH=20 +# How often the background classifier wakes up and how many messages it +# processes per tick. With 5/20s ≈ 900 messages/hour at ~3-6s per call. +LLM_CLASSIFY_INTERVAL_SECONDS=20 +LLM_CLASSIFY_BATCH_SIZE=5 + +# Admin allowlist for /auth.html, /docs, /openapi.json, /redoc and the +# /api/v1/auth/* endpoints. Comma-separated list of client IPs. +# Empty = no restriction (everyone is admin) — convenient for local dev. +# Example: ADMIN_ALLOWED_IPS=89.110.109.221,127.0.0.1 +ADMIN_ALLOWED_IPS= +# Honor X-Forwarded-For / X-Real-IP from a reverse proxy (Docker port- +# forward, nginx, traefik) when resolving the client IP for the allowlist. +TRUST_PROXY_HEADERS=true + diff --git a/.gitea/workflows/deploy.yaml b/.gitea/workflows/deploy.yaml new file mode 100644 index 0000000..c42bff2 --- /dev/null +++ b/.gitea/workflows/deploy.yaml @@ -0,0 +1,58 @@ +name: Build and Deploy + +on: + push: + branches: [main] + +env: + INTERNAL_REGISTRY: gitea-http.gitea.svc.cluster.local:3000 + NODE_REGISTRY: localhost:30300 + +jobs: + build-and-deploy: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Docker CLI + run: | + curl -fsSL https://download.docker.com/linux/static/stable/x86_64/docker-27.5.1.tgz \ + | tar xz --strip-components=1 -C /usr/local/bin docker/docker + docker version + + - name: Install kubectl + run: | + curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + chmod +x kubectl + mv kubectl /usr/local/bin/ + kubectl version --client + + - name: Login to Gitea Registry + run: | + echo "${{ secrets.REGISTRY_PASSWORD }}" | \ + docker login ${{ env.INTERNAL_REGISTRY }} \ + -u ${{ secrets.REGISTRY_USERNAME }} --password-stdin + + - name: Build and push server + run: | + docker build -f Dockerfile \ + -t ${{ env.INTERNAL_REGISTRY }}/admin/monitoring-tg-server:${{ github.sha }} \ + -t ${{ env.INTERNAL_REGISTRY }}/admin/monitoring-tg-server:latest \ + . + docker push ${{ env.INTERNAL_REGISTRY }}/admin/monitoring-tg-server:${{ github.sha }} + docker push ${{ env.INTERNAL_REGISTRY }}/admin/monitoring-tg-server:latest + + - name: Deploy to Kubernetes + env: + KUBECONFIG: /kubeconfig/config + run: | + kubectl apply -f k8s/namespace.yaml + kubectl apply -f k8s/secrets.yaml + kubectl apply -f k8s/configmap.yaml + kubectl apply -f k8s/postgres.yaml + kubectl apply -f k8s/server-deployment.yaml + kubectl apply -f k8s/server-service.yaml + kubectl -n monitoring-tg set image deployment/monitoring-tg-server \ + monitoring-tg-server=${{ env.NODE_REGISTRY }}/admin/monitoring-tg-server:${{ github.sha }} + kubectl -n monitoring-tg rollout status deployment/monitoring-tg-server --timeout=180s diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9b6dffa --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +__pycache__/ +*.py[cod] +*.egg-info/ +.venv/ +venv/ +.env +*.session +*.session-journal +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.idea/ +.vscode/ +data/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..67a3295 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,28 @@ +FROM python:3.11-slim + +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_NO_CACHE_DIR=1 + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +COPY pyproject.toml ./ +COPY src ./src +COPY alembic.ini ./ +COPY alembic ./alembic + +RUN pip install --upgrade pip && pip install -e . + +RUN mkdir -p /data/session /data/media + +COPY docker/entrypoint.sh /usr/local/bin/entrypoint.sh +RUN chmod +x /usr/local/bin/entrypoint.sh + +EXPOSE 8000 + +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] +CMD ["python", "-m", "parser_bot.main"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..632a9db --- /dev/null +++ b/README.md @@ -0,0 +1,123 @@ +# parser-tg-bot + +Парсер публичных Telegram-каналов на Telethon (MTProto). Сохраняет сообщения в Postgres, +управляется через REST API. Период опроса настраивается через `.env`. На следующем шаге +легко перевести на realtime через `events.NewMessage`. + +## Стек + +- Python 3.11, Telethon, FastAPI, SQLAlchemy 2 (async) + Alembic, APScheduler, Postgres 16 + +## Структура + +```text +src/parser_bot/ +├── api/ # FastAPI роуты + Pydantic-схемы +├── db/ # SQLAlchemy модели + сессии +├── scheduler/ # APScheduler-воркер периодического опроса +├── telegram/ # Telethon-клиент (resolve, fetch) +├── web/static/ # SPA-странички (HTML/CSS/JS, без бандлера) +├── config.py # pydantic-settings +└── main.py # FastAPI lifespan + uvicorn +alembic/ # миграции +``` + +## Первый запуск (локально, через Docker) + +1. Получить `api_id` и `api_hash` на [my.telegram.org](https://my.telegram.org) → API development tools. +2. Скопировать `.env.example` в `.env` и заполнить `TG_API_ID`, `TG_API_HASH`, `TG_PHONE`. +3. Поднять Postgres + накатить миграции: + + ```bash + docker compose up -d db + docker compose run --rm app alembic upgrade head + ``` + +4. Запуск: + + ```bash + docker compose up -d + docker compose logs app --tail=50 + ``` + +5. **Авторизация Telegram** — открыть [http://localhost:8000/auth.html](http://localhost:8000/auth.html) + и нажать «Отправить код». Telegram пришлёт код на номер из `TG_PHONE` → + ввести код (и 2FA-пароль, если включён). Готово, парсер начнёт опрос. + + Сессия сохраняется в `./data/session/parser.session` — рестарты её переиспользуют, + повторно входить не нужно. + +### Админ-доступ и коды подразделов + +- `ADMIN_PASSWORD` — дополнительный пароль для админских функций. Если не задан, + остаётся прежний режим: доступ определяется только `ADMIN_ALLOWED_IPS`. +- [http://localhost:8000/admin.html](http://localhost:8000/admin.html) — вход по + админ-паролю. После входа доступны удаление и редактирование подразделов, + просмотр их кодов, управление каналами, ручной опрос, промпты, авторизация + Telegram и Swagger. +- При создании подраздела обязательно задаётся `Код доступа`. Пользователь вводит + этот код при первом открытии данных подраздела; после входа он может добавлять + каналы в этот подраздел. Админ видит код в списке подразделов. + +### Прод-вариант: без UI и без volume (k8s-friendly) + +Сделай интерактивный логин **один раз** на dev-машине и получи опаковую строку: + +```bash +docker compose run --rm -it app python -m parser_bot.auth +``` + +Скрипт напечатает строку вида `TG_SESSION_STRING=1AbcD...`. Положи её в +`.env` или k8s Secret — после этого приложение поднимается без UI и без +монтирования сессионного файла: + +```ini +TG_SESSION_STRING=1AbcDef... # вместо TG_SESSION_PATH/volume +``` + +> ⚠️ **`ApiIdPublishedFloodError`** — Telegram заблокировал твою пару +> `api_id`/`api_hash` (попала в публичный доступ). Создай **новое** приложение +> на [my.telegram.org](https://my.telegram.org) и не публикуй креды нигде. +> Старый `api_id` восстановить нельзя. + +## UI + +После запуска доступны страницы: + +- [Дашборд](http://localhost:8000/) — общая статистика, топ каналов, кнопка опросить всех +- [Каналы](http://localhost:8000/channels.html) — добавить / удалить / включить-выключить / опросить вручную +- [Сообщения](http://localhost:8000/messages.html) — фильтр по каналу, поиск по тексту, пагинация, raw JSON +- [Настройки](http://localhost:8000/settings.html) — текущая конфигурация и подсказки +- [Авторизация](http://localhost:8000/auth.html) — веб-логин в Telegram (код + 2FA) +- [Swagger UI](http://localhost:8000/docs) — интерактивный API + +Глубокая ссылка `messages.html?channel_id=42` открывает ленту конкретного канала. + +## API + +- `GET /healthz` — health check +- `GET /api/v1/auth/status` — авторизован ли клиент +- `POST /api/v1/auth/send-code` — отправить код на `TG_PHONE` +- `POST /api/v1/auth/submit-code` `{"code": "12345"}` — подтвердить код +- `POST /api/v1/auth/submit-password` `{"password": "..."}` — 2FA-пароль +- `POST /api/v1/auth/logout` — завершить сессию +- `GET /api/v1/stats` — глобальные счётчики +- `GET /api/v1/settings` — read-only вид конфигурации +- `GET /api/v1/channels` — список каналов +- `POST /api/v1/channels` `{"identifier": "@durov"}` — добавить +- `GET /api/v1/channels/{id}` — карточка +- `PATCH /api/v1/channels/{id}` `{"is_active": false}` — включить/выключить +- `DELETE /api/v1/channels/{id}` — удалить +- `GET /api/v1/channels/{id}/stats` — счётчики по каналу +- `POST /api/v1/channels/{id}/poll` — форсировать опрос одного канала +- `POST /api/v1/poll` — форсировать опрос всех активных каналов +- `GET /api/v1/messages?channel_id=...&q=...&limit=50&offset=0` — лента +- `GET /api/v1/messages/{id}` — одно сообщение (с `raw` JSONB) + +## Дальше + +- **Realtime**: заменить APScheduler на `client.add_event_handler(handler, events.NewMessage)`, + оставив periodic poll как фоновый «доводчик» для пропущенных сообщений. +- **Go-микросервис**: контракт = таблицы `channels` / `messages` в Postgres. + Go-сервис может либо читать ту же БД, либо ходить в `/api/v1/messages`. +- **k8s**: добавить Helm-чарт; `data/session/` маппится на PVC, `.env` — в Secret. diff --git a/alembic.ini b/alembic.ini new file mode 100644 index 0000000..958f295 --- /dev/null +++ b/alembic.ini @@ -0,0 +1,39 @@ +[alembic] +script_location = alembic +prepend_sys_path = src +version_path_separator = os +sqlalchemy.url = postgresql+asyncpg://parser:parser@db:5432/parser + +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/alembic/env.py b/alembic/env.py new file mode 100644 index 0000000..dbde691 --- /dev/null +++ b/alembic/env.py @@ -0,0 +1,52 @@ +import asyncio +from logging.config import fileConfig + +from alembic import context +from sqlalchemy import pool +from sqlalchemy.engine import Connection +from sqlalchemy.ext.asyncio import async_engine_from_config + +from parser_bot.config import settings +from parser_bot.db.models import Base + +config = context.config +config.set_main_option("sqlalchemy.url", settings.database_url) + +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +target_metadata = Base.metadata + + +def run_migrations_offline() -> None: + context.configure( + url=settings.database_url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + with context.begin_transaction(): + context.run_migrations() + + +def do_run_migrations(connection: Connection) -> None: + context.configure(connection=connection, target_metadata=target_metadata) + with context.begin_transaction(): + context.run_migrations() + + +async def run_migrations_online() -> None: + connectable = async_engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + async with connectable.connect() as connection: + await connection.run_sync(do_run_migrations) + await connectable.dispose() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + asyncio.run(run_migrations_online()) diff --git a/alembic/script.py.mako b/alembic/script.py.mako new file mode 100644 index 0000000..17dcba0 --- /dev/null +++ b/alembic/script.py.mako @@ -0,0 +1,25 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +revision: str = ${repr(up_revision)} +down_revision: Union[str, None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/alembic/versions/0001_initial.py b/alembic/versions/0001_initial.py new file mode 100644 index 0000000..523423f --- /dev/null +++ b/alembic/versions/0001_initial.py @@ -0,0 +1,71 @@ +"""initial schema: channels + messages + +Revision ID: 0001 +Revises: +Create Date: 2026-05-05 + +""" +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +revision: str = "0001" +down_revision: Union[str, None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "channels", + sa.Column("id", sa.Integer(), primary_key=True), + sa.Column("tg_id", sa.BigInteger(), nullable=True, unique=True), + sa.Column("identifier", sa.String(length=255), nullable=False, unique=True), + sa.Column("title", sa.String(length=512), nullable=True), + sa.Column("is_active", sa.Boolean(), nullable=False, server_default=sa.text("true")), + sa.Column("last_message_id", sa.BigInteger(), nullable=True), + sa.Column("last_polled_at", sa.DateTime(timezone=True), nullable=True), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + nullable=False, + server_default=sa.func.now(), + ), + ) + + op.create_table( + "messages", + sa.Column("id", sa.Integer(), primary_key=True), + sa.Column( + "channel_id", + sa.Integer(), + sa.ForeignKey("channels.id", ondelete="CASCADE"), + nullable=False, + ), + sa.Column("tg_message_id", sa.BigInteger(), nullable=False), + sa.Column("date", sa.DateTime(timezone=True), nullable=False), + sa.Column("text", sa.Text(), nullable=True), + sa.Column("sender_id", sa.BigInteger(), nullable=True), + sa.Column("has_media", sa.Boolean(), nullable=False, server_default=sa.text("false")), + sa.Column("views", sa.Integer(), nullable=True), + sa.Column("forwards", sa.Integer(), nullable=True), + sa.Column("raw", postgresql.JSONB(), nullable=True), + sa.Column( + "fetched_at", + sa.DateTime(timezone=True), + nullable=False, + server_default=sa.func.now(), + ), + sa.UniqueConstraint("channel_id", "tg_message_id", name="uq_channel_message"), + ) + op.create_index( + "ix_messages_channel_date", "messages", ["channel_id", "date"], unique=False + ) + + +def downgrade() -> None: + op.drop_index("ix_messages_channel_date", table_name="messages") + op.drop_table("messages") + op.drop_table("channels") diff --git a/alembic/versions/0002_add_media_files.py b/alembic/versions/0002_add_media_files.py new file mode 100644 index 0000000..ff2309e --- /dev/null +++ b/alembic/versions/0002_add_media_files.py @@ -0,0 +1,28 @@ +"""add media_files JSONB column to messages + +Revision ID: 0002 +Revises: 0001 +Create Date: 2026-05-05 + +""" +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +revision: str = "0002" +down_revision: Union[str, None] = "0001" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column( + "messages", + sa.Column("media_files", postgresql.JSONB(), nullable=True), + ) + + +def downgrade() -> None: + op.drop_column("messages", "media_files") diff --git a/alembic/versions/0003_add_grouped_id.py b/alembic/versions/0003_add_grouped_id.py new file mode 100644 index 0000000..fb20d3f --- /dev/null +++ b/alembic/versions/0003_add_grouped_id.py @@ -0,0 +1,39 @@ +"""add grouped_id to messages (Telegram album/media-group key) + +Revision ID: 0003 +Revises: 0002 +Create Date: 2026-05-05 + +""" +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +revision: str = "0003" +down_revision: Union[str, None] = "0002" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column("messages", sa.Column("grouped_id", sa.BigInteger(), nullable=True)) + op.create_index( + "ix_messages_grouped_id", "messages", ["channel_id", "grouped_id"] + ) + # Backfill grouped_id from the stored raw JSONB for existing rows so that + # albums saved before this migration are grouped retroactively. + op.execute( + """ + UPDATE messages + SET grouped_id = (raw->>'grouped_id')::bigint + WHERE grouped_id IS NULL + AND raw IS NOT NULL + AND raw->>'grouped_id' IS NOT NULL + """ + ) + + +def downgrade() -> None: + op.drop_index("ix_messages_grouped_id", table_name="messages") + op.drop_column("messages", "grouped_id") diff --git a/alembic/versions/0004_add_extracted.py b/alembic/versions/0004_add_extracted.py new file mode 100644 index 0000000..589cae2 --- /dev/null +++ b/alembic/versions/0004_add_extracted.py @@ -0,0 +1,34 @@ +"""add extracted JSONB column to messages + +Revision ID: 0004 +Revises: 0003 +Create Date: 2026-05-05 + +""" +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +revision: str = "0004" +down_revision: Union[str, None] = "0003" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column( + "messages", + sa.Column("extracted", postgresql.JSONB(), nullable=True), + ) + # GIN index for json queries (e.g. filter by extracted->'real_estate'->>'kind'). + op.execute( + "CREATE INDEX IF NOT EXISTS ix_messages_extracted_gin " + "ON messages USING GIN (extracted)" + ) + + +def downgrade() -> None: + op.execute("DROP INDEX IF EXISTS ix_messages_extracted_gin") + op.drop_column("messages", "extracted") diff --git a/alembic/versions/0005_add_sender_info.py b/alembic/versions/0005_add_sender_info.py new file mode 100644 index 0000000..2a7f108 --- /dev/null +++ b/alembic/versions/0005_add_sender_info.py @@ -0,0 +1,30 @@ +"""add sender_username and sender_name to messages + +Revision ID: 0005 +Revises: 0004 +Create Date: 2026-05-06 + +""" +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +revision: str = "0005" +down_revision: Union[str, None] = "0004" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column( + "messages", sa.Column("sender_username", sa.String(length=64), nullable=True) + ) + op.add_column( + "messages", sa.Column("sender_name", sa.String(length=255), nullable=True) + ) + + +def downgrade() -> None: + op.drop_column("messages", "sender_name") + op.drop_column("messages", "sender_username") diff --git a/alembic/versions/0006_add_app_settings.py b/alembic/versions/0006_add_app_settings.py new file mode 100644 index 0000000..efd8596 --- /dev/null +++ b/alembic/versions/0006_add_app_settings.py @@ -0,0 +1,35 @@ +"""key/value store for runtime-editable settings (LLM prompt, etc.) + +Revision ID: 0006 +Revises: 0005 +Create Date: 2026-05-06 + +""" +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +revision: str = "0006" +down_revision: Union[str, None] = "0005" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "app_settings", + sa.Column("key", sa.String(length=64), primary_key=True), + sa.Column("value", postgresql.JSONB(), nullable=False), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + nullable=False, + server_default=sa.func.now(), + ), + ) + + +def downgrade() -> None: + op.drop_table("app_settings") diff --git a/alembic/versions/0007_add_channel_vertical.py b/alembic/versions/0007_add_channel_vertical.py new file mode 100644 index 0000000..8d33460 --- /dev/null +++ b/alembic/versions/0007_add_channel_vertical.py @@ -0,0 +1,37 @@ +"""split channels into two verticals: real_estate / hr + +Existing rows get `real_estate` per the migration decision — the service was +real-estate-only before this column existed. + +Revision ID: 0007 +Revises: 0006 +Create Date: 2026-05-19 + +""" +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +revision: str = "0007" +down_revision: Union[str, None] = "0006" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column( + "channels", + sa.Column( + "vertical", + sa.String(length=32), + nullable=False, + server_default="real_estate", + ), + ) + op.create_index("ix_channels_vertical", "channels", ["vertical"]) + + +def downgrade() -> None: + op.drop_index("ix_channels_vertical", table_name="channels") + op.drop_column("channels", "vertical") diff --git a/alembic/versions/0008_add_sections.py b/alembic/versions/0008_add_sections.py new file mode 100644 index 0000000..19dc3cd --- /dev/null +++ b/alembic/versions/0008_add_sections.py @@ -0,0 +1,110 @@ +"""sub-sections inside each vertical (e.g. Real Estate → Dubai / Moscow) + +A channel now belongs to exactly one section, and each section to exactly +one vertical. The migration auto-creates a `Общий` section per vertical +that has at least one channel and pins all existing channels there, so the +service keeps working without manual reclassification after upgrade. + +Revision ID: 0008 +Revises: 0007 +Create Date: 2026-05-20 + +""" +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +revision: str = "0008" +down_revision: Union[str, None] = "0007" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "sections", + sa.Column("id", sa.Integer(), primary_key=True), + sa.Column("vertical", sa.String(length=32), nullable=False), + sa.Column("slug", sa.String(length=64), nullable=False), + sa.Column("title", sa.String(length=255), nullable=False), + sa.Column("emoji", sa.String(length=8), nullable=True), + sa.Column("description", sa.Text(), nullable=True), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + nullable=False, + server_default=sa.func.now(), + ), + sa.UniqueConstraint("vertical", "slug", name="uq_section_vertical_slug"), + ) + op.create_index("ix_sections_vertical", "sections", ["vertical"]) + + # Auto-create a `default` section for each vertical that already has channels, + # so the backfill below has somewhere to point. + op.execute( + """ + INSERT INTO sections (vertical, slug, title, emoji) + SELECT DISTINCT c.vertical, + 'default', + CASE c.vertical + WHEN 'hr' THEN 'Общий HR' + ELSE 'Общий' + END, + CASE c.vertical WHEN 'hr' THEN '👥' ELSE '🏠' END + FROM channels c + ON CONFLICT (vertical, slug) DO NOTHING + """ + ) + + # Add nullable section_id first so the backfill can populate it. + op.add_column( + "channels", + sa.Column("section_id", sa.Integer(), nullable=True), + ) + op.create_foreign_key( + "fk_channels_section", + "channels", + "sections", + ["section_id"], + ["id"], + ondelete="RESTRICT", + ) + op.create_index("ix_channels_section_id", "channels", ["section_id"]) + + op.execute( + """ + UPDATE channels c + SET section_id = s.id + FROM sections s + WHERE s.vertical = c.vertical AND s.slug = 'default' + """ + ) + + # Now we can safely require section_id. + op.alter_column("channels", "section_id", nullable=False) + + # Per-section LLM prompt keys are longer than 64 chars + # (`llm_system_prompt:real_estate:some-long-slug`), so widen the key column. + op.alter_column( + "app_settings", + "key", + existing_type=sa.String(length=64), + type_=sa.String(length=128), + existing_nullable=False, + ) + + +def downgrade() -> None: + op.alter_column( + "app_settings", + "key", + existing_type=sa.String(length=128), + type_=sa.String(length=64), + existing_nullable=False, + ) + op.drop_index("ix_channels_section_id", table_name="channels") + op.drop_constraint("fk_channels_section", "channels", type_="foreignkey") + op.drop_column("channels", "section_id") + op.drop_index("ix_sections_vertical", table_name="sections") + op.drop_table("sections") diff --git a/alembic/versions/0009_add_section_access_code.py b/alembic/versions/0009_add_section_access_code.py new file mode 100644 index 0000000..c2d3bf5 --- /dev/null +++ b/alembic/versions/0009_add_section_access_code.py @@ -0,0 +1,24 @@ +"""add access code to sections + +Revision ID: 0009 +Revises: 0008 +Create Date: 2026-05-29 + +""" +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +revision: str = "0009" +down_revision: Union[str, None] = "0008" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column("sections", sa.Column("access_code", sa.String(length=255), nullable=True)) + + +def downgrade() -> None: + op.drop_column("sections", "access_code") diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..24a8e10 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,64 @@ +services: + ollama: + image: ollama/ollama:latest + environment: + OLLAMA_HOST: 0.0.0.0:11434 + OLLAMA_KEEP_ALIVE: 24h + OLLAMA_NUM_PARALLEL: "1" + OLLAMA_NUM_THREAD: "8" + volumes: + - ./data/ollama:/root/.ollama + ports: + - "11434:11434" + healthcheck: + test: ["CMD", "ollama", "list"] + interval: 10s + timeout: 5s + retries: 30 + restart: unless-stopped + + ollama-pull: + image: ollama/ollama:latest + depends_on: + ollama: + condition: service_healthy + environment: + OLLAMA_HOST: ollama:11434 + entrypoint: ["/bin/sh", "-c"] + command: ["ollama list | grep -q qwen2.5:7b-instruct-q4_K_M || ollama pull qwen2.5:7b-instruct-q4_K_M"] + restart: "no" + + db: + image: postgres:16-alpine + environment: + POSTGRES_USER: ${POSTGRES_USER:-parser} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-parser} + POSTGRES_DB: ${POSTGRES_DB:-parser} + ports: + - "5432:5432" + volumes: + - pgdata:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-parser}"] + interval: 5s + timeout: 5s + retries: 10 + + app: + build: . + env_file: .env + depends_on: + db: + condition: service_healthy + ollama: + condition: service_healthy + ports: + - "80:8000" + volumes: + - ./data/session:/data/session + - ./data/media:/data/media + - ./src:/app/src + - ./alembic:/app/alembic + +volumes: + pgdata: diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh new file mode 100644 index 0000000..90a1dc5 --- /dev/null +++ b/docker/entrypoint.sh @@ -0,0 +1,16 @@ +#!/bin/sh +set -e + +# Run migrations on every container start. Idempotent: alembic skips +# revisions already applied. Skipped for one-shot commands like `alembic` +# itself (would deadlock when explicitly invoked) and for the auth helper. +case "$1" in + alembic|python\ -m\ parser_bot.auth|/bin/sh|sh|bash) + exec "$@" + ;; +esac + +echo "[entrypoint] running alembic upgrade head" +alembic upgrade head + +exec "$@" diff --git a/k8s/configmap.yaml b/k8s/configmap.yaml new file mode 100644 index 0000000..013f7d6 --- /dev/null +++ b/k8s/configmap.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: monitoring-tg-config + namespace: monitoring-tg +data: + API_HOST: "0.0.0.0" + API_PORT: "8000" + PUBLIC_BASE_PATH: "/api/monitoring-tg" + POSTGRES_HOST: "postgres.monitoring-tg.svc.cluster.local" + POSTGRES_PORT: "5432" + POSTGRES_USER: "parser" + POSTGRES_DB: "parser" + TG_SESSION_PATH: "/data/session/parser.session" + MEDIA_DIR: "/data/media" + POLL_INTERVAL_SECONDS: "60" + POLL_HISTORY_LIMIT: "50" + LLM_ENABLED: "1" + LLM_BASE_URL: "http://ollama.ollama.svc.cluster.local:11434" + LLM_MODEL: "qwen2.5:7b-instruct-q4_K_M" diff --git a/k8s/kustomization.yaml b/k8s/kustomization.yaml new file mode 100644 index 0000000..3ad09db --- /dev/null +++ b/k8s/kustomization.yaml @@ -0,0 +1,12 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: monitoring-tg + +resources: + - namespace.yaml + - configmap.yaml + - secrets.yaml + - postgres.yaml + - server-deployment.yaml + - server-service.yaml diff --git a/k8s/namespace.yaml b/k8s/namespace.yaml new file mode 100644 index 0000000..9c1b841 --- /dev/null +++ b/k8s/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring-tg diff --git a/k8s/postgres.yaml b/k8s/postgres.yaml new file mode 100644 index 0000000..8f4d576 --- /dev/null +++ b/k8s/postgres.yaml @@ -0,0 +1,65 @@ +apiVersion: v1 +kind: Service +metadata: + name: postgres + namespace: monitoring-tg +spec: + selector: + app: postgres + ports: + - port: 5432 + targetPort: 5432 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: postgres + namespace: monitoring-tg +spec: + serviceName: postgres + replicas: 1 + selector: + matchLabels: + app: postgres + template: + metadata: + labels: + app: postgres + spec: + containers: + - name: postgres + image: postgres:16-alpine + ports: + - containerPort: 5432 + envFrom: + - secretRef: + name: postgres-secret + volumeMounts: + - name: pgdata + mountPath: /var/lib/postgresql/data + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + livenessProbe: + exec: + command: ["pg_isready", "-U", "parser", "-d", "parser"] + initialDelaySeconds: 10 + periodSeconds: 10 + readinessProbe: + exec: + command: ["pg_isready", "-U", "parser", "-d", "parser"] + initialDelaySeconds: 5 + periodSeconds: 5 + volumeClaimTemplates: + - metadata: + name: pgdata + spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-path + resources: + requests: + storage: 5Gi diff --git a/k8s/secrets.yaml b/k8s/secrets.yaml new file mode 100644 index 0000000..d8e9308 --- /dev/null +++ b/k8s/secrets.yaml @@ -0,0 +1,25 @@ +apiVersion: v1 +kind: Secret +metadata: + name: monitoring-tg-secrets + namespace: monitoring-tg +type: Opaque +stringData: + TG_API_ID: "0" + TG_API_HASH: "CHANGE_ME" + TG_PHONE: "CHANGE_ME" + TG_SESSION_STRING: "" + POSTGRES_PASSWORD: "parser" + ADMIN_ALLOWED_IPS: "" + ADMIN_PASSWORD: "CHANGE_ME" +--- +apiVersion: v1 +kind: Secret +metadata: + name: postgres-secret + namespace: monitoring-tg +type: Opaque +stringData: + POSTGRES_USER: "parser" + POSTGRES_PASSWORD: "parser" + POSTGRES_DB: "parser" diff --git a/k8s/server-deployment.yaml b/k8s/server-deployment.yaml new file mode 100644 index 0000000..5890dcb --- /dev/null +++ b/k8s/server-deployment.yaml @@ -0,0 +1,70 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: monitoring-tg-data + namespace: monitoring-tg +spec: + accessModes: ["ReadWriteOnce"] + storageClassName: local-path + resources: + requests: + storage: 10Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: monitoring-tg-server + namespace: monitoring-tg +spec: + replicas: 1 + selector: + matchLabels: + app: monitoring-tg-server + template: + metadata: + labels: + app: monitoring-tg-server + spec: + terminationGracePeriodSeconds: 20 + securityContext: + fsGroup: 1000 + containers: + - name: monitoring-tg-server + image: localhost:30300/admin/monitoring-tg-server:latest + ports: + - containerPort: 8000 + envFrom: + - configMapRef: + name: monitoring-tg-config + - secretRef: + name: monitoring-tg-secrets + volumeMounts: + - name: app-data + mountPath: /data + startupProbe: + httpGet: + path: /healthz + port: 8000 + periodSeconds: 5 + failureThreshold: 30 + livenessProbe: + httpGet: + path: /healthz + port: 8000 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: 8000 + periodSeconds: 5 + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 800m + memory: 1Gi + volumes: + - name: app-data + persistentVolumeClaim: + claimName: monitoring-tg-data diff --git a/k8s/server-service.yaml b/k8s/server-service.yaml new file mode 100644 index 0000000..db9f4d2 --- /dev/null +++ b/k8s/server-service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + name: monitoring-tg-server + namespace: monitoring-tg + annotations: + portal.estateliga.work/enabled: "true" + portal.estateliga.work/name: "Мониторинг TG" + portal.estateliga.work/description: "Парсер и анализ Telegram-каналов" + portal.estateliga.work/icon: "pulse" + portal.estateliga.work/path: "/api/monitoring-tg" + portal.estateliga.work/code: "monitoring_tg" +spec: + selector: + app: monitoring-tg-server + ports: + - port: 80 + targetPort: 8000 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..82c5664 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,44 @@ +[project] +name = "parser-tg-bot" +version = "0.1.0" +description = "Telegram channel parser — periodic polling + storage, future Go microservice" +requires-python = ">=3.11" +dependencies = [ + "telethon>=1.36", + "fastapi>=0.115", + "uvicorn[standard]>=0.32", + "sqlalchemy[asyncio]>=2.0", + "asyncpg>=0.30", + "alembic>=1.14", + "apscheduler>=3.10", + "pydantic>=2.9", + "pydantic-settings>=2.6", + "python-dotenv>=1.0", + "structlog>=24.4", + "httpx>=0.27", +] + +[project.optional-dependencies] +dev = [ + "ruff>=0.7", + "mypy>=1.13", + "pytest>=8.3", + "pytest-asyncio>=0.24", +] + +[build-system] +requires = ["setuptools>=68"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +"parser_bot.web" = ["static/*", "static/**/*"] + +[tool.ruff] +line-length = 100 +target-version = "py311" + +[tool.pytest.ini_options] +asyncio_mode = "auto" diff --git a/src/parser_bot/__init__.py b/src/parser_bot/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/parser_bot/access.py b/src/parser_bot/access.py new file mode 100644 index 0000000..dbbb460 --- /dev/null +++ b/src/parser_bot/access.py @@ -0,0 +1,116 @@ +"""Admin access helpers for admin-only surfaces (auth, OpenAPI docs). + +Resolution: + 1. If `ADMIN_ALLOWED_IPS` is empty → no network restriction. + 2. Otherwise the request's client IP must be in the allowlist. + 3. When `TRUST_PROXY_HEADERS=true` (default) and one of the proxy headers + is present, the first IP in `X-Forwarded-For` (or `X-Real-IP`) is used. + Without this, behind a Docker port-forward the source IP is always the + gateway, which is useless for ACLs. + 4. If `ADMIN_PASSWORD` is set, the request must also present a valid signed + admin cookie or the password in `X-Admin-Password`. +""" +from __future__ import annotations + +import hashlib +import hmac +import secrets + +from fastapi import HTTPException, Request, Response + +from parser_bot.config import settings + +ADMIN_COOKIE = "parser_admin" +_ADMIN_TOKEN_MESSAGE = b"parser-tg-bot-admin-v1" + + +def client_ip(request: Request) -> str: + """Best-effort source IP of the request.""" + if settings.trust_proxy_headers: + xff = request.headers.get("x-forwarded-for") + if xff: + # Standard form: "client, proxy1, proxy2" — first is closest to user. + return xff.split(",")[0].strip() + real = request.headers.get("x-real-ip") + if real: + return real.strip() + return request.client.host if request.client else "0.0.0.0" + + +def is_admin_network_allowed(request: Request) -> bool: + allowed = settings.admin_ip_set + if not allowed: + return True + return client_ip(request) in allowed + + +def admin_password_enabled() -> bool: + return bool(settings.admin_password) + + +def verify_admin_password(password: str | None) -> bool: + if not settings.admin_password: + return True + if password is None: + return False + return secrets.compare_digest(password, settings.admin_password) + + +def admin_token() -> str: + return hmac.new( + settings.admin_password.encode("utf-8"), + _ADMIN_TOKEN_MESSAGE, + hashlib.sha256, + ).hexdigest() + + +def verify_admin_token(token: str | None) -> bool: + if not settings.admin_password: + return True + if token is None: + return False + return secrets.compare_digest(token, admin_token()) + + +def set_admin_cookie(response: Response) -> None: + response.set_cookie( + ADMIN_COOKIE, + admin_token(), + httponly=True, + samesite="lax", + secure=False, + max_age=60 * 60 * 24 * 30, + ) + + +def clear_admin_cookie(response: Response) -> None: + response.delete_cookie(ADMIN_COOKIE) + + +def is_admin_request(request: Request) -> bool: + if not is_admin_network_allowed(request): + return False + if not settings.admin_password: + return True + return verify_admin_token(request.cookies.get(ADMIN_COOKIE)) or verify_admin_password( + request.headers.get("x-admin-password") + ) + + +def require_admin_network(request: Request) -> None: + """FastAPI dependency for the admin login page/API. + + This keeps the IP allowlist useful even before the password cookie exists. + """ + if not is_admin_network_allowed(request): + raise HTTPException(status_code=404) + + +def require_admin(request: Request) -> None: + """FastAPI dependency: 404 for non-admins. + + Admin endpoints keep returning 404 instead of 403 to avoid advertising + their existence to clients outside the admin boundary. + """ + if not is_admin_request(request): + raise HTTPException(status_code=404) diff --git a/src/parser_bot/api/__init__.py b/src/parser_bot/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/parser_bot/api/routes.py b/src/parser_bot/api/routes.py new file mode 100644 index 0000000..c4be44f --- /dev/null +++ b/src/parser_bot/api/routes.py @@ -0,0 +1,1048 @@ +import hashlib +import hmac +import secrets +from datetime import datetime, timedelta, timezone +from typing import Any, Literal + +import sqlalchemy as sa +from fastapi import ( + APIRouter, + BackgroundTasks, + Depends, + HTTPException, + Query, + Request, + Response, +) +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from parser_bot import llm as llm_client +from parser_bot import prompt_store +from parser_bot.access import ( + admin_password_enabled, + clear_admin_cookie, + client_ip, + is_admin_network_allowed, + is_admin_request, + require_admin, + require_admin_network, + set_admin_cookie, + verify_admin_password, +) + +from parser_bot.api.schemas import ( + AdminLogin, + AuthCode, + AuthCodeResult, + AuthPassword, + AuthStatus, + ChannelCreate, + ChannelOut, + ChannelStats, + ChannelUpdate, + GlobalStats, + MessageOut, + SectionCreate, + SectionLogin, + SectionOut, + SectionUpdate, + SectionWithStats, +) +from parser_bot.config import settings +from parser_bot.db.models import Channel, Message, Section +from parser_bot.db.session import get_session +from parser_bot.links import post_url as build_post_url +from parser_bot.scheduler.poller import ( + backfill_media, + pending_llm_count, + poll_channel, + reanalyze_channel, +) +from parser_bot.telegram import client as tg + +router = APIRouter() + + +Vertical = Literal["real_estate", "hr"] + + +def _verdict_key(vertical: str) -> str: + return "hr_lead" if vertical == "hr" else "lead" + + +async def _get_section( + session: AsyncSession, vertical: Vertical, slug: str +) -> Section: + """Find a section by (vertical, slug) or 404.""" + result = await session.execute( + select(Section).where(Section.vertical == vertical, Section.slug == slug) + ) + section = result.scalar_one_or_none() + if section is None: + raise HTTPException( + status_code=404, detail=f"section {vertical}:{slug} not found" + ) + return section + + +def _section_cookie_name(vertical: str, slug: str) -> str: + return f"parser_section_{vertical}_{slug}" + + +def _section_token(section: Section) -> str: + if not section.access_code: + return "" + return hmac.new( + section.access_code.encode("utf-8"), + f"{section.vertical}:{section.slug}".encode("utf-8"), + hashlib.sha256, + ).hexdigest() + + +def _set_section_cookie(response: Response, section: Section) -> None: + response.set_cookie( + _section_cookie_name(section.vertical, section.slug), + _section_token(section), + httponly=True, + samesite="lax", + secure=False, + max_age=60 * 60 * 24 * 30, + ) + + +def _section_is_unlocked(request: Request, section: Section) -> bool: + if not section.access_code: + return True + direct_code = request.headers.get("x-section-code") + if direct_code and secrets.compare_digest(direct_code, section.access_code): + return True + cookie_token = request.cookies.get( + _section_cookie_name(section.vertical, section.slug) + ) + return secrets.compare_digest(cookie_token or "", _section_token(section)) + + +async def _require_scope_access( + request: Request, + session: AsyncSession, + vertical: Vertical | None, + section_slug: str | None, +) -> Section | None: + if is_admin_request(request): + return None + if vertical is None or section_slug is None: + raise HTTPException(status_code=401, detail="section code required") + section = await _get_section(session, vertical, section_slug) + if not _section_is_unlocked(request, section): + raise HTTPException(status_code=401, detail="section code required") + return section + + +async def _get_channel_in_scope( + session: AsyncSession, + channel_id: int, + vertical: Vertical | None, + section_slug: str | None, +) -> tuple[Channel, str]: + """Load a channel and its section slug; 404 if any scope constraint fails. + + All per-id endpoints route through this so a vertical/section-scoped UI + cannot accidentally read or mutate something from another scope. + """ + result = await session.execute( + select(Channel, Section.slug) + .join(Section, Section.id == Channel.section_id) + .where(Channel.id == channel_id) + ) + row = result.one_or_none() + if row is None: + raise HTTPException(status_code=404) + channel, ch_section_slug = row + if vertical is not None and channel.vertical != vertical: + raise HTTPException(status_code=404) + if section_slug is not None and ch_section_slug != section_slug: + raise HTTPException(status_code=404) + return channel, ch_section_slug + + +def _channel_out(channel: Channel, section_slug: str | None) -> dict[str, Any]: + return { + "id": channel.id, + "tg_id": channel.tg_id, + "identifier": channel.identifier, + "title": channel.title, + "vertical": channel.vertical, + "section_id": channel.section_id, + "section_slug": section_slug, + "is_active": channel.is_active, + "last_message_id": channel.last_message_id, + "last_polled_at": channel.last_polled_at, + "created_at": channel.created_at, + } + + +# --- Access (admin allowlist) ------------------------------------------- + + +@router.get("/access/me") +async def access_me(request: Request) -> dict[str, Any]: + """Tell the frontend whether the current client is on the admin allowlist. + + Used by JS to show/hide the «Авторизация» and «API» nav links. Always + returns 200 (so it's safe to call from every page); the boolean is what + the UI keys off. + """ + admin = is_admin_request(request) + ip_allowed = is_admin_network_allowed(request) + return { + "is_admin": admin, + "admin_password_enabled": admin_password_enabled(), + "admin_ip_allowed": ip_allowed, + "ip": client_ip(request) if admin else None, + "restricted": bool(settings.admin_ip_set), + } + + +@router.post( + "/access/admin-login", + status_code=204, + dependencies=[Depends(require_admin_network)], +) +async def admin_login(payload: AdminLogin, response: Response) -> None: + if not verify_admin_password(payload.password): + raise HTTPException(status_code=401, detail="invalid admin password") + if admin_password_enabled(): + set_admin_cookie(response) + + +@router.post("/access/admin-logout", status_code=204) +async def admin_logout(response: Response) -> None: + clear_admin_cookie(response) + + +@router.post("/access/section-login", status_code=204) +async def section_login( + payload: SectionLogin, + response: Response, + session: AsyncSession = Depends(get_session), +) -> None: + section = await _get_section(session, payload.vertical, payload.section) + if not section.access_code: + _set_section_cookie(response, section) + return + if not secrets.compare_digest(payload.code, section.access_code): + raise HTTPException(status_code=401, detail="invalid section code") + _set_section_cookie(response, section) + + +# --- Auth (admin-only) -------------------------------------------------- +# Telegram session controls are an admin surface — gate with the same +# IP allowlist so an unauth visitor can't even probe the login state. + + +@router.get("/auth/status", response_model=AuthStatus, dependencies=[Depends(require_admin)]) +async def auth_status() -> AuthStatus: + username = await tg.current_username() + return AuthStatus( + authorized=username is not None, + username=username, + phone=settings.tg_phone, + ) + + +@router.post( + "/auth/send-code", status_code=204, dependencies=[Depends(require_admin)] +) +async def auth_send_code() -> None: + try: + await tg.send_login_code() + except Exception as exc: + raise HTTPException(status_code=400, detail=str(exc)) + + +@router.post( + "/auth/submit-code", + response_model=AuthCodeResult, + dependencies=[Depends(require_admin)], +) +async def auth_submit_code(payload: AuthCode) -> AuthCodeResult: + try: + needs_password = await tg.submit_login_code(payload.code) + except Exception as exc: + raise HTTPException(status_code=400, detail=str(exc)) + return AuthCodeResult(needs_password=needs_password) + + +@router.post( + "/auth/submit-password", status_code=204, dependencies=[Depends(require_admin)] +) +async def auth_submit_password(payload: AuthPassword) -> None: + try: + await tg.submit_login_password(payload.password) + except Exception as exc: + raise HTTPException(status_code=400, detail=str(exc)) + + +@router.post( + "/auth/logout", status_code=204, dependencies=[Depends(require_admin)] +) +async def auth_logout() -> None: + await tg.logout() + + +# --- Sections ----------------------------------------------------------- + + +@router.get("/sections", response_model=list[SectionWithStats]) +async def list_sections( + request: Request, + vertical: Vertical = Query(..., description="required: real_estate | hr"), + session: AsyncSession = Depends(get_session), +) -> list[SectionWithStats]: + """List sub-sections inside a vertical, each with rollup counts. + + Used by the section-chooser page. Counts are computed in a single query + via LEFT JOINs so empty sections still appear. + """ + # Per-section channel counts. + ch_total_sub = ( + select(Channel.section_id, func.count(Channel.id).label("ct")) + .group_by(Channel.section_id) + .subquery() + ) + ch_active_sub = ( + select(Channel.section_id, func.count(Channel.id).label("ca")) + .where(Channel.is_active.is_(True)) + .group_by(Channel.section_id) + .subquery() + ) + # Per-section message counts. + msg_total_sub = ( + select(Channel.section_id, func.count(Message.id).label("mt")) + .join(Message, Message.channel_id == Channel.id) + .group_by(Channel.section_id) + .subquery() + ) + # Per-section leads (uses vertical-appropriate verdict key). + if vertical == "hr": + lead_clause = Message.extracted["hr_lead"]["is_lead"].astext == "true" + else: + lead_clause = Message.extracted["lead"]["is_listing"].astext == "true" + leads_sub = ( + select(Channel.section_id, func.count(Message.id).label("lt")) + .join(Message, Message.channel_id == Channel.id) + .where(lead_clause) + .group_by(Channel.section_id) + .subquery() + ) + + rows = ( + await session.execute( + select( + Section, + func.coalesce(ch_total_sub.c.ct, 0), + func.coalesce(ch_active_sub.c.ca, 0), + func.coalesce(msg_total_sub.c.mt, 0), + func.coalesce(leads_sub.c.lt, 0), + ) + .where(Section.vertical == vertical) + .outerjoin(ch_total_sub, ch_total_sub.c.section_id == Section.id) + .outerjoin(ch_active_sub, ch_active_sub.c.section_id == Section.id) + .outerjoin(msg_total_sub, msg_total_sub.c.section_id == Section.id) + .outerjoin(leads_sub, leads_sub.c.section_id == Section.id) + .order_by(Section.slug) + ) + ).all() + + can_view_codes = is_admin_request(request) + return [ + SectionWithStats( + id=s.id, + vertical=s.vertical, + slug=s.slug, + title=s.title, + emoji=s.emoji, + description=s.description, + access_code=s.access_code if can_view_codes else None, + created_at=s.created_at, + channels_total=ct, + channels_active=ca, + messages_total=mt, + leads_total=lt, + ) + for (s, ct, ca, mt, lt) in rows + ] + + +@router.post("/sections", response_model=SectionOut, status_code=201) +async def create_section( + payload: SectionCreate, session: AsyncSession = Depends(get_session) +) -> Section: + # Reject duplicates with a friendly message instead of a constraint error. + existing = await session.execute( + select(Section).where( + Section.vertical == payload.vertical, Section.slug == payload.slug + ) + ) + if existing.scalar_one_or_none() is not None: + raise HTTPException( + status_code=409, + detail=f"section {payload.vertical}:{payload.slug} already exists", + ) + section = Section(**payload.model_dump()) + session.add(section) + await session.commit() + await session.refresh(section) + return section + + +@router.get("/sections/{vertical}/{slug}", response_model=SectionOut) +async def get_section( + vertical: Vertical, + slug: str, + request: Request, + session: AsyncSession = Depends(get_session), +) -> dict[str, Any]: + section = await _get_section(session, vertical, slug) + return { + "id": section.id, + "vertical": section.vertical, + "slug": section.slug, + "title": section.title, + "emoji": section.emoji, + "description": section.description, + "access_code": section.access_code if is_admin_request(request) else None, + "created_at": section.created_at, + } + + +@router.patch( + "/sections/{vertical}/{slug}", + response_model=SectionOut, + dependencies=[Depends(require_admin)], +) +async def update_section( + vertical: Vertical, + slug: str, + payload: SectionUpdate, + session: AsyncSession = Depends(get_session), +) -> Section: + section = await _get_section(session, vertical, slug) + data = payload.model_dump(exclude_unset=True) + for k, v in data.items(): + setattr(section, k, v) + await session.commit() + await session.refresh(section) + return section + + +@router.delete( + "/sections/{vertical}/{slug}", + status_code=204, + dependencies=[Depends(require_admin)], +) +async def delete_section( + vertical: Vertical, slug: str, session: AsyncSession = Depends(get_session) +) -> None: + section = await _get_section(session, vertical, slug) + # Block deletion when channels are still attached — keeps data referenceable. + count = ( + await session.execute( + select(func.count(Channel.id)).where(Channel.section_id == section.id) + ) + ).scalar_one() + if count: + raise HTTPException( + status_code=409, + detail=f"section has {count} channels — move or delete them first", + ) + # Drop the per-section LLM prompt too if any. + await prompt_store.reset(vertical, slug) + await session.delete(section) + await session.commit() + + +# --- Channels ----------------------------------------------------------- + + +@router.get("/channels", response_model=list[ChannelOut]) +async def list_channels( + request: Request, + vertical: Vertical = Query(..., description="required: real_estate | hr"), + section: str | None = Query(None, description="optional section slug filter"), + session: AsyncSession = Depends(get_session), +) -> list[dict[str, Any]]: + await _require_scope_access(request, session, vertical, section) + stmt = ( + select(Channel, Section.slug) + .join(Section, Section.id == Channel.section_id) + .where(Channel.vertical == vertical) + .order_by(Channel.id) + ) + if section is not None: + stmt = stmt.where(Section.slug == section) + rows = (await session.execute(stmt)).all() + return [_channel_out(ch, slug) for (ch, slug) in rows] + + +@router.post( + "/channels", + response_model=ChannelOut, + status_code=201, +) +async def add_channel( + payload: ChannelCreate, + request: Request, + session: AsyncSession = Depends(get_session), +) -> dict[str, Any]: + await _require_scope_access(request, session, payload.vertical, payload.section) + existing = await session.execute( + select(Channel).where(Channel.identifier == payload.identifier) + ) + if existing.scalar_one_or_none() is not None: + raise HTTPException(status_code=409, detail="channel already exists") + + section = await _get_section(session, payload.vertical, payload.section) + + if not await tg.is_authorized(): + raise HTTPException(status_code=401, detail="not authorized: log in at /auth.html") + try: + resolved = await tg.resolve_channel(payload.identifier) + except Exception as exc: + raise HTTPException(status_code=400, detail=f"cannot resolve channel: {exc}") + + channel = Channel( + identifier=payload.identifier, + tg_id=resolved.tg_id, + title=resolved.title, + vertical=payload.vertical, + section_id=section.id, + ) + session.add(channel) + await session.commit() + await session.refresh(channel) + return _channel_out(channel, section.slug) + + +@router.get("/channels/{channel_id}", response_model=ChannelOut) +async def get_channel( + channel_id: int, + request: Request, + vertical: Vertical | None = Query(None, description="scope: 404 if mismatched"), + section: str | None = Query(None, description="scope: 404 if mismatched"), + session: AsyncSession = Depends(get_session), +) -> dict[str, Any]: + await _require_scope_access(request, session, vertical, section) + channel, section_slug = await _get_channel_in_scope( + session, channel_id, vertical, section + ) + return _channel_out(channel, section_slug) + + +@router.patch( + "/channels/{channel_id}", + response_model=ChannelOut, + dependencies=[Depends(require_admin)], +) +async def update_channel( + channel_id: int, + payload: ChannelUpdate, + vertical: Vertical | None = Query(None, description="scope: 404 if mismatched"), + section: str | None = Query(None, description="scope: 404 if mismatched"), + session: AsyncSession = Depends(get_session), +) -> dict[str, Any]: + channel, _ = await _get_channel_in_scope(session, channel_id, vertical, section) + if payload.is_active is not None: + channel.is_active = payload.is_active + if payload.vertical is not None: + channel.vertical = payload.vertical + if payload.section is not None: + new_section = await _get_section(session, channel.vertical, payload.section) + channel.section_id = new_section.id + await session.commit() + await session.refresh(channel) + # Reload the section slug since it may have changed. + section_row = await session.get(Section, channel.section_id) + return _channel_out(channel, section_row.slug if section_row else None) + + +@router.delete( + "/channels/{channel_id}", + status_code=204, + dependencies=[Depends(require_admin)], +) +async def delete_channel( + channel_id: int, + vertical: Vertical | None = Query(None, description="scope: 404 if mismatched"), + section: str | None = Query(None, description="scope: 404 if mismatched"), + session: AsyncSession = Depends(get_session), +) -> None: + channel, _ = await _get_channel_in_scope(session, channel_id, vertical, section) + await session.delete(channel) + await session.commit() + + +@router.post("/channels/{channel_id}/poll", dependencies=[Depends(require_admin)]) +async def trigger_poll( + channel_id: int, + vertical: Vertical | None = Query(None, description="scope: 404 if mismatched"), + section: str | None = Query(None, description="scope: 404 if mismatched"), + session: AsyncSession = Depends(get_session), +) -> dict[str, int]: + await _get_channel_in_scope(session, channel_id, vertical, section) + inserted = await poll_channel(channel_id) + return {"inserted": inserted} + + +@router.post( + "/channels/{channel_id}/backfill-media", + dependencies=[Depends(require_admin)], +) +async def trigger_backfill_media( + channel_id: int, + batch: int = Query(50, ge=1, le=200), + vertical: Vertical | None = Query(None, description="scope: 404 if mismatched"), + section: str | None = Query(None, description="scope: 404 if mismatched"), + session: AsyncSession = Depends(get_session), +) -> dict[str, int]: + await _get_channel_in_scope(session, channel_id, vertical, section) + try: + return await backfill_media(channel_id, batch_size=batch) + except RuntimeError as exc: + raise HTTPException(status_code=400, detail=str(exc)) + + +@router.post( + "/channels/{channel_id}/reanalyze", + dependencies=[Depends(require_admin)], +) +async def trigger_reanalyze( + channel_id: int, + batch: int = Query(500, ge=1, le=2000), + vertical: Vertical | None = Query(None, description="scope: 404 if mismatched"), + section: str | None = Query(None, description="scope: 404 if mismatched"), + session: AsyncSession = Depends(get_session), +) -> dict[str, int]: + await _get_channel_in_scope(session, channel_id, vertical, section) + return await reanalyze_channel(channel_id, batch_size=batch) + + +@router.get("/channels/{channel_id}/stats", response_model=ChannelStats) +async def channel_stats( + channel_id: int, + request: Request, + vertical: Vertical | None = Query(None, description="scope: 404 if mismatched"), + section: str | None = Query(None, description="scope: 404 if mismatched"), + session: AsyncSession = Depends(get_session), +) -> ChannelStats: + await _require_scope_access(request, session, vertical, section) + channel, section_slug = await _get_channel_in_scope( + session, channel_id, vertical, section + ) + counts = await session.execute( + select(func.count(Message.id), func.max(Message.date)).where( + Message.channel_id == channel_id + ) + ) + msg_count, last_date = counts.one() + return ChannelStats( + channel_id=channel.id, + identifier=channel.identifier, + title=channel.title, + vertical=channel.vertical, + section_slug=section_slug, + is_active=channel.is_active, + last_polled_at=channel.last_polled_at, + message_count=msg_count or 0, + last_message_at=last_date, + ) + + +# --- Messages ----------------------------------------------------------- + + +@router.get("/messages", response_model=list[MessageOut]) +async def list_messages( + request: Request, + vertical: Vertical = Query(..., description="required: real_estate | hr"), + section: str | None = Query(None, description="optional section slug"), + channel_id: int | None = None, + q: str | None = Query(None, description="full-text search in message body"), + real_estate: str | None = Query( + None, description="filter by deal kind: any|sale|rent|purchase" + ), + hr_kind: str | None = Query( + None, description="filter by HR lead kind: any|vacancy|resume|contact" + ), + leads_only: bool = Query(False, description="only LLM-confirmed leads"), + min_confidence: float = Query( + 0.5, ge=0.0, le=1.0, description="minimum LLM confidence when leads_only=true" + ), + has_phone: bool = Query(False, description="only messages with extracted phone numbers"), + limit: int = Query(50, ge=1, le=500), + offset: int = Query(0, ge=0), + session: AsyncSession = Depends(get_session), +) -> list[MessageOut]: + """Return messages grouped by Telegram album (`grouped_id`). + + Every query joins through channels (and optionally sections) so a + cross-vertical / cross-section read is impossible at the API boundary. + """ + await _require_scope_access(request, session, vertical, section) + + # Step 1: pick the group keys for this page, ordered by latest activity. + group_key = func.coalesce( + Message.grouped_id, -Message.id # solo messages get a unique negative key + ).label("group_key") + g_stmt = ( + select(group_key, func.max(Message.date).label("group_date")) + .join(Channel, Channel.id == Message.channel_id) + .where(Channel.vertical == vertical) + .group_by(group_key) + .order_by(func.max(Message.date).desc()) + .limit(limit) + .offset(offset) + ) + if section is not None: + g_stmt = g_stmt.join(Section, Section.id == Channel.section_id).where( + Section.slug == section + ) + if channel_id is not None: + g_stmt = g_stmt.where(Message.channel_id == channel_id) + if q: + g_stmt = g_stmt.where(Message.text.ilike(f"%{q}%")) + if real_estate == "any": + g_stmt = g_stmt.where(Message.extracted["real_estate"].is_not(None)) + elif real_estate in ("sale", "rent", "purchase"): + g_stmt = g_stmt.where( + Message.extracted["real_estate"]["kind"].astext == real_estate + ) + if hr_kind == "any": + g_stmt = g_stmt.where(Message.extracted["hr_lead"].is_not(None)) + elif hr_kind in ("vacancy", "resume", "contact"): + g_stmt = g_stmt.where( + Message.extracted["hr_lead"]["kind"].astext == hr_kind + ) + if has_phone: + g_stmt = g_stmt.where( + func.jsonb_array_length(Message.extracted["phones"]) > 0 + ) + if leads_only: + if vertical == "hr": + g_stmt = g_stmt.where( + Message.extracted["hr_lead"]["is_lead"].astext == "true", + Message.extracted["hr_lead"]["confidence"].astext.cast(sa.Float) >= min_confidence, + ) + else: + g_stmt = g_stmt.where( + Message.extracted["lead"]["is_listing"].astext == "true", + Message.extracted["lead"]["confidence"].astext.cast(sa.Float) >= min_confidence, + ) + + page = (await session.execute(g_stmt)).all() + if not page: + return [] + page_keys = [row.group_key for row in page] + + rows_stmt = ( + select(Message) + .join(Channel, Channel.id == Message.channel_id) + .where(group_key.in_(page_keys), Channel.vertical == vertical) + .order_by(Message.tg_message_id.asc()) + ) + if section is not None: + rows_stmt = rows_stmt.join(Section, Section.id == Channel.section_id).where( + Section.slug == section + ) + if channel_id is not None: + rows_stmt = rows_stmt.where(Message.channel_id == channel_id) + rows = list((await session.execute(rows_stmt)).scalars().all()) + + by_key: dict[int, list[Message]] = {} + for r in rows: + key = r.grouped_id if r.grouped_id is not None else -r.id + by_key.setdefault(key, []).append(r) + + channel_ids = {r.channel_id for r in rows} + channels_map: dict[int, tuple[Channel, str]] = {} + if channel_ids: + ch_rows = ( + await session.execute( + select(Channel, Section.slug) + .join(Section, Section.id == Channel.section_id) + .where(Channel.id.in_(channel_ids)) + ) + ).all() + channels_map = {c.id: (c, slug) for (c, slug) in ch_rows} + + out: list[MessageOut] = [] + for key in page_keys: + group_rows = by_key.get(key) + if not group_rows: + continue + canonical = min(group_rows, key=lambda m: m.tg_message_id) + text = next((m.text for m in group_rows if m.text), None) + media: list = [] + for m in group_rows: + if m.media_files: + media.extend(m.media_files) + ch_pair = channels_map.get(canonical.channel_id) + ch, ch_slug = (ch_pair if ch_pair else (None, None)) + url = build_post_url( + ch.identifier if ch else None, + ch.tg_id if ch else None, + canonical.tg_message_id, + ) + out.append( + MessageOut( + id=canonical.id, + channel_id=canonical.channel_id, + channel_vertical=ch.vertical if ch else None, + channel_section_slug=ch_slug, + tg_message_id=canonical.tg_message_id, + grouped_id=canonical.grouped_id, + group_size=len(group_rows), + date=min(m.date for m in group_rows), + text=text, + sender_id=canonical.sender_id, + sender_username=canonical.sender_username, + sender_name=canonical.sender_name, + post_url=url, + has_media=any(m.has_media for m in group_rows), + media_files=media or None, + extracted=next( + (m.extracted for m in group_rows if m.extracted), None + ), + views=max((m.views for m in group_rows if m.views is not None), default=None), + forwards=max( + (m.forwards for m in group_rows if m.forwards is not None), default=None + ), + fetched_at=canonical.fetched_at, + ) + ) + return out + + +@router.get("/messages/{message_id}", response_model=MessageOut) +async def get_message( + message_id: int, + request: Request, + vertical: Vertical | None = Query(None, description="scope: 404 if mismatched"), + section: str | None = Query(None, description="scope: 404 if mismatched"), + session: AsyncSession = Depends(get_session), +) -> Message: + await _require_scope_access(request, session, vertical, section) + msg = await session.get(Message, message_id) + if msg is None: + raise HTTPException(status_code=404) + if vertical is not None or section is not None: + await _get_channel_in_scope(session, msg.channel_id, vertical, section) + return msg + + +# --- Stats -------------------------------------------------------------- + + +@router.get("/stats", response_model=GlobalStats) +async def global_stats( + request: Request, + vertical: Vertical = Query(..., description="required: real_estate | hr"), + section: str | None = Query(None, description="optional section slug"), + session: AsyncSession = Depends(get_session), +) -> GlobalStats: + await _require_scope_access(request, session, vertical, section) + base_channel_where = [Channel.vertical == vertical] + section_join_needed = section is not None + if section_join_needed: + base_channel_where.append(Section.slug == section) + + def _channel_query(*extra): + stmt = select(func.count(Channel.id)).where(*base_channel_where, *extra) + if section_join_needed: + stmt = stmt.join(Section, Section.id == Channel.section_id) + return stmt + + def _message_query(*extra): + stmt = ( + select(func.count(Message.id)) + .join(Channel, Channel.id == Message.channel_id) + .where(*base_channel_where, *extra) + ) + if section_join_needed: + stmt = stmt.join(Section, Section.id == Channel.section_id) + return stmt + + channels_total = (await session.execute(_channel_query())).scalar_one() + channels_active = ( + await session.execute(_channel_query(Channel.is_active.is_(True))) + ).scalar_one() + messages_total = (await session.execute(_message_query())).scalar_one() + + since = datetime.now(timezone.utc) - timedelta(hours=24) + messages_24h = ( + await session.execute(_message_query(Message.fetched_at >= since)) + ).scalar_one() + + last_poll_stmt = select(func.max(Channel.last_polled_at)).where(*base_channel_where) + if section_join_needed: + last_poll_stmt = last_poll_stmt.join(Section, Section.id == Channel.section_id) + last_poll = (await session.execute(last_poll_stmt)).scalar_one() + + if vertical == "hr": + lead_clause = Message.extracted["hr_lead"]["is_lead"].astext == "true" + else: + lead_clause = Message.extracted["lead"]["is_listing"].astext == "true" + + leads_total = (await session.execute(_message_query(lead_clause))).scalar_one() + leads_24h = ( + await session.execute(_message_query(lead_clause, Message.fetched_at >= since)) + ).scalar_one() + + return GlobalStats( + vertical=vertical, + section_slug=section, + channels_total=channels_total, + channels_active=channels_active, + messages_total=messages_total, + messages_last_24h=messages_24h, + leads_total=leads_total or 0, + leads_last_24h=leads_24h or 0, + poll_interval_seconds=settings.poll_interval_seconds, + last_poll_at=last_poll, + ) + + +# --- LLM ---------------------------------------------------------------- + + +@router.get("/llm/status") +async def llm_status() -> dict[str, Any]: + """Whether the local LLM (Ollama) is reachable and the configured model is loaded.""" + ready = await llm_client.is_ready() + return { + "enabled": settings.llm_enabled, + "ready": ready, + "base_url": settings.llm_base_url, + "model": settings.llm_model, + } + + +@router.get("/llm/queue") +async def llm_queue( + request: Request, + vertical: Vertical = Query(..., description="required: real_estate | hr"), + section: str | None = Query(None, description="optional section slug"), + session: AsyncSession = Depends(get_session), +) -> dict[str, int]: + """Pending classifications restricted to the vertical (+ section if given).""" + await _require_scope_access(request, session, vertical, section) + return {"pending": await pending_llm_count(vertical, section)} + + +@router.get("/llm/prompt") +async def llm_prompt_get( + request: Request, + vertical: Vertical = Query(..., description="required: real_estate | hr"), + section: str | None = Query( + None, description="optional section slug — return that level's prompt" + ), + session: AsyncSession = Depends(get_session), +) -> dict[str, Any]: + """Active LLM system prompt for the (vertical, section) level + the source. + + `source` is one of `section` / `vertical` / `default` and tells the UI + whether the override is at the requested level or just inherited. + """ + await _require_scope_access(request, session, vertical, section) + default = llm_client.default_prompt(vertical) + text, source = await prompt_store.get(vertical, section, default) + # Also return whether THIS exact level has its own override (for UI). + overridden_here = await prompt_store.is_overridden(vertical, section) + return { + "vertical": vertical, + "section": section, + "prompt": text, + "default": default, + "source": source, + "is_overridden_here": overridden_here, + } + + +@router.put("/llm/prompt", dependencies=[Depends(require_admin)]) +async def llm_prompt_put( + payload: dict, + vertical: Vertical = Query(..., description="required: real_estate | hr"), + section: str | None = Query( + None, description="optional section slug — save at section level" + ), +) -> dict[str, Any]: + text = payload.get("prompt") + if not isinstance(text, str) or not text.strip(): + raise HTTPException(status_code=400, detail="prompt must be a non-empty string") + if len(text) > 30000: + raise HTTPException( + status_code=400, detail="prompt is too long (max 30000 chars)" + ) + try: + await prompt_store.set_prompt(vertical, section, text) + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) + return {"saved": True, "vertical": vertical, "section": section, "length": len(text)} + + +@router.delete("/llm/prompt", status_code=204, dependencies=[Depends(require_admin)]) +async def llm_prompt_reset( + vertical: Vertical = Query(..., description="required: real_estate | hr"), + section: str | None = Query( + None, description="optional section slug — reset that level" + ), +) -> None: + await prompt_store.reset(vertical, section) + + +# --- Settings & batch poll --------------------------------------------- + + +@router.get("/settings", dependencies=[Depends(require_admin)]) +async def get_settings_view() -> dict[str, str | int]: + """Read-only view of runtime config — managed via .env, not editable from UI.""" + return { + "poll_interval_seconds": settings.poll_interval_seconds, + "poll_history_limit": settings.poll_history_limit, + "tg_session_path": settings.tg_session_path, + "postgres_host": settings.postgres_host, + "postgres_port": settings.postgres_port, + "postgres_db": settings.postgres_db, + "api_host": settings.api_host, + "api_port": settings.api_port, + } + + +async def _poll_all_in_background(channel_ids: list[int]) -> None: + for cid in channel_ids: + try: + await poll_channel(cid) + except Exception: + continue + + +@router.post("/poll", dependencies=[Depends(require_admin)]) +async def trigger_poll_all( + background: BackgroundTasks, + vertical: Vertical = Query(..., description="required: real_estate | hr"), + section: str | None = Query(None, description="optional section slug"), + session: AsyncSession = Depends(get_session), +) -> dict[str, int]: + """Queue a poll of every active channel in this vertical (+ section if given).""" + stmt = select(Channel.id).where( + Channel.is_active.is_(True), Channel.vertical == vertical + ) + if section is not None: + stmt = stmt.join(Section, Section.id == Channel.section_id).where( + Section.slug == section + ) + result = await session.execute(stmt) + ids = [row[0] for row in result.all()] + background.add_task(_poll_all_in_background, ids) + return {"queued": len(ids), "inserted": 0} diff --git a/src/parser_bot/api/schemas.py b/src/parser_bot/api/schemas.py new file mode 100644 index 0000000..efe471e --- /dev/null +++ b/src/parser_bot/api/schemas.py @@ -0,0 +1,231 @@ +import re +from datetime import datetime +from typing import Literal + +from pydantic import BaseModel, ConfigDict, Field, field_validator + +Vertical = Literal["real_estate", "hr"] + +# Section slugs are used as URL segments — keep them URL-safe. +_SLUG_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,62}[a-z0-9]$|^[a-z0-9]$") + + +class SectionCreate(BaseModel): + vertical: Vertical + slug: str = Field(..., min_length=1, max_length=64) + title: str = Field(..., min_length=1, max_length=255) + emoji: str | None = Field(None, max_length=8) + description: str | None = None + access_code: str = Field(..., min_length=3, max_length=255) + + @field_validator("slug") + @classmethod + def _check_slug(cls, v: str) -> str: + if not _SLUG_RE.match(v): + raise ValueError( + "slug must be lowercase letters/digits with '-' or '_' separators" + ) + return v + + +class SectionUpdate(BaseModel): + title: str | None = Field(None, min_length=1, max_length=255) + emoji: str | None = Field(None, max_length=8) + description: str | None = None + access_code: str | None = Field(None, min_length=3, max_length=255) + + +class SectionOut(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: int + vertical: Vertical + slug: str + title: str + emoji: str | None + description: str | None + access_code: str | None = None + created_at: datetime + + +class SectionWithStats(SectionOut): + """Section payload enriched with rollup counts for the section chooser page.""" + + channels_total: int = 0 + channels_active: int = 0 + messages_total: int = 0 + leads_total: int = 0 + + +class ChannelCreate(BaseModel): + identifier: str = Field(..., min_length=1, max_length=255, description="@username or t.me link") + vertical: Vertical = "real_estate" + section: str = Field( + ..., min_length=1, max_length=64, + description="Slug of the section inside the vertical (e.g. 'dubai')", + ) + + +class ChannelUpdate(BaseModel): + is_active: bool | None = None + vertical: Vertical | None = None + section: str | None = Field( + None, min_length=1, max_length=64, + description="Move the channel to another section in the same vertical", + ) + + +class ChannelOut(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: int + tg_id: int | None + identifier: str + title: str | None + vertical: Vertical + section_id: int + section_slug: str | None = None + is_active: bool + last_message_id: int | None + last_polled_at: datetime | None + created_at: datetime + + +class ChannelStats(BaseModel): + channel_id: int + identifier: str + title: str | None + vertical: Vertical + section_slug: str | None = None + is_active: bool + last_polled_at: datetime | None + message_count: int + last_message_at: datetime | None + + +class MediaFile(BaseModel): + kind: str # photo | video | document | audio | sticker | unknown + url: str | None = None + mime: str | None = None + size: int | None = None + skipped: str | None = None # set when not downloaded (e.g. "too_large") + + +class RealEstate(BaseModel): + kind: str | None = None + property_type: str | None = None + rooms: str | None = None + area_m2: float | None = None + price: str | None = None + + +class Lead(BaseModel): + is_listing: bool + kind: str | None = None # sale | rent | purchase + property_type: str | None = None + rooms: str | None = None + area_m2: float | None = None + price_text: str | None = None + price_value: float | None = None + currency: str | None = None # RUB | USD | EUR | AED | GBP | CNY | TRY | KZT | BYN | UAH + location: str | None = None + contact_phone: str | None = None + contact_name: str | None = None + summary: str | None = None + confidence: float = 0.0 + + +class HrLead(BaseModel): + """LLM verdict for HR-vertical messages (jobs / resumes / bare contacts).""" + + is_lead: bool + kind: str | None = None # vacancy | resume | contact + title: str | None = None + company: str | None = None + candidate_name: str | None = None + experience_years: float | None = None + skills: list[str] = [] + location: str | None = None + remote: bool | None = None + employment_type: str | None = None + salary_text: str | None = None + salary_value: float | None = None + currency: str | None = None + contact_phone: str | None = None + contact_name: str | None = None + summary: str | None = None + confidence: float = 0.0 + + +class Extracted(BaseModel): + phones: list[str] = [] + names: list[str] = [] + tg_handles: list[str] = [] + real_estate: RealEstate | None = None + lead: Lead | None = None + hr_lead: HrLead | None = None + + +class MessageOut(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: int + channel_id: int + channel_vertical: Vertical | None = None + channel_section_slug: str | None = None + tg_message_id: int + grouped_id: int | None = None + group_size: int = 1 + date: datetime + text: str | None + sender_id: int | None + has_media: bool + media_files: list[MediaFile] | None = None + extracted: Extracted | None = None + sender_username: str | None = None + sender_name: str | None = None + post_url: str | None = None + views: int | None + forwards: int | None + fetched_at: datetime + + +class GlobalStats(BaseModel): + vertical: Vertical + section_slug: str | None = None + channels_total: int + channels_active: int + messages_total: int + messages_last_24h: int + leads_total: int = 0 + leads_last_24h: int = 0 + poll_interval_seconds: int + last_poll_at: datetime | None + + +class AuthStatus(BaseModel): + authorized: bool + username: str | None = None + phone: str | None = None + + +class AuthCode(BaseModel): + code: str = Field(..., min_length=3, max_length=12) + + +class AuthPassword(BaseModel): + password: str = Field(..., min_length=1) + + +class AuthCodeResult(BaseModel): + needs_password: bool + + +class AdminLogin(BaseModel): + password: str = Field(..., min_length=1) + + +class SectionLogin(BaseModel): + vertical: Vertical + section: str = Field(..., min_length=1, max_length=64) + code: str = Field(..., min_length=1, max_length=255) diff --git a/src/parser_bot/auth.py b/src/parser_bot/auth.py new file mode 100644 index 0000000..8783aca --- /dev/null +++ b/src/parser_bot/auth.py @@ -0,0 +1,51 @@ +"""Interactive Telethon login. Run once on a dev machine, copy the printed +TG_SESSION_STRING into your .env / k8s Secret, then deploy without ever +touching auth again. + +Usage: + docker compose run --rm -it app python -m parser_bot.auth + +Telegram requires interactive code entry only for the very first login; +the resulting StringSession can be reused on any host until you log out +or someone invalidates the session in Telegram settings. +""" +import asyncio +import sys + +from telethon import TelegramClient +from telethon.sessions import StringSession + +from parser_bot.config import settings + + +async def main() -> int: + if not sys.stdin.isatty(): + print( + "ERROR: not a TTY. Re-run with: " + "docker compose run --rm -it app python -m parser_bot.auth", + file=sys.stderr, + ) + return 2 + + client = TelegramClient(StringSession(), settings.tg_api_id, settings.tg_api_hash) + await client.start(phone=settings.tg_phone) + me = await client.get_me() + session_str = client.session.save() + await client.disconnect() + + print() + print(f"authorized as {me.username or me.id}") + print() + print("Add this line to your .env (or k8s Secret) and never share it:") + print() + print(f"TG_SESSION_STRING={session_str}") + print() + print( + "After saving, no further interactive auth is needed. Restarts, rebuilds," + " redeploys all reuse this string." + ) + return 0 + + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) diff --git a/src/parser_bot/config.py b/src/parser_bot/config.py new file mode 100644 index 0000000..3a54710 --- /dev/null +++ b/src/parser_bot/config.py @@ -0,0 +1,64 @@ +from pydantic import Field +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore") + + tg_api_id: int = Field(..., alias="TG_API_ID") + tg_api_hash: str = Field(..., alias="TG_API_HASH") + tg_phone: str = Field(..., alias="TG_PHONE") + tg_session_path: str = Field("/data/session/parser.session", alias="TG_SESSION_PATH") + # Preferred for prod / k8s: opaque base64-ish string from `python -m parser_bot.auth`. + # If set, takes priority over file-based session. + tg_session_string: str | None = Field(None, alias="TG_SESSION_STRING") + + postgres_user: str = Field("parser", alias="POSTGRES_USER") + postgres_password: str = Field("parser", alias="POSTGRES_PASSWORD") + postgres_db: str = Field("parser", alias="POSTGRES_DB") + postgres_host: str = Field("db", alias="POSTGRES_HOST") + postgres_port: int = Field(5432, alias="POSTGRES_PORT") + + poll_interval_seconds: int = Field(60, alias="POLL_INTERVAL_SECONDS") + poll_history_limit: int = Field(50, alias="POLL_HISTORY_LIMIT") + + api_host: str = Field("0.0.0.0", alias="API_HOST") + api_port: int = Field(8000, alias="API_PORT") + public_base_path: str = Field("", alias="PUBLIC_BASE_PATH") + + media_dir: str = Field("/data/media", alias="MEDIA_DIR") + media_max_bytes: int = Field(20 * 1024 * 1024, alias="MEDIA_MAX_BYTES") + + # Local LLM via Ollama for lead classification & extraction + llm_enabled: bool = Field(True, alias="LLM_ENABLED") + llm_base_url: str = Field("http://ollama:11434", alias="LLM_BASE_URL") + llm_model: str = Field("qwen2.5:7b-instruct-q4_K_M", alias="LLM_MODEL") + llm_timeout_seconds: int = Field(120, alias="LLM_TIMEOUT_SECONDS") + llm_min_text_length: int = Field(20, alias="LLM_MIN_TEXT_LENGTH") + llm_classify_interval_seconds: int = Field(20, alias="LLM_CLASSIFY_INTERVAL_SECONDS") + llm_classify_batch_size: int = Field(5, alias="LLM_CLASSIFY_BATCH_SIZE") + + # Admin allowlist for /auth.html, /docs, /openapi.json, /redoc and the + # /auth/* API endpoints. Comma-separated IPv4/IPv6. Empty (default) means + # no restriction — convenient for local dev. Set explicitly in prod. + admin_allowed_ips: str = Field("", alias="ADMIN_ALLOWED_IPS") + # Optional second factor for admin-only UI/API operations. Empty keeps the + # previous IP-only behavior for local/dev deployments. + admin_password: str = Field("", alias="ADMIN_PASSWORD") + # When true, honor X-Forwarded-For / X-Real-IP set by a reverse proxy + # in front of uvicorn (Docker port-forward, nginx, traefik, etc). + trust_proxy_headers: bool = Field(True, alias="TRUST_PROXY_HEADERS") + + @property + def admin_ip_set(self) -> set[str]: + return {s.strip() for s in self.admin_allowed_ips.split(",") if s.strip()} + + @property + def database_url(self) -> str: + return ( + f"postgresql+asyncpg://{self.postgres_user}:{self.postgres_password}" + f"@{self.postgres_host}:{self.postgres_port}/{self.postgres_db}" + ) + + +settings = Settings() diff --git a/src/parser_bot/db/__init__.py b/src/parser_bot/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/parser_bot/db/models.py b/src/parser_bot/db/models.py new file mode 100644 index 0000000..fc75505 --- /dev/null +++ b/src/parser_bot/db/models.py @@ -0,0 +1,119 @@ +from datetime import datetime + +from sqlalchemy import ( + BigInteger, + DateTime, + ForeignKey, + Index, + String, + Text, + UniqueConstraint, + func, +) +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship + + +class Base(DeclarativeBase): + pass + + +class Section(Base): + """A sub-section inside a vertical, e.g. ('real_estate', 'dubai'). + + The pair (vertical, slug) is unique and identifies a section in URLs + and API calls. A channel belongs to exactly one section, the section + knows its vertical, and the LLM prompt store can hold a per-section + override that falls back to the vertical-level prompt. + """ + + __tablename__ = "sections" + __table_args__ = ( + UniqueConstraint("vertical", "slug", name="uq_section_vertical_slug"), + Index("ix_sections_vertical", "vertical"), + ) + + id: Mapped[int] = mapped_column(primary_key=True) + vertical: Mapped[str] = mapped_column(String(32)) + slug: Mapped[str] = mapped_column(String(64)) + title: Mapped[str] = mapped_column(String(255)) + emoji: Mapped[str | None] = mapped_column(String(8), nullable=True) + description: Mapped[str | None] = mapped_column(Text, nullable=True) + access_code: Mapped[str | None] = mapped_column(String(255), nullable=True) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now() + ) + + channels: Mapped[list["Channel"]] = relationship(back_populates="section") + + +class Channel(Base): + __tablename__ = "channels" + + id: Mapped[int] = mapped_column(primary_key=True) + # Telegram numeric channel id (peer id), nullable until first resolve + tg_id: Mapped[int | None] = mapped_column(BigInteger, unique=True, nullable=True) + # Username or t.me/joinchat link supplied by user + identifier: Mapped[str] = mapped_column(String(255), unique=True) + title: Mapped[str | None] = mapped_column(String(512), nullable=True) + # 'real_estate' or 'hr' — picks which LLM prompt and lead schema is used + vertical: Mapped[str] = mapped_column( + String(32), default="real_estate", server_default="real_estate", index=True + ) + section_id: Mapped[int] = mapped_column( + ForeignKey("sections.id", ondelete="RESTRICT"), index=True + ) + is_active: Mapped[bool] = mapped_column(default=True, server_default="true") + last_message_id: Mapped[int | None] = mapped_column(BigInteger, nullable=True) + last_polled_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now() + ) + + section: Mapped[Section] = relationship(back_populates="channels") + messages: Mapped[list["Message"]] = relationship( + back_populates="channel", + cascade="all, delete-orphan", + passive_deletes=True, + ) + + +class Message(Base): + __tablename__ = "messages" + __table_args__ = ( + UniqueConstraint("channel_id", "tg_message_id", name="uq_channel_message"), + Index("ix_messages_channel_date", "channel_id", "date"), + ) + + id: Mapped[int] = mapped_column(primary_key=True) + channel_id: Mapped[int] = mapped_column(ForeignKey("channels.id", ondelete="CASCADE")) + tg_message_id: Mapped[int] = mapped_column(BigInteger) + date: Mapped[datetime] = mapped_column(DateTime(timezone=True)) + text: Mapped[str | None] = mapped_column(Text, nullable=True) + sender_id: Mapped[int | None] = mapped_column(BigInteger, nullable=True) + sender_username: Mapped[str | None] = mapped_column(String(64), nullable=True) + sender_name: Mapped[str | None] = mapped_column(String(255), nullable=True) + grouped_id: Mapped[int | None] = mapped_column(BigInteger, nullable=True) + has_media: Mapped[bool] = mapped_column(default=False, server_default="false") + views: Mapped[int | None] = mapped_column(nullable=True) + forwards: Mapped[int | None] = mapped_column(nullable=True) + raw: Mapped[dict | None] = mapped_column(JSONB, nullable=True) + media_files: Mapped[list | None] = mapped_column(JSONB, nullable=True) + extracted: Mapped[dict | None] = mapped_column(JSONB, nullable=True) + fetched_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now() + ) + + channel: Mapped[Channel] = relationship(back_populates="messages") + + +class AppSetting(Base): + """Runtime-editable settings, edited from the UI without a restart.""" + + __tablename__ = "app_settings" + + key: Mapped[str] = mapped_column(String(128), primary_key=True) + value: Mapped[dict | str | int | bool | None] = mapped_column(JSONB, nullable=False) + updated_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now() + ) diff --git a/src/parser_bot/db/session.py b/src/parser_bot/db/session.py new file mode 100644 index 0000000..3db9620 --- /dev/null +++ b/src/parser_bot/db/session.py @@ -0,0 +1,25 @@ +from collections.abc import AsyncIterator +from contextlib import asynccontextmanager + +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine + +from parser_bot.config import settings + +engine = create_async_engine(settings.database_url, pool_pre_ping=True) +SessionFactory = async_sessionmaker(engine, expire_on_commit=False, class_=AsyncSession) + + +@asynccontextmanager +async def session_scope() -> AsyncIterator[AsyncSession]: + async with SessionFactory() as session: + try: + yield session + await session.commit() + except Exception: + await session.rollback() + raise + + +async def get_session() -> AsyncIterator[AsyncSession]: + async with SessionFactory() as session: + yield session diff --git a/src/parser_bot/extractors.py b/src/parser_bot/extractors.py new file mode 100644 index 0000000..9d12716 --- /dev/null +++ b/src/parser_bot/extractors.py @@ -0,0 +1,334 @@ +"""Heuristic extractors for Telegram message text. + +Russian-first, regex/keyword based, no ML deps. Goal is to surface signals for +the UI: phone numbers, person names (FIO), and real-estate intent (sale/rent/ +purchase). False positives are tolerable — operator triages in the UI. + +Output shape (used as JSONB in messages.extracted): +{ + "phones": ["+79123456789", ...], + "names": ["Иван Петров", ...], + "real_estate": { + "kind": "sale" | "rent" | "purchase" | null, + "property_type": str | null, # квартира, дом, ... + "rooms": str | null, # "2-к" + "area_m2": float | null, + "price": str | null, # raw matched string + } | null +} +""" +from __future__ import annotations + +import re +from typing import Any + +# --- Telegram @handles --------------------------------------------------- + +# Plain @username — Telegram allows 5–32 chars, letters/digits/_, no leading digit. +_TG_HANDLE_RE = re.compile(r"(? list[str]: + if not text: + return [] + out: list[str] = [] + seen: set[str] = set() + for h in _TG_HANDLE_RE.findall(text): + key = h.lower() + if key in seen: + continue + seen.add(key) + out.append("@" + h) + for h in _TG_LINK_RE.findall(text): + key = h.lower() + if key in seen: + continue + seen.add(key) + out.append("@" + h) + return out + + +# --- Phones -------------------------------------------------------------- + +# Russian-format: starts with +7, 7, or 8 (no plus), 11 digits total. +_PHONE_RU_RE = re.compile( + r"(?` then 7–14 more digits +# with optional separators. Catches +971 (UAE), +1 (US), +44 (UK), etc. +_PHONE_INTL_RE = re.compile( + r"(? list[str]: + if not text: + return [] + out: list[str] = [] + seen: set[str] = set() + + # Pass 1: Russian-style. Normalize to +7XXXXXXXXXX. + for raw in _PHONE_RU_RE.findall(text): + digits = re.sub(r"\D", "", raw) + if len(digits) == 11 and digits[0] in "78": + normalized = "+7" + digits[1:] + elif len(digits) == 10: + normalized = "+7" + digits + else: + continue + if normalized not in seen: + seen.add(normalized) + out.append(normalized) + + # Pass 2: international "+...". Keep raw plus-prefix; just + # collapse separators so the result is +. + for raw in _PHONE_INTL_RE.findall(text): + digits = re.sub(r"\D", "", raw) + if not (8 <= len(digits) <= 15): + continue + normalized = "+" + digits + # If it normalized to something we already captured (e.g. +7 number + # picked up by both passes), skip. + if normalized in seen: + continue + seen.add(normalized) + out.append(normalized) + return out + + +# --- Names (ФИО) --------------------------------------------------------- + +# Two or three capitalized Cyrillic tokens in a row. Allows hyphens (Иванов-Петров). +_NAME_RE = re.compile( + r"\b([А-ЯЁ][а-яё]+(?:\-[А-ЯЁ][а-яё]+)?(?:\s+[А-ЯЁ][а-яё]+(?:\-[А-ЯЁ][а-яё]+)?){1,2})\b" +) + +# Common false positives — geo/places/orgs/etc. Skip exact matches. +_NAME_BLOCKLIST = { + "Российская Федерация", + "Санкт Петербург", + "Санкт-Петербург", + "Нижний Новгород", + "Великий Новгород", + "Ростов На Дону", + "Ростов-На-Дону", + "Москва Сити", + "Красная Площадь", + "Чёрное Море", + "Чёрного Моря", + "Без Депозита", + "Без Залога", + "Без Комиссии", + "Сдам Квартиру", + "Продам Квартиру", + "Куплю Квартиру", + "Сдам Студию", + "Продам Студию", +} + +# Words that look like names but rarely are (months, weekdays, common nouns). +_NAME_TOKEN_BLOCK = { + "Январь", "Февраль", "Март", "Апрель", "Май", "Июнь", + "Июль", "Август", "Сентябрь", "Октябрь", "Ноябрь", "Декабрь", + "Понедельник", "Вторник", "Среда", "Четверг", "Пятница", "Суббота", "Воскресенье", + "Москва", "Питер", "Россия", "Кремль", "Метро", +} + + +def extract_names(text: str | None) -> list[str]: + if not text: + return [] + out: list[str] = [] + seen: set[str] = set() + for match in _NAME_RE.findall(text): + candidate = match.strip() + if candidate in _NAME_BLOCKLIST: + continue + tokens = re.split(r"[\s\-]+", candidate) + if any(t in _NAME_TOKEN_BLOCK for t in tokens): + continue + # Heuristic: at least one token must have len >= 4 (rules out "Ул.") + if not any(len(t) >= 4 for t in tokens): + continue + if candidate not in seen: + seen.add(candidate) + out.append(candidate) + return out + + +# --- Real estate --------------------------------------------------------- + +_DEAL_KEYWORDS: dict[str, tuple[str, ...]] = { + "rent": ( + # ru + "сдаётся", "сдается", "сдаю", "сдадим", "сдам", "сдаём", + "аренда", "арендую", "арендуем", "снять", + "посуточно", "помесячно", + # en + "for rent", "to let", "rental", "renting", "lease", "leasing", + "per year", "per month", "/year", "/month", "/mo", + ), + "sale": ( + # ru + "продаётся", "продается", "продаю", "продадим", "продам", "продаём", + "продажа", "к продаже", + # en + "for sale", "#forsale", "selling", "selling price", "sale price", + ), + "purchase": ( + # ru + "куплю", "купим", "покупаю", "покупка", "ищу квартиру", + "ищу дом", "ищем квартиру", "рассматриваю покупку", + # en + "looking for", "want to buy", "wanted", "requirement", "wtb", + ), +} + +_PROPERTY_TYPES: tuple[tuple[str, str], ...] = ( + # ru + ("квартир", "квартира"), + ("студи", "студия"), + ("апартамент", "апартаменты"), + ("комнат", "комната"), + ("таунхаус", "таунхаус"), + ("коттедж", "коттедж"), + ("дача", "дача"), + ("дом", "дом"), + ("офис", "офис"), + ("склад", "склад"), + ("помещен", "помещение"), + ("земельн", "земельный участок"), + ("участок", "участок"), + ("гараж", "гараж"), + ("машиномест", "машиноместо"), + # en — kept as Russian labels for UI consistency + ("villa", "дом"), + ("townhouse", "таунхаус"), + ("penthouse", "апартаменты"), + ("apartment", "квартира"), + ("studio", "студия"), + ("plot", "участок"), + (" land ", "участок"), + ("office", "офис"), + ("warehouse", "склад"), + ("retail", "помещение"), + ("garage", "гараж"), +) + +_AREA_M2_RE = re.compile( + r"(\d[\d\s,]*\d|\d)\s*(?:м[²2]|кв\.?\s*м|кв\.\s*метр)", + re.IGNORECASE, +) +_AREA_SQFT_RE = re.compile( + r"(\d[\d\s,]*\d|\d)\s*(?:sqft|sq\.?\s*ft|sq\s+ft|square\s+feet)", + re.IGNORECASE, +) + + +def _parse_number(s: str) -> float | None: + cleaned = s.replace(" ", "").replace(",", "") + try: + return float(cleaned) + except ValueError: + return None +_ROOMS_RE = re.compile( + r"\b(\d)[\-\s]*(?:к\b|комн|комнатн|-комнат|br\b|bed\b|bedroom|-bed)", + re.IGNORECASE, +) +# Studio is a special-case "0 rooms" indicator; not extracted as rooms count. +_PRICE_RE = re.compile( + r"(\d[\d\s.,]*\d|\d)\s*(млн|млрд|тыс|тысяч|миллионов?|миллиардов?|руб(?:лей)?|₽|р/мес|/мес|р\b)", + re.IGNORECASE, +) + + +def _detect_kind(low: str) -> str | None: + for kind, words in _DEAL_KEYWORDS.items(): + for w in words: + if w in low: + return kind + return None + + +def _detect_property_type(low: str) -> str | None: + for stem, label in _PROPERTY_TYPES: + if stem in low: + return label + return None + + +def extract_real_estate(text: str | None) -> dict[str, Any] | None: + if not text: + return None + low = text.lower() + kind = _detect_kind(low) + prop = _detect_property_type(low) + if kind is None and prop is None: + return None + + rooms_m = _ROOMS_RE.search(low) + rooms = f"{rooms_m.group(1)}-к" if rooms_m else None + if rooms is None and ("студи" in low or "studio" in low): + rooms = "студия" + + area: float | None = None + area_m = _AREA_M2_RE.search(text) + if area_m: + area = _parse_number(area_m.group(1)) + if area is None: + sqft_m = _AREA_SQFT_RE.search(text) + if sqft_m: + sqft = _parse_number(sqft_m.group(1)) + if sqft is not None: + area = round(sqft * 0.0929, 1) + + price_m = _PRICE_RE.search(text) + price = price_m.group(0).strip() if price_m else None + + return { + "kind": kind, + "property_type": prop, + "rooms": rooms, + "area_m2": area, + "price": price, + } + + +# --- Top-level analyzer -------------------------------------------------- + + +def analyze(text: str | None) -> dict[str, Any]: + """Synchronous regex-only analysis. Cheap and runs at insert time.""" + return { + "phones": extract_phones(text), + "names": extract_names(text), + "tg_handles": extract_tg_handles(text), + "real_estate": extract_real_estate(text), + } + + +async def analyze_with_llm( + text: str | None, + vertical: str = "real_estate", + section_slug: str | None = None, +) -> dict[str, Any]: + """Regex extraction + local LLM lead classification, routed by vertical. + + `section_slug` lets the classifier pick a section-specific system prompt + (e.g. Dubai-focused for `real_estate:dubai`) with fallback to the + vertical-default prompt. The LLM verdict goes under `lead` for RE and + under `hr_lead` for HR. Falls back to regex-only if Ollama is unavailable. + """ + base = analyze(text) + # Lazy import to avoid hard dep on httpx in environments where LLM is off. + from parser_bot.llm import classify + + verdict = await classify(text, vertical, section_slug) # type: ignore[arg-type] + if verdict is not None: + base["hr_lead" if vertical == "hr" else "lead"] = verdict + return base diff --git a/src/parser_bot/links.py b/src/parser_bot/links.py new file mode 100644 index 0000000..687351e --- /dev/null +++ b/src/parser_bot/links.py @@ -0,0 +1,44 @@ +"""Build Telegram URLs from stored channel metadata.""" +from __future__ import annotations + +import re + +_USERNAME_RE = re.compile(r"^@?([A-Za-z][A-Za-z0-9_]{4,31})$") +_TME_URL_RE = re.compile( + r"^(?:https?://)?(?:t|telegram)\.me/(?:s/)?([A-Za-z][A-Za-z0-9_]{4,31})(?:/.*)?$" +) + + +def channel_username(identifier: str | None) -> str | None: + """Extract the public username from a channel identifier if any. + + Returns None for private channels (joinchat, +invite, raw IDs). + """ + if not identifier: + return None + s = identifier.strip() + m = _USERNAME_RE.match(s) + if m: + return m.group(1) + m = _TME_URL_RE.match(s) + if m: + return m.group(1) + return None + + +def post_url(identifier: str | None, tg_id: int | None, tg_message_id: int) -> str | None: + """Build a deep link to a Telegram post. + + Public channel: https://t.me// + Private channel (no public username, only tg_id): https://t.me/c// + where is the absolute id with the leading -100 stripped. + """ + username = channel_username(identifier) + if username: + return f"https://t.me/{username}/{tg_message_id}" + if tg_id is None: + return None + raw = abs(tg_id) + s = str(raw) + short = s[3:] if s.startswith("100") and len(s) > 3 else s + return f"https://t.me/c/{short}/{tg_message_id}" diff --git a/src/parser_bot/llm.py b/src/parser_bot/llm.py new file mode 100644 index 0000000..fd12d15 --- /dev/null +++ b/src/parser_bot/llm.py @@ -0,0 +1,363 @@ +"""Local LLM (Ollama) client for lead classification & extraction. + +Two verticals share one model and one process: + - real_estate: high recall on listings (sale/rent/purchase), + - hr: vacancies, resumes, bare contact leads. + +The system prompt and JSON schema differ per vertical; the rest of the +plumbing (timeouts, single-lock concurrency, JSON-mode parsing) is shared. +On any error returns `None` and the caller falls back to regex-only extraction. + +The model runs on CPU via Ollama (Qwen2.5 7B Q4_K_M). Each call ~3–6s on +i5-12400. Concurrency is 1 (Ollama already saturates CPU per call). +""" +from __future__ import annotations + +import asyncio +import json +from typing import Any, Literal + +import httpx +import structlog + +from parser_bot.config import settings + +log = structlog.get_logger() + + +# Single shared lock so we never run two LLM requests at once on the GPU — +# they would just thrash VRAM and finish slower than sequential. +_lock = asyncio.Lock() + + +Vertical = Literal["real_estate", "hr"] + + +DEFAULT_RE_SYSTEM_PROMPT = """\ +Ты — аналитик объявлений о недвижимости. Тебе дают текст из Telegram-канала. +Сообщение МОЖЕТ БЫТЬ НА ЛЮБОМ ЯЗЫКЕ — русский, английский, арабский, любой +другой. Обрабатывай его одинаково независимо от языка. + +Задача: определить, является ли это РЕАЛЬНЫМ объявлением о покупке, продаже +или аренде НЕДВИЖИМОСТИ (квартира, дом/villa, студия/studio, апартаменты, +комната, таунхаус/townhouse, дача, коттедж, пентхаус/penthouse, офис, +склад, помещение, земельный участок/plot/land, гараж, машиноместо). +Учитывай намёки и нечёткие формулировки — лучше отметить сомнительный лид +как `is_listing=true` с низкой confidence, чем пропустить. + +Сигналы что это ОБЪЯВЛЕНИЕ (kind): +— продажа/sale: «продаётся», «продаю», «продажа», «for sale», «#forsale», + «selling price», «selling», «price», «AED 33M», ценник в любой валюте. +— аренда/rent: «сдаётся», «сдаю», «аренда», «for rent», «to let», «rental», + «per year», «per month», «AED ... /year». +— покупка/purchase: «куплю», «куплю в», «looking for», «want to buy», + «wanted», «requirement». + +ОДНО сообщение может быть и про продажу, И про аренду одновременно +(«FOR SALE | RENT» / «продажа или аренда»). В таком случае выбирай +основное намерение по самому тексту; если равноценно — `kind="sale"` +и упомяни аренду в summary. + +НЕ объявления (is_listing=false): +— общие новости / статьи / аналитика рынка; +— воспоминания и истории («когда-то продавал квартиру»); +— шутки, мемы, цитаты; +— реклама услуг агентств без конкретного объекта; +— чужие пересланные объявления без контактов и явного предложения от автора. + +Отвечай СТРОГО валидным JSON по схеме (никаких комментариев, никакого markdown): +{ + "is_listing": boolean, + "kind": "sale" | "rent" | "purchase" | null, + "property_type": "квартира" | "дом" | "студия" | "апартаменты" | "комната" | "таунхаус" | "дача" | "коттедж" | "офис" | "склад" | "помещение" | "участок" | "гараж" | "машиноместо" | null, + "rooms": "студия" | "1-к" | "2-к" | "3-к" | "4-к" | "5+к" | null, + "area_m2": number | null, + "price_text": string | null, + "price_value": number | null, + "currency": "RUB" | "USD" | "EUR" | "AED" | "GBP" | "CNY" | "TRY" | "KZT" | "BYN" | "UAH" | null, + "location": string | null, + "contact_phone": string | null, + "contact_name": string | null, + "summary": string, + "confidence": number +} + +Поля: +- summary — ОДНО короткое предложение НА РУССКОМ языке (даже если исходный + текст на английском или другом). Это нужно для единообразного UI. +- property_type — пиши значение по-русски (villa→дом, apartment→квартира, + townhouse→таунхаус, plot/land→участок, studio→студия, penthouse→апартаменты, + house→дом, office→офис, warehouse→склад, retail→помещение). +- rooms — для англоязычного «3BR», «3 BR», «3 bed», «3-bedroom» возвращай + «3-к»; для «studio» → «студия». +- area_m2 — площадь В КВАДРАТНЫХ МЕТРАХ. Если в тексте sqft / sq.ft / sq ft / + square feet — переведи: m² = sqft × 0.0929. Округляй до целого. +- confidence ∈ [0, 1]: 0.9+ если явное объявление с ценой/контактом, + 0.5–0.8 если правдоподобно, 0.2–0.4 если намёк. +- price_text — точная цитата из текста («2.5 млн ₽», «AED 850 000», «$320k», + «300 тыс. дирхам», «د.إ 1.2M», «70,000,000 AED», «AED 4.3M», «AED 1.75M»). +- price_value — числовая величина цены В УКАЗАННОЙ ВАЛЮТЕ (не конвертируй). + Раскрывай сокращения: «AED 4.3M» → 4300000, «$320k» → 320000. +- currency — определяй гибко: ₽/руб/р/RUB/рублей → RUB; $/USD/долл/бакс → USD; + €/EUR/евро → EUR; AED/дирхам/дирхамов/дирхама/dh/dhs/د.إ/Dirhams → AED; + ₺/TRY/лир/лира → TRY; ¥/CNY/юань → CNY; ₸/KZT/тенге → KZT; + Br/BYN/бел.руб → BYN; ₴/UAH/грн → UAH. Если не уверен — null. +- contact_phone — любой номер телефона в тексте (с + или без, российский, + ОАЭ, любой международный). +""" + + +DEFAULT_HR_SYSTEM_PROMPT = """\ +Ты — аналитик HR-объявлений. Тебе дают текст из Telegram-канала. Сообщение +МОЖЕТ БЫТЬ НА ЛЮБОМ ЯЗЫКЕ — обрабатывай одинаково. + +Задача: определить, относится ли сообщение к рынку труда, и какого типа лид +это. Допускаются три типа (`kind`): +— vacancy — компания/наниматель ищет сотрудника («ищем разработчика», + «hiring backend engineer», «требуется бухгалтер», «we are looking for»); +— resume — соискатель ищет работу («ищу работу», «open to work», «available + for hire», «рассматриваю предложения», «my CV», «резюме»); +— contact — короткое сообщение с именем/контактом и намёком на профессию, + без явной вакансии/резюме («Иван Петров, Python, +7…», «@nick — UI/UX, + Дубай»). Используй, когда vacancy и resume не подходят, но из текста ясно, + что это HR-контакт. + +Лучше отметить сомнительный случай `is_lead=true` с низкой confidence, +чем пропустить. НО полностью исключай: +— общие новости и аналитика рынка труда без конкретной вакансии/резюме; +— реклама курсов, школ, маркетплейсов услуг (Profi.ru и т.п.); +— чужие пересланные посты без контактов и без явного предложения от автора; +— объявления о продаже/аренде недвижимости, услуг и товаров; +— мемы, шутки, цитаты. + +Отвечай СТРОГО валидным JSON по схеме (никаких комментариев, никакого markdown): +{ + "is_lead": boolean, + "kind": "vacancy" | "resume" | "contact" | null, + "title": string | null, + "company": string | null, + "candidate_name": string | null, + "experience_years": number | null, + "skills": string[], + "location": string | null, + "remote": boolean | null, + "employment_type": "full-time" | "part-time" | "contract" | "internship" | null, + "salary_text": string | null, + "salary_value": number | null, + "currency": "RUB" | "USD" | "EUR" | "AED" | "GBP" | "CNY" | "TRY" | "KZT" | "BYN" | "UAH" | null, + "contact_phone": string | null, + "contact_name": string | null, + "summary": string, + "confidence": number +} + +Поля: +- title — должность/роль ОДНОЙ строкой («Senior Python Developer», «Бухгалтер», + «UI/UX-дизайнер»). Для resume — желаемая роль. Для contact — то, что заявлено. +- company — название компании-нанимателя, если оно явно указано (vacancy). +- candidate_name — ФИО или ник кандидата (resume / contact). +- experience_years — стаж в годах числом. «5+ years» → 5. Если не указан — null. +- skills — короткий массив ключевых навыков/технологий (до ~10 элементов). +- remote — true для «удалёнка / remote / WFH / hybrid: remote», false для + «офис / on-site», null если не указано. +- employment_type — full-time для «полная занятость / full-time», part-time + для «частичная / part-time», contract для «договор/контракт/freelance», + internship для «стажировка/internship». Иначе null. +- salary_text — точная цитата с зарплатой («200–300k ₽», «$5k/mo», «AED 18,000 per month»). +- salary_value — число В УКАЗАННОЙ ВАЛЮТЕ. Если диапазон — нижняя граница. + Раскрывай сокращения: «200k» → 200000, «1.5M» → 1500000. +- currency — определяй гибко: ₽/руб/RUB → RUB; $/USD/долл → USD; €/EUR/евро → EUR; + AED/дирхам/dh/dhs → AED; ₺/TRY/лир → TRY; ¥/CNY/юань → CNY; ₸/KZT/тенге → KZT; + Br/BYN/бел.руб → BYN; ₴/UAH/грн → UAH. Если не уверен — null. +- contact_phone — любой номер телефона (RU / международный, с + или без). +- contact_name — имя контактного лица (рекрутер / соискатель / автор). +- summary — ОДНО короткое предложение НА РУССКОМ языке. +- confidence ∈ [0, 1]: 0.9+ если явная вакансия/резюме с деталями, 0.5–0.8 + если правдоподобно, 0.2–0.4 если намёк. +""" + + +# Back-compat alias — older imports referenced DEFAULT_SYSTEM_PROMPT. +DEFAULT_SYSTEM_PROMPT = DEFAULT_RE_SYSTEM_PROMPT + + +def _build_user_prompt(text: str) -> str: + return f"Текст сообщения:\n```\n{text}\n```\nВерни JSON." + + +_VALID_CURRENCIES = { + "RUB", "USD", "EUR", "AED", "GBP", "CNY", "TRY", "KZT", "BYN", "UAH" +} + + +def _coerce_real_estate(payload: Any) -> dict | None: + if not isinstance(payload, dict): + return None + is_listing = bool(payload.get("is_listing")) + currency = payload.get("currency") + if currency is not None: + currency = str(currency).upper() + if currency not in _VALID_CURRENCIES: + currency = None + return { + "is_listing": is_listing, + "kind": payload.get("kind") if payload.get("kind") in ("sale", "rent", "purchase") else None, + "property_type": payload.get("property_type") or None, + "rooms": payload.get("rooms") or None, + "area_m2": _as_float(payload.get("area_m2")), + "price_text": payload.get("price_text") or None, + "price_value": _as_float(payload.get("price_value")), + "currency": currency, + "location": payload.get("location") or None, + "contact_phone": payload.get("contact_phone") or None, + "contact_name": payload.get("contact_name") or None, + "summary": (payload.get("summary") or "")[:300], + "confidence": max(0.0, min(1.0, _as_float(payload.get("confidence")) or 0.0)), + } + + +def _coerce_hr(payload: Any) -> dict | None: + if not isinstance(payload, dict): + return None + is_lead = bool(payload.get("is_lead")) + currency = payload.get("currency") + if currency is not None: + currency = str(currency).upper() + if currency not in _VALID_CURRENCIES: + currency = None + skills_raw = payload.get("skills") or [] + if isinstance(skills_raw, str): + skills = [s.strip() for s in skills_raw.split(",") if s.strip()] + elif isinstance(skills_raw, list): + skills = [str(s).strip() for s in skills_raw if str(s).strip()] + else: + skills = [] + skills = skills[:15] + employment = payload.get("employment_type") + if employment is not None and employment not in ( + "full-time", "part-time", "contract", "internship" + ): + employment = None + remote_raw = payload.get("remote") + remote = bool(remote_raw) if isinstance(remote_raw, bool) else None + return { + "is_lead": is_lead, + "kind": payload.get("kind") if payload.get("kind") in ("vacancy", "resume", "contact") else None, + "title": payload.get("title") or None, + "company": payload.get("company") or None, + "candidate_name": payload.get("candidate_name") or None, + "experience_years": _as_float(payload.get("experience_years")), + "skills": skills, + "location": payload.get("location") or None, + "remote": remote, + "employment_type": employment, + "salary_text": payload.get("salary_text") or None, + "salary_value": _as_float(payload.get("salary_value")), + "currency": currency, + "contact_phone": payload.get("contact_phone") or None, + "contact_name": payload.get("contact_name") or None, + "summary": (payload.get("summary") or "")[:300], + "confidence": max(0.0, min(1.0, _as_float(payload.get("confidence")) or 0.0)), + } + + +def _as_float(v: Any) -> float | None: + if v is None or isinstance(v, bool): + return None + try: + return float(v) + except (TypeError, ValueError): + return None + + +async def is_ready() -> bool: + """Check that Ollama is up and the configured model is pulled.""" + try: + async with httpx.AsyncClient(timeout=5) as client: + r = await client.get(f"{settings.llm_base_url}/api/tags") + r.raise_for_status() + tags = {m.get("name") for m in r.json().get("models", [])} + return any(t.startswith(settings.llm_model.split(":")[0]) for t in tags) + except Exception: + return False + + +def default_prompt(vertical: Vertical) -> str: + return DEFAULT_HR_SYSTEM_PROMPT if vertical == "hr" else DEFAULT_RE_SYSTEM_PROMPT + + +async def classify( + text: str | None, + vertical: Vertical = "real_estate", + section_slug: str | None = None, +) -> dict | None: + """Classify a message text under the given vertical/section. + + The system prompt is resolved with `section → vertical → built-in` fallback, + so a per-section prompt can fine-tune extraction (e.g. AED/sqft for Dubai) + while unconfigured sections keep using the vertical-wide prompt. + Returns a vertical-specific structured dict or None on error / short text. + """ + if not settings.llm_enabled: + return None + if not text or len(text.strip()) < settings.llm_min_text_length: + return None + + # Lazy import to avoid a circular: prompt_store -> db.session -> config. + from parser_bot import prompt_store + + system = await prompt_store.resolve(vertical, section_slug, default_prompt(vertical)) + payload = { + "model": settings.llm_model, + "prompt": _build_user_prompt(text), + "system": system, + "format": "json", + "stream": False, + "options": {"temperature": 0.1, "num_ctx": 4096, "num_predict": 600}, + } + async with _lock: + try: + async with httpx.AsyncClient(timeout=settings.llm_timeout_seconds) as client: + r = await client.post( + f"{settings.llm_base_url}/api/generate", json=payload + ) + if r.status_code != 200: + # Surface the actual server message — most useful one is + # `model '...' not found`, which otherwise would just look + # like a generic HTTP error and leave the worker to spin. + log.warning( + "llm_request_failed", + status=r.status_code, + model=settings.llm_model, + vertical=vertical, + section=section_slug, + body=r.text[:300], + ) + return None + data = r.json() + except Exception as exc: + log.warning( + "llm_request_failed", error=str(exc), model=settings.llm_model, vertical=vertical + ) + return None + + raw = (data.get("response") or "").strip() + if not raw: + return None + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + # Best effort: extract first {...} block. + start, end = raw.find("{"), raw.rfind("}") + if start == -1 or end == -1: + log.warning("llm_invalid_json", raw=raw[:200], vertical=vertical) + return None + try: + parsed = json.loads(raw[start : end + 1]) + except json.JSONDecodeError: + log.warning("llm_invalid_json", raw=raw[:200], vertical=vertical) + return None + + if vertical == "hr": + return _coerce_hr(parsed) + return _coerce_real_estate(parsed) diff --git a/src/parser_bot/main.py b/src/parser_bot/main.py new file mode 100644 index 0000000..184de91 --- /dev/null +++ b/src/parser_bot/main.py @@ -0,0 +1,205 @@ +from contextlib import asynccontextmanager +from pathlib import Path + +import structlog +import uvicorn +from fastapi import Depends, FastAPI, HTTPException +from fastapi.openapi.docs import get_redoc_html, get_swagger_ui_html +from fastapi.openapi.utils import get_openapi +from fastapi.responses import FileResponse, JSONResponse +from fastapi.staticfiles import StaticFiles +from starlette.types import Scope + +from parser_bot.access import require_admin, require_admin_network +from parser_bot.api.routes import router +from parser_bot.config import settings +from parser_bot.scheduler.poller import build_scheduler +from parser_bot.telegram.client import is_authorized, start_client, stop_client + +structlog.configure( + processors=[ + structlog.processors.TimeStamper(fmt="iso"), + structlog.processors.add_log_level, + structlog.processors.JSONRenderer(), + ] +) +log = structlog.get_logger() + +STATIC_DIR = Path(__file__).parent / "web" / "static" +NOCACHE = {"Cache-Control": "no-cache, must-revalidate"} + + +class NoCacheStaticFiles(StaticFiles): + """StaticFiles with Cache-Control: no-cache. + + The browser still gets to validate via ETag/Last-Modified (304 is fine), + but it will not silently serve a stale JS bundle after a deploy. + """ + + async def get_response(self, path: str, scope: Scope): + response = await super().get_response(path, scope) + response.headers["Cache-Control"] = "no-cache, must-revalidate" + return response + + +@asynccontextmanager +async def lifespan(app: FastAPI): + await start_client() + scheduler = build_scheduler() + scheduler.start() + authorized = await is_authorized() + log.info( + "startup", poll_interval=settings.poll_interval_seconds, authorized=authorized + ) + if not authorized: + log.warning("not_authorized", action="open /auth.html to log in") + try: + yield + finally: + scheduler.shutdown(wait=False) + await stop_client() + log.info("shutdown") + + +def _serve_section_template(vertical_dir: str, page: str) -> FileResponse: + """Resolve a section-scoped URL to a single shared template. + + Sections are dynamic (created via UI), so `/real-estate/dubai/channels.html` + can't be a real file. We serve `web/static//section/` + for any section slug — the section name is read from the URL by JS. + """ + target_name = page if page else "index.html" + if "/" in target_name or target_name.startswith(".."): + raise HTTPException(404) + target = STATIC_DIR / vertical_dir / "section" / target_name + if not target.is_file(): + raise HTTPException(404) + return FileResponse(target, headers=NOCACHE) + + +def create_app() -> FastAPI: + public_base = settings.public_base_path.rstrip("/") + # Disable the default /docs, /redoc and /openapi.json — we serve our own + # admin-gated versions below. + app = FastAPI( + title="parser-tg-bot", + lifespan=lifespan, + docs_url=None, + redoc_url=None, + openapi_url=None, + ) + app.include_router(router, prefix="/api/v1") + + @app.get("/healthz") + async def healthz() -> dict[str, str]: + return {"status": "ok"} + + @app.get("/", include_in_schema=False) + async def index() -> FileResponse: + return FileResponse(STATIC_DIR / "index.html", headers=NOCACHE) + + # Admin-only: Telegram login page. Registered BEFORE the static catch-all + # so the static mount can't accidentally serve it to non-admin visitors. + @app.get( + "/admin.html", + include_in_schema=False, + dependencies=[Depends(require_admin_network)], + ) + async def admin_page() -> FileResponse: + return FileResponse(STATIC_DIR / "admin.html", headers=NOCACHE) + + @app.get( + "/auth.html", + include_in_schema=False, + dependencies=[Depends(require_admin)], + ) + async def auth_page() -> FileResponse: + return FileResponse(STATIC_DIR / "auth.html", headers=NOCACHE) + + # Admin-only: OpenAPI surface. Custom routes so we can wrap them in + # `require_admin`; the auto-generated ones from FastAPI bypass it. + @app.get( + "/openapi.json", + include_in_schema=False, + dependencies=[Depends(require_admin)], + ) + async def openapi_json() -> JSONResponse: + return JSONResponse( + get_openapi( + title=app.title, + version=app.version, + openapi_version=app.openapi_version, + description=app.description, + routes=app.routes, + ) + ) + + @app.get( + "/docs", + include_in_schema=False, + dependencies=[Depends(require_admin)], + ) + async def docs() -> FileResponse: + return get_swagger_ui_html( + openapi_url=f"{public_base}/openapi.json" if public_base else "/openapi.json", + title=app.title + " — docs", + ) + + @app.get( + "/redoc", + include_in_schema=False, + dependencies=[Depends(require_admin)], + ) + async def redoc() -> FileResponse: + return get_redoc_html( + openapi_url=f"{public_base}/openapi.json" if public_base else "/openapi.json", + title=app.title + " — redoc", + ) + + # IMPORTANT: register /static and /media mounts BEFORE the dynamic + # vertical/section routes. Starlette matches routes in registration order, + # and a generic /{v}/{s}/{page} pattern would otherwise eat /static/*. + app.mount("/static", NoCacheStaticFiles(directory=STATIC_DIR), name="static") + media_dir = Path(settings.media_dir) + media_dir.mkdir(parents=True, exist_ok=True) + # /media is fine to cache — file names are content-stable. + app.mount("/media", StaticFiles(directory=media_dir), name="media") + + # Section-templated dynamic routes, explicit per vertical so /static/*, + # /api/*, /media/* (and any future top-level path) can't be captured. + @app.get("/real-estate/{section}/", include_in_schema=False) + async def re_section_root(section: str) -> FileResponse: + return _serve_section_template("real-estate", "index.html") + + @app.get("/real-estate/{section}/{page}", include_in_schema=False) + async def re_section_page(section: str, page: str) -> FileResponse: + return _serve_section_template("real-estate", page) + + @app.get("/hr/{section}/", include_in_schema=False) + async def hr_section_root(section: str) -> FileResponse: + return _serve_section_template("hr", "index.html") + + @app.get("/hr/{section}/{page}", include_in_schema=False) + async def hr_section_page(section: str, page: str) -> FileResponse: + return _serve_section_template("hr", page) + + # Catch-all for top-level static pages (chooser, css, etc.). auth.html is + # already handled above, so the static catch-all can't bypass the guard. + app.mount("/", NoCacheStaticFiles(directory=STATIC_DIR, html=True), name="pages") + return app + + +app = create_app() + + +def main() -> None: + uvicorn.run( + "parser_bot.main:app", + host=settings.api_host, + port=settings.api_port, + log_config=None, + ) + + +if __name__ == "__main__": + main() diff --git a/src/parser_bot/prompt_store.py b/src/parser_bot/prompt_store.py new file mode 100644 index 0000000..2da3b3f --- /dev/null +++ b/src/parser_bot/prompt_store.py @@ -0,0 +1,130 @@ +"""Runtime-editable LLM system prompts, persisted in app_settings. + +Three resolution levels with fallback (more specific → less specific): + 1. `llm_system_prompt::` — section override + 2. `llm_system_prompt:` — vertical override + 3. built-in DEFAULT_RE_SYSTEM_PROMPT / DEFAULT_HR_SYSTEM_PROMPT + +The prompt is read on every classification call but cached for a short +window so the DB isn't hit per-message. Edits via the API invalidate the +cache for that level, so a save in the UI takes effect within seconds. +""" +from __future__ import annotations + +import time +from typing import Literal + +from sqlalchemy import select +from sqlalchemy.dialects.postgresql import insert as pg_insert + +from parser_bot.db.models import AppSetting +from parser_bot.db.session import session_scope + +Vertical = Literal["real_estate", "hr"] + +_KEY_PREFIX = "llm_system_prompt:" +_CACHE_TTL_S = 5.0 +_cache: dict[str, tuple[float, str | None]] = {} + + +def _key(vertical: Vertical, section_slug: str | None = None) -> str: + if section_slug: + return f"{_KEY_PREFIX}{vertical}:{section_slug}" + return f"{_KEY_PREFIX}{vertical}" + + +async def _load(key: str) -> str | None: + """Read a stored prompt by exact key. None if missing or empty.""" + now = time.monotonic() + cached_at, cached_value = _cache.get(key, (0.0, None)) + if now - cached_at < _CACHE_TTL_S: + return cached_value + + async with session_scope() as session: + row = await session.execute( + select(AppSetting.value).where(AppSetting.key == key) + ) + value = row.scalar_one_or_none() + + text = value if isinstance(value, str) and value.strip() else None + _cache[key] = (now, text) + return text + + +async def resolve( + vertical: Vertical, section_slug: str | None, default: str +) -> str: + """Pick the most specific prompt available, falling back to `default`. + + Always consults section-level → vertical-level → default. This is what + the classifier uses for every message. + """ + if section_slug: + text = await _load(_key(vertical, section_slug)) + if text is not None: + return text + text = await _load(_key(vertical)) + if text is not None: + return text + return default + + +async def get( + vertical: Vertical, section_slug: str | None, default: str +) -> tuple[str, str]: + """For the settings UI: return (text, source) where source is one of + 'section' | 'vertical' | 'default'. Lets the editor show which override + is currently active without a second round-trip. + """ + if section_slug: + text = await _load(_key(vertical, section_slug)) + if text is not None: + return text, "section" + text = await _load(_key(vertical)) + if text is not None: + return text, "vertical" + return default, "default" + + +async def set_prompt( + vertical: Vertical, section_slug: str | None, text: str +) -> None: + """Save a new prompt at the given level (section or vertical).""" + if not isinstance(text, str) or not text.strip(): + raise ValueError("prompt must be a non-empty string") + key = _key(vertical, section_slug) + async with session_scope() as session: + stmt = ( + pg_insert(AppSetting) + .values(key=key, value=text) + .on_conflict_do_update( + index_elements=["key"], set_={"value": text} + ) + ) + await session.execute(stmt) + invalidate(key) + + +async def reset(vertical: Vertical, section_slug: str | None) -> None: + """Drop the override at the given level.""" + key = _key(vertical, section_slug) + async with session_scope() as session: + await session.execute( + AppSetting.__table__.delete().where(AppSetting.key == key) + ) + invalidate(key) + + +def invalidate(key: str | None = None) -> None: + if key is None: + _cache.clear() + else: + _cache.pop(key, None) + + +async def is_overridden( + vertical: Vertical, section_slug: str | None = None +) -> bool: + """True iff a custom prompt is stored at this exact level.""" + text = await _load(_key(vertical, section_slug)) + return text is not None diff --git a/src/parser_bot/scheduler/__init__.py b/src/parser_bot/scheduler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/parser_bot/scheduler/poller.py b/src/parser_bot/scheduler/poller.py new file mode 100644 index 0000000..13bdc06 --- /dev/null +++ b/src/parser_bot/scheduler/poller.py @@ -0,0 +1,349 @@ +from datetime import datetime, timezone + +import structlog +from apscheduler.schedulers.asyncio import AsyncIOScheduler +from sqlalchemy import func, select +from sqlalchemy.dialects.postgresql import insert as pg_insert + +from parser_bot.config import settings +from parser_bot.db.models import Channel, Message, Section +from parser_bot.db.session import session_scope +from parser_bot.extractors import analyze, analyze_with_llm +from parser_bot.telegram.client import ( + fetch_new_messages, + fetch_specific_messages_with_media, + is_authorized, + resolve_channel, +) + +log = structlog.get_logger() + + +def _verdict_key(vertical: str) -> str: + """JSONB key under `extracted` where the LLM verdict lives for this vertical.""" + return "hr_lead" if vertical == "hr" else "lead" + + +def _needs_work_clause(vertical: str | None): + """Rows that still need LLM classification. + + A row needs work when: + - extracted IS NULL (never analyzed), or + - the verdict for this vertical is missing. + + Without `vertical`, falls back to "missing any verdict" — used by + aggregate /llm/queue display when no vertical is selected. + """ + if vertical is None: + return (Message.extracted.is_(None)) | ( + Message.extracted["lead"].is_(None) & Message.extracted["hr_lead"].is_(None) + ) + key = _verdict_key(vertical) + return (Message.extracted.is_(None)) | (Message.extracted[key].is_(None)) + + +async def poll_channel(channel_id: int) -> int: + """Poll one channel for new messages. Returns count of inserted rows.""" + async with session_scope() as session: + channel = await session.get(Channel, channel_id) + if channel is None or not channel.is_active: + return 0 + + if channel.tg_id is None or channel.title is None: + resolved = await resolve_channel(channel.identifier) + channel.tg_id = resolved.tg_id + channel.title = resolved.title + + msgs = await fetch_new_messages( + channel.identifier, + min_id=channel.last_message_id, + limit=settings.poll_history_limit, + download_media_for_channel_id=channel.id, + ) + + inserted = 0 + for m in msgs: + # Only the cheap regex pass runs in the poll path. LLM classification + # is handled by `classify_pending` in a background scheduler job so + # that a poll request never blocks on a 5s/message LLM call. + stmt = ( + pg_insert(Message) + .values( + channel_id=channel.id, + tg_message_id=m.tg_message_id, + date=m.date, + text=m.text, + sender_id=m.sender_id, + sender_username=m.sender_username, + sender_name=m.sender_name, + grouped_id=m.grouped_id, + has_media=m.has_media, + views=m.views, + forwards=m.forwards, + raw=m.raw, + media_files=m.media_files or None, + extracted=analyze(m.text) if m.text else None, + ) + .on_conflict_do_nothing(index_elements=["channel_id", "tg_message_id"]) + ) + result = await session.execute(stmt) + inserted += result.rowcount or 0 + + if msgs: + channel.last_message_id = max( + channel.last_message_id or 0, msgs[-1].tg_message_id + ) + channel.last_polled_at = datetime.now(timezone.utc) + + log.info( + "polled_channel", + channel=channel.identifier, + vertical=channel.vertical, + fetched=len(msgs), + inserted=inserted, + ) + return inserted + + +async def poll_all() -> None: + if not await is_authorized(): + log.debug("poll_skipped_not_authorized") + return + + async with session_scope() as session: + result = await session.execute(select(Channel.id).where(Channel.is_active.is_(True))) + ids = [row[0] for row in result.all()] + + for channel_id in ids: + try: + await poll_channel(channel_id) + except Exception as exc: + log.error("poll_failed", channel_id=channel_id, error=str(exc)) + + +async def backfill_media(channel_id: int, batch_size: int = 50) -> dict[str, int]: + """Re-download media for messages with has_media=True but media_files IS NULL. + + Goes through one batch (oldest-first by tg_message_id) at a time so the call + stays bounded; the UI can press the button repeatedly until 'pending' is 0. + """ + if not await is_authorized(): + raise RuntimeError("not authorized") + + async with session_scope() as session: + channel = await session.get(Channel, channel_id) + if channel is None: + raise RuntimeError("channel not found") + + pending_q = select(func.count(Message.id)).where( + Message.channel_id == channel_id, + Message.has_media.is_(True), + Message.media_files.is_(None), + ) + pending_total = (await session.execute(pending_q)).scalar_one() + + rows = ( + await session.execute( + select(Message.id, Message.tg_message_id) + .where( + Message.channel_id == channel_id, + Message.has_media.is_(True), + Message.media_files.is_(None), + ) + .order_by(Message.tg_message_id.asc()) + .limit(batch_size) + ) + ).all() + if not rows: + return {"updated": 0, "pending": 0} + + tg_ids = [r.tg_message_id for r in rows] + results = await fetch_specific_messages_with_media( + channel.identifier, tg_ids, channel_id + ) + + updated = 0 + for db_id, tg_id in rows: + files = results.get(tg_id) + if not files: + continue + msg = await session.get(Message, db_id) + if msg is None: + continue + msg.media_files = files + updated += 1 + + log.info( + "backfill_media", + channel_id=channel_id, + updated=updated, + remaining=max(0, pending_total - updated), + ) + return {"updated": updated, "pending": max(0, pending_total - updated)} + + +async def reanalyze_channel(channel_id: int, batch_size: int = 5) -> dict[str, int]: + """Re-run extractors (regex + LLM) over messages missing this channel's verdict. + + Picks the vertical AND section from the channel row so the right LLM + prompt is used. Only reanalyzes rows where the corresponding verdict key + is missing. Newest first so fresh leads surface during long backfills. + """ + async with session_scope() as session: + result = await session.execute( + select(Channel, Section.slug) + .join(Section, Section.id == Channel.section_id) + .where(Channel.id == channel_id) + ) + row = result.one_or_none() + if row is None: + return {"updated": 0, "pending": 0} + channel, section_slug = row + vertical = channel.vertical + needs_work = _needs_work_clause(vertical) + + pending_total = ( + await session.execute( + select(func.count(Message.id)).where( + Message.channel_id == channel_id, + Message.text.is_not(None), + needs_work, + ) + ) + ).scalar_one() + + rows = ( + await session.execute( + select(Message.id, Message.text) + .where( + Message.channel_id == channel_id, + Message.text.is_not(None), + needs_work, + ) + .order_by(Message.id.desc()) + .limit(batch_size) + ) + ).all() + if not rows: + return {"updated": 0, "pending": 0} + + updated = 0 + for db_id, text in rows: + extracted = ( + await analyze_with_llm(text, vertical, section_slug) + if settings.llm_enabled + else analyze(text) + ) + msg = await session.get(Message, db_id) + if msg is None: + continue + msg.extracted = extracted + updated += 1 + + log.info( + "reanalyzed_channel", + channel_id=channel_id, + vertical=vertical, + section=section_slug, + updated=updated, + remaining=max(0, pending_total - updated), + ) + return {"updated": updated, "pending": max(0, pending_total - updated)} + + +async def pending_llm_count( + vertical: str | None = None, section_slug: str | None = None +) -> int: + """How many text messages still need LLM classification. + + When `vertical` is set, only counts messages from channels of that vertical + (and optionally that section) whose vertical-specific verdict is missing. + """ + if not settings.llm_enabled: + return 0 + needs_work = _needs_work_clause(vertical) + async with session_scope() as session: + stmt = select(func.count(Message.id)).where( + Message.text.is_not(None), + needs_work, + ) + if vertical is not None: + stmt = stmt.join(Channel, Channel.id == Message.channel_id).where( + Channel.vertical == vertical + ) + if section_slug is not None: + stmt = stmt.join(Section, Section.id == Channel.section_id).where( + Section.slug == section_slug + ) + return (await session.execute(stmt)).scalar_one() + + +async def classify_pending(batch_size: int = 5) -> int: + """Run LLM over a batch of unclassified messages across all channels. + + Walks newest-first and picks the prompt/vertical/section from each + message's channel, so RE and HR channels (and per-section overrides) + share the same classifier worker without crosstalk. + """ + if not settings.llm_enabled: + return 0 + needs_work = _needs_work_clause(None) + + async with session_scope() as session: + rows = ( + await session.execute( + select(Message.id, Message.text, Channel.vertical, Section.slug) + .join(Channel, Channel.id == Message.channel_id) + .join(Section, Section.id == Channel.section_id) + .where(Message.text.is_not(None), needs_work) + .order_by(Message.id.desc()) + .limit(batch_size) + ) + ).all() + if not rows: + return 0 + + updated = 0 + for db_id, text, vertical, section_slug in rows: + # If extracted already has THIS vertical's verdict, skip — needs_work + # uses an OR over both keys and would otherwise re-run RE channels + # that already have a lead just because hr_lead is null. + existing = ( + await session.execute(select(Message.extracted).where(Message.id == db_id)) + ).scalar_one_or_none() + key = _verdict_key(vertical) + if existing and existing.get(key) is not None: + continue + extracted = await analyze_with_llm(text, vertical, section_slug) + msg = await session.get(Message, db_id) + if msg is None: + continue + msg.extracted = extracted + updated += 1 + + if updated: + log.info("classify_pending_batch", updated=updated) + return updated + + +def build_scheduler() -> AsyncIOScheduler: + scheduler = AsyncIOScheduler() + scheduler.add_job( + poll_all, + "interval", + seconds=settings.poll_interval_seconds, + id="poll_all", + max_instances=1, + coalesce=True, + ) + if settings.llm_enabled: + scheduler.add_job( + classify_pending, + "interval", + seconds=settings.llm_classify_interval_seconds, + id="classify_pending", + max_instances=1, + coalesce=True, + kwargs={"batch_size": settings.llm_classify_batch_size}, + ) + return scheduler diff --git a/src/parser_bot/telegram/__init__.py b/src/parser_bot/telegram/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/parser_bot/telegram/client.py b/src/parser_bot/telegram/client.py new file mode 100644 index 0000000..458f77a --- /dev/null +++ b/src/parser_bot/telegram/client.py @@ -0,0 +1,319 @@ +import json +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Any + +import structlog +from telethon import TelegramClient +from telethon.sessions import StringSession +from telethon.tl.types import Channel as TgChannel +from telethon.tl.types import Message as TgMessage +from telethon.tl.types import ( + MessageMediaDocument, + MessageMediaPhoto, +) + +from parser_bot.config import settings + +log = structlog.get_logger() + + +def _json_safe(value: Any) -> Any: + """Coerce Telethon's to_dict() output into JSONB-safe primitives.""" + return json.loads(json.dumps(value, default=str)) + + +@dataclass(slots=True) +class ResolvedChannel: + tg_id: int + title: str + + +@dataclass(slots=True) +class FetchedMessage: + tg_message_id: int + date: datetime + text: str | None + sender_id: int | None + sender_username: str | None + sender_name: str | None + grouped_id: int | None + has_media: bool + views: int | None + forwards: int | None + raw: dict + media_files: list[dict] = field(default_factory=list) + + +def _sender_info(msg: TgMessage) -> tuple[str | None, str | None]: + """Best-effort sender username + display name from a Telethon Message. + + For public channel posts the "sender" is usually the channel itself — + in that case we fall back to `post_author` (the optional signature on + signed posts) so the operator at least knows who signed it. + """ + username: str | None = None + name: str | None = None + sender = msg.sender + if sender is not None: + username = getattr(sender, "username", None) + first = getattr(sender, "first_name", None) + last = getattr(sender, "last_name", None) + title = getattr(sender, "title", None) + if first or last: + name = " ".join(p for p in (first, last) if p) + elif title: + name = title + post_author = getattr(msg, "post_author", None) + if not name and post_author: + name = post_author + return username, name + + +def _media_kind(media: Any) -> str: + if isinstance(media, MessageMediaPhoto): + return "photo" + if isinstance(media, MessageMediaDocument): + doc = getattr(media, "document", None) + mime = (getattr(doc, "mime_type", "") or "").lower() + if mime.startswith("video/"): + return "video" + if mime.startswith("audio/"): + return "audio" + if mime == "image/webp": + return "sticker" + return "document" + return "unknown" + + +def _media_size(media: Any) -> int | None: + doc = getattr(media, "document", None) + if doc is not None: + return getattr(doc, "size", None) + return None + + +def _media_mime(media: Any) -> str | None: + doc = getattr(media, "document", None) + if doc is not None: + return getattr(doc, "mime_type", None) + if isinstance(media, MessageMediaPhoto): + return "image/jpeg" + return None + + +async def _download_message_media( + client: TelegramClient, msg: TgMessage, channel_id: int +) -> list[dict]: + """Download media from a single message into //. + + Returns a list of dicts: {kind, url?, mime?, size?, skipped?}. + Large documents/videos are skipped to avoid eating disk; metadata is kept + so the UI can still show that media existed. + """ + if msg.media is None: + return [] + + kind = _media_kind(msg.media) + size = _media_size(msg.media) + mime = _media_mime(msg.media) + info: dict = {"kind": kind, "mime": mime, "size": size} + + if size is not None and size > settings.media_max_bytes: + info["skipped"] = "too_large" + return [info] + + target_dir = Path(settings.media_dir) / str(channel_id) + target_dir.mkdir(parents=True, exist_ok=True) + prefix = target_dir / f"{msg.id}" + try: + path = await client.download_media(msg, file=str(prefix)) + except Exception as exc: + log.warning("media_download_failed", msg_id=msg.id, error=str(exc)) + info["skipped"] = "download_error" + return [info] + if path is None: + info["skipped"] = "no_file" + return [info] + filename = Path(path).name + public_base = settings.public_base_path.rstrip("/") + info["url"] = f"{public_base}/media/{channel_id}/{filename}" + return [info] + + +_client: TelegramClient | None = None + + +def get_client() -> TelegramClient: + """Build a Telethon client. Prefer StringSession from env (k8s-friendly), + fall back to file-based session at TG_SESSION_PATH for local dev.""" + global _client + if _client is None: + session = ( + StringSession(settings.tg_session_string) + if settings.tg_session_string + else settings.tg_session_path + ) + _client = TelegramClient(session, settings.tg_api_id, settings.tg_api_hash) + return _client + + +async def start_client() -> TelegramClient: + """Connect Telethon. Does NOT require authorization — connecting an + unauthorized client is fine and is a prerequisite for the web login flow. + Callers that need an authorized client must use `require_authorized()`. + """ + client = get_client() + if not client.is_connected(): + await client.connect() + return client + + +async def stop_client() -> None: + global _client + if _client is not None and _client.is_connected(): + await _client.disconnect() + _client = None + + +async def require_authorized() -> TelegramClient: + client = await start_client() + if not await client.is_user_authorized(): + raise RuntimeError("not authorized: complete login at /auth.html") + return client + + +async def is_authorized() -> bool: + client = await start_client() + return await client.is_user_authorized() + + +async def current_username() -> str | None: + client = await start_client() + if not await client.is_user_authorized(): + return None + me = await client.get_me() + if me is None: + return None + return me.username or str(me.id) + + +_pending_phone_code_hash: str | None = None + + +async def send_login_code() -> None: + """Step 1: ask Telegram to send the login code to TG_PHONE.""" + global _pending_phone_code_hash + client = await start_client() + if await client.is_user_authorized(): + raise RuntimeError("already authorized") + sent = await client.send_code_request(settings.tg_phone) + _pending_phone_code_hash = sent.phone_code_hash + + +async def submit_login_code(code: str) -> bool: + """Step 2: submit the code. Returns True if 2FA password is still required.""" + global _pending_phone_code_hash + if _pending_phone_code_hash is None: + raise RuntimeError("no pending login: call send-code first") + client = await start_client() + from telethon.errors import SessionPasswordNeededError + + try: + await client.sign_in( + phone=settings.tg_phone, + code=code, + phone_code_hash=_pending_phone_code_hash, + ) + except SessionPasswordNeededError: + return True + _pending_phone_code_hash = None + return False + + +async def submit_login_password(password: str) -> None: + """Step 3 (only if 2FA): submit the cloud password.""" + global _pending_phone_code_hash + client = await start_client() + await client.sign_in(password=password) + _pending_phone_code_hash = None + + +async def logout() -> None: + global _pending_phone_code_hash + client = await start_client() + if await client.is_user_authorized(): + await client.log_out() + _pending_phone_code_hash = None + + +async def resolve_channel(identifier: str) -> ResolvedChannel: + client = await start_client() + entity = await client.get_entity(identifier) + if not isinstance(entity, TgChannel): + raise ValueError(f"{identifier!r} is not a channel") + return ResolvedChannel(tg_id=entity.id, title=entity.title or identifier) + + +async def fetch_specific_messages_with_media( + identifier: str, tg_message_ids: list[int], channel_id: int +) -> dict[int, list[dict]]: + """Re-fetch a list of specific messages by id and download their media. + + Returns {tg_message_id: media_files} for messages whose media was + successfully resolved (skipped or downloaded). Used by the backfill flow + for messages that were saved before media-download was implemented. + """ + client = await require_authorized() + entity = await client.get_entity(identifier) + out: dict[int, list[dict]] = {} + msgs = await client.get_messages(entity, ids=list(tg_message_ids)) + for msg in msgs: + if msg is None or not isinstance(msg, TgMessage) or msg.media is None: + continue + out[msg.id] = await _download_message_media(client, msg, channel_id) + return out + + +async def fetch_new_messages( + identifier: str, + min_id: int | None, + limit: int, + download_media_for_channel_id: int | None = None, +) -> list[FetchedMessage]: + client = await start_client() + entity = await client.get_entity(identifier) + kwargs = {"limit": limit} + if min_id is not None: + kwargs["min_id"] = min_id + out: list[FetchedMessage] = [] + async for msg in client.iter_messages(entity, **kwargs): + if not isinstance(msg, TgMessage): + continue + media_files: list[dict] = [] + if msg.media is not None and download_media_for_channel_id is not None: + media_files = await _download_message_media( + client, msg, download_media_for_channel_id + ) + sender_username, sender_name = _sender_info(msg) + out.append( + FetchedMessage( + tg_message_id=msg.id, + date=msg.date, + text=msg.message, + sender_id=getattr(msg.sender_id, "user_id", msg.sender_id) + if msg.sender_id is not None + else None, + sender_username=sender_username, + sender_name=sender_name, + grouped_id=getattr(msg, "grouped_id", None), + has_media=msg.media is not None, + views=msg.views, + forwards=msg.forwards, + raw=_json_safe(msg.to_dict()), + media_files=media_files, + ) + ) + out.sort(key=lambda m: m.tg_message_id) + return out diff --git a/src/parser_bot/web/static/admin.html b/src/parser_bot/web/static/admin.html new file mode 100644 index 0000000..f35d53d --- /dev/null +++ b/src/parser_bot/web/static/admin.html @@ -0,0 +1,36 @@ + + + + + Админ — parser-tg-bot + + + + +
+

parser-tg-bot

+ +
+
+

Админ-доступ

+ +
+
Проверка...
+
+ + +
+
+ +
+
+
+ + + diff --git a/src/parser_bot/web/static/auth.html b/src/parser_bot/web/static/auth.html new file mode 100644 index 0000000..1fb0a33 --- /dev/null +++ b/src/parser_bot/web/static/auth.html @@ -0,0 +1,85 @@ + + + + + Авторизация — parser-tg-bot + + + + +
+

parser-tg-bot

+ +
+
+

Авторизация Telegram

+ +
+
+
Проверка статуса...
+
+ + + + + + + + +
+ +
+

Прод-вариант (без UI)

+

+ Для деплоя в k8s удобнее заранее получить опаковую строку сессии и положить её + в Secret — тогда поды поднимаются без интерактива: +

+
docker compose run --rm -it app python -m parser_bot.auth
+

+ Скрипт напечатает TG_SESSION_STRING=... — вставить + в .env или Secret и забыть про авторизацию. +

+
+
+ + + diff --git a/src/parser_bot/web/static/css/app.css b/src/parser_bot/web/static/css/app.css new file mode 100644 index 0000000..5044d9c --- /dev/null +++ b/src/parser_bot/web/static/css/app.css @@ -0,0 +1,241 @@ +:root { + --bg: #0f1115; + --panel: #161a22; + --panel-2: #1d222c; + --border: #262c38; + --text: #e6e8ec; + --muted: #8a93a3; + --accent: #4f8cff; + --accent-hover: #6aa0ff; + --danger: #ff6464; + --ok: #2ecc71; + --warn: #f1c40f; +} + +* { box-sizing: border-box; } + +body { + margin: 0; + font: 14px/1.45 -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; + background: var(--bg); + color: var(--text); +} + +a { color: var(--accent); text-decoration: none; } +a:hover { color: var(--accent-hover); } + +header { + display: flex; + align-items: center; + gap: 24px; + padding: 14px 24px; + background: var(--panel); + border-bottom: 1px solid var(--border); +} +header h1 { + font-size: 16px; + margin: 0; + font-weight: 600; +} +nav { display: flex; gap: 6px; } +nav a { + padding: 6px 12px; + border-radius: 6px; + color: var(--muted); +} +nav a.active, nav a:hover { + color: var(--text); + background: var(--panel-2); +} + +main { padding: 24px; max-width: 1200px; margin: 0 auto; } +h2 { font-size: 18px; margin: 0 0 16px; } +h3 { font-size: 14px; margin: 24px 0 12px; color: var(--muted); font-weight: 500; text-transform: uppercase; letter-spacing: 0.05em; } + +.row { display: flex; gap: 12px; align-items: center; flex-wrap: wrap; } +.spacer { flex: 1; } + +.card { + background: var(--panel); + border: 1px solid var(--border); + border-radius: 8px; + padding: 16px; +} + +.stats-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); + gap: 12px; + margin-bottom: 24px; +} +.stat .label { color: var(--muted); font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em; } +.stat .value { font-size: 24px; font-weight: 600; margin-top: 4px; } + +input, select, textarea, button { + font: inherit; + color: var(--text); + background: var(--panel-2); + border: 1px solid var(--border); + border-radius: 6px; + padding: 8px 10px; + outline: none; +} +input:focus, select:focus { border-color: var(--accent); } + +button { + cursor: pointer; + background: var(--accent); + border-color: var(--accent); + color: white; +} +button:hover { background: var(--accent-hover); border-color: var(--accent-hover); } +button.secondary { background: var(--panel-2); color: var(--text); } +button.secondary:hover { background: var(--border); } +button.danger { background: transparent; color: var(--danger); border-color: var(--border); } +button.danger:hover { background: rgba(255, 100, 100, 0.1); } +button:disabled { opacity: 0.5; cursor: not-allowed; } + +table { width: 100%; border-collapse: collapse; } +th, td { padding: 10px 12px; text-align: left; border-bottom: 1px solid var(--border); } +th { color: var(--muted); font-weight: 500; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em; } +tr:hover td { background: var(--panel-2); } + +.badge { + display: inline-block; + padding: 2px 8px; + border-radius: 999px; + font-size: 11px; + background: var(--panel-2); + color: var(--muted); + border: 1px solid var(--border); +} +.badge.ok { color: var(--ok); border-color: rgba(46, 204, 113, 0.4); } +.badge.off { color: var(--muted); } +.badge.warn { color: var(--warn); border-color: rgba(241, 196, 15, 0.4); } + +.muted { color: var(--muted); } +.mono { font-family: ui-monospace, SFMono-Regular, Menlo, monospace; } + +.message { + padding: 12px 16px; + border-bottom: 1px solid var(--border); +} +.message:last-child { border-bottom: none; } +.message-meta { display: flex; gap: 12px; color: var(--muted); font-size: 12px; margin-bottom: 6px; } +.message-text { white-space: pre-wrap; word-break: break-word; } + +.message-tags { + display: flex; flex-wrap: wrap; gap: 6px; + margin-top: 8px; +} +.message-tags .badge.re { color: #2ecc71; border-color: rgba(46, 204, 113, 0.4); } +.message-tags .badge.phone { color: #4f8cff; border-color: rgba(79, 140, 255, 0.4); } +.message-tags .badge.name { color: #f1c40f; border-color: rgba(241, 196, 15, 0.4); } +.message-tags .badge.tg { color: #4f8cff; border-color: rgba(79, 140, 255, 0.4); } +.message-tags .badge.tg-link { color: #fff; background: rgba(79, 140, 255, 0.2); border-color: rgba(79, 140, 255, 0.6); } +.message-tags .badge.tg-link:hover { background: rgba(79, 140, 255, 0.35); } + +.lead-card { + margin-top: 10px; + padding: 10px 14px; + border-radius: 8px; + border: 1px solid var(--border); + background: rgba(46, 204, 113, 0.05); +} +.lead-card.lead-strong { border-color: rgba(46, 204, 113, 0.6); background: rgba(46, 204, 113, 0.1); } +.lead-card.lead-medium { border-color: rgba(241, 196, 15, 0.5); background: rgba(241, 196, 15, 0.06); } +.lead-card.lead-weak { border-color: rgba(138, 147, 163, 0.4); background: rgba(138, 147, 163, 0.05); } +.lead-head { display: flex; flex-wrap: wrap; align-items: center; gap: 10px; } +.lead-facts { color: var(--text); font-weight: 500; } +.lead-summary { margin-top: 4px; color: var(--muted); font-size: 13px; } +.lead-confidence { + margin-left: auto; padding: 2px 8px; border-radius: 999px; + background: var(--panel-2); border: 1px solid var(--border); + font-size: 11px; color: var(--muted); font-variant-numeric: tabular-nums; +} +.badge.lead { color: #2ecc71; border-color: rgba(46, 204, 113, 0.5); font-weight: 600; } + +.message-media { + display: flex; flex-wrap: wrap; gap: 8px; + margin-top: 10px; +} +.media-thumb { + max-width: 240px; max-height: 240px; + border-radius: 6px; cursor: zoom-in; + background: var(--panel-2); +} +.media-video { max-width: 360px; max-height: 240px; border-radius: 6px; background: black; } +.media-doc { + display: inline-flex; align-items: center; gap: 8px; + padding: 8px 12px; background: var(--panel-2); + border: 1px solid var(--border); border-radius: 6px; + color: var(--text); +} +.media-doc:hover { border-color: var(--accent); } +.media-skipped { + display: inline-flex; align-items: center; gap: 8px; + padding: 6px 10px; background: var(--panel-2); + border-radius: 6px; font-size: 12px; +} + +#lightbox { + position: fixed; inset: 0; z-index: 2000; + background: rgba(0,0,0,0.85); + display: flex; align-items: center; justify-content: center; + cursor: zoom-out; +} +#lightbox img { max-width: 95vw; max-height: 95vh; border-radius: 4px; } + +.toolbar { display: flex; gap: 8px; align-items: center; margin-bottom: 16px; flex-wrap: wrap; } +.toolbar input[type="search"], .toolbar select { min-width: 200px; } + +.toast { + position: fixed; + bottom: 20px; + right: 20px; + background: var(--panel); + border: 1px solid var(--border); + border-radius: 8px; + padding: 10px 16px; + box-shadow: 0 6px 24px rgba(0,0,0,0.4); + animation: slideIn 0.18s ease-out; + z-index: 1000; + max-width: 360px; +} +.toast.error { border-color: var(--danger); } +.toast.success { border-color: var(--ok); } +@keyframes slideIn { from { transform: translateY(8px); opacity: 0; } to { transform: none; opacity: 1; } } + +.empty { padding: 32px; text-align: center; color: var(--muted); } + +.sections-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(260px, 1fr)); + gap: 16px; + margin-top: 16px; +} +.section-tile { padding: 16px; } +.section-tile-link { display: block; color: var(--text); } +.section-tile-link:hover { color: var(--text); } +.section-tile-head { display: flex; align-items: center; gap: 10px; margin-bottom: 8px; } +.section-emoji { font-size: 28px; } +.section-title { font-size: 16px; font-weight: 600; } +.section-stats { display: flex; flex-wrap: wrap; gap: 12px; color: var(--muted); font-size: 13px; } +.section-stats b { color: var(--text); } +.section-desc { margin-top: 8px; font-size: 13px; } +.section-code { margin-top: 8px; color: var(--warn); font-size: 12px; } +.section-slug { margin-top: 8px; font-size: 11px; } +.pagination { display: flex; gap: 8px; justify-content: center; margin-top: 16px; } + +dialog { + background: var(--panel); + color: var(--text); + border: 1px solid var(--border); + border-radius: 8px; + padding: 20px; + min-width: 400px; + max-width: 80vw; + max-height: 80vh; +} +dialog::backdrop { background: rgba(0,0,0,0.6); } +pre { background: var(--bg); padding: 12px; border-radius: 6px; overflow: auto; font-size: 12px; max-height: 60vh; } diff --git a/src/parser_bot/web/static/hr/index.html b/src/parser_bot/web/static/hr/index.html new file mode 100644 index 0000000..794c0ca --- /dev/null +++ b/src/parser_bot/web/static/hr/index.html @@ -0,0 +1,99 @@ + + + + + 👥 HR — подразделы + + + + +
+

parser-tg-bot · 👥 HR / Кадры

+ +
+
+
+

Подразделы HR

+
+ +
+

+ Каждый подраздел — это собственный набор каналов, своя статистика и свой + LLM-промпт (с фоллбэком на промпт вертикали). Например: IT, продажи, + маркетинг, рабочие специальности. +

+ +
+
+ + +

Новый подраздел

+
+ +
+ URL-адрес + /hr/(введите название)/ +
+ изменить вручную +
+ + + + +
+ + +
+
+
+ + +

Редактировать подраздел

+
+ + + + + +
+ + +
+
+
+ + + + + + diff --git a/src/parser_bot/web/static/hr/section/channels.html b/src/parser_bot/web/static/hr/section/channels.html new file mode 100644 index 0000000..4b72110 --- /dev/null +++ b/src/parser_bot/web/static/hr/section/channels.html @@ -0,0 +1,48 @@ + + + + + 👥 HR · Каналы — parser-tg-bot + + + + +
+

parser-tg-bot

+ +
+
+

Каналы подраздела

+ +
+
+ + +
+
+ Канал будет привязан к текущему подразделу. +
+
+ +
+ + + + + + + + + + + + + +
IDКаналTelegram IDСообщ.Последний опросСтатус
+
+
+ + + + + diff --git a/src/parser_bot/web/static/hr/section/index.html b/src/parser_bot/web/static/hr/section/index.html new file mode 100644 index 0000000..ff47ca9 --- /dev/null +++ b/src/parser_bot/web/static/hr/section/index.html @@ -0,0 +1,43 @@ + + + + + 👥 HR · Дашборд — parser-tg-bot + + + + +
+

parser-tg-bot

+ +
+
+
+

Дашборд

+
+ +
+ +
+ +

Каналы подраздела

+
+ + + + + + + + + + + +
КаналСообщенийПоследнее сообщениеПоследний опросСтатус
+
+
+ + + + + diff --git a/src/parser_bot/web/static/hr/section/messages.html b/src/parser_bot/web/static/hr/section/messages.html new file mode 100644 index 0000000..03fc60b --- /dev/null +++ b/src/parser_bot/web/static/hr/section/messages.html @@ -0,0 +1,78 @@ + + + + + 👥 HR · Сообщения — parser-tg-bot + + + + +
+

parser-tg-bot

+ +
+
+

Сообщения подраздела

+ +
+ + + + + + + +
+ + +
+ +
+ + +
+ + +

Сообщение

+

+    
+ +
+
+ + + + + + diff --git a/src/parser_bot/web/static/hr/section/settings.html b/src/parser_bot/web/static/hr/section/settings.html new file mode 100644 index 0000000..83d62e1 --- /dev/null +++ b/src/parser_bot/web/static/hr/section/settings.html @@ -0,0 +1,66 @@ + + + + + 👥 HR · Настройки — parser-tg-bot + + + + +
+

parser-tg-bot

+ +
+
+

Настройки подраздела

+ +
+

Текущая конфигурация

+ + + + +
Загрузка...
+
+ Параметры задаются через переменные окружения (.env). + Для изменения отредактируйте .env и перезапустите контейнер: + docker compose restart app. +
+
+ +
+

Действия

+
+ + OpenAPI / Swagger + Health check +
+
+ +
+

🤖 Промпт ИИ

+
+ + +
+ + + +
+ +
+ Каскад: section → vertical → default. Если промпта на + уровне подраздела нет, используется промпт вертикали; если и его нет — + встроенный по умолчанию. Сохранение применится в течение ~5 сек. +
+
+
+ + + + + diff --git a/src/parser_bot/web/static/index.html b/src/parser_bot/web/static/index.html new file mode 100644 index 0000000..dfb6709 --- /dev/null +++ b/src/parser_bot/web/static/index.html @@ -0,0 +1,76 @@ + + + + + parser-tg-bot — выбор раздела + + + + + +
+

parser-tg-bot

+ +
+ +
+

Выберите вертикаль

+

+ У каждой вертикали — свои подразделы (например, «Дубай», «Москва» + внутри Недвижимости, или «IT», «Продажи» внутри HR). Канал привязан + к одному подразделу одной вертикали. +

+ + +
+ + diff --git a/src/parser_bot/web/static/js/access.js b/src/parser_bot/web/static/js/access.js new file mode 100644 index 0000000..6324253 --- /dev/null +++ b/src/parser_bot/web/static/js/access.js @@ -0,0 +1,41 @@ +// Ask the backend whether this client is on the admin allowlist and hide +// admin-only nav links if not. The backend independently enforces the +// allowlist on every admin endpoint, so this is purely cosmetic — it just +// removes dead controls from the UI for non-admin visitors. + +let _adminPromise = null; +export function isAdmin() { + if (!_adminPromise) { + _adminPromise = fetch("/api/monitoring-tg/api/v1/access/me") + .then(r => r.ok ? r.json() : { is_admin: false }) + .then(d => !!d.is_admin) + .catch(() => false); + } + return _adminPromise; +} + +export function adminStatus() { + return fetch("/api/monitoring-tg/api/v1/access/me") + .then(r => r.ok ? r.json() : { is_admin: false, admin_ip_allowed: false }) + .catch(() => ({ is_admin: false, admin_ip_allowed: false })); +} + +adminStatus().then(status => { + const admin = !!status.is_admin; + const canOpenAdmin = !!status.admin_ip_allowed; + if (admin) return; + // Remove any `.admin-link` from the DOM. Works for both server-rendered + // navs (auth.html, chooser pages) and JS-built navs (nav.js fires before + // its own write, but DOMContentLoaded ordering means the elements appear + // after — handle via a MutationObserver for late insertions). + const hide = () => { + document.querySelectorAll(".admin-link").forEach(el => el.remove()); + document.querySelectorAll(".admin-only").forEach(el => el.remove()); + if (!canOpenAdmin) { + document.querySelectorAll(".admin-login-link").forEach(el => el.remove()); + } + }; + hide(); + const mo = new MutationObserver(hide); + mo.observe(document.body, { childList: true, subtree: true }); +}); diff --git a/src/parser_bot/web/static/js/admin.js b/src/parser_bot/web/static/js/admin.js new file mode 100644 index 0000000..0d5b882 --- /dev/null +++ b/src/parser_bot/web/static/js/admin.js @@ -0,0 +1,49 @@ +import { api, toast } from "/api/monitoring-tg/static/js/api.js"; +import "/api/monitoring-tg/static/js/access.js"; + +const form = document.getElementById("admin-form"); +const password = document.getElementById("admin-password"); +const statusEl = document.getElementById("admin-status"); +const logoutBtn = document.getElementById("admin-logout"); + +function returnUrl() { + const params = new URLSearchParams(location.search); + return params.get("return") || "/"; +} + +async function refresh() { + const status = await api.accessMe(); + if (status.is_admin) { + statusEl.textContent = "Админ-доступ активен."; + form.hidden = true; + logoutBtn.hidden = false; + } else if (!status.admin_password_enabled) { + statusEl.textContent = "Админ пароль не задан. Доступ управляется IP-allowlist."; + form.hidden = true; + logoutBtn.hidden = true; + } else { + statusEl.textContent = "Введите админ пароль, чтобы открыть админские функции."; + form.hidden = false; + logoutBtn.hidden = true; + setTimeout(() => password.focus(), 30); + } +} + +form.addEventListener("submit", async (e) => { + e.preventDefault(); + try { + await api.adminLogin(password.value); + password.value = ""; + toast("Админ-доступ открыт", "success"); + location.href = returnUrl(); + } catch (err) { + toast(err.message, "error"); + } +}); + +logoutBtn.addEventListener("click", async () => { + await api.adminLogout(); + location.reload(); +}); + +refresh().catch(err => toast(err.message, "error")); diff --git a/src/parser_bot/web/static/js/api.js b/src/parser_bot/web/static/js/api.js new file mode 100644 index 0000000..0c3561c --- /dev/null +++ b/src/parser_bot/web/static/js/api.js @@ -0,0 +1,192 @@ +import { getVertical, getSection } from "/api/monitoring-tg/static/js/vertical.js"; + +const BASE = "/api/monitoring-tg/api/v1"; +let sectionLoginPromise = null; + +async function unlockCurrentSection() { + if (sectionLoginPromise) return sectionLoginPromise; + sectionLoginPromise = (async () => { + const vertical = getVertical(); + const section = getSection(); + if (!section) return false; + const code = prompt(`Введите код подраздела "${section}"`); + if (!code) return false; + await request("/access/section-login", { + method: "POST", + body: JSON.stringify({ vertical, section, code }), + sectionRetry: false, + }); + return true; + })(); + try { + return await sectionLoginPromise; + } finally { + sectionLoginPromise = null; + } +} + +async function request(path, options = {}) { + const { sectionRetry = true, ...fetchOptions } = options; + const res = await fetch(BASE + path, { + headers: { "Content-Type": "application/json" }, + ...fetchOptions, + }); + if (!res.ok) { + let detail = res.statusText; + try { detail = (await res.json()).detail || detail; } catch {} + if (res.status === 401 && detail === "section code required" && sectionRetry) { + if (await unlockCurrentSection()) { + return request(path, { ...options, sectionRetry: false }); + } + } + throw new Error(`${res.status}: ${detail}`); + } + if (res.status === 204) return null; + return res.json(); +} + +// Build a query string scoped to the current (vertical, section). The +// section is intentionally optional — pages at // (chooser) +// pass null so they see all sections, while pages inside a section +// always carry their section slug. +function qs(extra = {}, { vertical, section } = {}) { + const params = new URLSearchParams(); + params.set("vertical", vertical ?? getVertical()); + const s = section === undefined ? getSection() : section; + if (s) params.set("section", s); + for (const [k, v] of Object.entries(extra)) { + if (v == null || v === false) continue; + params.set(k, String(v)); + } + return params.toString(); +} + +export const api = { + accessMe: () => request("/access/me"), + adminLogin: (password) => + request("/access/admin-login", { + method: "POST", + body: JSON.stringify({ password }), + sectionRetry: false, + }), + adminLogout: () => + request("/access/admin-logout", { method: "POST", sectionRetry: false }), + sectionLogin: ({ vertical, section, code }) => + request("/access/section-login", { + method: "POST", + body: JSON.stringify({ vertical, section, code }), + sectionRetry: false, + }), + + // Auth — section-agnostic. + authStatus: () => request("/auth/status"), + authSendCode: () => request("/auth/send-code", { method: "POST" }), + authSubmitCode: (code) => + request("/auth/submit-code", { method: "POST", body: JSON.stringify({ code }) }), + authSubmitPassword: (password) => + request("/auth/submit-password", { method: "POST", body: JSON.stringify({ password }) }), + authLogout: () => request("/auth/logout", { method: "POST" }), + + // Sections (sub-sections within a vertical). + listSections: (vertical) => request(`/sections?${qs({}, { vertical, section: null })}`), + createSection: ({ vertical, slug, title, emoji, description, accessCode }) => + request("/sections", { + method: "POST", + body: JSON.stringify({ + vertical: vertical ?? getVertical(), + slug, title, emoji, description, access_code: accessCode, + }), + }), + updateSection: (vertical, slug, patch) => + request(`/sections/${encodeURIComponent(vertical)}/${encodeURIComponent(slug)}`, { + method: "PATCH", + body: JSON.stringify(patch), + }), + deleteSection: (vertical, slug) => + request(`/sections/${encodeURIComponent(vertical)}/${encodeURIComponent(slug)}`, { + method: "DELETE", + }), + + // Scoped reads: implicit (vertical, section) from URL. + globalStats: (scope) => request(`/stats?${qs({}, scope)}`), + + listChannels: (scope) => request(`/channels?${qs({}, scope)}`), + getChannel: (id, scope) => request(`/channels/${id}?${qs({}, scope)}`), + channelStats: (id, scope) => request(`/channels/${id}/stats?${qs({}, scope)}`), + addChannel: (identifier, scope = {}) => { + const vertical = scope.vertical ?? getVertical(); + const section = scope.section === undefined ? getSection() : scope.section; + if (!section) { + throw new Error("addChannel requires a section context"); + } + return request("/channels", { + method: "POST", + body: JSON.stringify({ identifier, vertical, section }), + }); + }, + updateChannel: (id, patch, scope) => + request(`/channels/${id}?${qs({}, scope)}`, { + method: "PATCH", body: JSON.stringify(patch), + }), + deleteChannel: (id, scope) => + request(`/channels/${id}?${qs({}, scope)}`, { method: "DELETE" }), + pollChannel: (id, scope) => + request(`/channels/${id}/poll?${qs({}, scope)}`, { method: "POST" }), + backfillMedia: (id, batch = 50, scope) => + request(`/channels/${id}/backfill-media?${qs({ batch }, scope)}`, { method: "POST" }), + reanalyze: (id, batch = 500, scope) => + request(`/channels/${id}/reanalyze?${qs({ batch }, scope)}`, { method: "POST" }), + + pollAll: (scope) => request(`/poll?${qs({}, scope)}`, { method: "POST" }), + + listMessages: ({ channelId, q, realEstate, hrKind, hasPhone, leadsOnly, + minConfidence, limit = 50, offset = 0, + vertical, section } = {}) => { + const extra = { limit, offset }; + if (channelId) extra.channel_id = channelId; + if (q) extra.q = q; + if (realEstate) extra.real_estate = realEstate; + if (hrKind) extra.hr_kind = hrKind; + if (hasPhone) extra.has_phone = "true"; + if (leadsOnly) { + extra.leads_only = "true"; + if (minConfidence != null) extra.min_confidence = minConfidence; + } + return request(`/messages?${qs(extra, { vertical, section })}`); + }, + getMessage: (id, scope) => request(`/messages/${id}?${qs({}, scope)}`), + + llmStatus: () => request("/llm/status"), + llmQueue: (scope) => request(`/llm/queue?${qs({}, scope)}`), + llmPromptGet: (scope) => request(`/llm/prompt?${qs({}, scope)}`), + llmPromptSave: (prompt, scope) => + request(`/llm/prompt?${qs({}, scope)}`, { + method: "PUT", body: JSON.stringify({ prompt }), + }), + llmPromptReset: (scope) => + request(`/llm/prompt?${qs({}, scope)}`, { method: "DELETE" }), +}; + +export function toast(message, type = "info") { + const el = document.createElement("div"); + el.className = `toast ${type}`; + el.textContent = message; + document.body.appendChild(el); + setTimeout(() => el.remove(), 3500); +} + +export function fmtDate(iso) { + if (!iso) return "—"; + const d = new Date(iso); + return d.toLocaleString(); +} + +export function fmtRelative(iso) { + if (!iso) return "—"; + const d = new Date(iso); + const diff = (Date.now() - d.getTime()) / 1000; + if (diff < 60) return `${Math.floor(diff)}s ago`; + if (diff < 3600) return `${Math.floor(diff / 60)}m ago`; + if (diff < 86400) return `${Math.floor(diff / 3600)}h ago`; + return `${Math.floor(diff / 86400)}d ago`; +} diff --git a/src/parser_bot/web/static/js/auth.js b/src/parser_bot/web/static/js/auth.js new file mode 100644 index 0000000..835d7bc --- /dev/null +++ b/src/parser_bot/web/static/js/auth.js @@ -0,0 +1,120 @@ +import { api, toast } from "/api/monitoring-tg/static/js/api.js"; + +const returnTo = (() => { + const raw = new URLSearchParams(location.search).get("return"); + // Only allow same-origin relative paths to avoid open-redirect via ?return= + if (raw && raw.startsWith("/") && !raw.startsWith("//")) return raw; + return null; +})(); +const returnLink = document.getElementById("return-link"); +if (returnLink && returnTo) { + returnLink.href = returnTo; + returnLink.querySelector("button").textContent = "← Вернуться"; +} + +const steps = ["idle", "code", "password", "done"]; +function show(step) { + steps.forEach(s => { + document.getElementById(`step-${s}`).hidden = s !== step; + }); +} + +function setStatus(html) { + document.getElementById("status-block").innerHTML = html; +} + +async function refresh() { + const status = await api.authStatus(); + document.getElementById("phone").textContent = status.phone || "—"; + document.getElementById("phone-2").textContent = status.phone || "—"; + + if (status.authorized) { + setStatus(`
Авторизовано
`); + document.getElementById("username").textContent = status.username || "(unnamed)"; + show("done"); + } else { + setStatus(`
Не авторизовано
`); + show("idle"); + } +} + +document.getElementById("btn-send").addEventListener("click", async (e) => { + e.target.disabled = true; + try { + await api.authSendCode(); + toast("Код отправлен в Telegram", "success"); + show("code"); + document.getElementById("code").focus(); + } catch (err) { + toast(err.message, "error"); + } finally { + e.target.disabled = false; + } +}); + +document.getElementById("btn-resend").addEventListener("click", async (e) => { + e.target.disabled = true; + try { + await api.authSendCode(); + toast("Новый код отправлен", "success"); + } catch (err) { + toast(err.message, "error"); + } finally { + e.target.disabled = false; + } +}); + +document.getElementById("form-code").addEventListener("submit", async (e) => { + e.preventDefault(); + const code = document.getElementById("code").value.trim(); + const btn = e.target.querySelector("button"); + btn.disabled = true; + try { + const res = await api.authSubmitCode(code); + if (res.needs_password) { + toast("Введи 2FA-пароль", "success"); + show("password"); + document.getElementById("password").focus(); + } else { + toast("Готово", "success"); + await refresh(); + } + } catch (err) { + toast(err.message, "error"); + } finally { + btn.disabled = false; + } +}); + +document.getElementById("form-password").addEventListener("submit", async (e) => { + e.preventDefault(); + const password = document.getElementById("password").value; + const btn = e.target.querySelector("button"); + btn.disabled = true; + try { + await api.authSubmitPassword(password); + toast("Авторизовано", "success"); + document.getElementById("password").value = ""; + await refresh(); + } catch (err) { + toast(err.message, "error"); + } finally { + btn.disabled = false; + } +}); + +document.getElementById("btn-logout").addEventListener("click", async (e) => { + if (!confirm("Выйти из Telegram-сессии?")) return; + e.target.disabled = true; + try { + await api.authLogout(); + toast("Сессия завершена", "success"); + await refresh(); + } catch (err) { + toast(err.message, "error"); + } finally { + e.target.disabled = false; + } +}); + +refresh().catch(err => toast(err.message, "error")); diff --git a/src/parser_bot/web/static/js/channels.js b/src/parser_bot/web/static/js/channels.js new file mode 100644 index 0000000..7032e98 --- /dev/null +++ b/src/parser_bot/web/static/js/channels.js @@ -0,0 +1,132 @@ +import { api, toast, fmtRelative } from "/api/monitoring-tg/static/js/api.js"; +import { isAdmin } from "/api/monitoring-tg/static/js/access.js"; +import { getVertical, getSection, sectionBase, VERTICAL_META } from "/api/monitoring-tg/static/js/vertical.js"; + +const V = getVertical(); +const section = getSection(); +const sBase = sectionBase(); +const meta = VERTICAL_META[V]; + +function escape(s) { + if (s == null) return ""; + return String(s).replace(/[&<>"']/g, c => ({"&":"&","<":"<",">":">",'"':""","'":"'"}[c])); +} + +async function load() { + const admin = await isAdmin(); + const channels = await api.listChannels(); + const tbody = document.getElementById("tbody"); + if (!channels.length) { + tbody.innerHTML = `Каналов пока нет`; + return; + } + const stats = await Promise.all(channels.map(c => api.channelStats(c.id).catch(() => null))); + tbody.innerHTML = channels.map((c, i) => { + const s = stats[i] || {}; + return ` + + ${c.id} + +
${escape(c.title || "—")}
+
${escape(c.identifier)}
+ + ${c.tg_id ?? "—"} + ${(s.message_count ?? 0).toLocaleString()} + ${fmtRelative(c.last_polled_at)} + + + + +
+ сообщения + ${admin ? ` + + + + + ` : ""} +
+ + `; + }).join(""); +} + +document.getElementById("add-form").addEventListener("submit", async (e) => { + e.preventDefault(); + const input = document.getElementById("identifier"); + const id = input.value.trim(); + if (!id) return; + const btn = e.target.querySelector("button"); + btn.disabled = true; + try { + await api.addChannel(id); + const where = section ? `${meta.short} / ${section}` : meta.short; + toast(`Канал добавлен в "${where}"`, "success"); + input.value = ""; + await load(); + } catch (err) { + toast(err.message, "error"); + } finally { + btn.disabled = false; + } +}); + +document.getElementById("tbody").addEventListener("click", async (e) => { + const btn = e.target.closest("[data-action]"); + if (!btn) return; + const tr = btn.closest("tr"); + const id = Number(tr.dataset.id); + const action = btn.dataset.action; + try { + if (action === "delete") { + if (!confirm("Удалить канал и все его сообщения?")) return; + await api.deleteChannel(id); + toast("Удалено", "success"); + await load(); + } else if (action === "poll") { + btn.disabled = true; + const res = await api.pollChannel(id); + toast(`Добавлено ${res.inserted} сообщений`, "success"); + await load(); + } else if (action === "backfill-media") { + btn.disabled = true; + let totalUpdated = 0; + let pending = Infinity; + while (pending > 0) { + btn.textContent = `Качаю... (готово: ${totalUpdated})`; + const res = await api.backfillMedia(id, 50); + totalUpdated += res.updated; + pending = res.pending; + if (res.updated === 0) break; + } + btn.textContent = "Подкачать медиа"; + toast(`Подкачано ${totalUpdated}, осталось ${pending}`, "success"); + } else if (action === "reanalyze") { + btn.disabled = true; + let total = 0; + let pending = Infinity; + while (pending > 0) { + btn.textContent = `Анализирую... (${total})`; + const res = await api.reanalyze(id, 500); + total += res.updated; + pending = res.pending; + if (res.updated === 0) break; + } + btn.textContent = "Переанализировать"; + toast(`Проанализировано ${total} сообщений, осталось ${pending}`, "success"); + } else if (action === "toggle") { + const isActive = btn.checked; + await api.updateChannel(id, { is_active: isActive }); + toast(isActive ? "Канал включён" : "Канал выключен", "success"); + await load(); + } + } catch (err) { + toast(err.message, "error"); + await load(); + } +}); + +load().catch(err => toast(err.message, "error")); diff --git a/src/parser_bot/web/static/js/dashboard.js b/src/parser_bot/web/static/js/dashboard.js new file mode 100644 index 0000000..c5e1523 --- /dev/null +++ b/src/parser_bot/web/static/js/dashboard.js @@ -0,0 +1,87 @@ +import { api, toast, fmtRelative } from "/api/monitoring-tg/static/js/api.js"; +import { isAdmin } from "/api/monitoring-tg/static/js/access.js"; +import { getVertical, getSection, sectionBase, VERTICAL_META } from "/api/monitoring-tg/static/js/vertical.js"; + +const V = getVertical(); +const section = getSection(); +const sBase = sectionBase(); +const meta = VERTICAL_META[V]; + +function escape(s) { + if (s == null) return ""; + return String(s).replace(/[&<>"']/g, c => ({"&":"&","<":"<",">":">",'"':""","'":"'"}[c])); +} + +async function loadStats() { + const [stats, llm, queue] = await Promise.all([ + api.globalStats(), + api.llmStatus().catch(() => ({ enabled: false, ready: false, model: "—" })), + api.llmQueue().catch(() => ({ pending: null })), + ]); + const grid = document.getElementById("stats"); + const llmBadge = llm.enabled + ? (llm.ready ? `ready` : `загружается`) + : `off`; + const queueValue = queue.pending == null ? "—" : queue.pending.toLocaleString(); + grid.innerHTML = ` +
Каналы
${stats.channels_active} / ${stats.channels_total}
+
Сообщений всего
${stats.messages_total.toLocaleString()}
+
Сообщений за 24ч
${stats.messages_last_24h.toLocaleString()}
+
🎯 Лидов всего
${(stats.leads_total ?? 0).toLocaleString()}
+ +
⏳ В очереди ИИ
${queueValue}
+
Период опроса
${stats.poll_interval_seconds}s
+
Последний опрос
${fmtRelative(stats.last_poll_at)}
+
Локальный ИИ
${llmBadge}
${escape(llm.model || "")}
+ `; +} + +async function loadChannels() { + const channels = await api.listChannels(); + const tbody = document.getElementById("channels-tbody"); + if (!channels.length) { + tbody.innerHTML = `Каналов в этом подразделе пока нет — добавьте их на странице Каналы`; + return; + } + const stats = await Promise.all(channels.map(c => api.channelStats(c.id).catch(() => null))); + tbody.innerHTML = channels.map((c, i) => { + const s = stats[i] || {}; + return ` + + + +
${escape(c.identifier)}
+ + ${(s.message_count ?? 0).toLocaleString()} + ${fmtRelative(s.last_message_at)} + ${fmtRelative(c.last_polled_at)} + ${c.is_active ? 'on' : 'off'} + `; + }).join(""); +} + +document.getElementById("poll-all").addEventListener("click", async (e) => { + e.target.disabled = true; + try { + const res = await api.pollAll(); + const scope = section ? `${meta.short} / ${section}` : meta.short; + toast(`В очереди ${res.queued ?? 0} каналов (${scope}) — опрос идёт в фоне`, "success"); + await loadAll(); + } catch (err) { + toast(err.message, "error"); + } finally { + e.target.disabled = false; + } +}); + +async function loadAll() { + try { + document.getElementById("poll-all").hidden = !(await isAdmin()); + await Promise.all([loadStats(), loadChannels()]); + } catch (err) { + toast(err.message, "error"); + } +} + +loadAll(); +setInterval(loadAll, 15000); diff --git a/src/parser_bot/web/static/js/messages.js b/src/parser_bot/web/static/js/messages.js new file mode 100644 index 0000000..2ed7642 --- /dev/null +++ b/src/parser_bot/web/static/js/messages.js @@ -0,0 +1,433 @@ +import { api, toast, fmtDate } from "/api/monitoring-tg/static/js/api.js"; +import { getVertical, getSection, VERTICAL_META } from "/api/monitoring-tg/static/js/vertical.js"; + +const V = getVertical(); +const section = getSection(); +const meta = VERTICAL_META[V]; + +const state = { + offset: 0, + limit: 50, + channelId: null, + q: "", + realEstate: "", + hrKind: "", + hasPhone: false, + leadsOnly: false, + minConfidence: 0.5, + channels: [], + autorefresh: false, + timer: null, +}; + +function escape(s) { + if (s == null) return ""; + return String(s).replace(/[&<>"']/g, c => ({"&":"&","<":"<",">":">",'"':""","'":"'"}[c])); +} + +function highlight(text, q) { + if (!q || !text) return escape(text); + const escaped = escape(text); + const re = new RegExp(escape(q).replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "gi"); + return escaped.replace(re, m => `${m}`); +} + +function channelTitle(id) { + const c = state.channels.find(c => c.id === id); + return c ? (c.title || c.identifier) : `#${id}`; +} + +function fmtSize(bytes) { + if (bytes == null) return ""; + if (bytes < 1024) return `${bytes}B`; + if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)}KB`; + return `${(bytes / (1024 * 1024)).toFixed(1)}MB`; +} + +const REAL_ESTATE_LABELS = { sale: "продажа", rent: "аренда", purchase: "покупка" }; +const HR_KIND_LABELS = { vacancy: "вакансия", resume: "резюме", contact: "контакт" }; + +function senderContacts(m) { + const contacts = []; + if (m && m.post_url) { + contacts.push(`📬 Открыть в Telegram`); + } + if (m && m.sender_username) { + const u = m.sender_username.startsWith("@") ? m.sender_username : "@" + m.sender_username; + contacts.push(`✉️ ${escape(u)}`); + } else if (m && m.sender_name) { + contacts.push(`✍️ ${escape(m.sender_name)}`); + } + const handles = (m && m.extracted && m.extracted.tg_handles) || []; + for (const h of handles) { + const bare = h.replace(/^@/, ""); + contacts.push(`✉️ ${escape(h)}`); + } + return contacts; +} + +function renderReLead(lead, m) { + if (!lead || !lead.is_listing) return ""; + const tone = + lead.confidence >= 0.7 ? "lead-strong" : + lead.confidence >= 0.4 ? "lead-medium" : "lead-weak"; + const bits = []; + if (lead.kind) bits.push(REAL_ESTATE_LABELS[lead.kind] || lead.kind); + if (lead.property_type) bits.push(lead.property_type); + if (lead.rooms) bits.push(lead.rooms); + if (lead.area_m2) bits.push(`${lead.area_m2} м²`); + const priceBit = lead.price_text + || (lead.price_value != null + ? `${lead.price_value.toLocaleString()}${lead.currency ? " " + lead.currency : ""}` + : null); + if (priceBit) bits.push(priceBit); + else if (lead.currency) bits.push(lead.currency); + if (lead.location) bits.push(lead.location); + const facts = bits.length + ? `
${escape(bits.join(" · "))}
` : ""; + const summary = lead.summary + ? `
${escape(lead.summary)}
` : ""; + const contacts = []; + if (lead.contact_phone) { + contacts.push(`📞 ${escape(lead.contact_phone)}`); + } + if (lead.contact_name) { + contacts.push(`👤 ${escape(lead.contact_name)}`); + } + contacts.push(...senderContacts(m)); + return ` +
+
+ 🎯 ЛИД · 🏠 + ${facts} + ${(lead.confidence * 100).toFixed(0)}% +
+ ${summary} + ${contacts.length ? `
${contacts.join(" ")}
` : ""} +
`; +} + +function renderHrLead(lead, m) { + if (!lead || !lead.is_lead) return ""; + const tone = + lead.confidence >= 0.7 ? "lead-strong" : + lead.confidence >= 0.4 ? "lead-medium" : "lead-weak"; + const bits = []; + if (lead.kind) bits.push(HR_KIND_LABELS[lead.kind] || lead.kind); + if (lead.title) bits.push(lead.title); + if (lead.company) bits.push(lead.company); + if (lead.candidate_name) bits.push(lead.candidate_name); + if (lead.experience_years != null) bits.push(`${lead.experience_years}+ лет опыта`); + if (lead.employment_type) bits.push(lead.employment_type); + if (lead.remote === true) bits.push("удалёнка"); + else if (lead.remote === false) bits.push("офис"); + if (lead.location) bits.push(lead.location); + const salaryBit = lead.salary_text + || (lead.salary_value != null + ? `${lead.salary_value.toLocaleString()}${lead.currency ? " " + lead.currency : ""}` + : null); + if (salaryBit) bits.push(salaryBit); + else if (lead.currency) bits.push(lead.currency); + const facts = bits.length + ? `
${escape(bits.join(" · "))}
` : ""; + const summary = lead.summary + ? `
${escape(lead.summary)}
` : ""; + const skills = (lead.skills || []).slice(0, 12); + const skillsBlock = skills.length + ? `
${skills.map(s => `${escape(s)}`).join(" ")}
` + : ""; + const contacts = []; + if (lead.contact_phone) { + contacts.push(`📞 ${escape(lead.contact_phone)}`); + } + if (lead.contact_name) { + contacts.push(`👤 ${escape(lead.contact_name)}`); + } + contacts.push(...senderContacts(m)); + return ` +
+
+ 🎯 ЛИД · 👥 + ${facts} + ${(lead.confidence * 100).toFixed(0)}% +
+ ${summary} + ${skillsBlock} + ${contacts.length ? `
${contacts.join(" ")}
` : ""} +
`; +} + +function renderExtracted(ex) { + if (!ex) return ""; + const parts = []; + const re = ex.real_estate; + const showRegexRE = + V === "real_estate" && re && !(ex.lead && ex.lead.is_listing); + if (showRegexRE) { + const bits = []; + if (re.kind) bits.push(REAL_ESTATE_LABELS[re.kind] || re.kind); + if (re.property_type) bits.push(re.property_type); + if (re.rooms) bits.push(re.rooms); + if (re.area_m2) bits.push(`${re.area_m2} м²`); + if (re.price) bits.push(re.price); + if (bits.length) parts.push(`🏠 regex: ${escape(bits.join(" · "))}`); + } + // Phones/names from regex are still useful even when there's a lead — show + // only those that aren't already inside the lead card. + const inLead = new Set(); + const activeLead = V === "hr" ? ex.hr_lead : ex.lead; + if (activeLead) { + if (activeLead.contact_phone) inLead.add(activeLead.contact_phone); + if (activeLead.contact_name) inLead.add(activeLead.contact_name); + } + for (const p of ex.phones || []) { + if (inLead.has(p)) continue; + parts.push(`📞 ${escape(p)}`); + } + for (const n of (ex.names || []).slice(0, 3)) { + if (inLead.has(n)) continue; + parts.push(`👤 ${escape(n)}`); + } + if ((ex.names || []).length > 3) { + parts.push(`+${ex.names.length - 3}`); + } + const leadShown = (V === "hr" && ex.hr_lead && ex.hr_lead.is_lead) || + (V === "real_estate" && ex.lead && ex.lead.is_listing); + if (!leadShown) { + for (const h of (ex.tg_handles || [])) { + const bare = h.replace(/^@/, ""); + parts.push(`✉️ ${escape(h)}`); + } + } + const tags = parts.length ? `
${parts.join(" ")}
` : ""; + return tags; +} + +function renderMedia(files) { + if (!files || !files.length) return ""; + return `
${files.map(f => { + if (f.skipped) { + const why = f.skipped === "too_large" ? "слишком большой" : f.skipped; + return `
${escape(f.kind)} + ${why}${f.size ? `, ${fmtSize(f.size)}` : ""}
`; + } + if (!f.url) return ""; + if (f.kind === "photo" || f.kind === "sticker") { + return ` + + `; + } + if (f.kind === "video") { + return ``; + } + if (f.kind === "audio") { + return ``; + } + return ` + ${escape(f.kind)} + ${escape(f.mime || "файл")} + ${fmtSize(f.size)} + `; + }).join("")}
`; +} + +function readUrl() { + const params = new URLSearchParams(location.search); + if (params.has("channel_id")) state.channelId = Number(params.get("channel_id")); + if (params.has("q")) state.q = params.get("q"); + if (params.has("real_estate")) state.realEstate = params.get("real_estate"); + if (params.has("hr_kind")) state.hrKind = params.get("hr_kind"); + if (params.get("has_phone") === "true") state.hasPhone = true; + if (params.get("leads_only") === "true") state.leadsOnly = true; + if (params.has("min_confidence")) state.minConfidence = Number(params.get("min_confidence")); +} + +function syncControls() { + document.getElementById("channel-filter").value = state.channelId ?? ""; + document.getElementById("search").value = state.q; + const reSel = document.getElementById("real-estate"); + if (reSel) reSel.value = state.realEstate; + const hrSel = document.getElementById("hr-kind"); + if (hrSel) hrSel.value = state.hrKind; + document.getElementById("has-phone").checked = state.hasPhone; + document.getElementById("leads-only").checked = state.leadsOnly; + document.getElementById("min-confidence").value = String(state.minConfidence); + document.getElementById("limit").value = state.limit; +} + +async function loadChannels() { + state.channels = await api.listChannels(); + const sel = document.getElementById("channel-filter"); + sel.innerHTML = `` + state.channels.map(c => + `` + ).join(""); + syncControls(); +} + +async function loadMessages() { + const list = document.getElementById("list"); + list.innerHTML = `
Загрузка...
`; + try { + const msgs = await api.listMessages({ + channelId: state.channelId, + q: state.q || undefined, + realEstate: state.realEstate || undefined, + hrKind: state.hrKind || undefined, + hasPhone: state.hasPhone || undefined, + leadsOnly: state.leadsOnly || undefined, + minConfidence: state.leadsOnly ? state.minConfidence : undefined, + limit: state.limit, + offset: state.offset, + }); + if (!msgs.length) { + list.innerHTML = `
Сообщений нет
`; + } else { + list.innerHTML = msgs.map(m => ` +
+
+ ${escape(channelTitle(m.channel_id))} + · + ${fmtDate(m.date)} + · + #${m.tg_message_id} + ${m.group_size > 1 ? `альбом · ${m.group_size}` : (m.has_media ? 'media' : '')} + ${m.views != null ? `👁 ${m.views}` : ''} + ${m.forwards ? `↗ ${m.forwards}` : ''} +
+ json +
+
${m.text ? highlight(m.text, state.q) : '(без текста)'}
+ ${V === "hr" + ? renderHrLead(m.extracted && m.extracted.hr_lead, m) + : renderReLead(m.extracted && m.extracted.lead, m)} + ${renderExtracted(m.extracted)} + ${renderMedia(m.media_files)} +
+ `).join(""); + } + document.getElementById("page-info").textContent = + `${state.offset + 1}–${state.offset + msgs.length}`; + document.getElementById("prev").disabled = state.offset === 0; + document.getElementById("next").disabled = msgs.length < state.limit; + } catch (err) { + toast(err.message, "error"); + list.innerHTML = `
Ошибка: ${escape(err.message)}
`; + } +} + +document.getElementById("channel-filter").addEventListener("change", (e) => { + state.channelId = e.target.value ? Number(e.target.value) : null; + state.offset = 0; + loadMessages(); +}); + +let searchTimer; +document.getElementById("search").addEventListener("input", (e) => { + clearTimeout(searchTimer); + searchTimer = setTimeout(() => { + state.q = e.target.value.trim(); + state.offset = 0; + loadMessages(); + }, 250); +}); + +document.getElementById("limit").addEventListener("change", (e) => { + state.limit = Number(e.target.value); + state.offset = 0; + loadMessages(); +}); + +const reSelEl = document.getElementById("real-estate"); +if (reSelEl) { + reSelEl.addEventListener("change", (e) => { + state.realEstate = e.target.value; + state.offset = 0; + loadMessages(); + }); +} + +const hrSelEl = document.getElementById("hr-kind"); +if (hrSelEl) { + hrSelEl.addEventListener("change", (e) => { + state.hrKind = e.target.value; + state.offset = 0; + loadMessages(); + }); +} + +document.getElementById("has-phone").addEventListener("change", (e) => { + state.hasPhone = e.target.checked; + state.offset = 0; + loadMessages(); +}); + +document.getElementById("leads-only").addEventListener("change", (e) => { + state.leadsOnly = e.target.checked; + state.offset = 0; + loadMessages(); +}); + +document.getElementById("min-confidence").addEventListener("change", (e) => { + state.minConfidence = Number(e.target.value); + if (state.leadsOnly) { + state.offset = 0; + loadMessages(); + } +}); + +document.getElementById("refresh").addEventListener("click", loadMessages); + +document.getElementById("prev").addEventListener("click", () => { + state.offset = Math.max(0, state.offset - state.limit); + loadMessages(); +}); +document.getElementById("next").addEventListener("click", () => { + state.offset += state.limit; + loadMessages(); +}); + +document.getElementById("autorefresh").addEventListener("change", (e) => { + state.autorefresh = e.target.checked; + if (state.timer) { clearInterval(state.timer); state.timer = null; } + if (state.autorefresh) state.timer = setInterval(loadMessages, 10000); +}); + +document.getElementById("list").addEventListener("click", async (e) => { + const lightbox = e.target.closest("[data-action='lightbox']"); + if (lightbox) { + e.preventDefault(); + openLightbox(lightbox.dataset.url); + return; + } + const a = e.target.closest("[data-action='raw']"); + if (!a) return; + e.preventDefault(); + const id = Number(a.closest(".message").dataset.id); + try { + const msg = await api.getMessage(id); + document.getElementById("raw-content").textContent = JSON.stringify(msg, null, 2); + document.getElementById("raw-dialog").showModal(); + } catch (err) { + toast(err.message, "error"); + } +}); + +function openLightbox(url) { + let lb = document.getElementById("lightbox"); + if (!lb) { + lb = document.createElement("div"); + lb.id = "lightbox"; + lb.addEventListener("click", () => lb.remove()); + document.body.appendChild(lb); + } + lb.innerHTML = ``; +} +document.getElementById("raw-close").addEventListener("click", () => { + document.getElementById("raw-dialog").close(); +}); + +readUrl(); +(async () => { + await loadChannels(); + await loadMessages(); +})(); diff --git a/src/parser_bot/web/static/js/nav-status.js b/src/parser_bot/web/static/js/nav-status.js new file mode 100644 index 0000000..01538ea --- /dev/null +++ b/src/parser_bot/web/static/js/nav-status.js @@ -0,0 +1,25 @@ +import { api } from "/api/monitoring-tg/static/js/api.js"; +import { isAdmin } from "/api/monitoring-tg/static/js/access.js"; +import { appBase } from "/api/monitoring-tg/static/js/vertical.js"; + +// "Telegram not authorized" banner. Only useful for admins — non-admin +// visitors can't open /auth.html anyway, so showing the banner would be +// noise (and the /auth/status call itself 404s for non-admins). +(async () => { + if (!(await isAdmin())) return; + try { + const status = await api.authStatus(); + if (status.authorized) return; + const banner = document.createElement("div"); + banner.className = "card"; + banner.style.cssText = + "border-color: rgba(241, 196, 15, 0.5); background: rgba(241, 196, 15, 0.08); margin-bottom: 16px;"; + banner.innerHTML = ` + Telegram не авторизован. + Парсер не сможет ходить за сообщениями, пока вы не залогинитесь. + Открыть страницу авторизации → + `; + const main = document.querySelector("main"); + if (main) main.insertBefore(banner, main.firstChild); + } catch {} +})(); diff --git a/src/parser_bot/web/static/js/nav.js b/src/parser_bot/web/static/js/nav.js new file mode 100644 index 0000000..af51806 --- /dev/null +++ b/src/parser_bot/web/static/js/nav.js @@ -0,0 +1,71 @@ +import { api } from "/api/monitoring-tg/static/js/api.js"; +// Import for side-effect: access.js hides .admin-link elements for non-admins. +import "/api/monitoring-tg/static/js/access.js"; +import { + VERTICAL_META, + appBase, + getVertical, + getSection, + verticalBase, + sectionBase, +} from "/api/monitoring-tg/static/js/vertical.js"; + +const V = getVertical(); +const section = getSection(); +const meta = VERTICAL_META[V]; + +const titleEl = document.getElementById("page-title"); +if (titleEl) { + titleEl.textContent = section + ? `parser-tg-bot · ${meta.emoji} ${meta.short} · ${section}` + : `parser-tg-bot · ${meta.emoji} ${meta.short}`; +} + +const navEl = document.getElementById("nav-section"); +if (navEl) { + const here = location.pathname; + const active = (href) => here === href ? "active" : ""; + const links = []; + + // Up-link: chooser if we are inside a section, vertical-list otherwise. + if (section) { + links.push(`← ${meta.short} (подразделы)`); + } else { + links.push(`← Разделы`); + } + + if (section) { + const sBase = sectionBase(); + links.push( + `Дашборд`, + `Каналы`, + `Сообщения`, + `Настройки`, + ); + } + + links.push( + ``, + `Авторизация`, + `API`, + ); + navEl.innerHTML = links.join(""); +} + +// Best-effort: resolve section's display title from the API and update the +// page heading. Falls back to the raw slug if the network call fails. +const headingEl = document.getElementById("page-heading"); +if (headingEl && section) { + api.listSections(V) + .then(sections => { + const s = sections.find(x => x.slug === section); + if (s) { + const baseText = headingEl.dataset.base || headingEl.textContent; + headingEl.dataset.base = baseText; + headingEl.textContent = `${baseText} · ${s.emoji ? s.emoji + " " : ""}${s.title}`; + } + }) + .catch(() => {}); +} + +export { section, V, meta }; diff --git a/src/parser_bot/web/static/js/sections-list.js b/src/parser_bot/web/static/js/sections-list.js new file mode 100644 index 0000000..a920073 --- /dev/null +++ b/src/parser_bot/web/static/js/sections-list.js @@ -0,0 +1,202 @@ +import { api, toast } from "/api/monitoring-tg/static/js/api.js"; +import { isAdmin } from "/api/monitoring-tg/static/js/access.js"; +import { getVertical, verticalBase, VERTICAL_META } from "/api/monitoring-tg/static/js/vertical.js"; +import { slugify } from "/api/monitoring-tg/static/js/slugify.js"; + +const V = getVertical(); +const base = verticalBase(V); +const meta = VERTICAL_META[V]; +let sectionsBySlug = new Map(); + +function escape(s) { + if (s == null) return ""; + return String(s).replace(/[&<>"']/g, c => ({"&":"&","<":"<",">":">",'"':""","'":"'"}[c])); +} + +async function render() { + const grid = document.getElementById("sections-grid"); + grid.innerHTML = `
Загрузка...
`; + try { + const admin = await isAdmin(); + const sections = await api.listSections(V); + sectionsBySlug = new Map(sections.map(s => [s.slug, s])); + if (!sections.length) { + grid.innerHTML = `
Подразделов пока нет — нажми «+ Новый подраздел»
`; + return; + } + grid.innerHTML = ``; + } catch (err) { + toast(err.message, "error"); + grid.innerHTML = `
Ошибка: ${escape(err.message)}
`; + } +} + +// --- Create-section dialog with auto-slug ------------------------------- + +const titleInput = document.getElementById("new-title"); +const slugInput = document.getElementById("new-slug"); +const slugPreview = document.getElementById("new-slug-preview"); +const slugManualToggle = document.getElementById("new-slug-manual"); + +// Track whether the user has taken manual control of the slug. As soon as +// they touch the slug field directly, stop auto-syncing it. +let slugIsAuto = true; + +function syncSlugFromTitle() { + if (!slugIsAuto) return; + const proposed = slugify(titleInput.value); + slugInput.value = proposed; + if (slugPreview) { + slugPreview.textContent = proposed || "(введите название)"; + } +} + +if (titleInput) { + titleInput.addEventListener("input", syncSlugFromTitle); +} +if (slugInput) { + slugInput.addEventListener("input", () => { slugIsAuto = false; }); +} +if (slugManualToggle) { + slugManualToggle.addEventListener("click", (e) => { + e.preventDefault(); + const hidden = slugInput.closest(".slug-row"); + if (hidden) hidden.hidden = !hidden.hidden; + slugInput.focus(); + }); +} + +function resetForm() { + document.getElementById("create-form").reset(); + slugIsAuto = true; + if (slugPreview) slugPreview.textContent = "(введите название)"; + if (slugInput) slugInput.value = ""; + const hidden = slugInput?.closest(".slug-row"); + if (hidden) hidden.hidden = true; +} + +document.getElementById("open-create").addEventListener("click", () => { + resetForm(); + document.getElementById("create-dialog").showModal(); + setTimeout(() => titleInput?.focus(), 50); +}); + +document.getElementById("create-cancel").addEventListener("click", () => { + document.getElementById("create-dialog").close(); +}); + +document.getElementById("edit-cancel").addEventListener("click", () => { + document.getElementById("edit-dialog").close(); +}); + +document.getElementById("create-form").addEventListener("submit", async (e) => { + e.preventDefault(); + const title = titleInput.value.trim(); + if (!title) return; + // Re-sync once more in case `input` didn't fire before submit (autofill). + if (slugIsAuto) syncSlugFromTitle(); + const slug = slugInput.value.trim() || slugify(title); + if (!slug) { + toast("Не удалось сформировать slug — введите его вручную", "error"); + return; + } + const emoji = document.getElementById("new-emoji").value.trim() || null; + const accessCode = document.getElementById("new-access-code").value.trim(); + if (accessCode.length < 3) { + toast("Код доступа должен быть не короче 3 символов", "error"); + return; + } + const description = document.getElementById("new-description").value.trim() || null; + try { + await api.createSection({ vertical: V, slug, title, emoji, description, accessCode }); + toast(`Подраздел "${title}" создан`, "success"); + document.getElementById("create-dialog").close(); + resetForm(); + await render(); + } catch (err) { + toast(err.message, "error"); + } +}); + +document.getElementById("sections-grid").addEventListener("click", async (e) => { + const btn = e.target.closest("[data-action]"); + if (!btn) return; + const tile = btn.closest(".section-tile"); + const slug = tile.dataset.slug; + const action = btn.dataset.action; + if (action === "edit") { + const section = sectionsBySlug.get(slug); + if (!section) return; + document.getElementById("edit-slug").value = slug; + document.getElementById("edit-title").value = section.title || ""; + document.getElementById("edit-emoji").value = section.emoji || ""; + document.getElementById("edit-access-code").value = section.access_code || ""; + document.getElementById("edit-description").value = section.description || ""; + document.getElementById("edit-dialog").showModal(); + setTimeout(() => document.getElementById("edit-title").focus(), 50); + return; + } + if (action !== "delete") return; + if (!confirm(`Удалить подраздел "${slug}"? Удалить можно только пустой подраздел (без каналов).`)) { + return; + } + try { + await api.deleteSection(V, slug); + toast(`Подраздел "${slug}" удалён`, "success"); + await render(); + } catch (err) { + toast(err.message, "error"); + } +}); + +document.getElementById("edit-form").addEventListener("submit", async (e) => { + e.preventDefault(); + const slug = document.getElementById("edit-slug").value; + const title = document.getElementById("edit-title").value.trim(); + const emoji = document.getElementById("edit-emoji").value.trim() || null; + const accessCode = document.getElementById("edit-access-code").value.trim(); + const description = document.getElementById("edit-description").value.trim() || null; + if (!title) return; + if (accessCode.length < 3) { + toast("Код доступа должен быть не короче 3 символов", "error"); + return; + } + try { + await api.updateSection(V, slug, { + title, + emoji, + description, + access_code: accessCode, + }); + toast(`Подраздел "${title}" сохранён`, "success"); + document.getElementById("edit-dialog").close(); + await render(); + } catch (err) { + toast(err.message, "error"); + } +}); + +render(); diff --git a/src/parser_bot/web/static/js/settings.js b/src/parser_bot/web/static/js/settings.js new file mode 100644 index 0000000..6aea43f --- /dev/null +++ b/src/parser_bot/web/static/js/settings.js @@ -0,0 +1,118 @@ +import { api, toast, fmtDate } from "/api/monitoring-tg/static/js/api.js"; +import { getVertical, getSection, VERTICAL_META } from "/api/monitoring-tg/static/js/vertical.js"; + +const V = getVertical(); +const section = getSection(); +const meta = VERTICAL_META[V]; + +// `level` decides which override layer the editor edits/saves/resets. +// "section" → store key llm_system_prompt:: +// "vertical" → store key llm_system_prompt: +// Effective resolution always goes section → vertical → default. +let level = section ? "section" : "vertical"; + +const levelEl = document.getElementById("prompt-level"); +if (levelEl) { + if (!section) { + levelEl.value = "vertical"; + levelEl.disabled = true; + } else { + levelEl.value = "section"; + levelEl.addEventListener("change", async (e) => { + level = e.target.value; + await loadPrompt(); + }); + } +} + +function levelScope() { + return level === "section" + ? { vertical: V, section } + : { vertical: V, section: null }; +} + +async function loadConfig() { + const res = await fetch("/api/monitoring-tg/api/v1/settings"); + if (!res.ok) throw new Error(res.statusText); + const cfg = await res.json(); + const stats = await api.globalStats(); + + const scopeLabel = section ? `${meta.short} / ${section}` : meta.short; + const rows = [ + ["Раздел", `${meta.emoji} ${scopeLabel}`], + ["Период опроса", `${cfg.poll_interval_seconds}s`], + ["Лимит истории за опрос", cfg.poll_history_limit], + ["Telethon session", cfg.tg_session_path], + ["Postgres host", `${cfg.postgres_host}:${cfg.postgres_port}/${cfg.postgres_db}`], + ["API host", `${cfg.api_host}:${cfg.api_port}`], + [`Каналов в ${scopeLabel}`, `${stats.channels_active} активных / ${stats.channels_total}`], + [`Сообщений в ${scopeLabel}`, stats.messages_total.toLocaleString()], + ["Последний опрос (scope)", fmtDate(stats.last_poll_at)], + ]; + document.getElementById("config-tbody").innerHTML = rows.map(([k, v]) => + `${k}${v ?? "—"}` + ).join(""); +} + +document.getElementById("poll-all").addEventListener("click", async (e) => { + e.target.disabled = true; + try { + const res = await api.pollAll(); + toast(`В очереди ${res.queued ?? 0} каналов — опрос идёт в фоне`, "success"); + } catch (err) { + toast(err.message, "error"); + } finally { + e.target.disabled = false; + } +}); + +async function loadPrompt() { + const data = await api.llmPromptGet(levelScope()); + const editor = document.getElementById("prompt-editor"); + editor.value = data.prompt || ""; + const status = document.getElementById("prompt-status"); + const lengthEl = document.getElementById("prompt-length"); + + const map = { + section: ["override · подраздел", "ok"], + vertical: ["override · вертикаль", "ok"], + default: ["встроенный по умолчанию", "off"], + }; + const [label, cls] = map[data.source] || ["—", "off"]; + status.textContent = label; + status.className = `badge ${cls}`; + lengthEl.textContent = `${(data.prompt || "").length.toLocaleString()} символов`; +} + +document.getElementById("prompt-save").addEventListener("click", async (e) => { + const text = document.getElementById("prompt-editor").value; + e.target.disabled = true; + try { + await api.llmPromptSave(text, levelScope()); + const where = level === "section" ? `${meta.short} / ${section}` : meta.short; + toast(`Промпт ${where} сохранён, применится в течение 5 секунд`, "success"); + await loadPrompt(); + } catch (err) { + toast(err.message, "error"); + } finally { + e.target.disabled = false; + } +}); + +document.getElementById("prompt-reset").addEventListener("click", async (e) => { + const where = level === "section" ? `подраздела "${section}"` : `вертикали "${meta.short}"`; + if (!confirm(`Сбросить пользовательский промпт ${where} и вернуться к фоллбэку?`)) return; + e.target.disabled = true; + try { + await api.llmPromptReset(levelScope()); + toast(`Промпт ${where} сброшен`, "success"); + await loadPrompt(); + } catch (err) { + toast(err.message, "error"); + } finally { + e.target.disabled = false; + } +}); + +loadConfig().catch(err => toast(err.message, "error")); +loadPrompt().catch(err => toast(err.message, "error")); diff --git a/src/parser_bot/web/static/js/slugify.js b/src/parser_bot/web/static/js/slugify.js new file mode 100644 index 0000000..6ee1edb --- /dev/null +++ b/src/parser_bot/web/static/js/slugify.js @@ -0,0 +1,22 @@ +// URL-safe slug from arbitrary text. Cyrillic → Latin so titles like +// "Дубай Marina" become "dubai-marina" without forcing the user to type +// a slug by hand. + +const RU_TO_LAT = { + а: "a", б: "b", в: "v", г: "g", д: "d", е: "e", ё: "yo", ж: "zh", + з: "z", и: "i", й: "y", к: "k", л: "l", м: "m", н: "n", о: "o", + п: "p", р: "r", с: "s", т: "t", у: "u", ф: "f", х: "h", ц: "ts", + ч: "ch", ш: "sh", щ: "sch", ъ: "", ы: "y", ь: "", э: "e", ю: "yu", + я: "ya", +}; + +export function slugify(text) { + return (text || "") + .toLowerCase() + .split("") + .map(c => RU_TO_LAT[c] ?? c) + .join("") + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, "") + .slice(0, 64); +} diff --git a/src/parser_bot/web/static/js/vertical.js b/src/parser_bot/web/static/js/vertical.js new file mode 100644 index 0000000..319c549 --- /dev/null +++ b/src/parser_bot/web/static/js/vertical.js @@ -0,0 +1,76 @@ +const APP_BASE = "/api/monitoring-tg"; + +// Detect the current scope from the URL path. +// +// / → vertical=null, section=null +// /real-estate/ → vertical=real_estate, section=null (section chooser) +// /real-estate/dubai/ → vertical=real_estate, section=dubai +// /real-estate/dubai/channels.html → same +// /hr/ → vertical=hr, section=null +// /hr/it/settings.html → vertical=hr, section=it +// +// Section slug comes from URL path[2] and is opaque (created via UI). The +// frontend treats it as a string and passes it to the API; the backend +// resolves slug→Section row at query time. + +function _segments() { + const segments = location.pathname.split("/").filter(Boolean); + const base = APP_BASE.split("/").filter(Boolean); + if (base.every((part, idx) => segments[idx] === part)) { + return segments.slice(base.length); + } + return segments; +} + +export function getVerticalSlug() { + const seg = (_segments()[0] || "").toLowerCase(); + if (seg === "hr") return "hr"; + if (seg === "real-estate") return "real-estate"; + return null; +} + +export function getVertical() { + const slug = getVerticalSlug(); + if (slug === "hr") return "hr"; + if (slug === "real-estate") return "real_estate"; + return "real_estate"; // harmless default for section-less pages +} + +export function getSection() { + const segs = _segments(); + // Only treat segment[1] as a section slug when segment[0] is a known vertical. + if (!getVerticalSlug()) return null; + const candidate = segs[1]; + if (!candidate || candidate.endsWith(".html")) return null; + return candidate.toLowerCase(); +} + +export const VERTICAL_META = { + real_estate: { + slug: "real-estate", + title: "Недвижимость", + short: "Недвижимость", + emoji: "🏠", + leadLabel: "Объявление", + }, + hr: { + slug: "hr", + title: "HR / Кадры", + short: "HR", + emoji: "👥", + leadLabel: "HR-лид", + }, +}; + +export function appBase() { + return APP_BASE; +} + +export function verticalBase(vertical = getVertical()) { + return `${APP_BASE}/${VERTICAL_META[vertical].slug}`; +} + +export function sectionBase(vertical = getVertical(), section = getSection()) { + const v = verticalBase(vertical); + return section ? `${v}/${section}` : v; +} diff --git a/src/parser_bot/web/static/real-estate/index.html b/src/parser_bot/web/static/real-estate/index.html new file mode 100644 index 0000000..254202e --- /dev/null +++ b/src/parser_bot/web/static/real-estate/index.html @@ -0,0 +1,99 @@ + + + + + 🏠 Недвижимость — подразделы + + + + +
+

parser-tg-bot · 🏠 Недвижимость

+ +
+
+
+

Подразделы недвижимости

+
+ +
+

+ Каждый подраздел — это собственный набор каналов, своя статистика и свой + LLM-промпт (с фоллбэком на промпт вертикали). Например: Дубай, Москва, + Сочи, коммерческая недвижимость. +

+ +
+
+ + +

Новый подраздел

+
+ +
+ URL-адрес + /real-estate/(введите название)/ +
+ изменить вручную +
+ + + + +
+ + +
+
+
+ + +

Редактировать подраздел

+
+ + + + + +
+ + +
+
+
+ + + + + + diff --git a/src/parser_bot/web/static/real-estate/section/channels.html b/src/parser_bot/web/static/real-estate/section/channels.html new file mode 100644 index 0000000..3e4de16 --- /dev/null +++ b/src/parser_bot/web/static/real-estate/section/channels.html @@ -0,0 +1,48 @@ + + + + + 🏠 Недвижимость · Каналы — parser-tg-bot + + + + +
+

parser-tg-bot

+ +
+
+

Каналы подраздела

+ +
+
+ + +
+
+ Канал будет привязан к текущему подразделу. +
+
+ +
+ + + + + + + + + + + + + +
IDКаналTelegram IDСообщ.Последний опросСтатус
+
+
+ + + + + diff --git a/src/parser_bot/web/static/real-estate/section/index.html b/src/parser_bot/web/static/real-estate/section/index.html new file mode 100644 index 0000000..29911aa --- /dev/null +++ b/src/parser_bot/web/static/real-estate/section/index.html @@ -0,0 +1,43 @@ + + + + + 🏠 Недвижимость · Дашборд — parser-tg-bot + + + + +
+

parser-tg-bot

+ +
+
+
+

Дашборд

+
+ +
+ +
+ +

Каналы подраздела

+
+ + + + + + + + + + + +
КаналСообщенийПоследнее сообщениеПоследний опросСтатус
+
+
+ + + + + diff --git a/src/parser_bot/web/static/real-estate/section/messages.html b/src/parser_bot/web/static/real-estate/section/messages.html new file mode 100644 index 0000000..0a5b844 --- /dev/null +++ b/src/parser_bot/web/static/real-estate/section/messages.html @@ -0,0 +1,78 @@ + + + + + 🏠 Недвижимость · Сообщения — parser-tg-bot + + + + +
+

parser-tg-bot

+ +
+
+

Сообщения подраздела

+ +
+ + + + + + + +
+ + +
+ +
+ + +
+ + +

Сообщение

+

+    
+ +
+
+ + + + + + diff --git a/src/parser_bot/web/static/real-estate/section/settings.html b/src/parser_bot/web/static/real-estate/section/settings.html new file mode 100644 index 0000000..bb5a359 --- /dev/null +++ b/src/parser_bot/web/static/real-estate/section/settings.html @@ -0,0 +1,66 @@ + + + + + 🏠 Недвижимость · Настройки — parser-tg-bot + + + + +
+

parser-tg-bot

+ +
+
+

Настройки подраздела

+ +
+

Текущая конфигурация

+ + + + +
Загрузка...
+
+ Параметры задаются через переменные окружения (.env). + Для изменения отредактируйте .env и перезапустите контейнер: + docker compose restart app. +
+
+ +
+

Действия

+
+ + OpenAPI / Swagger + Health check +
+
+ +
+

🤖 Промпт ИИ

+
+ + +
+ + + +
+ +
+ Каскад: section → vertical → default. Если промпта на + уровне подраздела нет, используется промпт вертикали; если и его нет — + встроенный по умолчанию. Сохранение применится в течение ~5 сек. +
+
+
+ + + + +