Add audio model API keys

Enable Qwen audio endpoint
Add single WhisperX load balancer config
2026-06-09 13:28:53 +03:00 · 2026-06-09 13:23:21 +03:00 · 2026-06-09 13:20:17 +03:00 · 2026-06-09 13:17:05 +03:00 · 2026-06-09 12:52:13 +03:00 · 2026-06-09 12:50:56 +03:00
12 changed files with 754 additions and 57 deletions
--- a/README.md
+++ b/README.md
@@ -46,6 +46,34 @@ or compact `system` / `user` fields. The completed job result contains
 domain metadata fields in `input`, but the worker only reads chat fields such as
 `system`, `user`, `messages`, `max_tokens` and `response_format`.
 `transcription` jobs can run several transcription providers in order for
 temporary A/B comparison. The main `segments` field remains compatible with
 telephony and contains the first successful provider result. The full comparison
 is stored in `attempts` with `provider`, `model`, `status`, `text`, `segments`,
 `duration_ms` and `error`.
 Recommended comparison order:
 1. `whisperx`
 2. `qwen2-audio` (`Qwen/Qwen2-Audio-7B-Instruct`)
 3. `voxtral-small` (`mistralai/Voxtral-Small-24B-2507`)
 Qwen2-Audio and Voxtral are called through an OpenAI-compatible
 `/v1/chat/completions` endpoint with `input_audio`; set their endpoint URLs only
 after the models are actually exposed on the AI server.
 AI-server compose snippets for these temporary comparison endpoints live in
 `deploy/ai-server/docker-compose.audio.yml`. They are profile-gated because the
 single GPU cannot keep the production text vLLM, two WhisperX instances, Qwen2
 Audio and Voxtral loaded at the same time:
 - Qwen2-Audio endpoint: `http://10.2.3.5:8003`
 - Voxtral endpoint: `http://10.2.3.5:8004`
 - Start Qwen only:
  `docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile qwen-audio up -d qwen-audio`
 - Start Voxtral only:
  `docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile voxtral-small up -d voxtral-small`
 ## API
 - `POST /api/v1/jobs` creates one job.
@@ -83,7 +111,19 @@ for Kubernetes probes.
 - `LLM_API_KEY`, primary LLM API key
 - `LLM_MODEL`, default `qwen2.5-14b`
 - `LLM_TIMEOUT`, default `5m`
 - `TRANSCRIPTION_PROVIDERS`, default `whisperx`, comma-separated ordered list:
  `whisperx,qwen2-audio,voxtral-small`
 - `WHISPERX_URL`, WhisperX endpoint for transcription jobs
 - `QWEN_AUDIO_BASE_URL`, OpenAI-compatible endpoint for Qwen2-Audio
 - `QWEN_AUDIO_MODEL`, default `Qwen/Qwen2-Audio-7B-Instruct`
 - `QWEN_AUDIO_API_KEY`, optional bearer token for Qwen2-Audio; falls back to
  `AUDIO_LLM_API_KEY`, then `LLM_API_KEY`
 - `VOXTRAL_BASE_URL`, OpenAI-compatible endpoint for Voxtral
 - `VOXTRAL_MODEL`, default `mistralai/Voxtral-Small-24B-2507`
 - `VOXTRAL_API_KEY`, optional bearer token for Voxtral; falls back to
  `AUDIO_LLM_API_KEY`, then `LLM_API_KEY`
 - `AUDIO_LLM_PROMPT`, transcription instruction for audio LLM providers
 - `AUDIO_LLM_MAX_TOKENS`, default `4096`
 - `WORKER_ID`, default hostname
 - `WORKER_HTTP_HOST`, default `0.0.0.0`
 - `WORKER_HTTP_PORT`, default `8081`
--- a/cmd/worker/main.go
+++ b/cmd/worker/main.go
@@ -48,14 +48,31 @@ func main() {
 	}
 	llmClient := llm.New(cfg.LLMBaseURL, cfg.LLMAPIKey, cfg.LLMModel, cfg.LLMTimeout)
-	transcriber := transcription.New(cfg.WhisperXURL, cfg.WhisperXTimeout, cfg.FfmpegPath, cfg.WhisperXLeadSilence)
+	transcriber := transcription.NewWithOptions(transcription.Options{
 		Providers:         cfg.TranscriptionProviders,
 		WhisperXURL:       cfg.WhisperXURL,
 		WhisperXTimeout:   cfg.WhisperXTimeout,
 		FfmpegPath:        cfg.FfmpegPath,
 		LeadSilence:       cfg.WhisperXLeadSilence,
 		QwenAudioBaseURL:  cfg.QwenAudioBaseURL,
 		QwenAudioAPIKey:   cfg.QwenAudioAPIKey,
 		QwenAudioModel:    cfg.QwenAudioModel,
 		QwenAudioTimeout:  cfg.QwenAudioTimeout,
 		VoxtralBaseURL:    cfg.VoxtralBaseURL,
 		VoxtralAPIKey:     cfg.VoxtralAPIKey,
 		VoxtralModel:      cfg.VoxtralModel,
 		VoxtralTimeout:    cfg.VoxtralTimeout,
 		AudioLLMPrompt:    cfg.AudioLLMPrompt,
 		AudioLLMMaxTokens: cfg.AudioLLMMaxTokens,
 	})
 	w := worker.New(db, llmClient, transcriber, cfg.WorkerID, cfg.LLMModel, cfg.WorkerTaskTypes, cfg.WorkerModelProfiles, cfg.WorkerPollInterval, cfg.WorkerLeaseTimeout, cfg.WorkerClaimLimit)
 	healthSrv := startHealthServer(ctx, db, cfg)
 	slog.Info("ai_worker_started",
 		"worker_id", cfg.WorkerID,
 		"model", cfg.LLMModel,
-		"whisperx_enabled", transcriber != nil,
+		"transcription_enabled", transcriber != nil,
 		"transcription_providers", cfg.TranscriptionProviders,
 		"whisperx_lead_silence", cfg.WhisperXLeadSilence.String(),
 		"task_types", cfg.WorkerTaskTypes,
 		"model_profiles", cfg.WorkerModelProfiles,
@@ -123,13 +140,14 @@ func (h workerHealth) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 		})
 	case r.Method == http.MethodGet && path == "/worker/status":
 		writeWorkerJSON(w, http.StatusOK, map[string]any{
-			"status":         "running",
+			"status":                  "running",
-			"worker_id":      h.cfg.WorkerID,
+			"worker_id":               h.cfg.WorkerID,
-			"task_types":     h.cfg.WorkerTaskTypes,
+			"task_types":              h.cfg.WorkerTaskTypes,
-			"model_profiles": h.cfg.WorkerModelProfiles,
+			"model_profiles":          h.cfg.WorkerModelProfiles,
-			"claim_limit":    h.cfg.WorkerClaimLimit,
+			"transcription_providers": h.cfg.TranscriptionProviders,
-			"poll_interval":  h.cfg.WorkerPollInterval.String(),
+			"claim_limit":             h.cfg.WorkerClaimLimit,
-			"lease_timeout":  h.cfg.WorkerLeaseTimeout.String(),
+			"poll_interval":           h.cfg.WorkerPollInterval.String(),
 			"lease_timeout":           h.cfg.WorkerLeaseTimeout.String(),
 		})
 	default:
 		writeWorkerJSON(w, http.StatusNotFound, map[string]any{"error": "not found"})
--- a/deploy/ai-server/docker-compose.audio.yml
+++ b/deploy/ai-server/docker-compose.audio.yml
@@ -0,0 +1,118 @@
 services:
  qwen-audio:
    image: vllm/vllm-openai:latest
    container_name: qwen-audio
    profiles:
      - qwen-audio
      - audio-compare
    restart: unless-stopped
    ipc: host
    runtime: nvidia
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
      VLLM_API_KEY: ${VLLM_API_KEY}
      HF_HOME: /cache
    volumes:
      - ./data/vllm-cache:/cache
    networks:
      - audio-models
    ports:
      - "10.2.3.5:8003:8000"
    command:
      - "--model"
      - "Qwen/Qwen2-Audio-7B-Instruct"
      - "--served-model-name"
      - "Qwen/Qwen2-Audio-7B-Instruct"
      - "--trust-remote-code"
      - "--host"
      - "0.0.0.0"
      - "--port"
      - "8000"
      - "--max-model-len"
      - "8192"
      - "--gpu-memory-utilization"
      - "0.25"
      - "--api-key"
      - "${VLLM_API_KEY}"
      - "--max-num-seqs"
      - "4"
      - "--max-num-batched-tokens"
      - "4096"
    healthcheck:
      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
      interval: 30s
      timeout: 5s
      retries: 5
      start_period: 900s
  voxtral-small:
    image: vllm/vllm-openai:latest
    container_name: voxtral-small
    profiles:
      - voxtral-small
      - audio-compare
    restart: unless-stopped
    ipc: host
    runtime: nvidia
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
      VLLM_API_KEY: ${VLLM_API_KEY}
      HF_HOME: /cache
    volumes:
      - ./data/vllm-cache:/cache
    networks:
      - audio-models
    ports:
      - "10.2.3.5:8004:8000"
    command:
      - "--model"
      - "mistralai/Voxtral-Small-24B-2507"
      - "--served-model-name"
      - "mistralai/Voxtral-Small-24B-2507"
      - "--tokenizer-mode"
      - "mistral"
      - "--config-format"
      - "mistral"
      - "--load-format"
      - "mistral"
      - "--tool-call-parser"
      - "mistral"
      - "--enable-auto-tool-choice"
      - "--host"
      - "0.0.0.0"
      - "--port"
      - "8000"
      - "--max-model-len"
      - "32768"
      - "--gpu-memory-utilization"
      - "0.62"
      - "--api-key"
      - "${VLLM_API_KEY}"
      - "--max-num-seqs"
      - "2"
      - "--max-num-batched-tokens"
      - "8192"
    healthcheck:
      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
      interval: 30s
      timeout: 5s
      retries: 5
      start_period: 1200s
 networks:
  audio-models:
    driver: bridge
--- a/deploy/ai-server/whisperx-lb.single.conf
+++ b/deploy/ai-server/whisperx-lb.single.conf
@@ -0,0 +1,20 @@
 upstream whisperx_upstream {
    server whisperx-1:8000 max_fails=3 fail_timeout=30s;
 }
 server {
    listen 80 default_server;
    client_max_body_size 200m;
    location / {
        proxy_pass http://whisperx_upstream;
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_connect_timeout 30s;
        proxy_send_timeout    10m;
        proxy_read_timeout    10m;
        proxy_request_buffering off;
        proxy_buffering         off;
    }
 }
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -14,16 +14,27 @@ type Config struct {
 	MigrateOnStart bool
 	APIAuthToken   string
-	LLMBaseURL          string
+	LLMBaseURL             string
-	LLMAPIKey           string
+	LLMAPIKey              string
-	LLMModel            string
+	LLMModel               string
-	LLMTimeout          time.Duration
+	LLMTimeout             time.Duration
-	WhisperXURL         string
+	TranscriptionProviders []string
-	WhisperXTimeout     time.Duration
+	WhisperXURL            string
-	WhisperXLeadSilence time.Duration
+	WhisperXTimeout        time.Duration
-	FfmpegPath          string
+	WhisperXLeadSilence    time.Duration
-	AIStatsSidecarURL   string
+	QwenAudioBaseURL       string
-	AIStatsTimeout      time.Duration
+	QwenAudioAPIKey        string
 	QwenAudioModel         string
 	QwenAudioTimeout       time.Duration
 	VoxtralBaseURL         string
 	VoxtralAPIKey          string
 	VoxtralModel           string
 	VoxtralTimeout         time.Duration
 	AudioLLMMaxTokens      int
 	AudioLLMPrompt         string
 	FfmpegPath             string
 	AIStatsSidecarURL      string
 	AIStatsTimeout         time.Duration
 	WorkerID            string
 	WorkerHTTPHost      string
@@ -43,16 +54,27 @@ func Load() Config {
 		MigrateOnStart: envBool("MIGRATE_ON_START", true),
 		APIAuthToken:   envString("AI_SERVICE_TOKEN", ""),
-		LLMBaseURL:          envString("LLM_BASE_URL", ""),
+		LLMBaseURL:             envString("LLM_BASE_URL", ""),
-		LLMAPIKey:           envString("LLM_API_KEY", ""),
+		LLMAPIKey:              envString("LLM_API_KEY", ""),
-		LLMModel:            envString("LLM_MODEL", "qwen2.5-14b"),
+		LLMModel:               envString("LLM_MODEL", "qwen2.5-14b"),
-		LLMTimeout:          envDuration("LLM_TIMEOUT", 5*time.Minute),
+		LLMTimeout:             envDuration("LLM_TIMEOUT", 5*time.Minute),
-		WhisperXURL:         envString("WHISPERX_URL", ""),
+		TranscriptionProviders: envCSVDefault("TRANSCRIPTION_PROVIDERS", []string{"whisperx"}),
-		WhisperXTimeout:     envDuration("WHISPERX_TIMEOUT", 10*time.Minute),
+		WhisperXURL:            envString("WHISPERX_URL", ""),
-		WhisperXLeadSilence: envDuration("WHISPERX_LEAD_SILENCE", 800*time.Millisecond),
+		WhisperXTimeout:        envDuration("WHISPERX_TIMEOUT", 10*time.Minute),
-		FfmpegPath:          envString("FFMPEG_PATH", "/usr/bin/ffmpeg"),
+		WhisperXLeadSilence:    envDuration("WHISPERX_LEAD_SILENCE", 800*time.Millisecond),
-		AIStatsSidecarURL:   envString("AI_STATS_SIDECAR_URL", ""),
+		QwenAudioBaseURL:       envString("QWEN_AUDIO_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
-		AIStatsTimeout:      envDuration("AI_STATS_TIMEOUT", 8*time.Second),
+		QwenAudioAPIKey:        envString("QWEN_AUDIO_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
 		QwenAudioModel:         envString("QWEN_AUDIO_MODEL", "Qwen/Qwen2-Audio-7B-Instruct"),
 		QwenAudioTimeout:       envDuration("QWEN_AUDIO_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
 		VoxtralBaseURL:         envString("VOXTRAL_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
 		VoxtralAPIKey:          envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
 		VoxtralModel:           envString("VOXTRAL_MODEL", "mistralai/Voxtral-Small-24B-2507"),
 		VoxtralTimeout:         envDuration("VOXTRAL_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
 		AudioLLMMaxTokens:      envInt("AUDIO_LLM_MAX_TOKENS", 4096),
 		AudioLLMPrompt:         envString("AUDIO_LLM_PROMPT", defaultAudioLLMPrompt()),
 		FfmpegPath:             envString("FFMPEG_PATH", "/usr/bin/ffmpeg"),
 		AIStatsSidecarURL:      envString("AI_STATS_SIDECAR_URL", ""),
 		AIStatsTimeout:         envDuration("AI_STATS_TIMEOUT", 8*time.Second),
 		WorkerID:            envString("WORKER_ID", hostname()),
 		WorkerHTTPHost:      envString("WORKER_HTTP_HOST", "0.0.0.0"),
@@ -123,6 +145,17 @@ func envCSV(key string) []string {
 	return out
 }
 func envCSVDefault(key string, fallback []string) []string {
 	if values := envCSV(key); len(values) > 0 {
 		return values
 	}
 	return fallback
 }
 func defaultAudioLLMPrompt() string {
 	return "Расшифруй речь из аудио максимально точно. Сохрани русский язык, имена, телефоны, суммы и смысловые паузы. Не добавляй комментарии, анализ, Markdown или JSON. Верни только чистый текст расшифровки."
 }
 func hostname() string {
 	h, err := os.Hostname()
 	if err != nil || h == "" {
--- a/internal/httpapi/dashboard.go
+++ b/internal/httpapi/dashboard.go
@@ -53,6 +53,8 @@ func (s *Server) handleDashboard(w http.ResponseWriter, r *http.Request) {
 			Providers: []providerStatus{
 				s.checkLLM(ctx),
 				s.checkWhisperX(ctx),
 				s.checkAudioLLM(ctx, "qwen2-audio", s.cfg.QwenAudioBaseURL, s.cfg.QwenAudioAPIKey, s.cfg.QwenAudioModel, s.cfg.QwenAudioTimeout),
 				s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
 			},
 		},
 		Infra: loadInfraSnapshot(r, s.cfg),
--- a/internal/httpapi/providers.go
+++ b/internal/httpapi/providers.go
@@ -43,11 +43,59 @@ func (s *Server) handleProviderStatus(w http.ResponseWriter, r *http.Request) {
 		Providers: []providerStatus{
 			s.checkLLM(ctx),
 			s.checkWhisperX(ctx),
 			s.checkAudioLLM(ctx, "qwen2-audio", s.cfg.QwenAudioBaseURL, s.cfg.QwenAudioAPIKey, s.cfg.QwenAudioModel, s.cfg.QwenAudioTimeout),
 			s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
 		},
 	}
 	writeJSON(w, http.StatusOK, resp)
 }
 func (s *Server) checkAudioLLM(ctx context.Context, name, baseURL, apiKey, model string, timeout time.Duration) providerStatus {
 	baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
 	st := providerStatus{
 		Name:       name,
 		Configured: baseURL != "",
 		URL:        baseURL,
 		Model:      model,
 	}
 	if !st.Configured {
 		return st
 	}
 	if timeout <= 0 {
 		timeout = 10 * time.Minute
 	}
 	start := time.Now()
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, st.URL+"/v1/models", nil)
 	if err != nil {
 		st.Error = err.Error()
 		return st
 	}
 	if apiKey != "" {
 		req.Header.Set("Authorization", "Bearer "+apiKey)
 	}
 	res, err := (&http.Client{Timeout: minDuration(timeout, 3*time.Second)}).Do(req)
 	st.LatencyMS = time.Since(start).Milliseconds()
 	if err != nil {
 		st.Error = err.Error()
 		return s.withStaleProviderOK(name, st)
 	}
 	defer res.Body.Close()
 	if res.StatusCode >= 300 {
 		st.Error = fmt.Sprintf("http %d: %s", res.StatusCode, readSmallBody(res.Body))
 		return s.withStaleProviderOK(name, st)
 	}
 	st.OK = true
 	s.rememberProviderOK(name, st.LatencyMS)
 	return st
 }
 func minDuration(a, b time.Duration) time.Duration {
 	if a < b {
 		return a
 	}
 	return b
 }
 func (s *Server) checkLLM(ctx context.Context) providerStatus {
 	st := providerStatus{
 		Name:       "llm",
--- a/internal/transcription/client.go
+++ b/internal/transcription/client.go
@@ -3,6 +3,7 @@ package transcription
 import (
 	"bytes"
 	"context"
 	"encoding/base64"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -16,12 +17,47 @@ import (
 )
 type Client struct {
-	baseURL     string
+	providers   []ProviderConfig
 	http        *http.Client
 	ffmpegPath  string
 	leadSilence time.Duration
 }
 const (
 	ProviderWhisperX  = "whisperx"
 	ProviderQwenAudio = "qwen2-audio"
 	ProviderVoxtral   = "voxtral-small"
 )
 type Options struct {
 	Providers         []string
 	WhisperXURL       string
 	WhisperXTimeout   time.Duration
 	FfmpegPath        string
 	LeadSilence       time.Duration
 	QwenAudioBaseURL  string
 	QwenAudioAPIKey   string
 	QwenAudioModel    string
 	QwenAudioTimeout  time.Duration
 	VoxtralBaseURL    string
 	VoxtralAPIKey     string
 	VoxtralModel      string
 	VoxtralTimeout    time.Duration
 	AudioLLMPrompt    string
 	AudioLLMMaxTokens int
 }
 type ProviderConfig struct {
 	Name      string
 	Kind      string
 	BaseURL   string
 	APIKey    string
 	Model     string
 	Timeout   time.Duration
 	MaxTokens int
 	Prompt    string
 }
 type Input struct {
 	AudioURL    string `json:"audio_url"`
 	Filename    string `json:"filename,omitempty"`
@@ -39,6 +75,9 @@ type Segment struct {
 }
 type Result struct {
 	Provider     string    `json:"provider,omitempty"`
 	Model        string    `json:"model,omitempty"`
 	Attempts     []Attempt `json:"attempts,omitempty"`
 	Language     string    `json:"language"`
 	Segments     []Segment `json:"segments"`
 	DiarizeError *string   `json:"diarize_error,omitempty"`
@@ -46,6 +85,16 @@ type Result struct {
 	DurationMS   int64     `json:"duration_ms"`
 }
 type Attempt struct {
 	Provider   string    `json:"provider"`
 	Model      string    `json:"model,omitempty"`
 	Status     string    `json:"status"`
 	Error      string    `json:"error,omitempty"`
 	Text       string    `json:"text,omitempty"`
 	Segments   []Segment `json:"segments,omitempty"`
 	DurationMS int64     `json:"duration_ms,omitempty"`
 }
 type whisperResponse struct {
 	Language     string    `json:"language"`
 	Segments     []Segment `json:"segments"`
@@ -53,35 +102,188 @@ type whisperResponse struct {
 	AlignError   *string   `json:"align_error,omitempty"`
 }
 type audioLLMResponse struct {
 	Text  string
 	Model string
 }
 type audioLLMChatRequest struct {
 	Model       string                `json:"model"`
 	Messages    []audioLLMChatMessage `json:"messages"`
 	MaxTokens   int                   `json:"max_tokens,omitempty"`
 	Temperature float64               `json:"temperature"`
 }
 type audioLLMChatMessage struct {
 	Role    string                `json:"role"`
 	Content []audioLLMContentPart `json:"content"`
 }
 type audioLLMContentPart struct {
 	Type       string         `json:"type"`
 	Text       string         `json:"text,omitempty"`
 	InputAudio *audioLLMAudio `json:"input_audio,omitempty"`
 }
 type audioLLMAudio struct {
 	Data   string `json:"data"`
 	Format string `json:"format,omitempty"`
 }
 type audioLLMChatResponse struct {
 	Model   string `json:"model,omitempty"`
 	Choices []struct {
 		Message struct {
 			Content string `json:"content"`
 		} `json:"message"`
 	} `json:"choices"`
 	Error *struct {
 		Message string `json:"message"`
 	} `json:"error,omitempty"`
 }
 func New(baseURL string, timeout time.Duration, ffmpegPath string, leadSilence time.Duration) *Client {
-	baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
+	return NewWithOptions(Options{
-	if baseURL == "" {
+		Providers:       []string{ProviderWhisperX},
-		return nil
+		WhisperXURL:     baseURL,
-	}
+		WhisperXTimeout: timeout,
-	if timeout <= 0 {
+		FfmpegPath:      ffmpegPath,
-		timeout = 10 * time.Minute
+		LeadSilence:     leadSilence,
-	}
+	})
 }
 func NewWithOptions(opts Options) *Client {
 	leadSilence := opts.LeadSilence
 	if leadSilence < 0 {
 		leadSilence = 0
 	}
 	if leadSilence > 5*time.Second {
 		leadSilence = 5 * time.Second
 	}
-	ffmpegPath = strings.TrimSpace(ffmpegPath)
+	ffmpegPath := strings.TrimSpace(opts.FfmpegPath)
 	if ffmpegPath == "" {
 		ffmpegPath = "ffmpeg"
 	}
 	maxTokens := opts.AudioLLMMaxTokens
 	if maxTokens <= 0 {
 		maxTokens = 4096
 	}
 	audioLLMPrompt := strings.TrimSpace(opts.AudioLLMPrompt)
 	if audioLLMPrompt == "" {
 		audioLLMPrompt = "Transcribe the audio exactly. Return only the transcript text."
 	}
 	providers := buildProviders(opts, audioLLMPrompt, maxTokens)
 	if len(providers) == 0 {
 		return nil
 	}
 	return &Client{
-		baseURL:     baseURL,
+		providers:   providers,
-		http:        &http.Client{Timeout: timeout},
+		http:        &http.Client{Timeout: maxProviderTimeout(providers)},
 		ffmpegPath:  ffmpegPath,
 		leadSilence: leadSilence,
 	}
 }
 func buildProviders(opts Options, prompt string, maxTokens int) []ProviderConfig {
 	order := normalizeProviderOrder(opts.Providers)
 	out := make([]ProviderConfig, 0, len(order))
 	for _, name := range order {
 		switch name {
 		case ProviderWhisperX:
 			baseURL := strings.TrimRight(strings.TrimSpace(opts.WhisperXURL), "/")
 			if baseURL == "" {
 				continue
 			}
 			out = append(out, ProviderConfig{
 				Name:    ProviderWhisperX,
 				Kind:    ProviderWhisperX,
 				BaseURL: baseURL,
 				Model:   ProviderWhisperX,
 				Timeout: defaultDuration(opts.WhisperXTimeout, 10*time.Minute),
 			})
 		case ProviderQwenAudio:
 			baseURL := strings.TrimRight(strings.TrimSpace(opts.QwenAudioBaseURL), "/")
 			if baseURL == "" {
 				continue
 			}
 			model := firstNonEmpty(opts.QwenAudioModel, "Qwen/Qwen2-Audio-7B-Instruct")
 			out = append(out, ProviderConfig{
 				Name:      ProviderQwenAudio,
 				Kind:      "audio_llm",
 				BaseURL:   baseURL,
 				APIKey:    strings.TrimSpace(opts.QwenAudioAPIKey),
 				Model:     model,
 				Timeout:   defaultDuration(opts.QwenAudioTimeout, 10*time.Minute),
 				MaxTokens: maxTokens,
 				Prompt:    prompt,
 			})
 		case ProviderVoxtral:
 			baseURL := strings.TrimRight(strings.TrimSpace(opts.VoxtralBaseURL), "/")
 			if baseURL == "" {
 				continue
 			}
 			model := firstNonEmpty(opts.VoxtralModel, "mistralai/Voxtral-Small-24B-2507")
 			out = append(out, ProviderConfig{
 				Name:      ProviderVoxtral,
 				Kind:      "audio_llm",
 				BaseURL:   baseURL,
 				APIKey:    strings.TrimSpace(opts.VoxtralAPIKey),
 				Model:     model,
 				Timeout:   defaultDuration(opts.VoxtralTimeout, 10*time.Minute),
 				MaxTokens: maxTokens,
 				Prompt:    prompt,
 			})
 		}
 	}
 	return out
 }
 func normalizeProviderOrder(in []string) []string {
 	if len(in) == 0 {
 		return []string{ProviderWhisperX}
 	}
 	out := make([]string, 0, len(in))
 	seen := map[string]bool{}
 	for _, raw := range in {
 		name := strings.ToLower(strings.TrimSpace(raw))
 		switch name {
 		case "whisper", "whisperx":
 			name = ProviderWhisperX
 		case "qwen", "qwen-audio", "qwen2-audio", "qwen2-audio-7b-instruct":
 			name = ProviderQwenAudio
 		case "voxtral", "voxtral-small", "voxtral-small-24b-2507":
 			name = ProviderVoxtral
 		default:
 			continue
 		}
 		if !seen[name] {
 			out = append(out, name)
 			seen[name] = true
 		}
 	}
 	return out
 }
 func maxProviderTimeout(providers []ProviderConfig) time.Duration {
 	maxTimeout := 10 * time.Minute
 	for _, provider := range providers {
 		if provider.Timeout > maxTimeout {
 			maxTimeout = provider.Timeout
 		}
 	}
 	return maxTimeout
 }
 func defaultDuration(v, fallback time.Duration) time.Duration {
 	if v <= 0 {
 		return fallback
 	}
 	return v
 }
 func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
-	if c == nil || c.baseURL == "" {
+	if c == nil || len(c.providers) == 0 {
-		return nil, fmt.Errorf("whisperx not configured")
+		return nil, fmt.Errorf("transcription providers not configured")
 	}
 	if strings.TrimSpace(in.AudioURL) == "" {
 		return nil, fmt.Errorf("audio_url is required")
@@ -96,18 +298,91 @@ func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
 			return nil, err
 		}
 	}
-	resp, duration, err := c.transcribeAudio(ctx, audio, filename, in)
+	var attempts []Attempt
-	if err != nil {
+	var winner *Result
-		return nil, err
+	var errors []string
 	for _, provider := range c.providers {
 		result, attempt, err := c.transcribeWithProvider(ctx, provider, audio, filename, in)
 		attempts = append(attempts, attempt)
 		if err != nil {
 			errors = append(errors, provider.Name+": "+err.Error())
 			continue
 		}
 		if winner == nil {
 			winner = result
 		}
 	}
-	segments := adjustLeadSilence(resp.Segments, c.leadSilence)
+	if winner == nil {
-	return &Result{
+		return nil, fmt.Errorf("all transcription providers failed: %s", strings.Join(errors, "; "))
-		Language:     resp.Language,
+	}
-		Segments:     segments,
+	winner.Attempts = attempts
-		DiarizeError: resp.DiarizeError,
+	return winner, nil
-		AlignError:   resp.AlignError,
+}
-		DurationMS:   duration.Milliseconds(),
+
-	}, nil
+func (c *Client) transcribeWithProvider(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input) (*Result, Attempt, error) {
 	providerCtx := ctx
 	cancel := func() {}
 	if provider.Timeout > 0 {
 		providerCtx, cancel = context.WithTimeout(ctx, provider.Timeout)
 	}
 	defer cancel()
 	attempt := Attempt{
 		Provider: provider.Name,
 		Model:    provider.Model,
 		Status:   "failed",
 	}
 	switch provider.Kind {
 	case ProviderWhisperX:
 		resp, duration, err := c.transcribeAudio(providerCtx, provider, audio, filename, in)
 		attempt.DurationMS = duration.Milliseconds()
 		if err != nil {
 			attempt.Error = err.Error()
 			return nil, attempt, err
 		}
 		segments := adjustLeadSilence(resp.Segments, c.leadSilence)
 		attempt.Status = "ok"
 		attempt.Segments = segments
 		attempt.Text = segmentsText(segments)
 		return &Result{
 			Provider:     provider.Name,
 			Model:        provider.Model,
 			Language:     resp.Language,
 			Segments:     segments,
 			DiarizeError: resp.DiarizeError,
 			AlignError:   resp.AlignError,
 			DurationMS:   duration.Milliseconds(),
 		}, attempt, nil
 	default:
 		resp, duration, err := c.transcribeAudioLLM(providerCtx, provider, audio, filename, in)
 		attempt.DurationMS = duration.Milliseconds()
 		if err != nil {
 			attempt.Error = err.Error()
 			return nil, attempt, err
 		}
 		text := strings.TrimSpace(resp.Text)
 		segments := []Segment{{Start: 0, End: 0, Text: text}}
 		attempt.Status = "ok"
 		attempt.Model = resp.Model
 		attempt.Text = text
 		attempt.Segments = segments
 		return &Result{
 			Provider:   provider.Name,
 			Model:      resp.Model,
 			Language:   firstNonEmpty(in.Language, "unknown"),
 			Segments:   segments,
 			DurationMS: duration.Milliseconds(),
 		}, attempt, nil
 	}
 }
 func segmentsText(segments []Segment) string {
 	parts := make([]string, 0, len(segments))
 	for _, segment := range segments {
 		if text := strings.TrimSpace(segment.Text); text != "" {
 			parts = append(parts, text)
 		}
 	}
 	return strings.Join(parts, "\n")
 }
 func (c *Client) downloadAudio(ctx context.Context, in Input) ([]byte, string, error) {
@@ -222,7 +497,7 @@ func clampTime(v float64) float64 {
 	return v
 }
-func (c *Client) transcribeAudio(ctx context.Context, audio []byte, filename string, in Input) (*whisperResponse, time.Duration, error) {
+func (c *Client) transcribeAudio(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input) (*whisperResponse, time.Duration, error) {
 	body := &bytes.Buffer{}
 	mw := multipart.NewWriter(body)
 	fw, err := mw.CreateFormFile("file", filename)
@@ -250,7 +525,7 @@ func (c *Client) transcribeAudio(ctx context.Context, audio []byte, filename str
 		return nil, 0, fmt.Errorf("close form: %w", err)
 	}
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/transcribe", body)
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, provider.BaseURL+"/transcribe", body)
 	if err != nil {
 		return nil, 0, fmt.Errorf("whisperx request: %w", err)
 	}
@@ -273,3 +548,97 @@ func (c *Client) transcribeAudio(ctx context.Context, audio []byte, filename str
 	}
 	return &out, duration, nil
 }
 func (c *Client) transcribeAudioLLM(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input) (*audioLLMResponse, time.Duration, error) {
 	prompt := provider.Prompt
 	if in.Language != "" {
 		prompt += "\nЯзык аудио: " + in.Language + "."
 	}
 	if in.Diarize {
 		prompt += "\nЕсли слышны разные говорящие, разделяй реплики с короткими пометками Спикер 1/Спикер 2."
 	}
 	reqBody := audioLLMChatRequest{
 		Model:       provider.Model,
 		MaxTokens:   provider.MaxTokens,
 		Temperature: 0,
 		Messages: []audioLLMChatMessage{
 			{
 				Role: "user",
 				Content: []audioLLMContentPart{
 					{Type: "text", Text: prompt},
 					{
 						Type: "input_audio",
 						InputAudio: &audioLLMAudio{
 							Data:   base64.StdEncoding.EncodeToString(audio),
 							Format: audioFormat(filename),
 						},
 					},
 				},
 			},
 		},
 	}
 	body, err := json.Marshal(reqBody)
 	if err != nil {
 		return nil, 0, fmt.Errorf("audio llm marshal: %w", err)
 	}
 	req, err := http.NewRequestWithContext(ctx, http.MethodPost, provider.BaseURL+"/v1/chat/completions", bytes.NewReader(body))
 	if err != nil {
 		return nil, 0, fmt.Errorf("audio llm request: %w", err)
 	}
 	req.Header.Set("Content-Type", "application/json")
 	if provider.APIKey != "" {
 		req.Header.Set("Authorization", "Bearer "+provider.APIKey)
 	}
 	start := time.Now()
 	resp, err := c.http.Do(req)
 	duration := time.Since(start)
 	if err != nil {
 		return nil, duration, fmt.Errorf("audio llm do: %w", err)
 	}
 	defer resp.Body.Close()
 	raw, err := io.ReadAll(io.LimitReader(resp.Body, 4<<20))
 	if err != nil {
 		return nil, duration, fmt.Errorf("audio llm read: %w", err)
 	}
 	if resp.StatusCode >= 300 {
 		return nil, duration, fmt.Errorf("audio llm HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(raw)))
 	}
 	var out audioLLMChatResponse
 	if err := json.Unmarshal(raw, &out); err != nil {
 		return nil, duration, fmt.Errorf("audio llm decode: %w", err)
 	}
 	if out.Error != nil {
 		return nil, duration, fmt.Errorf("audio llm error: %s", out.Error.Message)
 	}
 	if len(out.Choices) == 0 {
 		return nil, duration, fmt.Errorf("audio llm: empty choices")
 	}
 	modelName := out.Model
 	if modelName == "" {
 		modelName = provider.Model
 	}
 	return &audioLLMResponse{
 		Text:  strings.TrimSpace(out.Choices[0].Message.Content),
 		Model: modelName,
 	}, duration, nil
 }
 func audioFormat(filename string) string {
 	ext := strings.TrimPrefix(strings.ToLower(filepath.Ext(filename)), ".")
 	switch ext {
 	case "wav", "mp3", "flac", "m4a", "ogg", "opus", "webm":
 		return ext
 	default:
 		return "mp3"
 	}
 }
 func firstNonEmpty(values ...string) string {
 	for _, value := range values {
 		if strings.TrimSpace(value) != "" {
 			return value
 		}
 	}
 	return ""
 }
--- a/internal/transcription/client_test.go
+++ b/internal/transcription/client_test.go
@@ -23,6 +23,41 @@ func TestAdjustLeadSilence(t *testing.T) {
 	}
 }
 func TestNormalizeProviderOrder(t *testing.T) {
 	got := normalizeProviderOrder([]string{"whisperx", "qwen", "voxtral", "qwen2-audio"})
 	want := []string{ProviderWhisperX, ProviderQwenAudio, ProviderVoxtral}
 	if len(got) != len(want) {
 		t.Fatalf("providers = %#v, want %#v", got, want)
 	}
 	for i := range want {
 		if got[i] != want[i] {
 			t.Fatalf("providers = %#v, want %#v", got, want)
 		}
 	}
 }
 func TestNewWithOptionsBuildsComparisonProviders(t *testing.T) {
 	client := NewWithOptions(Options{
 		Providers:        []string{"whisperx", "qwen2-audio", "voxtral-small"},
 		WhisperXURL:      "http://whisperx",
 		QwenAudioBaseURL: "http://qwen",
 		VoxtralBaseURL:   "http://voxtral",
 	})
 	if client == nil {
 		t.Fatal("client is nil")
 	}
 	got := make([]string, 0, len(client.providers))
 	for _, provider := range client.providers {
 		got = append(got, provider.Name)
 	}
 	want := []string{ProviderWhisperX, ProviderQwenAudio, ProviderVoxtral}
 	for i := range want {
 		if got[i] != want[i] {
 			t.Fatalf("providers = %#v, want %#v", got, want)
 		}
 	}
 }
 func near(got, want float64) bool {
 	return math.Abs(got-want) < 0.000001
 }
--- a/internal/worker/worker.go
+++ b/internal/worker/worker.go
@@ -139,7 +139,7 @@ func (w *Worker) process(ctx context.Context, job *model.Job) {
 func (w *Worker) processTranscription(ctx context.Context, job *model.Job) {
 	if w.transcriber == nil {
-		w.fail(ctx, job, "provider_unavailable", "whisperx not configured")
+		w.fail(ctx, job, "provider_unavailable", "transcription providers not configured")
 		return
 	}
 	var input transcription.Input
@@ -186,8 +186,10 @@ func classifyTranscriptionError(err error) string {
 		return "storage_error"
 	case strings.Contains(s, "whisperx http 4") || strings.Contains(s, "ffmpeg") || strings.Contains(s, "invalid data") || strings.Contains(s, "could not decode"):
 		return "bad_audio"
-	case strings.Contains(s, "whisperx http 5") || strings.Contains(s, "whisperx do") || strings.Contains(s, "connection refused") || strings.Contains(s, "connection reset") || strings.Contains(s, "closed network connection"):
+	case strings.Contains(s, "whisperx http 5") || strings.Contains(s, "whisperx do") || strings.Contains(s, "audio llm http 5") || strings.Contains(s, "audio llm do") || strings.Contains(s, "connection refused") || strings.Contains(s, "connection reset") || strings.Contains(s, "closed network connection"):
 		return "provider_unavailable"
 	case strings.Contains(s, "audio llm http 4"):
 		return "bad_input"
 	case strings.Contains(s, "decode"):
 		return "bad_response"
 	default:
--- a/k8s/configmap.yaml
+++ b/k8s/configmap.yaml
@@ -11,9 +11,19 @@ data:
  LLM_BASE_URL: "http://10.2.3.5:8002"
  LLM_MODEL: "qwen2.5-14b"
  LLM_TIMEOUT: "5m"
  TRANSCRIPTION_PROVIDERS: "whisperx,qwen2-audio,voxtral-small"
  WHISPERX_URL: "http://10.2.3.5:8001"
  WHISPERX_TIMEOUT: "10m"
  WHISPERX_LEAD_SILENCE: "800ms"
  # Fill these after Qwen2-Audio and Voxtral are exposed as OpenAI-compatible
  # chat-completions endpoints on the AI server.
  QWEN_AUDIO_BASE_URL: "http://10.2.3.5:8003"
  QWEN_AUDIO_MODEL: "Qwen/Qwen2-Audio-7B-Instruct"
  QWEN_AUDIO_TIMEOUT: "10m"
  VOXTRAL_BASE_URL: ""
  VOXTRAL_MODEL: "mistralai/Voxtral-Small-24B-2507"
  VOXTRAL_TIMEOUT: "10m"
  AUDIO_LLM_MAX_TOKENS: "4096"
  FFMPEG_PATH: "/usr/bin/ffmpeg"
  AI_STATS_SIDECAR_URL: "http://10.2.3.5:9090"
  AI_STATS_TIMEOUT: "8s"
--- a/k8s/secrets.yaml
+++ b/k8s/secrets.yaml
@@ -18,4 +18,6 @@ type: Opaque
 stringData:
  DATABASE_URL: "postgres://ai_service:ai_service@postgres:5432/ai_service?sslmode=disable"
  LLM_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
  QWEN_AUDIO_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
  VOXTRAL_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
  AI_SERVICE_TOKEN: "d18bcacf9e02bae1806ee6b6eeda62b95be6a915c0a22936d9a700128b275442"
Author	SHA1	Message	Date
Grendgi	76ac9b8896	Add audio model API keys All checks were successful CI / test (push) Successful in 14s Details Build and Deploy / build-and-deploy (push) Successful in 25s Details	2026-06-09 13:28:53 +03:00
Grendgi	c31dcb891c	Enable Qwen audio endpoint	2026-06-09 13:23:21 +03:00
Grendgi	ee6e948d2e	Add single WhisperX load balancer config	2026-06-09 13:20:17 +03:00
Grendgi	e132634c65	Isolate audio model compose network	2026-06-09 13:17:05 +03:00
Grendgi	cac8d89e64	Tune audio model GPU profiles	2026-06-09 12:52:13 +03:00
Grendgi	f49ba7abd5	Add AI server audio model profiles	2026-06-09 12:50:56 +03:00
Grendgi	aaecbb1bed	Add transcription provider comparison chain	2026-06-09 12:34:08 +03:00