Add transcription provider comparison chain

2026-06-09 12:34:08 +03:00
parent 562fad6f87
commit aaecbb1bed
9 changed files with 600 additions and 57 deletions
--- a/README.md
+++ b/README.md
@@ -46,6 +46,22 @@ or compact `system` / `user` fields. The completed job result contains
 domain metadata fields in `input`, but the worker only reads chat fields such as
 `system`, `user`, `messages`, `max_tokens` and `response_format`.

+`transcription` jobs can run several transcription providers in order for
+temporary A/B comparison. The main `segments` field remains compatible with
+telephony and contains the first successful provider result. The full comparison
+is stored in `attempts` with `provider`, `model`, `status`, `text`, `segments`,
+`duration_ms` and `error`.
+
+Recommended comparison order:
+
+1. `whisperx`
+2. `qwen2-audio` (`Qwen/Qwen2-Audio-7B-Instruct`)
+3. `voxtral-small` (`mistralai/Voxtral-Small-24B-2507`)
+
+Qwen2-Audio and Voxtral are called through an OpenAI-compatible
+`/v1/chat/completions` endpoint with `input_audio`; set their endpoint URLs only
+after the models are actually exposed on the AI server.
+
 ## API

 - `POST /api/v1/jobs` creates one job.
@@ -83,7 +99,17 @@ for Kubernetes probes.
 - `LLM_API_KEY`, primary LLM API key
 - `LLM_MODEL`, default `qwen2.5-14b`
 - `LLM_TIMEOUT`, default `5m`
+- `TRANSCRIPTION_PROVIDERS`, default `whisperx`, comma-separated ordered list:
+  `whisperx,qwen2-audio,voxtral-small`
 - `WHISPERX_URL`, WhisperX endpoint for transcription jobs
+- `QWEN_AUDIO_BASE_URL`, OpenAI-compatible endpoint for Qwen2-Audio
+- `QWEN_AUDIO_MODEL`, default `Qwen/Qwen2-Audio-7B-Instruct`
+- `QWEN_AUDIO_API_KEY`, optional bearer token for Qwen2-Audio
+- `VOXTRAL_BASE_URL`, OpenAI-compatible endpoint for Voxtral
+- `VOXTRAL_MODEL`, default `mistralai/Voxtral-Small-24B-2507`
+- `VOXTRAL_API_KEY`, optional bearer token for Voxtral
+- `AUDIO_LLM_PROMPT`, transcription instruction for audio LLM providers
+- `AUDIO_LLM_MAX_TOKENS`, default `4096`
 - `WORKER_ID`, default hostname
 - `WORKER_HTTP_HOST`, default `0.0.0.0`
 - `WORKER_HTTP_PORT`, default `8081`
--- a/cmd/worker/main.go
+++ b/cmd/worker/main.go
@@ -48,14 +48,31 @@ func main() {
 	}

 	llmClient := llm.New(cfg.LLMBaseURL, cfg.LLMAPIKey, cfg.LLMModel, cfg.LLMTimeout)
-	transcriber := transcription.New(cfg.WhisperXURL, cfg.WhisperXTimeout, cfg.FfmpegPath, cfg.WhisperXLeadSilence)
+	transcriber := transcription.NewWithOptions(transcription.Options{
+		Providers:         cfg.TranscriptionProviders,
+		WhisperXURL:       cfg.WhisperXURL,
+		WhisperXTimeout:   cfg.WhisperXTimeout,
+		FfmpegPath:        cfg.FfmpegPath,
+		LeadSilence:       cfg.WhisperXLeadSilence,
+		QwenAudioBaseURL:  cfg.QwenAudioBaseURL,
+		QwenAudioAPIKey:   cfg.QwenAudioAPIKey,
+		QwenAudioModel:    cfg.QwenAudioModel,
+		QwenAudioTimeout:  cfg.QwenAudioTimeout,
+		VoxtralBaseURL:    cfg.VoxtralBaseURL,
+		VoxtralAPIKey:     cfg.VoxtralAPIKey,
+		VoxtralModel:      cfg.VoxtralModel,
+		VoxtralTimeout:    cfg.VoxtralTimeout,
+		AudioLLMPrompt:    cfg.AudioLLMPrompt,
+		AudioLLMMaxTokens: cfg.AudioLLMMaxTokens,
+	})
 	w := worker.New(db, llmClient, transcriber, cfg.WorkerID, cfg.LLMModel, cfg.WorkerTaskTypes, cfg.WorkerModelProfiles, cfg.WorkerPollInterval, cfg.WorkerLeaseTimeout, cfg.WorkerClaimLimit)
 	healthSrv := startHealthServer(ctx, db, cfg)

 	slog.Info("ai_worker_started",
 		"worker_id", cfg.WorkerID,
 		"model", cfg.LLMModel,
-		"whisperx_enabled", transcriber != nil,
+		"transcription_enabled", transcriber != nil,
+		"transcription_providers", cfg.TranscriptionProviders,
 		"whisperx_lead_silence", cfg.WhisperXLeadSilence.String(),
 		"task_types", cfg.WorkerTaskTypes,
 		"model_profiles", cfg.WorkerModelProfiles,
@@ -127,6 +144,7 @@ func (h workerHealth) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 			"worker_id":               h.cfg.WorkerID,
 			"task_types":              h.cfg.WorkerTaskTypes,
 			"model_profiles":          h.cfg.WorkerModelProfiles,
+			"transcription_providers": h.cfg.TranscriptionProviders,
 			"claim_limit":             h.cfg.WorkerClaimLimit,
 			"poll_interval":           h.cfg.WorkerPollInterval.String(),
 			"lease_timeout":           h.cfg.WorkerLeaseTimeout.String(),
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -18,9 +18,20 @@ type Config struct {
 	LLMAPIKey              string
 	LLMModel               string
 	LLMTimeout             time.Duration
+	TranscriptionProviders []string
 	WhisperXURL            string
 	WhisperXTimeout        time.Duration
 	WhisperXLeadSilence    time.Duration
+	QwenAudioBaseURL       string
+	QwenAudioAPIKey        string
+	QwenAudioModel         string
+	QwenAudioTimeout       time.Duration
+	VoxtralBaseURL         string
+	VoxtralAPIKey          string
+	VoxtralModel           string
+	VoxtralTimeout         time.Duration
+	AudioLLMMaxTokens      int
+	AudioLLMPrompt         string
 	FfmpegPath             string
 	AIStatsSidecarURL      string
 	AIStatsTimeout         time.Duration
@@ -47,9 +58,20 @@ func Load() Config {
 		LLMAPIKey:              envString("LLM_API_KEY", ""),
 		LLMModel:               envString("LLM_MODEL", "qwen2.5-14b"),
 		LLMTimeout:             envDuration("LLM_TIMEOUT", 5*time.Minute),
+		TranscriptionProviders: envCSVDefault("TRANSCRIPTION_PROVIDERS", []string{"whisperx"}),
 		WhisperXURL:            envString("WHISPERX_URL", ""),
 		WhisperXTimeout:        envDuration("WHISPERX_TIMEOUT", 10*time.Minute),
 		WhisperXLeadSilence:    envDuration("WHISPERX_LEAD_SILENCE", 800*time.Millisecond),
+		QwenAudioBaseURL:       envString("QWEN_AUDIO_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
+		QwenAudioAPIKey:        envString("QWEN_AUDIO_API_KEY", envString("AUDIO_LLM_API_KEY", "")),
+		QwenAudioModel:         envString("QWEN_AUDIO_MODEL", "Qwen/Qwen2-Audio-7B-Instruct"),
+		QwenAudioTimeout:       envDuration("QWEN_AUDIO_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
+		VoxtralBaseURL:         envString("VOXTRAL_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
+		VoxtralAPIKey:          envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", "")),
+		VoxtralModel:           envString("VOXTRAL_MODEL", "mistralai/Voxtral-Small-24B-2507"),
+		VoxtralTimeout:         envDuration("VOXTRAL_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
+		AudioLLMMaxTokens:      envInt("AUDIO_LLM_MAX_TOKENS", 4096),
+		AudioLLMPrompt:         envString("AUDIO_LLM_PROMPT", defaultAudioLLMPrompt()),
 		FfmpegPath:             envString("FFMPEG_PATH", "/usr/bin/ffmpeg"),
 		AIStatsSidecarURL:      envString("AI_STATS_SIDECAR_URL", ""),
 		AIStatsTimeout:         envDuration("AI_STATS_TIMEOUT", 8*time.Second),
@@ -123,6 +145,17 @@ func envCSV(key string) []string {
 	return out
 }

+func envCSVDefault(key string, fallback []string) []string {
+	if values := envCSV(key); len(values) > 0 {
+		return values
+	}
+	return fallback
+}
+
+func defaultAudioLLMPrompt() string {
+	return "Расшифруй речь из аудио максимально точно. Сохрани русский язык, имена, телефоны, суммы и смысловые паузы. Не добавляй комментарии, анализ, Markdown или JSON. Верни только чистый текст расшифровки."
+}
+
 func hostname() string {
 	h, err := os.Hostname()
 	if err != nil || h == "" {
--- a/internal/httpapi/dashboard.go
+++ b/internal/httpapi/dashboard.go
@@ -53,6 +53,8 @@ func (s *Server) handleDashboard(w http.ResponseWriter, r *http.Request) {
 			Providers: []providerStatus{
 				s.checkLLM(ctx),
 				s.checkWhisperX(ctx),
+				s.checkAudioLLM(ctx, "qwen2-audio", s.cfg.QwenAudioBaseURL, s.cfg.QwenAudioAPIKey, s.cfg.QwenAudioModel, s.cfg.QwenAudioTimeout),
+				s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
 			},
 		},
 		Infra: loadInfraSnapshot(r, s.cfg),
--- a/internal/httpapi/providers.go
+++ b/internal/httpapi/providers.go
@@ -43,11 +43,59 @@ func (s *Server) handleProviderStatus(w http.ResponseWriter, r *http.Request) {
 		Providers: []providerStatus{
 			s.checkLLM(ctx),
 			s.checkWhisperX(ctx),
+			s.checkAudioLLM(ctx, "qwen2-audio", s.cfg.QwenAudioBaseURL, s.cfg.QwenAudioAPIKey, s.cfg.QwenAudioModel, s.cfg.QwenAudioTimeout),
+			s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
 		},
 	}
 	writeJSON(w, http.StatusOK, resp)
 }

+func (s *Server) checkAudioLLM(ctx context.Context, name, baseURL, apiKey, model string, timeout time.Duration) providerStatus {
+	baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
+	st := providerStatus{
+		Name:       name,
+		Configured: baseURL != "",
+		URL:        baseURL,
+		Model:      model,
+	}
+	if !st.Configured {
+		return st
+	}
+	if timeout <= 0 {
+		timeout = 10 * time.Minute
+	}
+	start := time.Now()
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, st.URL+"/v1/models", nil)
+	if err != nil {
+		st.Error = err.Error()
+		return st
+	}
+	if apiKey != "" {
+		req.Header.Set("Authorization", "Bearer "+apiKey)
+	}
+	res, err := (&http.Client{Timeout: minDuration(timeout, 3*time.Second)}).Do(req)
+	st.LatencyMS = time.Since(start).Milliseconds()
+	if err != nil {
+		st.Error = err.Error()
+		return s.withStaleProviderOK(name, st)
+	}
+	defer res.Body.Close()
+	if res.StatusCode >= 300 {
+		st.Error = fmt.Sprintf("http %d: %s", res.StatusCode, readSmallBody(res.Body))
+		return s.withStaleProviderOK(name, st)
+	}
+	st.OK = true
+	s.rememberProviderOK(name, st.LatencyMS)
+	return st
+}
+
+func minDuration(a, b time.Duration) time.Duration {
+	if a < b {
+		return a
+	}
+	return b
+}
+
 func (s *Server) checkLLM(ctx context.Context) providerStatus {
 	st := providerStatus{
 		Name:       "llm",
--- a/internal/transcription/client.go
+++ b/internal/transcription/client.go
@@ -3,6 +3,7 @@ package transcription
 import (
 	"bytes"
 	"context"
+	"encoding/base64"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -16,12 +17,47 @@ import (
 )

 type Client struct {
-	baseURL     string
+	providers   []ProviderConfig
 	http        *http.Client
 	ffmpegPath  string
 	leadSilence time.Duration
 }

+const (
+	ProviderWhisperX  = "whisperx"
+	ProviderQwenAudio = "qwen2-audio"
+	ProviderVoxtral   = "voxtral-small"
+)
+
+type Options struct {
+	Providers         []string
+	WhisperXURL       string
+	WhisperXTimeout   time.Duration
+	FfmpegPath        string
+	LeadSilence       time.Duration
+	QwenAudioBaseURL  string
+	QwenAudioAPIKey   string
+	QwenAudioModel    string
+	QwenAudioTimeout  time.Duration
+	VoxtralBaseURL    string
+	VoxtralAPIKey     string
+	VoxtralModel      string
+	VoxtralTimeout    time.Duration
+	AudioLLMPrompt    string
+	AudioLLMMaxTokens int
+}
+
+type ProviderConfig struct {
+	Name      string
+	Kind      string
+	BaseURL   string
+	APIKey    string
+	Model     string
+	Timeout   time.Duration
+	MaxTokens int
+	Prompt    string
+}
+
 type Input struct {
 	AudioURL    string `json:"audio_url"`
 	Filename    string `json:"filename,omitempty"`
@@ -39,6 +75,9 @@ type Segment struct {
 }

 type Result struct {
+	Provider     string    `json:"provider,omitempty"`
+	Model        string    `json:"model,omitempty"`
+	Attempts     []Attempt `json:"attempts,omitempty"`
 	Language     string    `json:"language"`
 	Segments     []Segment `json:"segments"`
 	DiarizeError *string   `json:"diarize_error,omitempty"`
@@ -46,6 +85,16 @@ type Result struct {
 	DurationMS   int64     `json:"duration_ms"`
 }

+type Attempt struct {
+	Provider   string    `json:"provider"`
+	Model      string    `json:"model,omitempty"`
+	Status     string    `json:"status"`
+	Error      string    `json:"error,omitempty"`
+	Text       string    `json:"text,omitempty"`
+	Segments   []Segment `json:"segments,omitempty"`
+	DurationMS int64     `json:"duration_ms,omitempty"`
+}
+
 type whisperResponse struct {
 	Language     string    `json:"language"`
 	Segments     []Segment `json:"segments"`
@@ -53,35 +102,188 @@ type whisperResponse struct {
 	AlignError   *string   `json:"align_error,omitempty"`
 }

+type audioLLMResponse struct {
+	Text  string
+	Model string
+}
+
+type audioLLMChatRequest struct {
+	Model       string                `json:"model"`
+	Messages    []audioLLMChatMessage `json:"messages"`
+	MaxTokens   int                   `json:"max_tokens,omitempty"`
+	Temperature float64               `json:"temperature"`
+}
+
+type audioLLMChatMessage struct {
+	Role    string                `json:"role"`
+	Content []audioLLMContentPart `json:"content"`
+}
+
+type audioLLMContentPart struct {
+	Type       string         `json:"type"`
+	Text       string         `json:"text,omitempty"`
+	InputAudio *audioLLMAudio `json:"input_audio,omitempty"`
+}
+
+type audioLLMAudio struct {
+	Data   string `json:"data"`
+	Format string `json:"format,omitempty"`
+}
+
+type audioLLMChatResponse struct {
+	Model   string `json:"model,omitempty"`
+	Choices []struct {
+		Message struct {
+			Content string `json:"content"`
+		} `json:"message"`
+	} `json:"choices"`
+	Error *struct {
+		Message string `json:"message"`
+	} `json:"error,omitempty"`
+}
+
 func New(baseURL string, timeout time.Duration, ffmpegPath string, leadSilence time.Duration) *Client {
-	baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
-	if baseURL == "" {
-		return nil
-	}
-	if timeout <= 0 {
-		timeout = 10 * time.Minute
+	return NewWithOptions(Options{
+		Providers:       []string{ProviderWhisperX},
+		WhisperXURL:     baseURL,
+		WhisperXTimeout: timeout,
+		FfmpegPath:      ffmpegPath,
+		LeadSilence:     leadSilence,
+	})
 }
+
+func NewWithOptions(opts Options) *Client {
+	leadSilence := opts.LeadSilence
 	if leadSilence < 0 {
 		leadSilence = 0
 	}
 	if leadSilence > 5*time.Second {
 		leadSilence = 5 * time.Second
 	}
-	ffmpegPath = strings.TrimSpace(ffmpegPath)
+	ffmpegPath := strings.TrimSpace(opts.FfmpegPath)
 	if ffmpegPath == "" {
 		ffmpegPath = "ffmpeg"
 	}
+	maxTokens := opts.AudioLLMMaxTokens
+	if maxTokens <= 0 {
+		maxTokens = 4096
+	}
+	audioLLMPrompt := strings.TrimSpace(opts.AudioLLMPrompt)
+	if audioLLMPrompt == "" {
+		audioLLMPrompt = "Transcribe the audio exactly. Return only the transcript text."
+	}
+	providers := buildProviders(opts, audioLLMPrompt, maxTokens)
+	if len(providers) == 0 {
+		return nil
+	}
 	return &Client{
-		baseURL:     baseURL,
-		http:        &http.Client{Timeout: timeout},
+		providers:   providers,
+		http:        &http.Client{Timeout: maxProviderTimeout(providers)},
 		ffmpegPath:  ffmpegPath,
 		leadSilence: leadSilence,
 	}
 }

+func buildProviders(opts Options, prompt string, maxTokens int) []ProviderConfig {
+	order := normalizeProviderOrder(opts.Providers)
+	out := make([]ProviderConfig, 0, len(order))
+	for _, name := range order {
+		switch name {
+		case ProviderWhisperX:
+			baseURL := strings.TrimRight(strings.TrimSpace(opts.WhisperXURL), "/")
+			if baseURL == "" {
+				continue
+			}
+			out = append(out, ProviderConfig{
+				Name:    ProviderWhisperX,
+				Kind:    ProviderWhisperX,
+				BaseURL: baseURL,
+				Model:   ProviderWhisperX,
+				Timeout: defaultDuration(opts.WhisperXTimeout, 10*time.Minute),
+			})
+		case ProviderQwenAudio:
+			baseURL := strings.TrimRight(strings.TrimSpace(opts.QwenAudioBaseURL), "/")
+			if baseURL == "" {
+				continue
+			}
+			model := firstNonEmpty(opts.QwenAudioModel, "Qwen/Qwen2-Audio-7B-Instruct")
+			out = append(out, ProviderConfig{
+				Name:      ProviderQwenAudio,
+				Kind:      "audio_llm",
+				BaseURL:   baseURL,
+				APIKey:    strings.TrimSpace(opts.QwenAudioAPIKey),
+				Model:     model,
+				Timeout:   defaultDuration(opts.QwenAudioTimeout, 10*time.Minute),
+				MaxTokens: maxTokens,
+				Prompt:    prompt,
+			})
+		case ProviderVoxtral:
+			baseURL := strings.TrimRight(strings.TrimSpace(opts.VoxtralBaseURL), "/")
+			if baseURL == "" {
+				continue
+			}
+			model := firstNonEmpty(opts.VoxtralModel, "mistralai/Voxtral-Small-24B-2507")
+			out = append(out, ProviderConfig{
+				Name:      ProviderVoxtral,
+				Kind:      "audio_llm",
+				BaseURL:   baseURL,
+				APIKey:    strings.TrimSpace(opts.VoxtralAPIKey),
+				Model:     model,
+				Timeout:   defaultDuration(opts.VoxtralTimeout, 10*time.Minute),
+				MaxTokens: maxTokens,
+				Prompt:    prompt,
+			})
+		}
+	}
+	return out
+}
+
+func normalizeProviderOrder(in []string) []string {
+	if len(in) == 0 {
+		return []string{ProviderWhisperX}
+	}
+	out := make([]string, 0, len(in))
+	seen := map[string]bool{}
+	for _, raw := range in {
+		name := strings.ToLower(strings.TrimSpace(raw))
+		switch name {
+		case "whisper", "whisperx":
+			name = ProviderWhisperX
+		case "qwen", "qwen-audio", "qwen2-audio", "qwen2-audio-7b-instruct":
+			name = ProviderQwenAudio
+		case "voxtral", "voxtral-small", "voxtral-small-24b-2507":
+			name = ProviderVoxtral
+		default:
+			continue
+		}
+		if !seen[name] {
+			out = append(out, name)
+			seen[name] = true
+		}
+	}
+	return out
+}
+
+func maxProviderTimeout(providers []ProviderConfig) time.Duration {
+	maxTimeout := 10 * time.Minute
+	for _, provider := range providers {
+		if provider.Timeout > maxTimeout {
+			maxTimeout = provider.Timeout
+		}
+	}
+	return maxTimeout
+}
+
+func defaultDuration(v, fallback time.Duration) time.Duration {
+	if v <= 0 {
+		return fallback
+	}
+	return v
+}
+
 func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
-	if c == nil || c.baseURL == "" {
-		return nil, fmt.Errorf("whisperx not configured")
+	if c == nil || len(c.providers) == 0 {
+		return nil, fmt.Errorf("transcription providers not configured")
 	}
 	if strings.TrimSpace(in.AudioURL) == "" {
 		return nil, fmt.Errorf("audio_url is required")
@@ -96,18 +298,91 @@ func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
 			return nil, err
 		}
 	}
-	resp, duration, err := c.transcribeAudio(ctx, audio, filename, in)
+	var attempts []Attempt
+	var winner *Result
+	var errors []string
+	for _, provider := range c.providers {
+		result, attempt, err := c.transcribeWithProvider(ctx, provider, audio, filename, in)
+		attempts = append(attempts, attempt)
 		if err != nil {
-		return nil, err
+			errors = append(errors, provider.Name+": "+err.Error())
+			continue
+		}
+		if winner == nil {
+			winner = result
+		}
+	}
+	if winner == nil {
+		return nil, fmt.Errorf("all transcription providers failed: %s", strings.Join(errors, "; "))
+	}
+	winner.Attempts = attempts
+	return winner, nil
+}
+
+func (c *Client) transcribeWithProvider(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input) (*Result, Attempt, error) {
+	providerCtx := ctx
+	cancel := func() {}
+	if provider.Timeout > 0 {
+		providerCtx, cancel = context.WithTimeout(ctx, provider.Timeout)
+	}
+	defer cancel()
+	attempt := Attempt{
+		Provider: provider.Name,
+		Model:    provider.Model,
+		Status:   "failed",
+	}
+	switch provider.Kind {
+	case ProviderWhisperX:
+		resp, duration, err := c.transcribeAudio(providerCtx, provider, audio, filename, in)
+		attempt.DurationMS = duration.Milliseconds()
+		if err != nil {
+			attempt.Error = err.Error()
+			return nil, attempt, err
 		}
 		segments := adjustLeadSilence(resp.Segments, c.leadSilence)
+		attempt.Status = "ok"
+		attempt.Segments = segments
+		attempt.Text = segmentsText(segments)
 		return &Result{
+			Provider:     provider.Name,
+			Model:        provider.Model,
 			Language:     resp.Language,
 			Segments:     segments,
 			DiarizeError: resp.DiarizeError,
 			AlignError:   resp.AlignError,
 			DurationMS:   duration.Milliseconds(),
-	}, nil
+		}, attempt, nil
+	default:
+		resp, duration, err := c.transcribeAudioLLM(providerCtx, provider, audio, filename, in)
+		attempt.DurationMS = duration.Milliseconds()
+		if err != nil {
+			attempt.Error = err.Error()
+			return nil, attempt, err
+		}
+		text := strings.TrimSpace(resp.Text)
+		segments := []Segment{{Start: 0, End: 0, Text: text}}
+		attempt.Status = "ok"
+		attempt.Model = resp.Model
+		attempt.Text = text
+		attempt.Segments = segments
+		return &Result{
+			Provider:   provider.Name,
+			Model:      resp.Model,
+			Language:   firstNonEmpty(in.Language, "unknown"),
+			Segments:   segments,
+			DurationMS: duration.Milliseconds(),
+		}, attempt, nil
+	}
+}
+
+func segmentsText(segments []Segment) string {
+	parts := make([]string, 0, len(segments))
+	for _, segment := range segments {
+		if text := strings.TrimSpace(segment.Text); text != "" {
+			parts = append(parts, text)
+		}
+	}
+	return strings.Join(parts, "\n")
 }

 func (c *Client) downloadAudio(ctx context.Context, in Input) ([]byte, string, error) {
@@ -222,7 +497,7 @@ func clampTime(v float64) float64 {
 	return v
 }

-func (c *Client) transcribeAudio(ctx context.Context, audio []byte, filename string, in Input) (*whisperResponse, time.Duration, error) {
+func (c *Client) transcribeAudio(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input) (*whisperResponse, time.Duration, error) {
 	body := &bytes.Buffer{}
 	mw := multipart.NewWriter(body)
 	fw, err := mw.CreateFormFile("file", filename)
@@ -250,7 +525,7 @@ func (c *Client) transcribeAudio(ctx context.Context, audio []byte, filename str
 		return nil, 0, fmt.Errorf("close form: %w", err)
 	}

-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/transcribe", body)
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, provider.BaseURL+"/transcribe", body)
 	if err != nil {
 		return nil, 0, fmt.Errorf("whisperx request: %w", err)
 	}
@@ -273,3 +548,97 @@ func (c *Client) transcribeAudio(ctx context.Context, audio []byte, filename str
 	}
 	return &out, duration, nil
 }
+
+func (c *Client) transcribeAudioLLM(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input) (*audioLLMResponse, time.Duration, error) {
+	prompt := provider.Prompt
+	if in.Language != "" {
+		prompt += "\nЯзык аудио: " + in.Language + "."
+	}
+	if in.Diarize {
+		prompt += "\nЕсли слышны разные говорящие, разделяй реплики с короткими пометками Спикер 1/Спикер 2."
+	}
+	reqBody := audioLLMChatRequest{
+		Model:       provider.Model,
+		MaxTokens:   provider.MaxTokens,
+		Temperature: 0,
+		Messages: []audioLLMChatMessage{
+			{
+				Role: "user",
+				Content: []audioLLMContentPart{
+					{Type: "text", Text: prompt},
+					{
+						Type: "input_audio",
+						InputAudio: &audioLLMAudio{
+							Data:   base64.StdEncoding.EncodeToString(audio),
+							Format: audioFormat(filename),
+						},
+					},
+				},
+			},
+		},
+	}
+	body, err := json.Marshal(reqBody)
+	if err != nil {
+		return nil, 0, fmt.Errorf("audio llm marshal: %w", err)
+	}
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, provider.BaseURL+"/v1/chat/completions", bytes.NewReader(body))
+	if err != nil {
+		return nil, 0, fmt.Errorf("audio llm request: %w", err)
+	}
+	req.Header.Set("Content-Type", "application/json")
+	if provider.APIKey != "" {
+		req.Header.Set("Authorization", "Bearer "+provider.APIKey)
+	}
+
+	start := time.Now()
+	resp, err := c.http.Do(req)
+	duration := time.Since(start)
+	if err != nil {
+		return nil, duration, fmt.Errorf("audio llm do: %w", err)
+	}
+	defer resp.Body.Close()
+	raw, err := io.ReadAll(io.LimitReader(resp.Body, 4<<20))
+	if err != nil {
+		return nil, duration, fmt.Errorf("audio llm read: %w", err)
+	}
+	if resp.StatusCode >= 300 {
+		return nil, duration, fmt.Errorf("audio llm HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(raw)))
+	}
+	var out audioLLMChatResponse
+	if err := json.Unmarshal(raw, &out); err != nil {
+		return nil, duration, fmt.Errorf("audio llm decode: %w", err)
+	}
+	if out.Error != nil {
+		return nil, duration, fmt.Errorf("audio llm error: %s", out.Error.Message)
+	}
+	if len(out.Choices) == 0 {
+		return nil, duration, fmt.Errorf("audio llm: empty choices")
+	}
+	modelName := out.Model
+	if modelName == "" {
+		modelName = provider.Model
+	}
+	return &audioLLMResponse{
+		Text:  strings.TrimSpace(out.Choices[0].Message.Content),
+		Model: modelName,
+	}, duration, nil
+}
+
+func audioFormat(filename string) string {
+	ext := strings.TrimPrefix(strings.ToLower(filepath.Ext(filename)), ".")
+	switch ext {
+	case "wav", "mp3", "flac", "m4a", "ogg", "opus", "webm":
+		return ext
+	default:
+		return "mp3"
+	}
+}
+
+func firstNonEmpty(values ...string) string {
+	for _, value := range values {
+		if strings.TrimSpace(value) != "" {
+			return value
+		}
+	}
+	return ""
+}
--- a/internal/transcription/client_test.go
+++ b/internal/transcription/client_test.go
@@ -23,6 +23,41 @@ func TestAdjustLeadSilence(t *testing.T) {
 	}
 }

+func TestNormalizeProviderOrder(t *testing.T) {
+	got := normalizeProviderOrder([]string{"whisperx", "qwen", "voxtral", "qwen2-audio"})
+	want := []string{ProviderWhisperX, ProviderQwenAudio, ProviderVoxtral}
+	if len(got) != len(want) {
+		t.Fatalf("providers = %#v, want %#v", got, want)
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("providers = %#v, want %#v", got, want)
+		}
+	}
+}
+
+func TestNewWithOptionsBuildsComparisonProviders(t *testing.T) {
+	client := NewWithOptions(Options{
+		Providers:        []string{"whisperx", "qwen2-audio", "voxtral-small"},
+		WhisperXURL:      "http://whisperx",
+		QwenAudioBaseURL: "http://qwen",
+		VoxtralBaseURL:   "http://voxtral",
+	})
+	if client == nil {
+		t.Fatal("client is nil")
+	}
+	got := make([]string, 0, len(client.providers))
+	for _, provider := range client.providers {
+		got = append(got, provider.Name)
+	}
+	want := []string{ProviderWhisperX, ProviderQwenAudio, ProviderVoxtral}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("providers = %#v, want %#v", got, want)
+		}
+	}
+}
+
 func near(got, want float64) bool {
 	return math.Abs(got-want) < 0.000001
 }
--- a/internal/worker/worker.go
+++ b/internal/worker/worker.go
@@ -139,7 +139,7 @@ func (w *Worker) process(ctx context.Context, job *model.Job) {

 func (w *Worker) processTranscription(ctx context.Context, job *model.Job) {
 	if w.transcriber == nil {
-		w.fail(ctx, job, "provider_unavailable", "whisperx not configured")
+		w.fail(ctx, job, "provider_unavailable", "transcription providers not configured")
 		return
 	}
 	var input transcription.Input
@@ -186,8 +186,10 @@ func classifyTranscriptionError(err error) string {
 		return "storage_error"
 	case strings.Contains(s, "whisperx http 4") || strings.Contains(s, "ffmpeg") || strings.Contains(s, "invalid data") || strings.Contains(s, "could not decode"):
 		return "bad_audio"
-	case strings.Contains(s, "whisperx http 5") || strings.Contains(s, "whisperx do") || strings.Contains(s, "connection refused") || strings.Contains(s, "connection reset") || strings.Contains(s, "closed network connection"):
+	case strings.Contains(s, "whisperx http 5") || strings.Contains(s, "whisperx do") || strings.Contains(s, "audio llm http 5") || strings.Contains(s, "audio llm do") || strings.Contains(s, "connection refused") || strings.Contains(s, "connection reset") || strings.Contains(s, "closed network connection"):
 		return "provider_unavailable"
+	case strings.Contains(s, "audio llm http 4"):
+		return "bad_input"
 	case strings.Contains(s, "decode"):
 		return "bad_response"
 	default:
--- a/k8s/configmap.yaml
+++ b/k8s/configmap.yaml
@@ -11,9 +11,19 @@ data:
  LLM_BASE_URL: "http://10.2.3.5:8002"
  LLM_MODEL: "qwen2.5-14b"
  LLM_TIMEOUT: "5m"
+  TRANSCRIPTION_PROVIDERS: "whisperx,qwen2-audio,voxtral-small"
  WHISPERX_URL: "http://10.2.3.5:8001"
  WHISPERX_TIMEOUT: "10m"
  WHISPERX_LEAD_SILENCE: "800ms"
+  # Fill these after Qwen2-Audio and Voxtral are exposed as OpenAI-compatible
+  # chat-completions endpoints on the AI server.
+  QWEN_AUDIO_BASE_URL: ""
+  QWEN_AUDIO_MODEL: "Qwen/Qwen2-Audio-7B-Instruct"
+  QWEN_AUDIO_TIMEOUT: "10m"
+  VOXTRAL_BASE_URL: ""
+  VOXTRAL_MODEL: "mistralai/Voxtral-Small-24B-2507"
+  VOXTRAL_TIMEOUT: "10m"
+  AUDIO_LLM_MAX_TOKENS: "4096"
  FFMPEG_PATH: "/usr/bin/ffmpeg"
  AI_STATS_SIDECAR_URL: "http://10.2.3.5:9090"
  AI_STATS_TIMEOUT: "8s"