Add transcription provider comparison chain

This commit is contained in:
Grendgi
2026-06-09 12:34:08 +03:00
parent 562fad6f87
commit aaecbb1bed
9 changed files with 600 additions and 57 deletions

View File

@@ -46,6 +46,22 @@ or compact `system` / `user` fields. The completed job result contains
domain metadata fields in `input`, but the worker only reads chat fields such as domain metadata fields in `input`, but the worker only reads chat fields such as
`system`, `user`, `messages`, `max_tokens` and `response_format`. `system`, `user`, `messages`, `max_tokens` and `response_format`.
`transcription` jobs can run several transcription providers in order for
temporary A/B comparison. The main `segments` field remains compatible with
telephony and contains the first successful provider result. The full comparison
is stored in `attempts` with `provider`, `model`, `status`, `text`, `segments`,
`duration_ms` and `error`.
Recommended comparison order:
1. `whisperx`
2. `qwen2-audio` (`Qwen/Qwen2-Audio-7B-Instruct`)
3. `voxtral-small` (`mistralai/Voxtral-Small-24B-2507`)
Qwen2-Audio and Voxtral are called through an OpenAI-compatible
`/v1/chat/completions` endpoint with `input_audio`; set their endpoint URLs only
after the models are actually exposed on the AI server.
## API ## API
- `POST /api/v1/jobs` creates one job. - `POST /api/v1/jobs` creates one job.
@@ -83,7 +99,17 @@ for Kubernetes probes.
- `LLM_API_KEY`, primary LLM API key - `LLM_API_KEY`, primary LLM API key
- `LLM_MODEL`, default `qwen2.5-14b` - `LLM_MODEL`, default `qwen2.5-14b`
- `LLM_TIMEOUT`, default `5m` - `LLM_TIMEOUT`, default `5m`
- `TRANSCRIPTION_PROVIDERS`, default `whisperx`, comma-separated ordered list:
`whisperx,qwen2-audio,voxtral-small`
- `WHISPERX_URL`, WhisperX endpoint for transcription jobs - `WHISPERX_URL`, WhisperX endpoint for transcription jobs
- `QWEN_AUDIO_BASE_URL`, OpenAI-compatible endpoint for Qwen2-Audio
- `QWEN_AUDIO_MODEL`, default `Qwen/Qwen2-Audio-7B-Instruct`
- `QWEN_AUDIO_API_KEY`, optional bearer token for Qwen2-Audio
- `VOXTRAL_BASE_URL`, OpenAI-compatible endpoint for Voxtral
- `VOXTRAL_MODEL`, default `mistralai/Voxtral-Small-24B-2507`
- `VOXTRAL_API_KEY`, optional bearer token for Voxtral
- `AUDIO_LLM_PROMPT`, transcription instruction for audio LLM providers
- `AUDIO_LLM_MAX_TOKENS`, default `4096`
- `WORKER_ID`, default hostname - `WORKER_ID`, default hostname
- `WORKER_HTTP_HOST`, default `0.0.0.0` - `WORKER_HTTP_HOST`, default `0.0.0.0`
- `WORKER_HTTP_PORT`, default `8081` - `WORKER_HTTP_PORT`, default `8081`

View File

@@ -48,14 +48,31 @@ func main() {
} }
llmClient := llm.New(cfg.LLMBaseURL, cfg.LLMAPIKey, cfg.LLMModel, cfg.LLMTimeout) llmClient := llm.New(cfg.LLMBaseURL, cfg.LLMAPIKey, cfg.LLMModel, cfg.LLMTimeout)
transcriber := transcription.New(cfg.WhisperXURL, cfg.WhisperXTimeout, cfg.FfmpegPath, cfg.WhisperXLeadSilence) transcriber := transcription.NewWithOptions(transcription.Options{
Providers: cfg.TranscriptionProviders,
WhisperXURL: cfg.WhisperXURL,
WhisperXTimeout: cfg.WhisperXTimeout,
FfmpegPath: cfg.FfmpegPath,
LeadSilence: cfg.WhisperXLeadSilence,
QwenAudioBaseURL: cfg.QwenAudioBaseURL,
QwenAudioAPIKey: cfg.QwenAudioAPIKey,
QwenAudioModel: cfg.QwenAudioModel,
QwenAudioTimeout: cfg.QwenAudioTimeout,
VoxtralBaseURL: cfg.VoxtralBaseURL,
VoxtralAPIKey: cfg.VoxtralAPIKey,
VoxtralModel: cfg.VoxtralModel,
VoxtralTimeout: cfg.VoxtralTimeout,
AudioLLMPrompt: cfg.AudioLLMPrompt,
AudioLLMMaxTokens: cfg.AudioLLMMaxTokens,
})
w := worker.New(db, llmClient, transcriber, cfg.WorkerID, cfg.LLMModel, cfg.WorkerTaskTypes, cfg.WorkerModelProfiles, cfg.WorkerPollInterval, cfg.WorkerLeaseTimeout, cfg.WorkerClaimLimit) w := worker.New(db, llmClient, transcriber, cfg.WorkerID, cfg.LLMModel, cfg.WorkerTaskTypes, cfg.WorkerModelProfiles, cfg.WorkerPollInterval, cfg.WorkerLeaseTimeout, cfg.WorkerClaimLimit)
healthSrv := startHealthServer(ctx, db, cfg) healthSrv := startHealthServer(ctx, db, cfg)
slog.Info("ai_worker_started", slog.Info("ai_worker_started",
"worker_id", cfg.WorkerID, "worker_id", cfg.WorkerID,
"model", cfg.LLMModel, "model", cfg.LLMModel,
"whisperx_enabled", transcriber != nil, "transcription_enabled", transcriber != nil,
"transcription_providers", cfg.TranscriptionProviders,
"whisperx_lead_silence", cfg.WhisperXLeadSilence.String(), "whisperx_lead_silence", cfg.WhisperXLeadSilence.String(),
"task_types", cfg.WorkerTaskTypes, "task_types", cfg.WorkerTaskTypes,
"model_profiles", cfg.WorkerModelProfiles, "model_profiles", cfg.WorkerModelProfiles,
@@ -127,6 +144,7 @@ func (h workerHealth) ServeHTTP(w http.ResponseWriter, r *http.Request) {
"worker_id": h.cfg.WorkerID, "worker_id": h.cfg.WorkerID,
"task_types": h.cfg.WorkerTaskTypes, "task_types": h.cfg.WorkerTaskTypes,
"model_profiles": h.cfg.WorkerModelProfiles, "model_profiles": h.cfg.WorkerModelProfiles,
"transcription_providers": h.cfg.TranscriptionProviders,
"claim_limit": h.cfg.WorkerClaimLimit, "claim_limit": h.cfg.WorkerClaimLimit,
"poll_interval": h.cfg.WorkerPollInterval.String(), "poll_interval": h.cfg.WorkerPollInterval.String(),
"lease_timeout": h.cfg.WorkerLeaseTimeout.String(), "lease_timeout": h.cfg.WorkerLeaseTimeout.String(),

View File

@@ -18,9 +18,20 @@ type Config struct {
LLMAPIKey string LLMAPIKey string
LLMModel string LLMModel string
LLMTimeout time.Duration LLMTimeout time.Duration
TranscriptionProviders []string
WhisperXURL string WhisperXURL string
WhisperXTimeout time.Duration WhisperXTimeout time.Duration
WhisperXLeadSilence time.Duration WhisperXLeadSilence time.Duration
QwenAudioBaseURL string
QwenAudioAPIKey string
QwenAudioModel string
QwenAudioTimeout time.Duration
VoxtralBaseURL string
VoxtralAPIKey string
VoxtralModel string
VoxtralTimeout time.Duration
AudioLLMMaxTokens int
AudioLLMPrompt string
FfmpegPath string FfmpegPath string
AIStatsSidecarURL string AIStatsSidecarURL string
AIStatsTimeout time.Duration AIStatsTimeout time.Duration
@@ -47,9 +58,20 @@ func Load() Config {
LLMAPIKey: envString("LLM_API_KEY", ""), LLMAPIKey: envString("LLM_API_KEY", ""),
LLMModel: envString("LLM_MODEL", "qwen2.5-14b"), LLMModel: envString("LLM_MODEL", "qwen2.5-14b"),
LLMTimeout: envDuration("LLM_TIMEOUT", 5*time.Minute), LLMTimeout: envDuration("LLM_TIMEOUT", 5*time.Minute),
TranscriptionProviders: envCSVDefault("TRANSCRIPTION_PROVIDERS", []string{"whisperx"}),
WhisperXURL: envString("WHISPERX_URL", ""), WhisperXURL: envString("WHISPERX_URL", ""),
WhisperXTimeout: envDuration("WHISPERX_TIMEOUT", 10*time.Minute), WhisperXTimeout: envDuration("WHISPERX_TIMEOUT", 10*time.Minute),
WhisperXLeadSilence: envDuration("WHISPERX_LEAD_SILENCE", 800*time.Millisecond), WhisperXLeadSilence: envDuration("WHISPERX_LEAD_SILENCE", 800*time.Millisecond),
QwenAudioBaseURL: envString("QWEN_AUDIO_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
QwenAudioAPIKey: envString("QWEN_AUDIO_API_KEY", envString("AUDIO_LLM_API_KEY", "")),
QwenAudioModel: envString("QWEN_AUDIO_MODEL", "Qwen/Qwen2-Audio-7B-Instruct"),
QwenAudioTimeout: envDuration("QWEN_AUDIO_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
VoxtralBaseURL: envString("VOXTRAL_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
VoxtralAPIKey: envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", "")),
VoxtralModel: envString("VOXTRAL_MODEL", "mistralai/Voxtral-Small-24B-2507"),
VoxtralTimeout: envDuration("VOXTRAL_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
AudioLLMMaxTokens: envInt("AUDIO_LLM_MAX_TOKENS", 4096),
AudioLLMPrompt: envString("AUDIO_LLM_PROMPT", defaultAudioLLMPrompt()),
FfmpegPath: envString("FFMPEG_PATH", "/usr/bin/ffmpeg"), FfmpegPath: envString("FFMPEG_PATH", "/usr/bin/ffmpeg"),
AIStatsSidecarURL: envString("AI_STATS_SIDECAR_URL", ""), AIStatsSidecarURL: envString("AI_STATS_SIDECAR_URL", ""),
AIStatsTimeout: envDuration("AI_STATS_TIMEOUT", 8*time.Second), AIStatsTimeout: envDuration("AI_STATS_TIMEOUT", 8*time.Second),
@@ -123,6 +145,17 @@ func envCSV(key string) []string {
return out return out
} }
func envCSVDefault(key string, fallback []string) []string {
if values := envCSV(key); len(values) > 0 {
return values
}
return fallback
}
func defaultAudioLLMPrompt() string {
return "Расшифруй речь из аудио максимально точно. Сохрани русский язык, имена, телефоны, суммы и смысловые паузы. Не добавляй комментарии, анализ, Markdown или JSON. Верни только чистый текст расшифровки."
}
func hostname() string { func hostname() string {
h, err := os.Hostname() h, err := os.Hostname()
if err != nil || h == "" { if err != nil || h == "" {

View File

@@ -53,6 +53,8 @@ func (s *Server) handleDashboard(w http.ResponseWriter, r *http.Request) {
Providers: []providerStatus{ Providers: []providerStatus{
s.checkLLM(ctx), s.checkLLM(ctx),
s.checkWhisperX(ctx), s.checkWhisperX(ctx),
s.checkAudioLLM(ctx, "qwen2-audio", s.cfg.QwenAudioBaseURL, s.cfg.QwenAudioAPIKey, s.cfg.QwenAudioModel, s.cfg.QwenAudioTimeout),
s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
}, },
}, },
Infra: loadInfraSnapshot(r, s.cfg), Infra: loadInfraSnapshot(r, s.cfg),

View File

@@ -43,11 +43,59 @@ func (s *Server) handleProviderStatus(w http.ResponseWriter, r *http.Request) {
Providers: []providerStatus{ Providers: []providerStatus{
s.checkLLM(ctx), s.checkLLM(ctx),
s.checkWhisperX(ctx), s.checkWhisperX(ctx),
s.checkAudioLLM(ctx, "qwen2-audio", s.cfg.QwenAudioBaseURL, s.cfg.QwenAudioAPIKey, s.cfg.QwenAudioModel, s.cfg.QwenAudioTimeout),
s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
}, },
} }
writeJSON(w, http.StatusOK, resp) writeJSON(w, http.StatusOK, resp)
} }
func (s *Server) checkAudioLLM(ctx context.Context, name, baseURL, apiKey, model string, timeout time.Duration) providerStatus {
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
st := providerStatus{
Name: name,
Configured: baseURL != "",
URL: baseURL,
Model: model,
}
if !st.Configured {
return st
}
if timeout <= 0 {
timeout = 10 * time.Minute
}
start := time.Now()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, st.URL+"/v1/models", nil)
if err != nil {
st.Error = err.Error()
return st
}
if apiKey != "" {
req.Header.Set("Authorization", "Bearer "+apiKey)
}
res, err := (&http.Client{Timeout: minDuration(timeout, 3*time.Second)}).Do(req)
st.LatencyMS = time.Since(start).Milliseconds()
if err != nil {
st.Error = err.Error()
return s.withStaleProviderOK(name, st)
}
defer res.Body.Close()
if res.StatusCode >= 300 {
st.Error = fmt.Sprintf("http %d: %s", res.StatusCode, readSmallBody(res.Body))
return s.withStaleProviderOK(name, st)
}
st.OK = true
s.rememberProviderOK(name, st.LatencyMS)
return st
}
func minDuration(a, b time.Duration) time.Duration {
if a < b {
return a
}
return b
}
func (s *Server) checkLLM(ctx context.Context) providerStatus { func (s *Server) checkLLM(ctx context.Context) providerStatus {
st := providerStatus{ st := providerStatus{
Name: "llm", Name: "llm",

View File

@@ -3,6 +3,7 @@ package transcription
import ( import (
"bytes" "bytes"
"context" "context"
"encoding/base64"
"encoding/json" "encoding/json"
"fmt" "fmt"
"io" "io"
@@ -16,12 +17,47 @@ import (
) )
type Client struct { type Client struct {
baseURL string providers []ProviderConfig
http *http.Client http *http.Client
ffmpegPath string ffmpegPath string
leadSilence time.Duration leadSilence time.Duration
} }
const (
ProviderWhisperX = "whisperx"
ProviderQwenAudio = "qwen2-audio"
ProviderVoxtral = "voxtral-small"
)
type Options struct {
Providers []string
WhisperXURL string
WhisperXTimeout time.Duration
FfmpegPath string
LeadSilence time.Duration
QwenAudioBaseURL string
QwenAudioAPIKey string
QwenAudioModel string
QwenAudioTimeout time.Duration
VoxtralBaseURL string
VoxtralAPIKey string
VoxtralModel string
VoxtralTimeout time.Duration
AudioLLMPrompt string
AudioLLMMaxTokens int
}
type ProviderConfig struct {
Name string
Kind string
BaseURL string
APIKey string
Model string
Timeout time.Duration
MaxTokens int
Prompt string
}
type Input struct { type Input struct {
AudioURL string `json:"audio_url"` AudioURL string `json:"audio_url"`
Filename string `json:"filename,omitempty"` Filename string `json:"filename,omitempty"`
@@ -39,6 +75,9 @@ type Segment struct {
} }
type Result struct { type Result struct {
Provider string `json:"provider,omitempty"`
Model string `json:"model,omitempty"`
Attempts []Attempt `json:"attempts,omitempty"`
Language string `json:"language"` Language string `json:"language"`
Segments []Segment `json:"segments"` Segments []Segment `json:"segments"`
DiarizeError *string `json:"diarize_error,omitempty"` DiarizeError *string `json:"diarize_error,omitempty"`
@@ -46,6 +85,16 @@ type Result struct {
DurationMS int64 `json:"duration_ms"` DurationMS int64 `json:"duration_ms"`
} }
type Attempt struct {
Provider string `json:"provider"`
Model string `json:"model,omitempty"`
Status string `json:"status"`
Error string `json:"error,omitempty"`
Text string `json:"text,omitempty"`
Segments []Segment `json:"segments,omitempty"`
DurationMS int64 `json:"duration_ms,omitempty"`
}
type whisperResponse struct { type whisperResponse struct {
Language string `json:"language"` Language string `json:"language"`
Segments []Segment `json:"segments"` Segments []Segment `json:"segments"`
@@ -53,35 +102,188 @@ type whisperResponse struct {
AlignError *string `json:"align_error,omitempty"` AlignError *string `json:"align_error,omitempty"`
} }
type audioLLMResponse struct {
Text string
Model string
}
type audioLLMChatRequest struct {
Model string `json:"model"`
Messages []audioLLMChatMessage `json:"messages"`
MaxTokens int `json:"max_tokens,omitempty"`
Temperature float64 `json:"temperature"`
}
type audioLLMChatMessage struct {
Role string `json:"role"`
Content []audioLLMContentPart `json:"content"`
}
type audioLLMContentPart struct {
Type string `json:"type"`
Text string `json:"text,omitempty"`
InputAudio *audioLLMAudio `json:"input_audio,omitempty"`
}
type audioLLMAudio struct {
Data string `json:"data"`
Format string `json:"format,omitempty"`
}
type audioLLMChatResponse struct {
Model string `json:"model,omitempty"`
Choices []struct {
Message struct {
Content string `json:"content"`
} `json:"message"`
} `json:"choices"`
Error *struct {
Message string `json:"message"`
} `json:"error,omitempty"`
}
func New(baseURL string, timeout time.Duration, ffmpegPath string, leadSilence time.Duration) *Client { func New(baseURL string, timeout time.Duration, ffmpegPath string, leadSilence time.Duration) *Client {
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/") return NewWithOptions(Options{
if baseURL == "" { Providers: []string{ProviderWhisperX},
return nil WhisperXURL: baseURL,
} WhisperXTimeout: timeout,
if timeout <= 0 { FfmpegPath: ffmpegPath,
timeout = 10 * time.Minute LeadSilence: leadSilence,
})
} }
func NewWithOptions(opts Options) *Client {
leadSilence := opts.LeadSilence
if leadSilence < 0 { if leadSilence < 0 {
leadSilence = 0 leadSilence = 0
} }
if leadSilence > 5*time.Second { if leadSilence > 5*time.Second {
leadSilence = 5 * time.Second leadSilence = 5 * time.Second
} }
ffmpegPath = strings.TrimSpace(ffmpegPath) ffmpegPath := strings.TrimSpace(opts.FfmpegPath)
if ffmpegPath == "" { if ffmpegPath == "" {
ffmpegPath = "ffmpeg" ffmpegPath = "ffmpeg"
} }
maxTokens := opts.AudioLLMMaxTokens
if maxTokens <= 0 {
maxTokens = 4096
}
audioLLMPrompt := strings.TrimSpace(opts.AudioLLMPrompt)
if audioLLMPrompt == "" {
audioLLMPrompt = "Transcribe the audio exactly. Return only the transcript text."
}
providers := buildProviders(opts, audioLLMPrompt, maxTokens)
if len(providers) == 0 {
return nil
}
return &Client{ return &Client{
baseURL: baseURL, providers: providers,
http: &http.Client{Timeout: timeout}, http: &http.Client{Timeout: maxProviderTimeout(providers)},
ffmpegPath: ffmpegPath, ffmpegPath: ffmpegPath,
leadSilence: leadSilence, leadSilence: leadSilence,
} }
} }
func buildProviders(opts Options, prompt string, maxTokens int) []ProviderConfig {
order := normalizeProviderOrder(opts.Providers)
out := make([]ProviderConfig, 0, len(order))
for _, name := range order {
switch name {
case ProviderWhisperX:
baseURL := strings.TrimRight(strings.TrimSpace(opts.WhisperXURL), "/")
if baseURL == "" {
continue
}
out = append(out, ProviderConfig{
Name: ProviderWhisperX,
Kind: ProviderWhisperX,
BaseURL: baseURL,
Model: ProviderWhisperX,
Timeout: defaultDuration(opts.WhisperXTimeout, 10*time.Minute),
})
case ProviderQwenAudio:
baseURL := strings.TrimRight(strings.TrimSpace(opts.QwenAudioBaseURL), "/")
if baseURL == "" {
continue
}
model := firstNonEmpty(opts.QwenAudioModel, "Qwen/Qwen2-Audio-7B-Instruct")
out = append(out, ProviderConfig{
Name: ProviderQwenAudio,
Kind: "audio_llm",
BaseURL: baseURL,
APIKey: strings.TrimSpace(opts.QwenAudioAPIKey),
Model: model,
Timeout: defaultDuration(opts.QwenAudioTimeout, 10*time.Minute),
MaxTokens: maxTokens,
Prompt: prompt,
})
case ProviderVoxtral:
baseURL := strings.TrimRight(strings.TrimSpace(opts.VoxtralBaseURL), "/")
if baseURL == "" {
continue
}
model := firstNonEmpty(opts.VoxtralModel, "mistralai/Voxtral-Small-24B-2507")
out = append(out, ProviderConfig{
Name: ProviderVoxtral,
Kind: "audio_llm",
BaseURL: baseURL,
APIKey: strings.TrimSpace(opts.VoxtralAPIKey),
Model: model,
Timeout: defaultDuration(opts.VoxtralTimeout, 10*time.Minute),
MaxTokens: maxTokens,
Prompt: prompt,
})
}
}
return out
}
func normalizeProviderOrder(in []string) []string {
if len(in) == 0 {
return []string{ProviderWhisperX}
}
out := make([]string, 0, len(in))
seen := map[string]bool{}
for _, raw := range in {
name := strings.ToLower(strings.TrimSpace(raw))
switch name {
case "whisper", "whisperx":
name = ProviderWhisperX
case "qwen", "qwen-audio", "qwen2-audio", "qwen2-audio-7b-instruct":
name = ProviderQwenAudio
case "voxtral", "voxtral-small", "voxtral-small-24b-2507":
name = ProviderVoxtral
default:
continue
}
if !seen[name] {
out = append(out, name)
seen[name] = true
}
}
return out
}
func maxProviderTimeout(providers []ProviderConfig) time.Duration {
maxTimeout := 10 * time.Minute
for _, provider := range providers {
if provider.Timeout > maxTimeout {
maxTimeout = provider.Timeout
}
}
return maxTimeout
}
func defaultDuration(v, fallback time.Duration) time.Duration {
if v <= 0 {
return fallback
}
return v
}
func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) { func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
if c == nil || c.baseURL == "" { if c == nil || len(c.providers) == 0 {
return nil, fmt.Errorf("whisperx not configured") return nil, fmt.Errorf("transcription providers not configured")
} }
if strings.TrimSpace(in.AudioURL) == "" { if strings.TrimSpace(in.AudioURL) == "" {
return nil, fmt.Errorf("audio_url is required") return nil, fmt.Errorf("audio_url is required")
@@ -96,18 +298,91 @@ func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
return nil, err return nil, err
} }
} }
resp, duration, err := c.transcribeAudio(ctx, audio, filename, in) var attempts []Attempt
var winner *Result
var errors []string
for _, provider := range c.providers {
result, attempt, err := c.transcribeWithProvider(ctx, provider, audio, filename, in)
attempts = append(attempts, attempt)
if err != nil { if err != nil {
return nil, err errors = append(errors, provider.Name+": "+err.Error())
continue
}
if winner == nil {
winner = result
}
}
if winner == nil {
return nil, fmt.Errorf("all transcription providers failed: %s", strings.Join(errors, "; "))
}
winner.Attempts = attempts
return winner, nil
}
func (c *Client) transcribeWithProvider(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input) (*Result, Attempt, error) {
providerCtx := ctx
cancel := func() {}
if provider.Timeout > 0 {
providerCtx, cancel = context.WithTimeout(ctx, provider.Timeout)
}
defer cancel()
attempt := Attempt{
Provider: provider.Name,
Model: provider.Model,
Status: "failed",
}
switch provider.Kind {
case ProviderWhisperX:
resp, duration, err := c.transcribeAudio(providerCtx, provider, audio, filename, in)
attempt.DurationMS = duration.Milliseconds()
if err != nil {
attempt.Error = err.Error()
return nil, attempt, err
} }
segments := adjustLeadSilence(resp.Segments, c.leadSilence) segments := adjustLeadSilence(resp.Segments, c.leadSilence)
attempt.Status = "ok"
attempt.Segments = segments
attempt.Text = segmentsText(segments)
return &Result{ return &Result{
Provider: provider.Name,
Model: provider.Model,
Language: resp.Language, Language: resp.Language,
Segments: segments, Segments: segments,
DiarizeError: resp.DiarizeError, DiarizeError: resp.DiarizeError,
AlignError: resp.AlignError, AlignError: resp.AlignError,
DurationMS: duration.Milliseconds(), DurationMS: duration.Milliseconds(),
}, nil }, attempt, nil
default:
resp, duration, err := c.transcribeAudioLLM(providerCtx, provider, audio, filename, in)
attempt.DurationMS = duration.Milliseconds()
if err != nil {
attempt.Error = err.Error()
return nil, attempt, err
}
text := strings.TrimSpace(resp.Text)
segments := []Segment{{Start: 0, End: 0, Text: text}}
attempt.Status = "ok"
attempt.Model = resp.Model
attempt.Text = text
attempt.Segments = segments
return &Result{
Provider: provider.Name,
Model: resp.Model,
Language: firstNonEmpty(in.Language, "unknown"),
Segments: segments,
DurationMS: duration.Milliseconds(),
}, attempt, nil
}
}
func segmentsText(segments []Segment) string {
parts := make([]string, 0, len(segments))
for _, segment := range segments {
if text := strings.TrimSpace(segment.Text); text != "" {
parts = append(parts, text)
}
}
return strings.Join(parts, "\n")
} }
func (c *Client) downloadAudio(ctx context.Context, in Input) ([]byte, string, error) { func (c *Client) downloadAudio(ctx context.Context, in Input) ([]byte, string, error) {
@@ -222,7 +497,7 @@ func clampTime(v float64) float64 {
return v return v
} }
func (c *Client) transcribeAudio(ctx context.Context, audio []byte, filename string, in Input) (*whisperResponse, time.Duration, error) { func (c *Client) transcribeAudio(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input) (*whisperResponse, time.Duration, error) {
body := &bytes.Buffer{} body := &bytes.Buffer{}
mw := multipart.NewWriter(body) mw := multipart.NewWriter(body)
fw, err := mw.CreateFormFile("file", filename) fw, err := mw.CreateFormFile("file", filename)
@@ -250,7 +525,7 @@ func (c *Client) transcribeAudio(ctx context.Context, audio []byte, filename str
return nil, 0, fmt.Errorf("close form: %w", err) return nil, 0, fmt.Errorf("close form: %w", err)
} }
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/transcribe", body) req, err := http.NewRequestWithContext(ctx, http.MethodPost, provider.BaseURL+"/transcribe", body)
if err != nil { if err != nil {
return nil, 0, fmt.Errorf("whisperx request: %w", err) return nil, 0, fmt.Errorf("whisperx request: %w", err)
} }
@@ -273,3 +548,97 @@ func (c *Client) transcribeAudio(ctx context.Context, audio []byte, filename str
} }
return &out, duration, nil return &out, duration, nil
} }
func (c *Client) transcribeAudioLLM(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input) (*audioLLMResponse, time.Duration, error) {
prompt := provider.Prompt
if in.Language != "" {
prompt += "\nЯзык аудио: " + in.Language + "."
}
if in.Diarize {
prompt += "\nЕсли слышны разные говорящие, разделяй реплики с короткими пометками Спикер 1/Спикер 2."
}
reqBody := audioLLMChatRequest{
Model: provider.Model,
MaxTokens: provider.MaxTokens,
Temperature: 0,
Messages: []audioLLMChatMessage{
{
Role: "user",
Content: []audioLLMContentPart{
{Type: "text", Text: prompt},
{
Type: "input_audio",
InputAudio: &audioLLMAudio{
Data: base64.StdEncoding.EncodeToString(audio),
Format: audioFormat(filename),
},
},
},
},
},
}
body, err := json.Marshal(reqBody)
if err != nil {
return nil, 0, fmt.Errorf("audio llm marshal: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, provider.BaseURL+"/v1/chat/completions", bytes.NewReader(body))
if err != nil {
return nil, 0, fmt.Errorf("audio llm request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
if provider.APIKey != "" {
req.Header.Set("Authorization", "Bearer "+provider.APIKey)
}
start := time.Now()
resp, err := c.http.Do(req)
duration := time.Since(start)
if err != nil {
return nil, duration, fmt.Errorf("audio llm do: %w", err)
}
defer resp.Body.Close()
raw, err := io.ReadAll(io.LimitReader(resp.Body, 4<<20))
if err != nil {
return nil, duration, fmt.Errorf("audio llm read: %w", err)
}
if resp.StatusCode >= 300 {
return nil, duration, fmt.Errorf("audio llm HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(raw)))
}
var out audioLLMChatResponse
if err := json.Unmarshal(raw, &out); err != nil {
return nil, duration, fmt.Errorf("audio llm decode: %w", err)
}
if out.Error != nil {
return nil, duration, fmt.Errorf("audio llm error: %s", out.Error.Message)
}
if len(out.Choices) == 0 {
return nil, duration, fmt.Errorf("audio llm: empty choices")
}
modelName := out.Model
if modelName == "" {
modelName = provider.Model
}
return &audioLLMResponse{
Text: strings.TrimSpace(out.Choices[0].Message.Content),
Model: modelName,
}, duration, nil
}
func audioFormat(filename string) string {
ext := strings.TrimPrefix(strings.ToLower(filepath.Ext(filename)), ".")
switch ext {
case "wav", "mp3", "flac", "m4a", "ogg", "opus", "webm":
return ext
default:
return "mp3"
}
}
func firstNonEmpty(values ...string) string {
for _, value := range values {
if strings.TrimSpace(value) != "" {
return value
}
}
return ""
}

View File

@@ -23,6 +23,41 @@ func TestAdjustLeadSilence(t *testing.T) {
} }
} }
func TestNormalizeProviderOrder(t *testing.T) {
got := normalizeProviderOrder([]string{"whisperx", "qwen", "voxtral", "qwen2-audio"})
want := []string{ProviderWhisperX, ProviderQwenAudio, ProviderVoxtral}
if len(got) != len(want) {
t.Fatalf("providers = %#v, want %#v", got, want)
}
for i := range want {
if got[i] != want[i] {
t.Fatalf("providers = %#v, want %#v", got, want)
}
}
}
func TestNewWithOptionsBuildsComparisonProviders(t *testing.T) {
client := NewWithOptions(Options{
Providers: []string{"whisperx", "qwen2-audio", "voxtral-small"},
WhisperXURL: "http://whisperx",
QwenAudioBaseURL: "http://qwen",
VoxtralBaseURL: "http://voxtral",
})
if client == nil {
t.Fatal("client is nil")
}
got := make([]string, 0, len(client.providers))
for _, provider := range client.providers {
got = append(got, provider.Name)
}
want := []string{ProviderWhisperX, ProviderQwenAudio, ProviderVoxtral}
for i := range want {
if got[i] != want[i] {
t.Fatalf("providers = %#v, want %#v", got, want)
}
}
}
func near(got, want float64) bool { func near(got, want float64) bool {
return math.Abs(got-want) < 0.000001 return math.Abs(got-want) < 0.000001
} }

View File

@@ -139,7 +139,7 @@ func (w *Worker) process(ctx context.Context, job *model.Job) {
func (w *Worker) processTranscription(ctx context.Context, job *model.Job) { func (w *Worker) processTranscription(ctx context.Context, job *model.Job) {
if w.transcriber == nil { if w.transcriber == nil {
w.fail(ctx, job, "provider_unavailable", "whisperx not configured") w.fail(ctx, job, "provider_unavailable", "transcription providers not configured")
return return
} }
var input transcription.Input var input transcription.Input
@@ -186,8 +186,10 @@ func classifyTranscriptionError(err error) string {
return "storage_error" return "storage_error"
case strings.Contains(s, "whisperx http 4") || strings.Contains(s, "ffmpeg") || strings.Contains(s, "invalid data") || strings.Contains(s, "could not decode"): case strings.Contains(s, "whisperx http 4") || strings.Contains(s, "ffmpeg") || strings.Contains(s, "invalid data") || strings.Contains(s, "could not decode"):
return "bad_audio" return "bad_audio"
case strings.Contains(s, "whisperx http 5") || strings.Contains(s, "whisperx do") || strings.Contains(s, "connection refused") || strings.Contains(s, "connection reset") || strings.Contains(s, "closed network connection"): case strings.Contains(s, "whisperx http 5") || strings.Contains(s, "whisperx do") || strings.Contains(s, "audio llm http 5") || strings.Contains(s, "audio llm do") || strings.Contains(s, "connection refused") || strings.Contains(s, "connection reset") || strings.Contains(s, "closed network connection"):
return "provider_unavailable" return "provider_unavailable"
case strings.Contains(s, "audio llm http 4"):
return "bad_input"
case strings.Contains(s, "decode"): case strings.Contains(s, "decode"):
return "bad_response" return "bad_response"
default: default:

View File

@@ -11,9 +11,19 @@ data:
LLM_BASE_URL: "http://10.2.3.5:8002" LLM_BASE_URL: "http://10.2.3.5:8002"
LLM_MODEL: "qwen2.5-14b" LLM_MODEL: "qwen2.5-14b"
LLM_TIMEOUT: "5m" LLM_TIMEOUT: "5m"
TRANSCRIPTION_PROVIDERS: "whisperx,qwen2-audio,voxtral-small"
WHISPERX_URL: "http://10.2.3.5:8001" WHISPERX_URL: "http://10.2.3.5:8001"
WHISPERX_TIMEOUT: "10m" WHISPERX_TIMEOUT: "10m"
WHISPERX_LEAD_SILENCE: "800ms" WHISPERX_LEAD_SILENCE: "800ms"
# Fill these after Qwen2-Audio and Voxtral are exposed as OpenAI-compatible
# chat-completions endpoints on the AI server.
QWEN_AUDIO_BASE_URL: ""
QWEN_AUDIO_MODEL: "Qwen/Qwen2-Audio-7B-Instruct"
QWEN_AUDIO_TIMEOUT: "10m"
VOXTRAL_BASE_URL: ""
VOXTRAL_MODEL: "mistralai/Voxtral-Small-24B-2507"
VOXTRAL_TIMEOUT: "10m"
AUDIO_LLM_MAX_TOKENS: "4096"
FFMPEG_PATH: "/usr/bin/ffmpeg" FFMPEG_PATH: "/usr/bin/ffmpeg"
AI_STATS_SIDECAR_URL: "http://10.2.3.5:9090" AI_STATS_SIDECAR_URL: "http://10.2.3.5:9090"
AI_STATS_TIMEOUT: "8s" AI_STATS_TIMEOUT: "8s"