Compare commits

..

7 Commits

Author SHA1 Message Date
Grendgi
76ac9b8896 Add audio model API keys
All checks were successful
CI / test (push) Successful in 14s
Build and Deploy / build-and-deploy (push) Successful in 25s
2026-06-09 13:28:53 +03:00
Grendgi
c31dcb891c Enable Qwen audio endpoint 2026-06-09 13:23:21 +03:00
Grendgi
ee6e948d2e Add single WhisperX load balancer config 2026-06-09 13:20:17 +03:00
Grendgi
e132634c65 Isolate audio model compose network 2026-06-09 13:17:05 +03:00
Grendgi
cac8d89e64 Tune audio model GPU profiles 2026-06-09 12:52:13 +03:00
Grendgi
f49ba7abd5 Add AI server audio model profiles 2026-06-09 12:50:56 +03:00
Grendgi
aaecbb1bed Add transcription provider comparison chain 2026-06-09 12:34:08 +03:00
12 changed files with 754 additions and 57 deletions

View File

@@ -46,6 +46,34 @@ or compact `system` / `user` fields. The completed job result contains
domain metadata fields in `input`, but the worker only reads chat fields such as
`system`, `user`, `messages`, `max_tokens` and `response_format`.
`transcription` jobs can run several transcription providers in order for
temporary A/B comparison. The main `segments` field remains compatible with
telephony and contains the first successful provider result. The full comparison
is stored in `attempts` with `provider`, `model`, `status`, `text`, `segments`,
`duration_ms` and `error`.
Recommended comparison order:
1. `whisperx`
2. `qwen2-audio` (`Qwen/Qwen2-Audio-7B-Instruct`)
3. `voxtral-small` (`mistralai/Voxtral-Small-24B-2507`)
Qwen2-Audio and Voxtral are called through an OpenAI-compatible
`/v1/chat/completions` endpoint with `input_audio`; set their endpoint URLs only
after the models are actually exposed on the AI server.
AI-server compose snippets for these temporary comparison endpoints live in
`deploy/ai-server/docker-compose.audio.yml`. They are profile-gated because the
single GPU cannot keep the production text vLLM, two WhisperX instances, Qwen2
Audio and Voxtral loaded at the same time:
- Qwen2-Audio endpoint: `http://10.2.3.5:8003`
- Voxtral endpoint: `http://10.2.3.5:8004`
- Start Qwen only:
`docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile qwen-audio up -d qwen-audio`
- Start Voxtral only:
`docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile voxtral-small up -d voxtral-small`
## API
- `POST /api/v1/jobs` creates one job.
@@ -83,7 +111,19 @@ for Kubernetes probes.
- `LLM_API_KEY`, primary LLM API key
- `LLM_MODEL`, default `qwen2.5-14b`
- `LLM_TIMEOUT`, default `5m`
- `TRANSCRIPTION_PROVIDERS`, default `whisperx`, comma-separated ordered list:
`whisperx,qwen2-audio,voxtral-small`
- `WHISPERX_URL`, WhisperX endpoint for transcription jobs
- `QWEN_AUDIO_BASE_URL`, OpenAI-compatible endpoint for Qwen2-Audio
- `QWEN_AUDIO_MODEL`, default `Qwen/Qwen2-Audio-7B-Instruct`
- `QWEN_AUDIO_API_KEY`, optional bearer token for Qwen2-Audio; falls back to
`AUDIO_LLM_API_KEY`, then `LLM_API_KEY`
- `VOXTRAL_BASE_URL`, OpenAI-compatible endpoint for Voxtral
- `VOXTRAL_MODEL`, default `mistralai/Voxtral-Small-24B-2507`
- `VOXTRAL_API_KEY`, optional bearer token for Voxtral; falls back to
`AUDIO_LLM_API_KEY`, then `LLM_API_KEY`
- `AUDIO_LLM_PROMPT`, transcription instruction for audio LLM providers
- `AUDIO_LLM_MAX_TOKENS`, default `4096`
- `WORKER_ID`, default hostname
- `WORKER_HTTP_HOST`, default `0.0.0.0`
- `WORKER_HTTP_PORT`, default `8081`

View File

@@ -48,14 +48,31 @@ func main() {
}
llmClient := llm.New(cfg.LLMBaseURL, cfg.LLMAPIKey, cfg.LLMModel, cfg.LLMTimeout)
transcriber := transcription.New(cfg.WhisperXURL, cfg.WhisperXTimeout, cfg.FfmpegPath, cfg.WhisperXLeadSilence)
transcriber := transcription.NewWithOptions(transcription.Options{
Providers: cfg.TranscriptionProviders,
WhisperXURL: cfg.WhisperXURL,
WhisperXTimeout: cfg.WhisperXTimeout,
FfmpegPath: cfg.FfmpegPath,
LeadSilence: cfg.WhisperXLeadSilence,
QwenAudioBaseURL: cfg.QwenAudioBaseURL,
QwenAudioAPIKey: cfg.QwenAudioAPIKey,
QwenAudioModel: cfg.QwenAudioModel,
QwenAudioTimeout: cfg.QwenAudioTimeout,
VoxtralBaseURL: cfg.VoxtralBaseURL,
VoxtralAPIKey: cfg.VoxtralAPIKey,
VoxtralModel: cfg.VoxtralModel,
VoxtralTimeout: cfg.VoxtralTimeout,
AudioLLMPrompt: cfg.AudioLLMPrompt,
AudioLLMMaxTokens: cfg.AudioLLMMaxTokens,
})
w := worker.New(db, llmClient, transcriber, cfg.WorkerID, cfg.LLMModel, cfg.WorkerTaskTypes, cfg.WorkerModelProfiles, cfg.WorkerPollInterval, cfg.WorkerLeaseTimeout, cfg.WorkerClaimLimit)
healthSrv := startHealthServer(ctx, db, cfg)
slog.Info("ai_worker_started",
"worker_id", cfg.WorkerID,
"model", cfg.LLMModel,
"whisperx_enabled", transcriber != nil,
"transcription_enabled", transcriber != nil,
"transcription_providers", cfg.TranscriptionProviders,
"whisperx_lead_silence", cfg.WhisperXLeadSilence.String(),
"task_types", cfg.WorkerTaskTypes,
"model_profiles", cfg.WorkerModelProfiles,
@@ -127,6 +144,7 @@ func (h workerHealth) ServeHTTP(w http.ResponseWriter, r *http.Request) {
"worker_id": h.cfg.WorkerID,
"task_types": h.cfg.WorkerTaskTypes,
"model_profiles": h.cfg.WorkerModelProfiles,
"transcription_providers": h.cfg.TranscriptionProviders,
"claim_limit": h.cfg.WorkerClaimLimit,
"poll_interval": h.cfg.WorkerPollInterval.String(),
"lease_timeout": h.cfg.WorkerLeaseTimeout.String(),

View File

@@ -0,0 +1,118 @@
services:
qwen-audio:
image: vllm/vllm-openai:latest
container_name: qwen-audio
profiles:
- qwen-audio
- audio-compare
restart: unless-stopped
ipc: host
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
VLLM_API_KEY: ${VLLM_API_KEY}
HF_HOME: /cache
volumes:
- ./data/vllm-cache:/cache
networks:
- audio-models
ports:
- "10.2.3.5:8003:8000"
command:
- "--model"
- "Qwen/Qwen2-Audio-7B-Instruct"
- "--served-model-name"
- "Qwen/Qwen2-Audio-7B-Instruct"
- "--trust-remote-code"
- "--host"
- "0.0.0.0"
- "--port"
- "8000"
- "--max-model-len"
- "8192"
- "--gpu-memory-utilization"
- "0.25"
- "--api-key"
- "${VLLM_API_KEY}"
- "--max-num-seqs"
- "4"
- "--max-num-batched-tokens"
- "4096"
healthcheck:
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
interval: 30s
timeout: 5s
retries: 5
start_period: 900s
voxtral-small:
image: vllm/vllm-openai:latest
container_name: voxtral-small
profiles:
- voxtral-small
- audio-compare
restart: unless-stopped
ipc: host
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
VLLM_API_KEY: ${VLLM_API_KEY}
HF_HOME: /cache
volumes:
- ./data/vllm-cache:/cache
networks:
- audio-models
ports:
- "10.2.3.5:8004:8000"
command:
- "--model"
- "mistralai/Voxtral-Small-24B-2507"
- "--served-model-name"
- "mistralai/Voxtral-Small-24B-2507"
- "--tokenizer-mode"
- "mistral"
- "--config-format"
- "mistral"
- "--load-format"
- "mistral"
- "--tool-call-parser"
- "mistral"
- "--enable-auto-tool-choice"
- "--host"
- "0.0.0.0"
- "--port"
- "8000"
- "--max-model-len"
- "32768"
- "--gpu-memory-utilization"
- "0.62"
- "--api-key"
- "${VLLM_API_KEY}"
- "--max-num-seqs"
- "2"
- "--max-num-batched-tokens"
- "8192"
healthcheck:
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
interval: 30s
timeout: 5s
retries: 5
start_period: 1200s
networks:
audio-models:
driver: bridge

View File

@@ -0,0 +1,20 @@
upstream whisperx_upstream {
server whisperx-1:8000 max_fails=3 fail_timeout=30s;
}
server {
listen 80 default_server;
client_max_body_size 200m;
location / {
proxy_pass http://whisperx_upstream;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_connect_timeout 30s;
proxy_send_timeout 10m;
proxy_read_timeout 10m;
proxy_request_buffering off;
proxy_buffering off;
}
}

View File

@@ -18,9 +18,20 @@ type Config struct {
LLMAPIKey string
LLMModel string
LLMTimeout time.Duration
TranscriptionProviders []string
WhisperXURL string
WhisperXTimeout time.Duration
WhisperXLeadSilence time.Duration
QwenAudioBaseURL string
QwenAudioAPIKey string
QwenAudioModel string
QwenAudioTimeout time.Duration
VoxtralBaseURL string
VoxtralAPIKey string
VoxtralModel string
VoxtralTimeout time.Duration
AudioLLMMaxTokens int
AudioLLMPrompt string
FfmpegPath string
AIStatsSidecarURL string
AIStatsTimeout time.Duration
@@ -47,9 +58,20 @@ func Load() Config {
LLMAPIKey: envString("LLM_API_KEY", ""),
LLMModel: envString("LLM_MODEL", "qwen2.5-14b"),
LLMTimeout: envDuration("LLM_TIMEOUT", 5*time.Minute),
TranscriptionProviders: envCSVDefault("TRANSCRIPTION_PROVIDERS", []string{"whisperx"}),
WhisperXURL: envString("WHISPERX_URL", ""),
WhisperXTimeout: envDuration("WHISPERX_TIMEOUT", 10*time.Minute),
WhisperXLeadSilence: envDuration("WHISPERX_LEAD_SILENCE", 800*time.Millisecond),
QwenAudioBaseURL: envString("QWEN_AUDIO_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
QwenAudioAPIKey: envString("QWEN_AUDIO_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
QwenAudioModel: envString("QWEN_AUDIO_MODEL", "Qwen/Qwen2-Audio-7B-Instruct"),
QwenAudioTimeout: envDuration("QWEN_AUDIO_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
VoxtralBaseURL: envString("VOXTRAL_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
VoxtralAPIKey: envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
VoxtralModel: envString("VOXTRAL_MODEL", "mistralai/Voxtral-Small-24B-2507"),
VoxtralTimeout: envDuration("VOXTRAL_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
AudioLLMMaxTokens: envInt("AUDIO_LLM_MAX_TOKENS", 4096),
AudioLLMPrompt: envString("AUDIO_LLM_PROMPT", defaultAudioLLMPrompt()),
FfmpegPath: envString("FFMPEG_PATH", "/usr/bin/ffmpeg"),
AIStatsSidecarURL: envString("AI_STATS_SIDECAR_URL", ""),
AIStatsTimeout: envDuration("AI_STATS_TIMEOUT", 8*time.Second),
@@ -123,6 +145,17 @@ func envCSV(key string) []string {
return out
}
func envCSVDefault(key string, fallback []string) []string {
if values := envCSV(key); len(values) > 0 {
return values
}
return fallback
}
func defaultAudioLLMPrompt() string {
return "Расшифруй речь из аудио максимально точно. Сохрани русский язык, имена, телефоны, суммы и смысловые паузы. Не добавляй комментарии, анализ, Markdown или JSON. Верни только чистый текст расшифровки."
}
func hostname() string {
h, err := os.Hostname()
if err != nil || h == "" {

View File

@@ -53,6 +53,8 @@ func (s *Server) handleDashboard(w http.ResponseWriter, r *http.Request) {
Providers: []providerStatus{
s.checkLLM(ctx),
s.checkWhisperX(ctx),
s.checkAudioLLM(ctx, "qwen2-audio", s.cfg.QwenAudioBaseURL, s.cfg.QwenAudioAPIKey, s.cfg.QwenAudioModel, s.cfg.QwenAudioTimeout),
s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
},
},
Infra: loadInfraSnapshot(r, s.cfg),

View File

@@ -43,11 +43,59 @@ func (s *Server) handleProviderStatus(w http.ResponseWriter, r *http.Request) {
Providers: []providerStatus{
s.checkLLM(ctx),
s.checkWhisperX(ctx),
s.checkAudioLLM(ctx, "qwen2-audio", s.cfg.QwenAudioBaseURL, s.cfg.QwenAudioAPIKey, s.cfg.QwenAudioModel, s.cfg.QwenAudioTimeout),
s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
},
}
writeJSON(w, http.StatusOK, resp)
}
func (s *Server) checkAudioLLM(ctx context.Context, name, baseURL, apiKey, model string, timeout time.Duration) providerStatus {
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
st := providerStatus{
Name: name,
Configured: baseURL != "",
URL: baseURL,
Model: model,
}
if !st.Configured {
return st
}
if timeout <= 0 {
timeout = 10 * time.Minute
}
start := time.Now()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, st.URL+"/v1/models", nil)
if err != nil {
st.Error = err.Error()
return st
}
if apiKey != "" {
req.Header.Set("Authorization", "Bearer "+apiKey)
}
res, err := (&http.Client{Timeout: minDuration(timeout, 3*time.Second)}).Do(req)
st.LatencyMS = time.Since(start).Milliseconds()
if err != nil {
st.Error = err.Error()
return s.withStaleProviderOK(name, st)
}
defer res.Body.Close()
if res.StatusCode >= 300 {
st.Error = fmt.Sprintf("http %d: %s", res.StatusCode, readSmallBody(res.Body))
return s.withStaleProviderOK(name, st)
}
st.OK = true
s.rememberProviderOK(name, st.LatencyMS)
return st
}
func minDuration(a, b time.Duration) time.Duration {
if a < b {
return a
}
return b
}
func (s *Server) checkLLM(ctx context.Context) providerStatus {
st := providerStatus{
Name: "llm",

View File

@@ -3,6 +3,7 @@ package transcription
import (
"bytes"
"context"
"encoding/base64"
"encoding/json"
"fmt"
"io"
@@ -16,12 +17,47 @@ import (
)
type Client struct {
baseURL string
providers []ProviderConfig
http *http.Client
ffmpegPath string
leadSilence time.Duration
}
const (
ProviderWhisperX = "whisperx"
ProviderQwenAudio = "qwen2-audio"
ProviderVoxtral = "voxtral-small"
)
type Options struct {
Providers []string
WhisperXURL string
WhisperXTimeout time.Duration
FfmpegPath string
LeadSilence time.Duration
QwenAudioBaseURL string
QwenAudioAPIKey string
QwenAudioModel string
QwenAudioTimeout time.Duration
VoxtralBaseURL string
VoxtralAPIKey string
VoxtralModel string
VoxtralTimeout time.Duration
AudioLLMPrompt string
AudioLLMMaxTokens int
}
type ProviderConfig struct {
Name string
Kind string
BaseURL string
APIKey string
Model string
Timeout time.Duration
MaxTokens int
Prompt string
}
type Input struct {
AudioURL string `json:"audio_url"`
Filename string `json:"filename,omitempty"`
@@ -39,6 +75,9 @@ type Segment struct {
}
type Result struct {
Provider string `json:"provider,omitempty"`
Model string `json:"model,omitempty"`
Attempts []Attempt `json:"attempts,omitempty"`
Language string `json:"language"`
Segments []Segment `json:"segments"`
DiarizeError *string `json:"diarize_error,omitempty"`
@@ -46,6 +85,16 @@ type Result struct {
DurationMS int64 `json:"duration_ms"`
}
type Attempt struct {
Provider string `json:"provider"`
Model string `json:"model,omitempty"`
Status string `json:"status"`
Error string `json:"error,omitempty"`
Text string `json:"text,omitempty"`
Segments []Segment `json:"segments,omitempty"`
DurationMS int64 `json:"duration_ms,omitempty"`
}
type whisperResponse struct {
Language string `json:"language"`
Segments []Segment `json:"segments"`
@@ -53,35 +102,188 @@ type whisperResponse struct {
AlignError *string `json:"align_error,omitempty"`
}
type audioLLMResponse struct {
Text string
Model string
}
type audioLLMChatRequest struct {
Model string `json:"model"`
Messages []audioLLMChatMessage `json:"messages"`
MaxTokens int `json:"max_tokens,omitempty"`
Temperature float64 `json:"temperature"`
}
type audioLLMChatMessage struct {
Role string `json:"role"`
Content []audioLLMContentPart `json:"content"`
}
type audioLLMContentPart struct {
Type string `json:"type"`
Text string `json:"text,omitempty"`
InputAudio *audioLLMAudio `json:"input_audio,omitempty"`
}
type audioLLMAudio struct {
Data string `json:"data"`
Format string `json:"format,omitempty"`
}
type audioLLMChatResponse struct {
Model string `json:"model,omitempty"`
Choices []struct {
Message struct {
Content string `json:"content"`
} `json:"message"`
} `json:"choices"`
Error *struct {
Message string `json:"message"`
} `json:"error,omitempty"`
}
func New(baseURL string, timeout time.Duration, ffmpegPath string, leadSilence time.Duration) *Client {
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
if baseURL == "" {
return nil
}
if timeout <= 0 {
timeout = 10 * time.Minute
}
return NewWithOptions(Options{
Providers: []string{ProviderWhisperX},
WhisperXURL: baseURL,
WhisperXTimeout: timeout,
FfmpegPath: ffmpegPath,
LeadSilence: leadSilence,
})
}
func NewWithOptions(opts Options) *Client {
leadSilence := opts.LeadSilence
if leadSilence < 0 {
leadSilence = 0
}
if leadSilence > 5*time.Second {
leadSilence = 5 * time.Second
}
ffmpegPath = strings.TrimSpace(ffmpegPath)
ffmpegPath := strings.TrimSpace(opts.FfmpegPath)
if ffmpegPath == "" {
ffmpegPath = "ffmpeg"
}
maxTokens := opts.AudioLLMMaxTokens
if maxTokens <= 0 {
maxTokens = 4096
}
audioLLMPrompt := strings.TrimSpace(opts.AudioLLMPrompt)
if audioLLMPrompt == "" {
audioLLMPrompt = "Transcribe the audio exactly. Return only the transcript text."
}
providers := buildProviders(opts, audioLLMPrompt, maxTokens)
if len(providers) == 0 {
return nil
}
return &Client{
baseURL: baseURL,
http: &http.Client{Timeout: timeout},
providers: providers,
http: &http.Client{Timeout: maxProviderTimeout(providers)},
ffmpegPath: ffmpegPath,
leadSilence: leadSilence,
}
}
func buildProviders(opts Options, prompt string, maxTokens int) []ProviderConfig {
order := normalizeProviderOrder(opts.Providers)
out := make([]ProviderConfig, 0, len(order))
for _, name := range order {
switch name {
case ProviderWhisperX:
baseURL := strings.TrimRight(strings.TrimSpace(opts.WhisperXURL), "/")
if baseURL == "" {
continue
}
out = append(out, ProviderConfig{
Name: ProviderWhisperX,
Kind: ProviderWhisperX,
BaseURL: baseURL,
Model: ProviderWhisperX,
Timeout: defaultDuration(opts.WhisperXTimeout, 10*time.Minute),
})
case ProviderQwenAudio:
baseURL := strings.TrimRight(strings.TrimSpace(opts.QwenAudioBaseURL), "/")
if baseURL == "" {
continue
}
model := firstNonEmpty(opts.QwenAudioModel, "Qwen/Qwen2-Audio-7B-Instruct")
out = append(out, ProviderConfig{
Name: ProviderQwenAudio,
Kind: "audio_llm",
BaseURL: baseURL,
APIKey: strings.TrimSpace(opts.QwenAudioAPIKey),
Model: model,
Timeout: defaultDuration(opts.QwenAudioTimeout, 10*time.Minute),
MaxTokens: maxTokens,
Prompt: prompt,
})
case ProviderVoxtral:
baseURL := strings.TrimRight(strings.TrimSpace(opts.VoxtralBaseURL), "/")
if baseURL == "" {
continue
}
model := firstNonEmpty(opts.VoxtralModel, "mistralai/Voxtral-Small-24B-2507")
out = append(out, ProviderConfig{
Name: ProviderVoxtral,
Kind: "audio_llm",
BaseURL: baseURL,
APIKey: strings.TrimSpace(opts.VoxtralAPIKey),
Model: model,
Timeout: defaultDuration(opts.VoxtralTimeout, 10*time.Minute),
MaxTokens: maxTokens,
Prompt: prompt,
})
}
}
return out
}
func normalizeProviderOrder(in []string) []string {
if len(in) == 0 {
return []string{ProviderWhisperX}
}
out := make([]string, 0, len(in))
seen := map[string]bool{}
for _, raw := range in {
name := strings.ToLower(strings.TrimSpace(raw))
switch name {
case "whisper", "whisperx":
name = ProviderWhisperX
case "qwen", "qwen-audio", "qwen2-audio", "qwen2-audio-7b-instruct":
name = ProviderQwenAudio
case "voxtral", "voxtral-small", "voxtral-small-24b-2507":
name = ProviderVoxtral
default:
continue
}
if !seen[name] {
out = append(out, name)
seen[name] = true
}
}
return out
}
func maxProviderTimeout(providers []ProviderConfig) time.Duration {
maxTimeout := 10 * time.Minute
for _, provider := range providers {
if provider.Timeout > maxTimeout {
maxTimeout = provider.Timeout
}
}
return maxTimeout
}
func defaultDuration(v, fallback time.Duration) time.Duration {
if v <= 0 {
return fallback
}
return v
}
func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
if c == nil || c.baseURL == "" {
return nil, fmt.Errorf("whisperx not configured")
if c == nil || len(c.providers) == 0 {
return nil, fmt.Errorf("transcription providers not configured")
}
if strings.TrimSpace(in.AudioURL) == "" {
return nil, fmt.Errorf("audio_url is required")
@@ -96,18 +298,91 @@ func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
return nil, err
}
}
resp, duration, err := c.transcribeAudio(ctx, audio, filename, in)
var attempts []Attempt
var winner *Result
var errors []string
for _, provider := range c.providers {
result, attempt, err := c.transcribeWithProvider(ctx, provider, audio, filename, in)
attempts = append(attempts, attempt)
if err != nil {
return nil, err
errors = append(errors, provider.Name+": "+err.Error())
continue
}
if winner == nil {
winner = result
}
}
if winner == nil {
return nil, fmt.Errorf("all transcription providers failed: %s", strings.Join(errors, "; "))
}
winner.Attempts = attempts
return winner, nil
}
func (c *Client) transcribeWithProvider(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input) (*Result, Attempt, error) {
providerCtx := ctx
cancel := func() {}
if provider.Timeout > 0 {
providerCtx, cancel = context.WithTimeout(ctx, provider.Timeout)
}
defer cancel()
attempt := Attempt{
Provider: provider.Name,
Model: provider.Model,
Status: "failed",
}
switch provider.Kind {
case ProviderWhisperX:
resp, duration, err := c.transcribeAudio(providerCtx, provider, audio, filename, in)
attempt.DurationMS = duration.Milliseconds()
if err != nil {
attempt.Error = err.Error()
return nil, attempt, err
}
segments := adjustLeadSilence(resp.Segments, c.leadSilence)
attempt.Status = "ok"
attempt.Segments = segments
attempt.Text = segmentsText(segments)
return &Result{
Provider: provider.Name,
Model: provider.Model,
Language: resp.Language,
Segments: segments,
DiarizeError: resp.DiarizeError,
AlignError: resp.AlignError,
DurationMS: duration.Milliseconds(),
}, nil
}, attempt, nil
default:
resp, duration, err := c.transcribeAudioLLM(providerCtx, provider, audio, filename, in)
attempt.DurationMS = duration.Milliseconds()
if err != nil {
attempt.Error = err.Error()
return nil, attempt, err
}
text := strings.TrimSpace(resp.Text)
segments := []Segment{{Start: 0, End: 0, Text: text}}
attempt.Status = "ok"
attempt.Model = resp.Model
attempt.Text = text
attempt.Segments = segments
return &Result{
Provider: provider.Name,
Model: resp.Model,
Language: firstNonEmpty(in.Language, "unknown"),
Segments: segments,
DurationMS: duration.Milliseconds(),
}, attempt, nil
}
}
func segmentsText(segments []Segment) string {
parts := make([]string, 0, len(segments))
for _, segment := range segments {
if text := strings.TrimSpace(segment.Text); text != "" {
parts = append(parts, text)
}
}
return strings.Join(parts, "\n")
}
func (c *Client) downloadAudio(ctx context.Context, in Input) ([]byte, string, error) {
@@ -222,7 +497,7 @@ func clampTime(v float64) float64 {
return v
}
func (c *Client) transcribeAudio(ctx context.Context, audio []byte, filename string, in Input) (*whisperResponse, time.Duration, error) {
func (c *Client) transcribeAudio(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input) (*whisperResponse, time.Duration, error) {
body := &bytes.Buffer{}
mw := multipart.NewWriter(body)
fw, err := mw.CreateFormFile("file", filename)
@@ -250,7 +525,7 @@ func (c *Client) transcribeAudio(ctx context.Context, audio []byte, filename str
return nil, 0, fmt.Errorf("close form: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/transcribe", body)
req, err := http.NewRequestWithContext(ctx, http.MethodPost, provider.BaseURL+"/transcribe", body)
if err != nil {
return nil, 0, fmt.Errorf("whisperx request: %w", err)
}
@@ -273,3 +548,97 @@ func (c *Client) transcribeAudio(ctx context.Context, audio []byte, filename str
}
return &out, duration, nil
}
func (c *Client) transcribeAudioLLM(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input) (*audioLLMResponse, time.Duration, error) {
prompt := provider.Prompt
if in.Language != "" {
prompt += "\nЯзык аудио: " + in.Language + "."
}
if in.Diarize {
prompt += "\nЕсли слышны разные говорящие, разделяй реплики с короткими пометками Спикер 1/Спикер 2."
}
reqBody := audioLLMChatRequest{
Model: provider.Model,
MaxTokens: provider.MaxTokens,
Temperature: 0,
Messages: []audioLLMChatMessage{
{
Role: "user",
Content: []audioLLMContentPart{
{Type: "text", Text: prompt},
{
Type: "input_audio",
InputAudio: &audioLLMAudio{
Data: base64.StdEncoding.EncodeToString(audio),
Format: audioFormat(filename),
},
},
},
},
},
}
body, err := json.Marshal(reqBody)
if err != nil {
return nil, 0, fmt.Errorf("audio llm marshal: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, provider.BaseURL+"/v1/chat/completions", bytes.NewReader(body))
if err != nil {
return nil, 0, fmt.Errorf("audio llm request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
if provider.APIKey != "" {
req.Header.Set("Authorization", "Bearer "+provider.APIKey)
}
start := time.Now()
resp, err := c.http.Do(req)
duration := time.Since(start)
if err != nil {
return nil, duration, fmt.Errorf("audio llm do: %w", err)
}
defer resp.Body.Close()
raw, err := io.ReadAll(io.LimitReader(resp.Body, 4<<20))
if err != nil {
return nil, duration, fmt.Errorf("audio llm read: %w", err)
}
if resp.StatusCode >= 300 {
return nil, duration, fmt.Errorf("audio llm HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(raw)))
}
var out audioLLMChatResponse
if err := json.Unmarshal(raw, &out); err != nil {
return nil, duration, fmt.Errorf("audio llm decode: %w", err)
}
if out.Error != nil {
return nil, duration, fmt.Errorf("audio llm error: %s", out.Error.Message)
}
if len(out.Choices) == 0 {
return nil, duration, fmt.Errorf("audio llm: empty choices")
}
modelName := out.Model
if modelName == "" {
modelName = provider.Model
}
return &audioLLMResponse{
Text: strings.TrimSpace(out.Choices[0].Message.Content),
Model: modelName,
}, duration, nil
}
func audioFormat(filename string) string {
ext := strings.TrimPrefix(strings.ToLower(filepath.Ext(filename)), ".")
switch ext {
case "wav", "mp3", "flac", "m4a", "ogg", "opus", "webm":
return ext
default:
return "mp3"
}
}
func firstNonEmpty(values ...string) string {
for _, value := range values {
if strings.TrimSpace(value) != "" {
return value
}
}
return ""
}

View File

@@ -23,6 +23,41 @@ func TestAdjustLeadSilence(t *testing.T) {
}
}
func TestNormalizeProviderOrder(t *testing.T) {
got := normalizeProviderOrder([]string{"whisperx", "qwen", "voxtral", "qwen2-audio"})
want := []string{ProviderWhisperX, ProviderQwenAudio, ProviderVoxtral}
if len(got) != len(want) {
t.Fatalf("providers = %#v, want %#v", got, want)
}
for i := range want {
if got[i] != want[i] {
t.Fatalf("providers = %#v, want %#v", got, want)
}
}
}
func TestNewWithOptionsBuildsComparisonProviders(t *testing.T) {
client := NewWithOptions(Options{
Providers: []string{"whisperx", "qwen2-audio", "voxtral-small"},
WhisperXURL: "http://whisperx",
QwenAudioBaseURL: "http://qwen",
VoxtralBaseURL: "http://voxtral",
})
if client == nil {
t.Fatal("client is nil")
}
got := make([]string, 0, len(client.providers))
for _, provider := range client.providers {
got = append(got, provider.Name)
}
want := []string{ProviderWhisperX, ProviderQwenAudio, ProviderVoxtral}
for i := range want {
if got[i] != want[i] {
t.Fatalf("providers = %#v, want %#v", got, want)
}
}
}
func near(got, want float64) bool {
return math.Abs(got-want) < 0.000001
}

View File

@@ -139,7 +139,7 @@ func (w *Worker) process(ctx context.Context, job *model.Job) {
func (w *Worker) processTranscription(ctx context.Context, job *model.Job) {
if w.transcriber == nil {
w.fail(ctx, job, "provider_unavailable", "whisperx not configured")
w.fail(ctx, job, "provider_unavailable", "transcription providers not configured")
return
}
var input transcription.Input
@@ -186,8 +186,10 @@ func classifyTranscriptionError(err error) string {
return "storage_error"
case strings.Contains(s, "whisperx http 4") || strings.Contains(s, "ffmpeg") || strings.Contains(s, "invalid data") || strings.Contains(s, "could not decode"):
return "bad_audio"
case strings.Contains(s, "whisperx http 5") || strings.Contains(s, "whisperx do") || strings.Contains(s, "connection refused") || strings.Contains(s, "connection reset") || strings.Contains(s, "closed network connection"):
case strings.Contains(s, "whisperx http 5") || strings.Contains(s, "whisperx do") || strings.Contains(s, "audio llm http 5") || strings.Contains(s, "audio llm do") || strings.Contains(s, "connection refused") || strings.Contains(s, "connection reset") || strings.Contains(s, "closed network connection"):
return "provider_unavailable"
case strings.Contains(s, "audio llm http 4"):
return "bad_input"
case strings.Contains(s, "decode"):
return "bad_response"
default:

View File

@@ -11,9 +11,19 @@ data:
LLM_BASE_URL: "http://10.2.3.5:8002"
LLM_MODEL: "qwen2.5-14b"
LLM_TIMEOUT: "5m"
TRANSCRIPTION_PROVIDERS: "whisperx,qwen2-audio,voxtral-small"
WHISPERX_URL: "http://10.2.3.5:8001"
WHISPERX_TIMEOUT: "10m"
WHISPERX_LEAD_SILENCE: "800ms"
# Fill these after Qwen2-Audio and Voxtral are exposed as OpenAI-compatible
# chat-completions endpoints on the AI server.
QWEN_AUDIO_BASE_URL: "http://10.2.3.5:8003"
QWEN_AUDIO_MODEL: "Qwen/Qwen2-Audio-7B-Instruct"
QWEN_AUDIO_TIMEOUT: "10m"
VOXTRAL_BASE_URL: ""
VOXTRAL_MODEL: "mistralai/Voxtral-Small-24B-2507"
VOXTRAL_TIMEOUT: "10m"
AUDIO_LLM_MAX_TOKENS: "4096"
FFMPEG_PATH: "/usr/bin/ffmpeg"
AI_STATS_SIDECAR_URL: "http://10.2.3.5:9090"
AI_STATS_TIMEOUT: "8s"

View File

@@ -18,4 +18,6 @@ type: Opaque
stringData:
DATABASE_URL: "postgres://ai_service:ai_service@postgres:5432/ai_service?sslmode=disable"
LLM_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
QWEN_AUDIO_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
VOXTRAL_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
AI_SERVICE_TOKEN: "d18bcacf9e02bae1806ee6b6eeda62b95be6a915c0a22936d9a700128b275442"