Switch transcription to Whisper large v3
Some checks failed
CI / test (push) Failing after 10s
Build and Deploy / build-and-deploy (push) Successful in 24s

This commit is contained in:
Grendgi
2026-06-10 10:10:13 +03:00
parent 1b63dcdbf5
commit 8d6cd84403
12 changed files with 85 additions and 93 deletions

View File

@@ -15,7 +15,7 @@ The service is intentionally domain-agnostic:
`beeline/{call_id}` or `channel/{message_id}`.
- `task_type` describes the technical task class, for example
`transcribe`, `call_analysis`, `tg_analysis`, `pf_competitor_analysis`.
- `model_profile` selects a runtime profile, for example `voxtral-small`,
- `model_profile` selects a runtime profile, for example `whisper-large-v3`,
`qwen2.5-14b`, `vision`, or a future provider profile.
- `input` and `result` are JSON payloads owned by the caller and worker.
@@ -46,23 +46,22 @@ or compact `system` / `user` fields. The completed job result contains
domain metadata fields in `input`, but the worker only reads chat fields such as
`system`, `user`, `messages`, `max_tokens` and `response_format`.
`transcription` jobs are processed only by Voxtral Small
(`mistralai/Voxtral-Small-24B-2507`) through an OpenAI-compatible
`transcription` jobs are processed only by Whisper Large v3
(`openai/whisper-large-v3`) through an OpenAI-compatible
`/v1/audio/transcriptions` endpoint. The returned `segments` field stays
compatible with telephony. If the provider returns one long segment, AI Service
splits it into smaller transcript segments and adds heuristic speaker labels
when diarization is requested.
splits it into smaller transcript segments without inventing speaker labels.
AI-server compose snippet for Voxtral lives in
AI-server compose snippet for Whisper Large v3 lives in
`deploy/ai-server/docker-compose.audio.yml`:
- Voxtral endpoint: `http://10.2.3.5:8004`
- Start Voxtral:
`docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile voxtral-small up -d voxtral-small`
- Whisper endpoint: `http://10.2.3.5:8004`
- Start Whisper:
`docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile whisper-large-v3 up -d whisper-large-v3`
In Kubernetes the dedicated transcription worker may claim more than one
`voxtral-small` job at a time. This keeps download/upload/wait overhead from
serializing the queue while Voxtral/vLLM still controls the actual GPU
`whisper-large-v3` job at a time. This keeps download/upload/wait overhead from
serializing the queue while Whisper/vLLM still controls the actual GPU
scheduling.
## API
@@ -102,11 +101,11 @@ for Kubernetes probes.
- `LLM_API_KEY`, primary LLM API key
- `LLM_MODEL`, default `qwen2.5-14b`
- `LLM_TIMEOUT`, default `5m`
- `VOXTRAL_BASE_URL`, OpenAI-compatible endpoint for Voxtral
- `VOXTRAL_MODEL`, default `mistralai/Voxtral-Small-24B-2507`
- `VOXTRAL_API_KEY`, optional bearer token for Voxtral; falls back to
- `AUDIO_TRANSCRIPTION_BASE_URL`, OpenAI-compatible transcription endpoint
- `AUDIO_TRANSCRIPTION_MODEL`, default `openai/whisper-large-v3`
- `AUDIO_TRANSCRIPTION_API_KEY`, optional bearer token; falls back to
`AUDIO_LLM_API_KEY`, then `LLM_API_KEY`
- `AUDIO_LLM_PROMPT`, transcription instruction for Voxtral
- `AUDIO_TRANSCRIPTION_PROMPT`, transcription instruction
- `WORKER_ID`, default hostname
- `WORKER_HTTP_HOST`, default `0.0.0.0`
- `WORKER_HTTP_PORT`, default `8081`

View File

@@ -49,11 +49,11 @@ func main() {
llmClient := llm.New(cfg.LLMBaseURL, cfg.LLMAPIKey, cfg.LLMModel, cfg.LLMTimeout)
transcriber := transcription.NewWithOptions(transcription.Options{
VoxtralBaseURL: cfg.VoxtralBaseURL,
VoxtralAPIKey: cfg.VoxtralAPIKey,
VoxtralModel: cfg.VoxtralModel,
VoxtralTimeout: cfg.VoxtralTimeout,
AudioLLMPrompt: cfg.AudioLLMPrompt,
AudioBaseURL: cfg.AudioBaseURL,
AudioAPIKey: cfg.AudioAPIKey,
AudioModel: cfg.AudioModel,
AudioTimeout: cfg.AudioTimeout,
AudioPrompt: cfg.AudioPrompt,
})
w := worker.New(db, llmClient, transcriber, cfg.WorkerID, cfg.LLMModel, cfg.WorkerTaskTypes, cfg.WorkerModelProfiles, cfg.WorkerPollInterval, cfg.WorkerLeaseTimeout, cfg.WorkerClaimLimit)
healthSrv := startHealthServer(ctx, db, cfg)
@@ -62,8 +62,8 @@ func main() {
"worker_id", cfg.WorkerID,
"model", cfg.LLMModel,
"transcription_enabled", transcriber != nil,
"transcription_provider", "voxtral-small",
"transcription_model", cfg.VoxtralModel,
"transcription_provider", transcription.ProviderWhisperLargeV3,
"transcription_model", cfg.AudioModel,
"task_types", cfg.WorkerTaskTypes,
"model_profiles", cfg.WorkerModelProfiles,
"poll_interval", cfg.WorkerPollInterval.String(),
@@ -134,8 +134,8 @@ func (h workerHealth) ServeHTTP(w http.ResponseWriter, r *http.Request) {
"worker_id": h.cfg.WorkerID,
"task_types": h.cfg.WorkerTaskTypes,
"model_profiles": h.cfg.WorkerModelProfiles,
"transcription_provider": "voxtral-small",
"transcription_model": h.cfg.VoxtralModel,
"transcription_provider": transcription.ProviderWhisperLargeV3,
"transcription_model": h.cfg.AudioModel,
"claim_limit": h.cfg.WorkerClaimLimit,
"poll_interval": h.cfg.WorkerPollInterval.String(),
"lease_timeout": h.cfg.WorkerLeaseTimeout.String(),

View File

@@ -1,12 +1,12 @@
services:
voxtral-small:
whisper-large-v3:
build:
context: .
dockerfile: vllm-audio.Dockerfile
image: vllm-audio:local
container_name: voxtral-small
container_name: whisper-large-v3
profiles:
- voxtral-small
- whisper-large-v3
restart: unless-stopped
ipc: host
runtime: nvidia
@@ -29,32 +29,19 @@ services:
- "10.2.3.5:8004:8000"
command:
- "--model"
- "mistralai/Voxtral-Small-24B-2507"
- "openai/whisper-large-v3"
- "--served-model-name"
- "mistralai/Voxtral-Small-24B-2507"
- "--tokenizer-mode"
- "mistral"
- "--config-format"
- "mistral"
- "--load-format"
- "mistral"
- "--tool-call-parser"
- "mistral"
- "--enable-auto-tool-choice"
- "openai/whisper-large-v3"
- "--task"
- "transcription"
- "--host"
- "0.0.0.0"
- "--port"
- "8000"
- "--max-model-len"
- "16384"
- "--gpu-memory-utilization"
- "0.55"
- "--api-key"
- "${VLLM_API_KEY}"
- "--max-num-seqs"
- "1"
- "--max-num-batched-tokens"
- "4096"
healthcheck:
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
interval: 30s

View File

@@ -18,11 +18,11 @@ type Config struct {
LLMAPIKey string
LLMModel string
LLMTimeout time.Duration
VoxtralBaseURL string
VoxtralAPIKey string
VoxtralModel string
VoxtralTimeout time.Duration
AudioLLMPrompt string
AudioBaseURL string
AudioAPIKey string
AudioModel string
AudioTimeout time.Duration
AudioPrompt string
AIStatsSidecarURL string
AIStatsTimeout time.Duration
@@ -48,11 +48,11 @@ func Load() Config {
LLMAPIKey: envString("LLM_API_KEY", ""),
LLMModel: envString("LLM_MODEL", "qwen2.5-14b"),
LLMTimeout: envDuration("LLM_TIMEOUT", 5*time.Minute),
VoxtralBaseURL: envString("VOXTRAL_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
VoxtralAPIKey: envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
VoxtralModel: envString("VOXTRAL_MODEL", "mistralai/Voxtral-Small-24B-2507"),
VoxtralTimeout: envDuration("VOXTRAL_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
AudioLLMPrompt: envString("AUDIO_LLM_PROMPT", defaultAudioLLMPrompt()),
AudioBaseURL: envString("AUDIO_TRANSCRIPTION_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
AudioAPIKey: envString("AUDIO_TRANSCRIPTION_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
AudioModel: envString("AUDIO_TRANSCRIPTION_MODEL", "openai/whisper-large-v3"),
AudioTimeout: envDuration("AUDIO_TRANSCRIPTION_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
AudioPrompt: envString("AUDIO_TRANSCRIPTION_PROMPT", envString("AUDIO_LLM_PROMPT", defaultAudioPrompt())),
AIStatsSidecarURL: envString("AI_STATS_SIDECAR_URL", ""),
AIStatsTimeout: envDuration("AI_STATS_TIMEOUT", 8*time.Second),
@@ -132,7 +132,7 @@ func envCSVDefault(key string, fallback []string) []string {
return fallback
}
func defaultAudioLLMPrompt() string {
func defaultAudioPrompt() string {
return "Расшифруй речь из аудио максимально точно. Сохрани русский язык, имена, телефоны, суммы и смысловые паузы. Не добавляй комментарии, анализ, Markdown или JSON. Верни только чистый текст расшифровки."
}

View File

@@ -5,6 +5,7 @@ import (
"time"
"ai-service/internal/model"
"ai-service/internal/transcription"
)
type dashboardResponse struct {
@@ -51,7 +52,7 @@ func (s *Server) handleDashboard(w http.ResponseWriter, r *http.Request) {
At: now,
Providers: []providerStatus{
s.checkLLM(ctx),
s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
s.checkAudioLLM(ctx, transcription.ProviderWhisperLargeV3, s.cfg.AudioBaseURL, s.cfg.AudioAPIKey, s.cfg.AudioModel, s.cfg.AudioTimeout),
},
},
Infra: loadInfraSnapshot(r, s.cfg),

View File

@@ -8,6 +8,8 @@ import (
"net/http"
"strings"
"time"
"ai-service/internal/transcription"
)
type providerStatus struct {
@@ -42,7 +44,7 @@ func (s *Server) handleProviderStatus(w http.ResponseWriter, r *http.Request) {
At: time.Now().UTC(),
Providers: []providerStatus{
s.checkLLM(ctx),
s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
s.checkAudioLLM(ctx, transcription.ProviderWhisperLargeV3, s.cfg.AudioBaseURL, s.cfg.AudioAPIKey, s.cfg.AudioModel, s.cfg.AudioTimeout),
},
}
writeJSON(w, http.StatusOK, resp)

View File

@@ -19,16 +19,19 @@ type Client struct {
http *http.Client
}
const ProviderVoxtral = "voxtral-small"
const (
ProviderWhisperLargeV3 = "whisper-large-v3"
defaultWhisperModel = "openai/whisper-large-v3"
)
var speakerLabelPattern = regexp.MustCompile(`(?i)(?:^|[\n\r ]+)((?:speaker|спикер|говорящий)\s*\d+)\s*[:-]`)
type Options struct {
VoxtralBaseURL string
VoxtralAPIKey string
VoxtralModel string
VoxtralTimeout time.Duration
AudioLLMPrompt string
AudioBaseURL string
AudioAPIKey string
AudioModel string
AudioTimeout time.Duration
AudioPrompt string
}
type ProviderConfig struct {
@@ -102,17 +105,17 @@ type audioTranscriptionSegment struct {
func New(baseURL string, timeout time.Duration, ffmpegPath string, leadSilence time.Duration) *Client {
return NewWithOptions(Options{
VoxtralBaseURL: baseURL,
VoxtralTimeout: timeout,
AudioBaseURL: baseURL,
AudioTimeout: timeout,
})
}
func NewWithOptions(opts Options) *Client {
audioLLMPrompt := strings.TrimSpace(opts.AudioLLMPrompt)
if audioLLMPrompt == "" {
audioLLMPrompt = "Transcribe the audio exactly. Return only the transcript text."
audioPrompt := strings.TrimSpace(opts.AudioPrompt)
if audioPrompt == "" {
audioPrompt = "Transcribe the audio exactly. Return only the transcript text."
}
provider := buildVoxtralProvider(opts, audioLLMPrompt)
provider := buildAudioProvider(opts, audioPrompt)
if provider.BaseURL == "" {
return nil
}
@@ -122,18 +125,18 @@ func NewWithOptions(opts Options) *Client {
}
}
func buildVoxtralProvider(opts Options, prompt string) ProviderConfig {
baseURL := strings.TrimRight(strings.TrimSpace(opts.VoxtralBaseURL), "/")
func buildAudioProvider(opts Options, prompt string) ProviderConfig {
baseURL := strings.TrimRight(strings.TrimSpace(opts.AudioBaseURL), "/")
if baseURL == "" {
return ProviderConfig{}
}
model := firstNonEmpty(opts.VoxtralModel, "mistralai/Voxtral-Small-24B-2507")
model := firstNonEmpty(opts.AudioModel, defaultWhisperModel)
return ProviderConfig{
Name: ProviderVoxtral,
Name: ProviderWhisperLargeV3,
BaseURL: baseURL,
APIKey: strings.TrimSpace(opts.VoxtralAPIKey),
APIKey: strings.TrimSpace(opts.AudioAPIKey),
Model: model,
Timeout: defaultDuration(opts.VoxtralTimeout, 10*time.Minute),
Timeout: defaultDuration(opts.AudioTimeout, 10*time.Minute),
Prompt: prompt,
}
}
@@ -147,7 +150,7 @@ func defaultDuration(v, fallback time.Duration) time.Duration {
func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
if c == nil || c.provider.BaseURL == "" {
return nil, fmt.Errorf("voxtral transcription provider not configured")
return nil, fmt.Errorf("audio transcription provider not configured")
}
if strings.TrimSpace(in.AudioURL) == "" {
return nil, fmt.Errorf("audio_url is required")

View File

@@ -7,22 +7,22 @@ import (
"testing"
)
func TestNewWithOptionsBuildsVoxtralProvider(t *testing.T) {
func TestNewWithOptionsBuildsWhisperProvider(t *testing.T) {
client := NewWithOptions(Options{
VoxtralBaseURL: "http://voxtral",
AudioBaseURL: "http://whisper",
})
if client == nil {
t.Fatal("client is nil")
}
if client.provider.Name != ProviderVoxtral {
t.Fatalf("provider = %q, want %q", client.provider.Name, ProviderVoxtral)
if client.provider.Name != ProviderWhisperLargeV3 {
t.Fatalf("provider = %q, want %q", client.provider.Name, ProviderWhisperLargeV3)
}
if client.provider.Model != "mistralai/Voxtral-Small-24B-2507" {
if client.provider.Model != "openai/whisper-large-v3" {
t.Fatalf("model = %q", client.provider.Model)
}
}
func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
func TestWhisperUsesAudioTranscriptionsEndpoint(t *testing.T) {
audioSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("fake audio"))
}))
@@ -50,8 +50,8 @@ func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
defer providerSrv.Close()
client := NewWithOptions(Options{
VoxtralBaseURL: providerSrv.URL,
VoxtralModel: "mistralai/Voxtral-Small-24B-2507",
AudioBaseURL: providerSrv.URL,
AudioModel: "openai/whisper-large-v3",
})
if client == nil {
t.Fatal("client is nil")
@@ -63,7 +63,7 @@ func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
if gotPath != "/v1/audio/transcriptions" {
t.Fatalf("path = %q, want /v1/audio/transcriptions", gotPath)
}
if gotModel != "mistralai/Voxtral-Small-24B-2507" {
if gotModel != "openai/whisper-large-v3" {
t.Fatalf("model = %q", gotModel)
}
if gotResponseFormat != "json" {

View File

@@ -20,7 +20,7 @@ const (
TaskCallAnalysis = "call_analysis"
TaskTranscription = "transcription"
TranscriptionProfile = "voxtral-small"
TranscriptionProfile = "whisper-large-v3"
)
type Worker struct {

View File

@@ -11,11 +11,11 @@ data:
LLM_BASE_URL: "http://10.2.3.5:8002"
LLM_MODEL: "qwen2.5-14b"
LLM_TIMEOUT: "5m"
# Voxtral Small is the only transcription provider. It is exposed on the AI
# server through an OpenAI-compatible /v1/audio/transcriptions endpoint.
VOXTRAL_BASE_URL: "http://10.2.3.5:8004"
VOXTRAL_MODEL: "mistralai/Voxtral-Small-24B-2507"
VOXTRAL_TIMEOUT: "30m"
# Whisper Large v3 is exposed on the AI server through an OpenAI-compatible
# /v1/audio/transcriptions endpoint.
AUDIO_TRANSCRIPTION_BASE_URL: "http://10.2.3.5:8004"
AUDIO_TRANSCRIPTION_MODEL: "openai/whisper-large-v3"
AUDIO_TRANSCRIPTION_TIMEOUT: "30m"
AI_STATS_SIDECAR_URL: "http://10.2.3.5:9090"
AI_STATS_TIMEOUT: "8s"
WORKER_POLL_INTERVAL: "2s"

View File

@@ -18,5 +18,5 @@ type: Opaque
stringData:
DATABASE_URL: "postgres://ai_service:ai_service@postgres:5432/ai_service?sslmode=disable"
LLM_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
VOXTRAL_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
AUDIO_TRANSCRIPTION_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
AI_SERVICE_TOKEN: "d18bcacf9e02bae1806ee6b6eeda62b95be6a915c0a22936d9a700128b275442"

View File

@@ -98,7 +98,7 @@ spec:
- name: WORKER_TASK_TYPES
value: "transcription"
- name: WORKER_MODEL_PROFILES
value: "voxtral-small"
value: "whisper-large-v3"
- name: WORKER_CLAIM_LIMIT
value: "2"
- name: WORKER_LEASE_TIMEOUT