Switch transcription to Whisper large v3
This commit is contained in:
29
README.md
29
README.md
@@ -15,7 +15,7 @@ The service is intentionally domain-agnostic:
|
||||
`beeline/{call_id}` or `channel/{message_id}`.
|
||||
- `task_type` describes the technical task class, for example
|
||||
`transcribe`, `call_analysis`, `tg_analysis`, `pf_competitor_analysis`.
|
||||
- `model_profile` selects a runtime profile, for example `voxtral-small`,
|
||||
- `model_profile` selects a runtime profile, for example `whisper-large-v3`,
|
||||
`qwen2.5-14b`, `vision`, or a future provider profile.
|
||||
- `input` and `result` are JSON payloads owned by the caller and worker.
|
||||
|
||||
@@ -46,23 +46,22 @@ or compact `system` / `user` fields. The completed job result contains
|
||||
domain metadata fields in `input`, but the worker only reads chat fields such as
|
||||
`system`, `user`, `messages`, `max_tokens` and `response_format`.
|
||||
|
||||
`transcription` jobs are processed only by Voxtral Small
|
||||
(`mistralai/Voxtral-Small-24B-2507`) through an OpenAI-compatible
|
||||
`transcription` jobs are processed only by Whisper Large v3
|
||||
(`openai/whisper-large-v3`) through an OpenAI-compatible
|
||||
`/v1/audio/transcriptions` endpoint. The returned `segments` field stays
|
||||
compatible with telephony. If the provider returns one long segment, AI Service
|
||||
splits it into smaller transcript segments and adds heuristic speaker labels
|
||||
when diarization is requested.
|
||||
splits it into smaller transcript segments without inventing speaker labels.
|
||||
|
||||
AI-server compose snippet for Voxtral lives in
|
||||
AI-server compose snippet for Whisper Large v3 lives in
|
||||
`deploy/ai-server/docker-compose.audio.yml`:
|
||||
|
||||
- Voxtral endpoint: `http://10.2.3.5:8004`
|
||||
- Start Voxtral:
|
||||
`docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile voxtral-small up -d voxtral-small`
|
||||
- Whisper endpoint: `http://10.2.3.5:8004`
|
||||
- Start Whisper:
|
||||
`docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile whisper-large-v3 up -d whisper-large-v3`
|
||||
|
||||
In Kubernetes the dedicated transcription worker may claim more than one
|
||||
`voxtral-small` job at a time. This keeps download/upload/wait overhead from
|
||||
serializing the queue while Voxtral/vLLM still controls the actual GPU
|
||||
`whisper-large-v3` job at a time. This keeps download/upload/wait overhead from
|
||||
serializing the queue while Whisper/vLLM still controls the actual GPU
|
||||
scheduling.
|
||||
|
||||
## API
|
||||
@@ -102,11 +101,11 @@ for Kubernetes probes.
|
||||
- `LLM_API_KEY`, primary LLM API key
|
||||
- `LLM_MODEL`, default `qwen2.5-14b`
|
||||
- `LLM_TIMEOUT`, default `5m`
|
||||
- `VOXTRAL_BASE_URL`, OpenAI-compatible endpoint for Voxtral
|
||||
- `VOXTRAL_MODEL`, default `mistralai/Voxtral-Small-24B-2507`
|
||||
- `VOXTRAL_API_KEY`, optional bearer token for Voxtral; falls back to
|
||||
- `AUDIO_TRANSCRIPTION_BASE_URL`, OpenAI-compatible transcription endpoint
|
||||
- `AUDIO_TRANSCRIPTION_MODEL`, default `openai/whisper-large-v3`
|
||||
- `AUDIO_TRANSCRIPTION_API_KEY`, optional bearer token; falls back to
|
||||
`AUDIO_LLM_API_KEY`, then `LLM_API_KEY`
|
||||
- `AUDIO_LLM_PROMPT`, transcription instruction for Voxtral
|
||||
- `AUDIO_TRANSCRIPTION_PROMPT`, transcription instruction
|
||||
- `WORKER_ID`, default hostname
|
||||
- `WORKER_HTTP_HOST`, default `0.0.0.0`
|
||||
- `WORKER_HTTP_PORT`, default `8081`
|
||||
|
||||
@@ -49,11 +49,11 @@ func main() {
|
||||
|
||||
llmClient := llm.New(cfg.LLMBaseURL, cfg.LLMAPIKey, cfg.LLMModel, cfg.LLMTimeout)
|
||||
transcriber := transcription.NewWithOptions(transcription.Options{
|
||||
VoxtralBaseURL: cfg.VoxtralBaseURL,
|
||||
VoxtralAPIKey: cfg.VoxtralAPIKey,
|
||||
VoxtralModel: cfg.VoxtralModel,
|
||||
VoxtralTimeout: cfg.VoxtralTimeout,
|
||||
AudioLLMPrompt: cfg.AudioLLMPrompt,
|
||||
AudioBaseURL: cfg.AudioBaseURL,
|
||||
AudioAPIKey: cfg.AudioAPIKey,
|
||||
AudioModel: cfg.AudioModel,
|
||||
AudioTimeout: cfg.AudioTimeout,
|
||||
AudioPrompt: cfg.AudioPrompt,
|
||||
})
|
||||
w := worker.New(db, llmClient, transcriber, cfg.WorkerID, cfg.LLMModel, cfg.WorkerTaskTypes, cfg.WorkerModelProfiles, cfg.WorkerPollInterval, cfg.WorkerLeaseTimeout, cfg.WorkerClaimLimit)
|
||||
healthSrv := startHealthServer(ctx, db, cfg)
|
||||
@@ -62,8 +62,8 @@ func main() {
|
||||
"worker_id", cfg.WorkerID,
|
||||
"model", cfg.LLMModel,
|
||||
"transcription_enabled", transcriber != nil,
|
||||
"transcription_provider", "voxtral-small",
|
||||
"transcription_model", cfg.VoxtralModel,
|
||||
"transcription_provider", transcription.ProviderWhisperLargeV3,
|
||||
"transcription_model", cfg.AudioModel,
|
||||
"task_types", cfg.WorkerTaskTypes,
|
||||
"model_profiles", cfg.WorkerModelProfiles,
|
||||
"poll_interval", cfg.WorkerPollInterval.String(),
|
||||
@@ -134,8 +134,8 @@ func (h workerHealth) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
"worker_id": h.cfg.WorkerID,
|
||||
"task_types": h.cfg.WorkerTaskTypes,
|
||||
"model_profiles": h.cfg.WorkerModelProfiles,
|
||||
"transcription_provider": "voxtral-small",
|
||||
"transcription_model": h.cfg.VoxtralModel,
|
||||
"transcription_provider": transcription.ProviderWhisperLargeV3,
|
||||
"transcription_model": h.cfg.AudioModel,
|
||||
"claim_limit": h.cfg.WorkerClaimLimit,
|
||||
"poll_interval": h.cfg.WorkerPollInterval.String(),
|
||||
"lease_timeout": h.cfg.WorkerLeaseTimeout.String(),
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
services:
|
||||
voxtral-small:
|
||||
whisper-large-v3:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: vllm-audio.Dockerfile
|
||||
image: vllm-audio:local
|
||||
container_name: voxtral-small
|
||||
container_name: whisper-large-v3
|
||||
profiles:
|
||||
- voxtral-small
|
||||
- whisper-large-v3
|
||||
restart: unless-stopped
|
||||
ipc: host
|
||||
runtime: nvidia
|
||||
@@ -29,32 +29,19 @@ services:
|
||||
- "10.2.3.5:8004:8000"
|
||||
command:
|
||||
- "--model"
|
||||
- "mistralai/Voxtral-Small-24B-2507"
|
||||
- "openai/whisper-large-v3"
|
||||
- "--served-model-name"
|
||||
- "mistralai/Voxtral-Small-24B-2507"
|
||||
- "--tokenizer-mode"
|
||||
- "mistral"
|
||||
- "--config-format"
|
||||
- "mistral"
|
||||
- "--load-format"
|
||||
- "mistral"
|
||||
- "--tool-call-parser"
|
||||
- "mistral"
|
||||
- "--enable-auto-tool-choice"
|
||||
- "openai/whisper-large-v3"
|
||||
- "--task"
|
||||
- "transcription"
|
||||
- "--host"
|
||||
- "0.0.0.0"
|
||||
- "--port"
|
||||
- "8000"
|
||||
- "--max-model-len"
|
||||
- "16384"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.55"
|
||||
- "--api-key"
|
||||
- "${VLLM_API_KEY}"
|
||||
- "--max-num-seqs"
|
||||
- "1"
|
||||
- "--max-num-batched-tokens"
|
||||
- "4096"
|
||||
healthcheck:
|
||||
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
|
||||
interval: 30s
|
||||
|
||||
@@ -18,11 +18,11 @@ type Config struct {
|
||||
LLMAPIKey string
|
||||
LLMModel string
|
||||
LLMTimeout time.Duration
|
||||
VoxtralBaseURL string
|
||||
VoxtralAPIKey string
|
||||
VoxtralModel string
|
||||
VoxtralTimeout time.Duration
|
||||
AudioLLMPrompt string
|
||||
AudioBaseURL string
|
||||
AudioAPIKey string
|
||||
AudioModel string
|
||||
AudioTimeout time.Duration
|
||||
AudioPrompt string
|
||||
AIStatsSidecarURL string
|
||||
AIStatsTimeout time.Duration
|
||||
|
||||
@@ -48,11 +48,11 @@ func Load() Config {
|
||||
LLMAPIKey: envString("LLM_API_KEY", ""),
|
||||
LLMModel: envString("LLM_MODEL", "qwen2.5-14b"),
|
||||
LLMTimeout: envDuration("LLM_TIMEOUT", 5*time.Minute),
|
||||
VoxtralBaseURL: envString("VOXTRAL_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
|
||||
VoxtralAPIKey: envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
|
||||
VoxtralModel: envString("VOXTRAL_MODEL", "mistralai/Voxtral-Small-24B-2507"),
|
||||
VoxtralTimeout: envDuration("VOXTRAL_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
|
||||
AudioLLMPrompt: envString("AUDIO_LLM_PROMPT", defaultAudioLLMPrompt()),
|
||||
AudioBaseURL: envString("AUDIO_TRANSCRIPTION_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
|
||||
AudioAPIKey: envString("AUDIO_TRANSCRIPTION_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
|
||||
AudioModel: envString("AUDIO_TRANSCRIPTION_MODEL", "openai/whisper-large-v3"),
|
||||
AudioTimeout: envDuration("AUDIO_TRANSCRIPTION_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
|
||||
AudioPrompt: envString("AUDIO_TRANSCRIPTION_PROMPT", envString("AUDIO_LLM_PROMPT", defaultAudioPrompt())),
|
||||
AIStatsSidecarURL: envString("AI_STATS_SIDECAR_URL", ""),
|
||||
AIStatsTimeout: envDuration("AI_STATS_TIMEOUT", 8*time.Second),
|
||||
|
||||
@@ -132,7 +132,7 @@ func envCSVDefault(key string, fallback []string) []string {
|
||||
return fallback
|
||||
}
|
||||
|
||||
func defaultAudioLLMPrompt() string {
|
||||
func defaultAudioPrompt() string {
|
||||
return "Расшифруй речь из аудио максимально точно. Сохрани русский язык, имена, телефоны, суммы и смысловые паузы. Не добавляй комментарии, анализ, Markdown или JSON. Верни только чистый текст расшифровки."
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"time"
|
||||
|
||||
"ai-service/internal/model"
|
||||
"ai-service/internal/transcription"
|
||||
)
|
||||
|
||||
type dashboardResponse struct {
|
||||
@@ -51,7 +52,7 @@ func (s *Server) handleDashboard(w http.ResponseWriter, r *http.Request) {
|
||||
At: now,
|
||||
Providers: []providerStatus{
|
||||
s.checkLLM(ctx),
|
||||
s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
|
||||
s.checkAudioLLM(ctx, transcription.ProviderWhisperLargeV3, s.cfg.AudioBaseURL, s.cfg.AudioAPIKey, s.cfg.AudioModel, s.cfg.AudioTimeout),
|
||||
},
|
||||
},
|
||||
Infra: loadInfraSnapshot(r, s.cfg),
|
||||
|
||||
@@ -8,6 +8,8 @@ import (
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"ai-service/internal/transcription"
|
||||
)
|
||||
|
||||
type providerStatus struct {
|
||||
@@ -42,7 +44,7 @@ func (s *Server) handleProviderStatus(w http.ResponseWriter, r *http.Request) {
|
||||
At: time.Now().UTC(),
|
||||
Providers: []providerStatus{
|
||||
s.checkLLM(ctx),
|
||||
s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
|
||||
s.checkAudioLLM(ctx, transcription.ProviderWhisperLargeV3, s.cfg.AudioBaseURL, s.cfg.AudioAPIKey, s.cfg.AudioModel, s.cfg.AudioTimeout),
|
||||
},
|
||||
}
|
||||
writeJSON(w, http.StatusOK, resp)
|
||||
|
||||
@@ -19,16 +19,19 @@ type Client struct {
|
||||
http *http.Client
|
||||
}
|
||||
|
||||
const ProviderVoxtral = "voxtral-small"
|
||||
const (
|
||||
ProviderWhisperLargeV3 = "whisper-large-v3"
|
||||
defaultWhisperModel = "openai/whisper-large-v3"
|
||||
)
|
||||
|
||||
var speakerLabelPattern = regexp.MustCompile(`(?i)(?:^|[\n\r ]+)((?:speaker|спикер|говорящий)\s*\d+)\s*[::-]`)
|
||||
|
||||
type Options struct {
|
||||
VoxtralBaseURL string
|
||||
VoxtralAPIKey string
|
||||
VoxtralModel string
|
||||
VoxtralTimeout time.Duration
|
||||
AudioLLMPrompt string
|
||||
AudioBaseURL string
|
||||
AudioAPIKey string
|
||||
AudioModel string
|
||||
AudioTimeout time.Duration
|
||||
AudioPrompt string
|
||||
}
|
||||
|
||||
type ProviderConfig struct {
|
||||
@@ -102,17 +105,17 @@ type audioTranscriptionSegment struct {
|
||||
|
||||
func New(baseURL string, timeout time.Duration, ffmpegPath string, leadSilence time.Duration) *Client {
|
||||
return NewWithOptions(Options{
|
||||
VoxtralBaseURL: baseURL,
|
||||
VoxtralTimeout: timeout,
|
||||
AudioBaseURL: baseURL,
|
||||
AudioTimeout: timeout,
|
||||
})
|
||||
}
|
||||
|
||||
func NewWithOptions(opts Options) *Client {
|
||||
audioLLMPrompt := strings.TrimSpace(opts.AudioLLMPrompt)
|
||||
if audioLLMPrompt == "" {
|
||||
audioLLMPrompt = "Transcribe the audio exactly. Return only the transcript text."
|
||||
audioPrompt := strings.TrimSpace(opts.AudioPrompt)
|
||||
if audioPrompt == "" {
|
||||
audioPrompt = "Transcribe the audio exactly. Return only the transcript text."
|
||||
}
|
||||
provider := buildVoxtralProvider(opts, audioLLMPrompt)
|
||||
provider := buildAudioProvider(opts, audioPrompt)
|
||||
if provider.BaseURL == "" {
|
||||
return nil
|
||||
}
|
||||
@@ -122,18 +125,18 @@ func NewWithOptions(opts Options) *Client {
|
||||
}
|
||||
}
|
||||
|
||||
func buildVoxtralProvider(opts Options, prompt string) ProviderConfig {
|
||||
baseURL := strings.TrimRight(strings.TrimSpace(opts.VoxtralBaseURL), "/")
|
||||
func buildAudioProvider(opts Options, prompt string) ProviderConfig {
|
||||
baseURL := strings.TrimRight(strings.TrimSpace(opts.AudioBaseURL), "/")
|
||||
if baseURL == "" {
|
||||
return ProviderConfig{}
|
||||
}
|
||||
model := firstNonEmpty(opts.VoxtralModel, "mistralai/Voxtral-Small-24B-2507")
|
||||
model := firstNonEmpty(opts.AudioModel, defaultWhisperModel)
|
||||
return ProviderConfig{
|
||||
Name: ProviderVoxtral,
|
||||
Name: ProviderWhisperLargeV3,
|
||||
BaseURL: baseURL,
|
||||
APIKey: strings.TrimSpace(opts.VoxtralAPIKey),
|
||||
APIKey: strings.TrimSpace(opts.AudioAPIKey),
|
||||
Model: model,
|
||||
Timeout: defaultDuration(opts.VoxtralTimeout, 10*time.Minute),
|
||||
Timeout: defaultDuration(opts.AudioTimeout, 10*time.Minute),
|
||||
Prompt: prompt,
|
||||
}
|
||||
}
|
||||
@@ -147,7 +150,7 @@ func defaultDuration(v, fallback time.Duration) time.Duration {
|
||||
|
||||
func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
|
||||
if c == nil || c.provider.BaseURL == "" {
|
||||
return nil, fmt.Errorf("voxtral transcription provider not configured")
|
||||
return nil, fmt.Errorf("audio transcription provider not configured")
|
||||
}
|
||||
if strings.TrimSpace(in.AudioURL) == "" {
|
||||
return nil, fmt.Errorf("audio_url is required")
|
||||
|
||||
@@ -7,22 +7,22 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestNewWithOptionsBuildsVoxtralProvider(t *testing.T) {
|
||||
func TestNewWithOptionsBuildsWhisperProvider(t *testing.T) {
|
||||
client := NewWithOptions(Options{
|
||||
VoxtralBaseURL: "http://voxtral",
|
||||
AudioBaseURL: "http://whisper",
|
||||
})
|
||||
if client == nil {
|
||||
t.Fatal("client is nil")
|
||||
}
|
||||
if client.provider.Name != ProviderVoxtral {
|
||||
t.Fatalf("provider = %q, want %q", client.provider.Name, ProviderVoxtral)
|
||||
if client.provider.Name != ProviderWhisperLargeV3 {
|
||||
t.Fatalf("provider = %q, want %q", client.provider.Name, ProviderWhisperLargeV3)
|
||||
}
|
||||
if client.provider.Model != "mistralai/Voxtral-Small-24B-2507" {
|
||||
if client.provider.Model != "openai/whisper-large-v3" {
|
||||
t.Fatalf("model = %q", client.provider.Model)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
||||
func TestWhisperUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
||||
audioSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte("fake audio"))
|
||||
}))
|
||||
@@ -50,8 +50,8 @@ func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
||||
defer providerSrv.Close()
|
||||
|
||||
client := NewWithOptions(Options{
|
||||
VoxtralBaseURL: providerSrv.URL,
|
||||
VoxtralModel: "mistralai/Voxtral-Small-24B-2507",
|
||||
AudioBaseURL: providerSrv.URL,
|
||||
AudioModel: "openai/whisper-large-v3",
|
||||
})
|
||||
if client == nil {
|
||||
t.Fatal("client is nil")
|
||||
@@ -63,7 +63,7 @@ func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
||||
if gotPath != "/v1/audio/transcriptions" {
|
||||
t.Fatalf("path = %q, want /v1/audio/transcriptions", gotPath)
|
||||
}
|
||||
if gotModel != "mistralai/Voxtral-Small-24B-2507" {
|
||||
if gotModel != "openai/whisper-large-v3" {
|
||||
t.Fatalf("model = %q", gotModel)
|
||||
}
|
||||
if gotResponseFormat != "json" {
|
||||
|
||||
@@ -20,7 +20,7 @@ const (
|
||||
TaskCallAnalysis = "call_analysis"
|
||||
TaskTranscription = "transcription"
|
||||
|
||||
TranscriptionProfile = "voxtral-small"
|
||||
TranscriptionProfile = "whisper-large-v3"
|
||||
)
|
||||
|
||||
type Worker struct {
|
||||
|
||||
@@ -11,11 +11,11 @@ data:
|
||||
LLM_BASE_URL: "http://10.2.3.5:8002"
|
||||
LLM_MODEL: "qwen2.5-14b"
|
||||
LLM_TIMEOUT: "5m"
|
||||
# Voxtral Small is the only transcription provider. It is exposed on the AI
|
||||
# server through an OpenAI-compatible /v1/audio/transcriptions endpoint.
|
||||
VOXTRAL_BASE_URL: "http://10.2.3.5:8004"
|
||||
VOXTRAL_MODEL: "mistralai/Voxtral-Small-24B-2507"
|
||||
VOXTRAL_TIMEOUT: "30m"
|
||||
# Whisper Large v3 is exposed on the AI server through an OpenAI-compatible
|
||||
# /v1/audio/transcriptions endpoint.
|
||||
AUDIO_TRANSCRIPTION_BASE_URL: "http://10.2.3.5:8004"
|
||||
AUDIO_TRANSCRIPTION_MODEL: "openai/whisper-large-v3"
|
||||
AUDIO_TRANSCRIPTION_TIMEOUT: "30m"
|
||||
AI_STATS_SIDECAR_URL: "http://10.2.3.5:9090"
|
||||
AI_STATS_TIMEOUT: "8s"
|
||||
WORKER_POLL_INTERVAL: "2s"
|
||||
|
||||
@@ -18,5 +18,5 @@ type: Opaque
|
||||
stringData:
|
||||
DATABASE_URL: "postgres://ai_service:ai_service@postgres:5432/ai_service?sslmode=disable"
|
||||
LLM_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
|
||||
VOXTRAL_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
|
||||
AUDIO_TRANSCRIPTION_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
|
||||
AI_SERVICE_TOKEN: "d18bcacf9e02bae1806ee6b6eeda62b95be6a915c0a22936d9a700128b275442"
|
||||
|
||||
@@ -98,7 +98,7 @@ spec:
|
||||
- name: WORKER_TASK_TYPES
|
||||
value: "transcription"
|
||||
- name: WORKER_MODEL_PROFILES
|
||||
value: "voxtral-small"
|
||||
value: "whisper-large-v3"
|
||||
- name: WORKER_CLAIM_LIMIT
|
||||
value: "2"
|
||||
- name: WORKER_LEASE_TIMEOUT
|
||||
|
||||
Reference in New Issue
Block a user