diff --git a/README.md b/README.md index dbe5153..21a516b 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ The service is intentionally domain-agnostic: `beeline/{call_id}` or `channel/{message_id}`. - `task_type` describes the technical task class, for example `transcribe`, `call_analysis`, `tg_analysis`, `pf_competitor_analysis`. -- `model_profile` selects a runtime profile, for example `voxtral-small`, +- `model_profile` selects a runtime profile, for example `whisper-large-v3`, `qwen2.5-14b`, `vision`, or a future provider profile. - `input` and `result` are JSON payloads owned by the caller and worker. @@ -46,23 +46,22 @@ or compact `system` / `user` fields. The completed job result contains domain metadata fields in `input`, but the worker only reads chat fields such as `system`, `user`, `messages`, `max_tokens` and `response_format`. -`transcription` jobs are processed only by Voxtral Small -(`mistralai/Voxtral-Small-24B-2507`) through an OpenAI-compatible +`transcription` jobs are processed only by Whisper Large v3 +(`openai/whisper-large-v3`) through an OpenAI-compatible `/v1/audio/transcriptions` endpoint. The returned `segments` field stays compatible with telephony. If the provider returns one long segment, AI Service -splits it into smaller transcript segments and adds heuristic speaker labels -when diarization is requested. +splits it into smaller transcript segments without inventing speaker labels. -AI-server compose snippet for Voxtral lives in +AI-server compose snippet for Whisper Large v3 lives in `deploy/ai-server/docker-compose.audio.yml`: -- Voxtral endpoint: `http://10.2.3.5:8004` -- Start Voxtral: - `docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile voxtral-small up -d voxtral-small` +- Whisper endpoint: `http://10.2.3.5:8004` +- Start Whisper: + `docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile whisper-large-v3 up -d whisper-large-v3` In Kubernetes the dedicated transcription worker may claim more than one -`voxtral-small` job at a time. This keeps download/upload/wait overhead from -serializing the queue while Voxtral/vLLM still controls the actual GPU +`whisper-large-v3` job at a time. This keeps download/upload/wait overhead from +serializing the queue while Whisper/vLLM still controls the actual GPU scheduling. ## API @@ -102,11 +101,11 @@ for Kubernetes probes. - `LLM_API_KEY`, primary LLM API key - `LLM_MODEL`, default `qwen2.5-14b` - `LLM_TIMEOUT`, default `5m` -- `VOXTRAL_BASE_URL`, OpenAI-compatible endpoint for Voxtral -- `VOXTRAL_MODEL`, default `mistralai/Voxtral-Small-24B-2507` -- `VOXTRAL_API_KEY`, optional bearer token for Voxtral; falls back to +- `AUDIO_TRANSCRIPTION_BASE_URL`, OpenAI-compatible transcription endpoint +- `AUDIO_TRANSCRIPTION_MODEL`, default `openai/whisper-large-v3` +- `AUDIO_TRANSCRIPTION_API_KEY`, optional bearer token; falls back to `AUDIO_LLM_API_KEY`, then `LLM_API_KEY` -- `AUDIO_LLM_PROMPT`, transcription instruction for Voxtral +- `AUDIO_TRANSCRIPTION_PROMPT`, transcription instruction - `WORKER_ID`, default hostname - `WORKER_HTTP_HOST`, default `0.0.0.0` - `WORKER_HTTP_PORT`, default `8081` diff --git a/cmd/worker/main.go b/cmd/worker/main.go index 3dbc7e8..29f3a5e 100644 --- a/cmd/worker/main.go +++ b/cmd/worker/main.go @@ -49,11 +49,11 @@ func main() { llmClient := llm.New(cfg.LLMBaseURL, cfg.LLMAPIKey, cfg.LLMModel, cfg.LLMTimeout) transcriber := transcription.NewWithOptions(transcription.Options{ - VoxtralBaseURL: cfg.VoxtralBaseURL, - VoxtralAPIKey: cfg.VoxtralAPIKey, - VoxtralModel: cfg.VoxtralModel, - VoxtralTimeout: cfg.VoxtralTimeout, - AudioLLMPrompt: cfg.AudioLLMPrompt, + AudioBaseURL: cfg.AudioBaseURL, + AudioAPIKey: cfg.AudioAPIKey, + AudioModel: cfg.AudioModel, + AudioTimeout: cfg.AudioTimeout, + AudioPrompt: cfg.AudioPrompt, }) w := worker.New(db, llmClient, transcriber, cfg.WorkerID, cfg.LLMModel, cfg.WorkerTaskTypes, cfg.WorkerModelProfiles, cfg.WorkerPollInterval, cfg.WorkerLeaseTimeout, cfg.WorkerClaimLimit) healthSrv := startHealthServer(ctx, db, cfg) @@ -62,8 +62,8 @@ func main() { "worker_id", cfg.WorkerID, "model", cfg.LLMModel, "transcription_enabled", transcriber != nil, - "transcription_provider", "voxtral-small", - "transcription_model", cfg.VoxtralModel, + "transcription_provider", transcription.ProviderWhisperLargeV3, + "transcription_model", cfg.AudioModel, "task_types", cfg.WorkerTaskTypes, "model_profiles", cfg.WorkerModelProfiles, "poll_interval", cfg.WorkerPollInterval.String(), @@ -134,8 +134,8 @@ func (h workerHealth) ServeHTTP(w http.ResponseWriter, r *http.Request) { "worker_id": h.cfg.WorkerID, "task_types": h.cfg.WorkerTaskTypes, "model_profiles": h.cfg.WorkerModelProfiles, - "transcription_provider": "voxtral-small", - "transcription_model": h.cfg.VoxtralModel, + "transcription_provider": transcription.ProviderWhisperLargeV3, + "transcription_model": h.cfg.AudioModel, "claim_limit": h.cfg.WorkerClaimLimit, "poll_interval": h.cfg.WorkerPollInterval.String(), "lease_timeout": h.cfg.WorkerLeaseTimeout.String(), diff --git a/deploy/ai-server/docker-compose.audio.yml b/deploy/ai-server/docker-compose.audio.yml index 285be9b..05bdabb 100644 --- a/deploy/ai-server/docker-compose.audio.yml +++ b/deploy/ai-server/docker-compose.audio.yml @@ -1,12 +1,12 @@ services: - voxtral-small: + whisper-large-v3: build: context: . dockerfile: vllm-audio.Dockerfile image: vllm-audio:local - container_name: voxtral-small + container_name: whisper-large-v3 profiles: - - voxtral-small + - whisper-large-v3 restart: unless-stopped ipc: host runtime: nvidia @@ -29,32 +29,19 @@ services: - "10.2.3.5:8004:8000" command: - "--model" - - "mistralai/Voxtral-Small-24B-2507" + - "openai/whisper-large-v3" - "--served-model-name" - - "mistralai/Voxtral-Small-24B-2507" - - "--tokenizer-mode" - - "mistral" - - "--config-format" - - "mistral" - - "--load-format" - - "mistral" - - "--tool-call-parser" - - "mistral" - - "--enable-auto-tool-choice" + - "openai/whisper-large-v3" + - "--task" + - "transcription" - "--host" - "0.0.0.0" - "--port" - "8000" - - "--max-model-len" - - "16384" - "--gpu-memory-utilization" - "0.55" - "--api-key" - "${VLLM_API_KEY}" - - "--max-num-seqs" - - "1" - - "--max-num-batched-tokens" - - "4096" healthcheck: test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"] interval: 30s diff --git a/internal/config/config.go b/internal/config/config.go index 552412d..b991372 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -18,11 +18,11 @@ type Config struct { LLMAPIKey string LLMModel string LLMTimeout time.Duration - VoxtralBaseURL string - VoxtralAPIKey string - VoxtralModel string - VoxtralTimeout time.Duration - AudioLLMPrompt string + AudioBaseURL string + AudioAPIKey string + AudioModel string + AudioTimeout time.Duration + AudioPrompt string AIStatsSidecarURL string AIStatsTimeout time.Duration @@ -48,11 +48,11 @@ func Load() Config { LLMAPIKey: envString("LLM_API_KEY", ""), LLMModel: envString("LLM_MODEL", "qwen2.5-14b"), LLMTimeout: envDuration("LLM_TIMEOUT", 5*time.Minute), - VoxtralBaseURL: envString("VOXTRAL_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")), - VoxtralAPIKey: envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))), - VoxtralModel: envString("VOXTRAL_MODEL", "mistralai/Voxtral-Small-24B-2507"), - VoxtralTimeout: envDuration("VOXTRAL_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)), - AudioLLMPrompt: envString("AUDIO_LLM_PROMPT", defaultAudioLLMPrompt()), + AudioBaseURL: envString("AUDIO_TRANSCRIPTION_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")), + AudioAPIKey: envString("AUDIO_TRANSCRIPTION_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))), + AudioModel: envString("AUDIO_TRANSCRIPTION_MODEL", "openai/whisper-large-v3"), + AudioTimeout: envDuration("AUDIO_TRANSCRIPTION_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)), + AudioPrompt: envString("AUDIO_TRANSCRIPTION_PROMPT", envString("AUDIO_LLM_PROMPT", defaultAudioPrompt())), AIStatsSidecarURL: envString("AI_STATS_SIDECAR_URL", ""), AIStatsTimeout: envDuration("AI_STATS_TIMEOUT", 8*time.Second), @@ -132,7 +132,7 @@ func envCSVDefault(key string, fallback []string) []string { return fallback } -func defaultAudioLLMPrompt() string { +func defaultAudioPrompt() string { return "Расшифруй речь из аудио максимально точно. Сохрани русский язык, имена, телефоны, суммы и смысловые паузы. Не добавляй комментарии, анализ, Markdown или JSON. Верни только чистый текст расшифровки." } diff --git a/internal/httpapi/dashboard.go b/internal/httpapi/dashboard.go index d2a4a08..7feea99 100644 --- a/internal/httpapi/dashboard.go +++ b/internal/httpapi/dashboard.go @@ -5,6 +5,7 @@ import ( "time" "ai-service/internal/model" + "ai-service/internal/transcription" ) type dashboardResponse struct { @@ -51,7 +52,7 @@ func (s *Server) handleDashboard(w http.ResponseWriter, r *http.Request) { At: now, Providers: []providerStatus{ s.checkLLM(ctx), - s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout), + s.checkAudioLLM(ctx, transcription.ProviderWhisperLargeV3, s.cfg.AudioBaseURL, s.cfg.AudioAPIKey, s.cfg.AudioModel, s.cfg.AudioTimeout), }, }, Infra: loadInfraSnapshot(r, s.cfg), diff --git a/internal/httpapi/providers.go b/internal/httpapi/providers.go index fc6a1f5..0b01b34 100644 --- a/internal/httpapi/providers.go +++ b/internal/httpapi/providers.go @@ -8,6 +8,8 @@ import ( "net/http" "strings" "time" + + "ai-service/internal/transcription" ) type providerStatus struct { @@ -42,7 +44,7 @@ func (s *Server) handleProviderStatus(w http.ResponseWriter, r *http.Request) { At: time.Now().UTC(), Providers: []providerStatus{ s.checkLLM(ctx), - s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout), + s.checkAudioLLM(ctx, transcription.ProviderWhisperLargeV3, s.cfg.AudioBaseURL, s.cfg.AudioAPIKey, s.cfg.AudioModel, s.cfg.AudioTimeout), }, } writeJSON(w, http.StatusOK, resp) diff --git a/internal/transcription/client.go b/internal/transcription/client.go index 1ce6ca4..dcba17d 100644 --- a/internal/transcription/client.go +++ b/internal/transcription/client.go @@ -19,16 +19,19 @@ type Client struct { http *http.Client } -const ProviderVoxtral = "voxtral-small" +const ( + ProviderWhisperLargeV3 = "whisper-large-v3" + defaultWhisperModel = "openai/whisper-large-v3" +) var speakerLabelPattern = regexp.MustCompile(`(?i)(?:^|[\n\r ]+)((?:speaker|спикер|говорящий)\s*\d+)\s*[::-]`) type Options struct { - VoxtralBaseURL string - VoxtralAPIKey string - VoxtralModel string - VoxtralTimeout time.Duration - AudioLLMPrompt string + AudioBaseURL string + AudioAPIKey string + AudioModel string + AudioTimeout time.Duration + AudioPrompt string } type ProviderConfig struct { @@ -102,17 +105,17 @@ type audioTranscriptionSegment struct { func New(baseURL string, timeout time.Duration, ffmpegPath string, leadSilence time.Duration) *Client { return NewWithOptions(Options{ - VoxtralBaseURL: baseURL, - VoxtralTimeout: timeout, + AudioBaseURL: baseURL, + AudioTimeout: timeout, }) } func NewWithOptions(opts Options) *Client { - audioLLMPrompt := strings.TrimSpace(opts.AudioLLMPrompt) - if audioLLMPrompt == "" { - audioLLMPrompt = "Transcribe the audio exactly. Return only the transcript text." + audioPrompt := strings.TrimSpace(opts.AudioPrompt) + if audioPrompt == "" { + audioPrompt = "Transcribe the audio exactly. Return only the transcript text." } - provider := buildVoxtralProvider(opts, audioLLMPrompt) + provider := buildAudioProvider(opts, audioPrompt) if provider.BaseURL == "" { return nil } @@ -122,18 +125,18 @@ func NewWithOptions(opts Options) *Client { } } -func buildVoxtralProvider(opts Options, prompt string) ProviderConfig { - baseURL := strings.TrimRight(strings.TrimSpace(opts.VoxtralBaseURL), "/") +func buildAudioProvider(opts Options, prompt string) ProviderConfig { + baseURL := strings.TrimRight(strings.TrimSpace(opts.AudioBaseURL), "/") if baseURL == "" { return ProviderConfig{} } - model := firstNonEmpty(opts.VoxtralModel, "mistralai/Voxtral-Small-24B-2507") + model := firstNonEmpty(opts.AudioModel, defaultWhisperModel) return ProviderConfig{ - Name: ProviderVoxtral, + Name: ProviderWhisperLargeV3, BaseURL: baseURL, - APIKey: strings.TrimSpace(opts.VoxtralAPIKey), + APIKey: strings.TrimSpace(opts.AudioAPIKey), Model: model, - Timeout: defaultDuration(opts.VoxtralTimeout, 10*time.Minute), + Timeout: defaultDuration(opts.AudioTimeout, 10*time.Minute), Prompt: prompt, } } @@ -147,7 +150,7 @@ func defaultDuration(v, fallback time.Duration) time.Duration { func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) { if c == nil || c.provider.BaseURL == "" { - return nil, fmt.Errorf("voxtral transcription provider not configured") + return nil, fmt.Errorf("audio transcription provider not configured") } if strings.TrimSpace(in.AudioURL) == "" { return nil, fmt.Errorf("audio_url is required") diff --git a/internal/transcription/client_test.go b/internal/transcription/client_test.go index fcc6049..21ab7f1 100644 --- a/internal/transcription/client_test.go +++ b/internal/transcription/client_test.go @@ -7,22 +7,22 @@ import ( "testing" ) -func TestNewWithOptionsBuildsVoxtralProvider(t *testing.T) { +func TestNewWithOptionsBuildsWhisperProvider(t *testing.T) { client := NewWithOptions(Options{ - VoxtralBaseURL: "http://voxtral", + AudioBaseURL: "http://whisper", }) if client == nil { t.Fatal("client is nil") } - if client.provider.Name != ProviderVoxtral { - t.Fatalf("provider = %q, want %q", client.provider.Name, ProviderVoxtral) + if client.provider.Name != ProviderWhisperLargeV3 { + t.Fatalf("provider = %q, want %q", client.provider.Name, ProviderWhisperLargeV3) } - if client.provider.Model != "mistralai/Voxtral-Small-24B-2507" { + if client.provider.Model != "openai/whisper-large-v3" { t.Fatalf("model = %q", client.provider.Model) } } -func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) { +func TestWhisperUsesAudioTranscriptionsEndpoint(t *testing.T) { audioSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { _, _ = w.Write([]byte("fake audio")) })) @@ -50,8 +50,8 @@ func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) { defer providerSrv.Close() client := NewWithOptions(Options{ - VoxtralBaseURL: providerSrv.URL, - VoxtralModel: "mistralai/Voxtral-Small-24B-2507", + AudioBaseURL: providerSrv.URL, + AudioModel: "openai/whisper-large-v3", }) if client == nil { t.Fatal("client is nil") @@ -63,7 +63,7 @@ func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) { if gotPath != "/v1/audio/transcriptions" { t.Fatalf("path = %q, want /v1/audio/transcriptions", gotPath) } - if gotModel != "mistralai/Voxtral-Small-24B-2507" { + if gotModel != "openai/whisper-large-v3" { t.Fatalf("model = %q", gotModel) } if gotResponseFormat != "json" { diff --git a/internal/worker/worker.go b/internal/worker/worker.go index 4529a37..86ff694 100644 --- a/internal/worker/worker.go +++ b/internal/worker/worker.go @@ -20,7 +20,7 @@ const ( TaskCallAnalysis = "call_analysis" TaskTranscription = "transcription" - TranscriptionProfile = "voxtral-small" + TranscriptionProfile = "whisper-large-v3" ) type Worker struct { diff --git a/k8s/configmap.yaml b/k8s/configmap.yaml index b83ecb2..0bc2ffa 100644 --- a/k8s/configmap.yaml +++ b/k8s/configmap.yaml @@ -11,11 +11,11 @@ data: LLM_BASE_URL: "http://10.2.3.5:8002" LLM_MODEL: "qwen2.5-14b" LLM_TIMEOUT: "5m" - # Voxtral Small is the only transcription provider. It is exposed on the AI - # server through an OpenAI-compatible /v1/audio/transcriptions endpoint. - VOXTRAL_BASE_URL: "http://10.2.3.5:8004" - VOXTRAL_MODEL: "mistralai/Voxtral-Small-24B-2507" - VOXTRAL_TIMEOUT: "30m" + # Whisper Large v3 is exposed on the AI server through an OpenAI-compatible + # /v1/audio/transcriptions endpoint. + AUDIO_TRANSCRIPTION_BASE_URL: "http://10.2.3.5:8004" + AUDIO_TRANSCRIPTION_MODEL: "openai/whisper-large-v3" + AUDIO_TRANSCRIPTION_TIMEOUT: "30m" AI_STATS_SIDECAR_URL: "http://10.2.3.5:9090" AI_STATS_TIMEOUT: "8s" WORKER_POLL_INTERVAL: "2s" diff --git a/k8s/secrets.yaml b/k8s/secrets.yaml index e797c47..9c80d4c 100644 --- a/k8s/secrets.yaml +++ b/k8s/secrets.yaml @@ -18,5 +18,5 @@ type: Opaque stringData: DATABASE_URL: "postgres://ai_service:ai_service@postgres:5432/ai_service?sslmode=disable" LLM_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32" - VOXTRAL_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32" + AUDIO_TRANSCRIPTION_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32" AI_SERVICE_TOKEN: "d18bcacf9e02bae1806ee6b6eeda62b95be6a915c0a22936d9a700128b275442" diff --git a/k8s/worker-deployment.yaml b/k8s/worker-deployment.yaml index b039527..455a31a 100644 --- a/k8s/worker-deployment.yaml +++ b/k8s/worker-deployment.yaml @@ -98,7 +98,7 @@ spec: - name: WORKER_TASK_TYPES value: "transcription" - name: WORKER_MODEL_PROFILES - value: "voxtral-small" + value: "whisper-large-v3" - name: WORKER_CLAIM_LIMIT value: "2" - name: WORKER_LEASE_TIMEOUT