Switch transcription to Whisper large v3
This commit is contained in:
29
README.md
29
README.md
@@ -15,7 +15,7 @@ The service is intentionally domain-agnostic:
|
|||||||
`beeline/{call_id}` or `channel/{message_id}`.
|
`beeline/{call_id}` or `channel/{message_id}`.
|
||||||
- `task_type` describes the technical task class, for example
|
- `task_type` describes the technical task class, for example
|
||||||
`transcribe`, `call_analysis`, `tg_analysis`, `pf_competitor_analysis`.
|
`transcribe`, `call_analysis`, `tg_analysis`, `pf_competitor_analysis`.
|
||||||
- `model_profile` selects a runtime profile, for example `voxtral-small`,
|
- `model_profile` selects a runtime profile, for example `whisper-large-v3`,
|
||||||
`qwen2.5-14b`, `vision`, or a future provider profile.
|
`qwen2.5-14b`, `vision`, or a future provider profile.
|
||||||
- `input` and `result` are JSON payloads owned by the caller and worker.
|
- `input` and `result` are JSON payloads owned by the caller and worker.
|
||||||
|
|
||||||
@@ -46,23 +46,22 @@ or compact `system` / `user` fields. The completed job result contains
|
|||||||
domain metadata fields in `input`, but the worker only reads chat fields such as
|
domain metadata fields in `input`, but the worker only reads chat fields such as
|
||||||
`system`, `user`, `messages`, `max_tokens` and `response_format`.
|
`system`, `user`, `messages`, `max_tokens` and `response_format`.
|
||||||
|
|
||||||
`transcription` jobs are processed only by Voxtral Small
|
`transcription` jobs are processed only by Whisper Large v3
|
||||||
(`mistralai/Voxtral-Small-24B-2507`) through an OpenAI-compatible
|
(`openai/whisper-large-v3`) through an OpenAI-compatible
|
||||||
`/v1/audio/transcriptions` endpoint. The returned `segments` field stays
|
`/v1/audio/transcriptions` endpoint. The returned `segments` field stays
|
||||||
compatible with telephony. If the provider returns one long segment, AI Service
|
compatible with telephony. If the provider returns one long segment, AI Service
|
||||||
splits it into smaller transcript segments and adds heuristic speaker labels
|
splits it into smaller transcript segments without inventing speaker labels.
|
||||||
when diarization is requested.
|
|
||||||
|
|
||||||
AI-server compose snippet for Voxtral lives in
|
AI-server compose snippet for Whisper Large v3 lives in
|
||||||
`deploy/ai-server/docker-compose.audio.yml`:
|
`deploy/ai-server/docker-compose.audio.yml`:
|
||||||
|
|
||||||
- Voxtral endpoint: `http://10.2.3.5:8004`
|
- Whisper endpoint: `http://10.2.3.5:8004`
|
||||||
- Start Voxtral:
|
- Start Whisper:
|
||||||
`docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile voxtral-small up -d voxtral-small`
|
`docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile whisper-large-v3 up -d whisper-large-v3`
|
||||||
|
|
||||||
In Kubernetes the dedicated transcription worker may claim more than one
|
In Kubernetes the dedicated transcription worker may claim more than one
|
||||||
`voxtral-small` job at a time. This keeps download/upload/wait overhead from
|
`whisper-large-v3` job at a time. This keeps download/upload/wait overhead from
|
||||||
serializing the queue while Voxtral/vLLM still controls the actual GPU
|
serializing the queue while Whisper/vLLM still controls the actual GPU
|
||||||
scheduling.
|
scheduling.
|
||||||
|
|
||||||
## API
|
## API
|
||||||
@@ -102,11 +101,11 @@ for Kubernetes probes.
|
|||||||
- `LLM_API_KEY`, primary LLM API key
|
- `LLM_API_KEY`, primary LLM API key
|
||||||
- `LLM_MODEL`, default `qwen2.5-14b`
|
- `LLM_MODEL`, default `qwen2.5-14b`
|
||||||
- `LLM_TIMEOUT`, default `5m`
|
- `LLM_TIMEOUT`, default `5m`
|
||||||
- `VOXTRAL_BASE_URL`, OpenAI-compatible endpoint for Voxtral
|
- `AUDIO_TRANSCRIPTION_BASE_URL`, OpenAI-compatible transcription endpoint
|
||||||
- `VOXTRAL_MODEL`, default `mistralai/Voxtral-Small-24B-2507`
|
- `AUDIO_TRANSCRIPTION_MODEL`, default `openai/whisper-large-v3`
|
||||||
- `VOXTRAL_API_KEY`, optional bearer token for Voxtral; falls back to
|
- `AUDIO_TRANSCRIPTION_API_KEY`, optional bearer token; falls back to
|
||||||
`AUDIO_LLM_API_KEY`, then `LLM_API_KEY`
|
`AUDIO_LLM_API_KEY`, then `LLM_API_KEY`
|
||||||
- `AUDIO_LLM_PROMPT`, transcription instruction for Voxtral
|
- `AUDIO_TRANSCRIPTION_PROMPT`, transcription instruction
|
||||||
- `WORKER_ID`, default hostname
|
- `WORKER_ID`, default hostname
|
||||||
- `WORKER_HTTP_HOST`, default `0.0.0.0`
|
- `WORKER_HTTP_HOST`, default `0.0.0.0`
|
||||||
- `WORKER_HTTP_PORT`, default `8081`
|
- `WORKER_HTTP_PORT`, default `8081`
|
||||||
|
|||||||
@@ -49,11 +49,11 @@ func main() {
|
|||||||
|
|
||||||
llmClient := llm.New(cfg.LLMBaseURL, cfg.LLMAPIKey, cfg.LLMModel, cfg.LLMTimeout)
|
llmClient := llm.New(cfg.LLMBaseURL, cfg.LLMAPIKey, cfg.LLMModel, cfg.LLMTimeout)
|
||||||
transcriber := transcription.NewWithOptions(transcription.Options{
|
transcriber := transcription.NewWithOptions(transcription.Options{
|
||||||
VoxtralBaseURL: cfg.VoxtralBaseURL,
|
AudioBaseURL: cfg.AudioBaseURL,
|
||||||
VoxtralAPIKey: cfg.VoxtralAPIKey,
|
AudioAPIKey: cfg.AudioAPIKey,
|
||||||
VoxtralModel: cfg.VoxtralModel,
|
AudioModel: cfg.AudioModel,
|
||||||
VoxtralTimeout: cfg.VoxtralTimeout,
|
AudioTimeout: cfg.AudioTimeout,
|
||||||
AudioLLMPrompt: cfg.AudioLLMPrompt,
|
AudioPrompt: cfg.AudioPrompt,
|
||||||
})
|
})
|
||||||
w := worker.New(db, llmClient, transcriber, cfg.WorkerID, cfg.LLMModel, cfg.WorkerTaskTypes, cfg.WorkerModelProfiles, cfg.WorkerPollInterval, cfg.WorkerLeaseTimeout, cfg.WorkerClaimLimit)
|
w := worker.New(db, llmClient, transcriber, cfg.WorkerID, cfg.LLMModel, cfg.WorkerTaskTypes, cfg.WorkerModelProfiles, cfg.WorkerPollInterval, cfg.WorkerLeaseTimeout, cfg.WorkerClaimLimit)
|
||||||
healthSrv := startHealthServer(ctx, db, cfg)
|
healthSrv := startHealthServer(ctx, db, cfg)
|
||||||
@@ -62,8 +62,8 @@ func main() {
|
|||||||
"worker_id", cfg.WorkerID,
|
"worker_id", cfg.WorkerID,
|
||||||
"model", cfg.LLMModel,
|
"model", cfg.LLMModel,
|
||||||
"transcription_enabled", transcriber != nil,
|
"transcription_enabled", transcriber != nil,
|
||||||
"transcription_provider", "voxtral-small",
|
"transcription_provider", transcription.ProviderWhisperLargeV3,
|
||||||
"transcription_model", cfg.VoxtralModel,
|
"transcription_model", cfg.AudioModel,
|
||||||
"task_types", cfg.WorkerTaskTypes,
|
"task_types", cfg.WorkerTaskTypes,
|
||||||
"model_profiles", cfg.WorkerModelProfiles,
|
"model_profiles", cfg.WorkerModelProfiles,
|
||||||
"poll_interval", cfg.WorkerPollInterval.String(),
|
"poll_interval", cfg.WorkerPollInterval.String(),
|
||||||
@@ -134,8 +134,8 @@ func (h workerHealth) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
|||||||
"worker_id": h.cfg.WorkerID,
|
"worker_id": h.cfg.WorkerID,
|
||||||
"task_types": h.cfg.WorkerTaskTypes,
|
"task_types": h.cfg.WorkerTaskTypes,
|
||||||
"model_profiles": h.cfg.WorkerModelProfiles,
|
"model_profiles": h.cfg.WorkerModelProfiles,
|
||||||
"transcription_provider": "voxtral-small",
|
"transcription_provider": transcription.ProviderWhisperLargeV3,
|
||||||
"transcription_model": h.cfg.VoxtralModel,
|
"transcription_model": h.cfg.AudioModel,
|
||||||
"claim_limit": h.cfg.WorkerClaimLimit,
|
"claim_limit": h.cfg.WorkerClaimLimit,
|
||||||
"poll_interval": h.cfg.WorkerPollInterval.String(),
|
"poll_interval": h.cfg.WorkerPollInterval.String(),
|
||||||
"lease_timeout": h.cfg.WorkerLeaseTimeout.String(),
|
"lease_timeout": h.cfg.WorkerLeaseTimeout.String(),
|
||||||
|
|||||||
@@ -1,12 +1,12 @@
|
|||||||
services:
|
services:
|
||||||
voxtral-small:
|
whisper-large-v3:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: .
|
||||||
dockerfile: vllm-audio.Dockerfile
|
dockerfile: vllm-audio.Dockerfile
|
||||||
image: vllm-audio:local
|
image: vllm-audio:local
|
||||||
container_name: voxtral-small
|
container_name: whisper-large-v3
|
||||||
profiles:
|
profiles:
|
||||||
- voxtral-small
|
- whisper-large-v3
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
ipc: host
|
ipc: host
|
||||||
runtime: nvidia
|
runtime: nvidia
|
||||||
@@ -29,32 +29,19 @@ services:
|
|||||||
- "10.2.3.5:8004:8000"
|
- "10.2.3.5:8004:8000"
|
||||||
command:
|
command:
|
||||||
- "--model"
|
- "--model"
|
||||||
- "mistralai/Voxtral-Small-24B-2507"
|
- "openai/whisper-large-v3"
|
||||||
- "--served-model-name"
|
- "--served-model-name"
|
||||||
- "mistralai/Voxtral-Small-24B-2507"
|
- "openai/whisper-large-v3"
|
||||||
- "--tokenizer-mode"
|
- "--task"
|
||||||
- "mistral"
|
- "transcription"
|
||||||
- "--config-format"
|
|
||||||
- "mistral"
|
|
||||||
- "--load-format"
|
|
||||||
- "mistral"
|
|
||||||
- "--tool-call-parser"
|
|
||||||
- "mistral"
|
|
||||||
- "--enable-auto-tool-choice"
|
|
||||||
- "--host"
|
- "--host"
|
||||||
- "0.0.0.0"
|
- "0.0.0.0"
|
||||||
- "--port"
|
- "--port"
|
||||||
- "8000"
|
- "8000"
|
||||||
- "--max-model-len"
|
|
||||||
- "16384"
|
|
||||||
- "--gpu-memory-utilization"
|
- "--gpu-memory-utilization"
|
||||||
- "0.55"
|
- "0.55"
|
||||||
- "--api-key"
|
- "--api-key"
|
||||||
- "${VLLM_API_KEY}"
|
- "${VLLM_API_KEY}"
|
||||||
- "--max-num-seqs"
|
|
||||||
- "1"
|
|
||||||
- "--max-num-batched-tokens"
|
|
||||||
- "4096"
|
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
|
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
|
|||||||
@@ -18,11 +18,11 @@ type Config struct {
|
|||||||
LLMAPIKey string
|
LLMAPIKey string
|
||||||
LLMModel string
|
LLMModel string
|
||||||
LLMTimeout time.Duration
|
LLMTimeout time.Duration
|
||||||
VoxtralBaseURL string
|
AudioBaseURL string
|
||||||
VoxtralAPIKey string
|
AudioAPIKey string
|
||||||
VoxtralModel string
|
AudioModel string
|
||||||
VoxtralTimeout time.Duration
|
AudioTimeout time.Duration
|
||||||
AudioLLMPrompt string
|
AudioPrompt string
|
||||||
AIStatsSidecarURL string
|
AIStatsSidecarURL string
|
||||||
AIStatsTimeout time.Duration
|
AIStatsTimeout time.Duration
|
||||||
|
|
||||||
@@ -48,11 +48,11 @@ func Load() Config {
|
|||||||
LLMAPIKey: envString("LLM_API_KEY", ""),
|
LLMAPIKey: envString("LLM_API_KEY", ""),
|
||||||
LLMModel: envString("LLM_MODEL", "qwen2.5-14b"),
|
LLMModel: envString("LLM_MODEL", "qwen2.5-14b"),
|
||||||
LLMTimeout: envDuration("LLM_TIMEOUT", 5*time.Minute),
|
LLMTimeout: envDuration("LLM_TIMEOUT", 5*time.Minute),
|
||||||
VoxtralBaseURL: envString("VOXTRAL_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
|
AudioBaseURL: envString("AUDIO_TRANSCRIPTION_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
|
||||||
VoxtralAPIKey: envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
|
AudioAPIKey: envString("AUDIO_TRANSCRIPTION_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
|
||||||
VoxtralModel: envString("VOXTRAL_MODEL", "mistralai/Voxtral-Small-24B-2507"),
|
AudioModel: envString("AUDIO_TRANSCRIPTION_MODEL", "openai/whisper-large-v3"),
|
||||||
VoxtralTimeout: envDuration("VOXTRAL_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
|
AudioTimeout: envDuration("AUDIO_TRANSCRIPTION_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
|
||||||
AudioLLMPrompt: envString("AUDIO_LLM_PROMPT", defaultAudioLLMPrompt()),
|
AudioPrompt: envString("AUDIO_TRANSCRIPTION_PROMPT", envString("AUDIO_LLM_PROMPT", defaultAudioPrompt())),
|
||||||
AIStatsSidecarURL: envString("AI_STATS_SIDECAR_URL", ""),
|
AIStatsSidecarURL: envString("AI_STATS_SIDECAR_URL", ""),
|
||||||
AIStatsTimeout: envDuration("AI_STATS_TIMEOUT", 8*time.Second),
|
AIStatsTimeout: envDuration("AI_STATS_TIMEOUT", 8*time.Second),
|
||||||
|
|
||||||
@@ -132,7 +132,7 @@ func envCSVDefault(key string, fallback []string) []string {
|
|||||||
return fallback
|
return fallback
|
||||||
}
|
}
|
||||||
|
|
||||||
func defaultAudioLLMPrompt() string {
|
func defaultAudioPrompt() string {
|
||||||
return "Расшифруй речь из аудио максимально точно. Сохрани русский язык, имена, телефоны, суммы и смысловые паузы. Не добавляй комментарии, анализ, Markdown или JSON. Верни только чистый текст расшифровки."
|
return "Расшифруй речь из аудио максимально точно. Сохрани русский язык, имена, телефоны, суммы и смысловые паузы. Не добавляй комментарии, анализ, Markdown или JSON. Верни только чистый текст расшифровки."
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"ai-service/internal/model"
|
"ai-service/internal/model"
|
||||||
|
"ai-service/internal/transcription"
|
||||||
)
|
)
|
||||||
|
|
||||||
type dashboardResponse struct {
|
type dashboardResponse struct {
|
||||||
@@ -51,7 +52,7 @@ func (s *Server) handleDashboard(w http.ResponseWriter, r *http.Request) {
|
|||||||
At: now,
|
At: now,
|
||||||
Providers: []providerStatus{
|
Providers: []providerStatus{
|
||||||
s.checkLLM(ctx),
|
s.checkLLM(ctx),
|
||||||
s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
|
s.checkAudioLLM(ctx, transcription.ProviderWhisperLargeV3, s.cfg.AudioBaseURL, s.cfg.AudioAPIKey, s.cfg.AudioModel, s.cfg.AudioTimeout),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
Infra: loadInfraSnapshot(r, s.cfg),
|
Infra: loadInfraSnapshot(r, s.cfg),
|
||||||
|
|||||||
@@ -8,6 +8,8 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"ai-service/internal/transcription"
|
||||||
)
|
)
|
||||||
|
|
||||||
type providerStatus struct {
|
type providerStatus struct {
|
||||||
@@ -42,7 +44,7 @@ func (s *Server) handleProviderStatus(w http.ResponseWriter, r *http.Request) {
|
|||||||
At: time.Now().UTC(),
|
At: time.Now().UTC(),
|
||||||
Providers: []providerStatus{
|
Providers: []providerStatus{
|
||||||
s.checkLLM(ctx),
|
s.checkLLM(ctx),
|
||||||
s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
|
s.checkAudioLLM(ctx, transcription.ProviderWhisperLargeV3, s.cfg.AudioBaseURL, s.cfg.AudioAPIKey, s.cfg.AudioModel, s.cfg.AudioTimeout),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
writeJSON(w, http.StatusOK, resp)
|
writeJSON(w, http.StatusOK, resp)
|
||||||
|
|||||||
@@ -19,16 +19,19 @@ type Client struct {
|
|||||||
http *http.Client
|
http *http.Client
|
||||||
}
|
}
|
||||||
|
|
||||||
const ProviderVoxtral = "voxtral-small"
|
const (
|
||||||
|
ProviderWhisperLargeV3 = "whisper-large-v3"
|
||||||
|
defaultWhisperModel = "openai/whisper-large-v3"
|
||||||
|
)
|
||||||
|
|
||||||
var speakerLabelPattern = regexp.MustCompile(`(?i)(?:^|[\n\r ]+)((?:speaker|спикер|говорящий)\s*\d+)\s*[::-]`)
|
var speakerLabelPattern = regexp.MustCompile(`(?i)(?:^|[\n\r ]+)((?:speaker|спикер|говорящий)\s*\d+)\s*[::-]`)
|
||||||
|
|
||||||
type Options struct {
|
type Options struct {
|
||||||
VoxtralBaseURL string
|
AudioBaseURL string
|
||||||
VoxtralAPIKey string
|
AudioAPIKey string
|
||||||
VoxtralModel string
|
AudioModel string
|
||||||
VoxtralTimeout time.Duration
|
AudioTimeout time.Duration
|
||||||
AudioLLMPrompt string
|
AudioPrompt string
|
||||||
}
|
}
|
||||||
|
|
||||||
type ProviderConfig struct {
|
type ProviderConfig struct {
|
||||||
@@ -102,17 +105,17 @@ type audioTranscriptionSegment struct {
|
|||||||
|
|
||||||
func New(baseURL string, timeout time.Duration, ffmpegPath string, leadSilence time.Duration) *Client {
|
func New(baseURL string, timeout time.Duration, ffmpegPath string, leadSilence time.Duration) *Client {
|
||||||
return NewWithOptions(Options{
|
return NewWithOptions(Options{
|
||||||
VoxtralBaseURL: baseURL,
|
AudioBaseURL: baseURL,
|
||||||
VoxtralTimeout: timeout,
|
AudioTimeout: timeout,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewWithOptions(opts Options) *Client {
|
func NewWithOptions(opts Options) *Client {
|
||||||
audioLLMPrompt := strings.TrimSpace(opts.AudioLLMPrompt)
|
audioPrompt := strings.TrimSpace(opts.AudioPrompt)
|
||||||
if audioLLMPrompt == "" {
|
if audioPrompt == "" {
|
||||||
audioLLMPrompt = "Transcribe the audio exactly. Return only the transcript text."
|
audioPrompt = "Transcribe the audio exactly. Return only the transcript text."
|
||||||
}
|
}
|
||||||
provider := buildVoxtralProvider(opts, audioLLMPrompt)
|
provider := buildAudioProvider(opts, audioPrompt)
|
||||||
if provider.BaseURL == "" {
|
if provider.BaseURL == "" {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -122,18 +125,18 @@ func NewWithOptions(opts Options) *Client {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildVoxtralProvider(opts Options, prompt string) ProviderConfig {
|
func buildAudioProvider(opts Options, prompt string) ProviderConfig {
|
||||||
baseURL := strings.TrimRight(strings.TrimSpace(opts.VoxtralBaseURL), "/")
|
baseURL := strings.TrimRight(strings.TrimSpace(opts.AudioBaseURL), "/")
|
||||||
if baseURL == "" {
|
if baseURL == "" {
|
||||||
return ProviderConfig{}
|
return ProviderConfig{}
|
||||||
}
|
}
|
||||||
model := firstNonEmpty(opts.VoxtralModel, "mistralai/Voxtral-Small-24B-2507")
|
model := firstNonEmpty(opts.AudioModel, defaultWhisperModel)
|
||||||
return ProviderConfig{
|
return ProviderConfig{
|
||||||
Name: ProviderVoxtral,
|
Name: ProviderWhisperLargeV3,
|
||||||
BaseURL: baseURL,
|
BaseURL: baseURL,
|
||||||
APIKey: strings.TrimSpace(opts.VoxtralAPIKey),
|
APIKey: strings.TrimSpace(opts.AudioAPIKey),
|
||||||
Model: model,
|
Model: model,
|
||||||
Timeout: defaultDuration(opts.VoxtralTimeout, 10*time.Minute),
|
Timeout: defaultDuration(opts.AudioTimeout, 10*time.Minute),
|
||||||
Prompt: prompt,
|
Prompt: prompt,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -147,7 +150,7 @@ func defaultDuration(v, fallback time.Duration) time.Duration {
|
|||||||
|
|
||||||
func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
|
func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
|
||||||
if c == nil || c.provider.BaseURL == "" {
|
if c == nil || c.provider.BaseURL == "" {
|
||||||
return nil, fmt.Errorf("voxtral transcription provider not configured")
|
return nil, fmt.Errorf("audio transcription provider not configured")
|
||||||
}
|
}
|
||||||
if strings.TrimSpace(in.AudioURL) == "" {
|
if strings.TrimSpace(in.AudioURL) == "" {
|
||||||
return nil, fmt.Errorf("audio_url is required")
|
return nil, fmt.Errorf("audio_url is required")
|
||||||
|
|||||||
@@ -7,22 +7,22 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestNewWithOptionsBuildsVoxtralProvider(t *testing.T) {
|
func TestNewWithOptionsBuildsWhisperProvider(t *testing.T) {
|
||||||
client := NewWithOptions(Options{
|
client := NewWithOptions(Options{
|
||||||
VoxtralBaseURL: "http://voxtral",
|
AudioBaseURL: "http://whisper",
|
||||||
})
|
})
|
||||||
if client == nil {
|
if client == nil {
|
||||||
t.Fatal("client is nil")
|
t.Fatal("client is nil")
|
||||||
}
|
}
|
||||||
if client.provider.Name != ProviderVoxtral {
|
if client.provider.Name != ProviderWhisperLargeV3 {
|
||||||
t.Fatalf("provider = %q, want %q", client.provider.Name, ProviderVoxtral)
|
t.Fatalf("provider = %q, want %q", client.provider.Name, ProviderWhisperLargeV3)
|
||||||
}
|
}
|
||||||
if client.provider.Model != "mistralai/Voxtral-Small-24B-2507" {
|
if client.provider.Model != "openai/whisper-large-v3" {
|
||||||
t.Fatalf("model = %q", client.provider.Model)
|
t.Fatalf("model = %q", client.provider.Model)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
func TestWhisperUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
||||||
audioSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
audioSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
_, _ = w.Write([]byte("fake audio"))
|
_, _ = w.Write([]byte("fake audio"))
|
||||||
}))
|
}))
|
||||||
@@ -50,8 +50,8 @@ func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
|||||||
defer providerSrv.Close()
|
defer providerSrv.Close()
|
||||||
|
|
||||||
client := NewWithOptions(Options{
|
client := NewWithOptions(Options{
|
||||||
VoxtralBaseURL: providerSrv.URL,
|
AudioBaseURL: providerSrv.URL,
|
||||||
VoxtralModel: "mistralai/Voxtral-Small-24B-2507",
|
AudioModel: "openai/whisper-large-v3",
|
||||||
})
|
})
|
||||||
if client == nil {
|
if client == nil {
|
||||||
t.Fatal("client is nil")
|
t.Fatal("client is nil")
|
||||||
@@ -63,7 +63,7 @@ func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
|||||||
if gotPath != "/v1/audio/transcriptions" {
|
if gotPath != "/v1/audio/transcriptions" {
|
||||||
t.Fatalf("path = %q, want /v1/audio/transcriptions", gotPath)
|
t.Fatalf("path = %q, want /v1/audio/transcriptions", gotPath)
|
||||||
}
|
}
|
||||||
if gotModel != "mistralai/Voxtral-Small-24B-2507" {
|
if gotModel != "openai/whisper-large-v3" {
|
||||||
t.Fatalf("model = %q", gotModel)
|
t.Fatalf("model = %q", gotModel)
|
||||||
}
|
}
|
||||||
if gotResponseFormat != "json" {
|
if gotResponseFormat != "json" {
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ const (
|
|||||||
TaskCallAnalysis = "call_analysis"
|
TaskCallAnalysis = "call_analysis"
|
||||||
TaskTranscription = "transcription"
|
TaskTranscription = "transcription"
|
||||||
|
|
||||||
TranscriptionProfile = "voxtral-small"
|
TranscriptionProfile = "whisper-large-v3"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Worker struct {
|
type Worker struct {
|
||||||
|
|||||||
@@ -11,11 +11,11 @@ data:
|
|||||||
LLM_BASE_URL: "http://10.2.3.5:8002"
|
LLM_BASE_URL: "http://10.2.3.5:8002"
|
||||||
LLM_MODEL: "qwen2.5-14b"
|
LLM_MODEL: "qwen2.5-14b"
|
||||||
LLM_TIMEOUT: "5m"
|
LLM_TIMEOUT: "5m"
|
||||||
# Voxtral Small is the only transcription provider. It is exposed on the AI
|
# Whisper Large v3 is exposed on the AI server through an OpenAI-compatible
|
||||||
# server through an OpenAI-compatible /v1/audio/transcriptions endpoint.
|
# /v1/audio/transcriptions endpoint.
|
||||||
VOXTRAL_BASE_URL: "http://10.2.3.5:8004"
|
AUDIO_TRANSCRIPTION_BASE_URL: "http://10.2.3.5:8004"
|
||||||
VOXTRAL_MODEL: "mistralai/Voxtral-Small-24B-2507"
|
AUDIO_TRANSCRIPTION_MODEL: "openai/whisper-large-v3"
|
||||||
VOXTRAL_TIMEOUT: "30m"
|
AUDIO_TRANSCRIPTION_TIMEOUT: "30m"
|
||||||
AI_STATS_SIDECAR_URL: "http://10.2.3.5:9090"
|
AI_STATS_SIDECAR_URL: "http://10.2.3.5:9090"
|
||||||
AI_STATS_TIMEOUT: "8s"
|
AI_STATS_TIMEOUT: "8s"
|
||||||
WORKER_POLL_INTERVAL: "2s"
|
WORKER_POLL_INTERVAL: "2s"
|
||||||
|
|||||||
@@ -18,5 +18,5 @@ type: Opaque
|
|||||||
stringData:
|
stringData:
|
||||||
DATABASE_URL: "postgres://ai_service:ai_service@postgres:5432/ai_service?sslmode=disable"
|
DATABASE_URL: "postgres://ai_service:ai_service@postgres:5432/ai_service?sslmode=disable"
|
||||||
LLM_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
|
LLM_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
|
||||||
VOXTRAL_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
|
AUDIO_TRANSCRIPTION_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
|
||||||
AI_SERVICE_TOKEN: "d18bcacf9e02bae1806ee6b6eeda62b95be6a915c0a22936d9a700128b275442"
|
AI_SERVICE_TOKEN: "d18bcacf9e02bae1806ee6b6eeda62b95be6a915c0a22936d9a700128b275442"
|
||||||
|
|||||||
@@ -98,7 +98,7 @@ spec:
|
|||||||
- name: WORKER_TASK_TYPES
|
- name: WORKER_TASK_TYPES
|
||||||
value: "transcription"
|
value: "transcription"
|
||||||
- name: WORKER_MODEL_PROFILES
|
- name: WORKER_MODEL_PROFILES
|
||||||
value: "voxtral-small"
|
value: "whisper-large-v3"
|
||||||
- name: WORKER_CLAIM_LIMIT
|
- name: WORKER_CLAIM_LIMIT
|
||||||
value: "2"
|
value: "2"
|
||||||
- name: WORKER_LEASE_TIMEOUT
|
- name: WORKER_LEASE_TIMEOUT
|
||||||
|
|||||||
Reference in New Issue
Block a user