feat: expand ai retry policy
This commit is contained in:
16
README.md
16
README.md
@@ -85,6 +85,8 @@ scheduling.
|
||||
returning secrets.
|
||||
- `GET /api/v1/infra/status` returns AI-server sidecar telemetry
|
||||
(GPU, containers and vLLM live metrics) when configured.
|
||||
- `GET /health/detail` returns PostgreSQL, provider, queue, error, throughput
|
||||
and infra components for Portal `admin/health`.
|
||||
- `GET /healthz` returns process health.
|
||||
- `GET /readyz` checks PostgreSQL readiness.
|
||||
- Built-in workers expose open Kubernetes endpoints on `WORKER_HTTP_PORT`:
|
||||
@@ -94,6 +96,20 @@ All `/api/v1/*` endpoints require `Authorization: Bearer <AI_SERVICE_TOKEN>`
|
||||
when `AI_SERVICE_TOKEN` is configured. Health and readiness endpoints stay open
|
||||
for Kubernetes probes.
|
||||
|
||||
## Retry policy
|
||||
|
||||
Workers store a normalized `error_code` on failed jobs. AI Service requeues only
|
||||
explicitly retryable categories while attempts remain.
|
||||
|
||||
| Category | Retry | Delay |
|
||||
| --- | --- | --- |
|
||||
| `provider_unavailable`, `model_unavailable`, `provider_error`, `dependency_error`, `timeout`, `storage_error`, `stale_worker` | yes | 30s |
|
||||
| `bad_response`, `transcript_hallucination`, `transcript_incomplete`, `internal_error`, `unknown` | yes | 2m |
|
||||
| `bad_audio`, `bad_input`, `context_length`, `unsupported_task`, `cancelled` | no | - |
|
||||
|
||||
Domain services may still expose manual retry for terminal errors after the
|
||||
underlying data or prompt is corrected.
|
||||
|
||||
## Configuration
|
||||
|
||||
- `HTTP_HOST`, default `0.0.0.0`
|
||||
|
||||
@@ -12,9 +12,9 @@ type failRetryPolicy struct {
|
||||
|
||||
func retryPolicyForError(errorCode string) failRetryPolicy {
|
||||
switch strings.TrimSpace(errorCode) {
|
||||
case "provider_unavailable", "model_unavailable", "timeout", "storage_error", "stale_worker":
|
||||
case "provider_unavailable", "model_unavailable", "provider_error", "dependency_error", "timeout", "storage_error", "stale_worker":
|
||||
return failRetryPolicy{Retryable: true, Delay: 30 * time.Second}
|
||||
case "bad_response", "unknown":
|
||||
case "bad_response", "transcript_hallucination", "transcript_incomplete", "internal_error", "unknown":
|
||||
return failRetryPolicy{Retryable: true, Delay: 2 * time.Minute}
|
||||
default:
|
||||
return failRetryPolicy{}
|
||||
|
||||
@@ -14,13 +14,21 @@ func TestRetryPolicyForError(t *testing.T) {
|
||||
}{
|
||||
{name: "provider unavailable", code: "provider_unavailable", retryable: true, delay: 30 * time.Second},
|
||||
{name: "model unavailable", code: "model_unavailable", retryable: true, delay: 30 * time.Second},
|
||||
{name: "provider error", code: "provider_error", retryable: true, delay: 30 * time.Second},
|
||||
{name: "dependency error", code: "dependency_error", retryable: true, delay: 30 * time.Second},
|
||||
{name: "timeout", code: "timeout", retryable: true, delay: 30 * time.Second},
|
||||
{name: "storage", code: "storage_error", retryable: true, delay: 30 * time.Second},
|
||||
{name: "stale worker", code: "stale_worker", retryable: true, delay: 30 * time.Second},
|
||||
{name: "bad response", code: "bad_response", retryable: true, delay: 2 * time.Minute},
|
||||
{name: "transcript hallucination", code: "transcript_hallucination", retryable: true, delay: 2 * time.Minute},
|
||||
{name: "transcript incomplete", code: "transcript_incomplete", retryable: true, delay: 2 * time.Minute},
|
||||
{name: "internal error", code: "internal_error", retryable: true, delay: 2 * time.Minute},
|
||||
{name: "unknown", code: "unknown", retryable: true, delay: 2 * time.Minute},
|
||||
{name: "bad audio", code: "bad_audio"},
|
||||
{name: "bad input", code: "bad_input"},
|
||||
{name: "context length", code: "context_length"},
|
||||
{name: "unsupported task", code: "unsupported_task"},
|
||||
{name: "cancelled", code: "cancelled"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
|
||||
Reference in New Issue
Block a user