feat: expand ai retry policy
This commit is contained in:
16
README.md
16
README.md
@@ -85,6 +85,8 @@ scheduling.
|
|||||||
returning secrets.
|
returning secrets.
|
||||||
- `GET /api/v1/infra/status` returns AI-server sidecar telemetry
|
- `GET /api/v1/infra/status` returns AI-server sidecar telemetry
|
||||||
(GPU, containers and vLLM live metrics) when configured.
|
(GPU, containers and vLLM live metrics) when configured.
|
||||||
|
- `GET /health/detail` returns PostgreSQL, provider, queue, error, throughput
|
||||||
|
and infra components for Portal `admin/health`.
|
||||||
- `GET /healthz` returns process health.
|
- `GET /healthz` returns process health.
|
||||||
- `GET /readyz` checks PostgreSQL readiness.
|
- `GET /readyz` checks PostgreSQL readiness.
|
||||||
- Built-in workers expose open Kubernetes endpoints on `WORKER_HTTP_PORT`:
|
- Built-in workers expose open Kubernetes endpoints on `WORKER_HTTP_PORT`:
|
||||||
@@ -94,6 +96,20 @@ All `/api/v1/*` endpoints require `Authorization: Bearer <AI_SERVICE_TOKEN>`
|
|||||||
when `AI_SERVICE_TOKEN` is configured. Health and readiness endpoints stay open
|
when `AI_SERVICE_TOKEN` is configured. Health and readiness endpoints stay open
|
||||||
for Kubernetes probes.
|
for Kubernetes probes.
|
||||||
|
|
||||||
|
## Retry policy
|
||||||
|
|
||||||
|
Workers store a normalized `error_code` on failed jobs. AI Service requeues only
|
||||||
|
explicitly retryable categories while attempts remain.
|
||||||
|
|
||||||
|
| Category | Retry | Delay |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `provider_unavailable`, `model_unavailable`, `provider_error`, `dependency_error`, `timeout`, `storage_error`, `stale_worker` | yes | 30s |
|
||||||
|
| `bad_response`, `transcript_hallucination`, `transcript_incomplete`, `internal_error`, `unknown` | yes | 2m |
|
||||||
|
| `bad_audio`, `bad_input`, `context_length`, `unsupported_task`, `cancelled` | no | - |
|
||||||
|
|
||||||
|
Domain services may still expose manual retry for terminal errors after the
|
||||||
|
underlying data or prompt is corrected.
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
- `HTTP_HOST`, default `0.0.0.0`
|
- `HTTP_HOST`, default `0.0.0.0`
|
||||||
|
|||||||
@@ -12,9 +12,9 @@ type failRetryPolicy struct {
|
|||||||
|
|
||||||
func retryPolicyForError(errorCode string) failRetryPolicy {
|
func retryPolicyForError(errorCode string) failRetryPolicy {
|
||||||
switch strings.TrimSpace(errorCode) {
|
switch strings.TrimSpace(errorCode) {
|
||||||
case "provider_unavailable", "model_unavailable", "timeout", "storage_error", "stale_worker":
|
case "provider_unavailable", "model_unavailable", "provider_error", "dependency_error", "timeout", "storage_error", "stale_worker":
|
||||||
return failRetryPolicy{Retryable: true, Delay: 30 * time.Second}
|
return failRetryPolicy{Retryable: true, Delay: 30 * time.Second}
|
||||||
case "bad_response", "unknown":
|
case "bad_response", "transcript_hallucination", "transcript_incomplete", "internal_error", "unknown":
|
||||||
return failRetryPolicy{Retryable: true, Delay: 2 * time.Minute}
|
return failRetryPolicy{Retryable: true, Delay: 2 * time.Minute}
|
||||||
default:
|
default:
|
||||||
return failRetryPolicy{}
|
return failRetryPolicy{}
|
||||||
|
|||||||
@@ -14,13 +14,21 @@ func TestRetryPolicyForError(t *testing.T) {
|
|||||||
}{
|
}{
|
||||||
{name: "provider unavailable", code: "provider_unavailable", retryable: true, delay: 30 * time.Second},
|
{name: "provider unavailable", code: "provider_unavailable", retryable: true, delay: 30 * time.Second},
|
||||||
{name: "model unavailable", code: "model_unavailable", retryable: true, delay: 30 * time.Second},
|
{name: "model unavailable", code: "model_unavailable", retryable: true, delay: 30 * time.Second},
|
||||||
|
{name: "provider error", code: "provider_error", retryable: true, delay: 30 * time.Second},
|
||||||
|
{name: "dependency error", code: "dependency_error", retryable: true, delay: 30 * time.Second},
|
||||||
{name: "timeout", code: "timeout", retryable: true, delay: 30 * time.Second},
|
{name: "timeout", code: "timeout", retryable: true, delay: 30 * time.Second},
|
||||||
{name: "storage", code: "storage_error", retryable: true, delay: 30 * time.Second},
|
{name: "storage", code: "storage_error", retryable: true, delay: 30 * time.Second},
|
||||||
|
{name: "stale worker", code: "stale_worker", retryable: true, delay: 30 * time.Second},
|
||||||
{name: "bad response", code: "bad_response", retryable: true, delay: 2 * time.Minute},
|
{name: "bad response", code: "bad_response", retryable: true, delay: 2 * time.Minute},
|
||||||
|
{name: "transcript hallucination", code: "transcript_hallucination", retryable: true, delay: 2 * time.Minute},
|
||||||
|
{name: "transcript incomplete", code: "transcript_incomplete", retryable: true, delay: 2 * time.Minute},
|
||||||
|
{name: "internal error", code: "internal_error", retryable: true, delay: 2 * time.Minute},
|
||||||
{name: "unknown", code: "unknown", retryable: true, delay: 2 * time.Minute},
|
{name: "unknown", code: "unknown", retryable: true, delay: 2 * time.Minute},
|
||||||
{name: "bad audio", code: "bad_audio"},
|
{name: "bad audio", code: "bad_audio"},
|
||||||
{name: "bad input", code: "bad_input"},
|
{name: "bad input", code: "bad_input"},
|
||||||
{name: "context length", code: "context_length"},
|
{name: "context length", code: "context_length"},
|
||||||
|
{name: "unsupported task", code: "unsupported_task"},
|
||||||
|
{name: "cancelled", code: "cancelled"},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
|
|||||||
Reference in New Issue
Block a user