From f32265400b01146ea8b42373fbce5f5c98093326 Mon Sep 17 00:00:00 2001 From: Grendgi Date: Wed, 17 Jun 2026 16:39:58 +0300 Subject: [PATCH] feat: expand ai retry policy --- README.md | 16 ++++++++++++++++ internal/store/retry_policy.go | 4 ++-- internal/store/retry_policy_test.go | 8 ++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c310407..5128c42 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,8 @@ scheduling. returning secrets. - `GET /api/v1/infra/status` returns AI-server sidecar telemetry (GPU, containers and vLLM live metrics) when configured. +- `GET /health/detail` returns PostgreSQL, provider, queue, error, throughput + and infra components for Portal `admin/health`. - `GET /healthz` returns process health. - `GET /readyz` checks PostgreSQL readiness. - Built-in workers expose open Kubernetes endpoints on `WORKER_HTTP_PORT`: @@ -94,6 +96,20 @@ All `/api/v1/*` endpoints require `Authorization: Bearer ` when `AI_SERVICE_TOKEN` is configured. Health and readiness endpoints stay open for Kubernetes probes. +## Retry policy + +Workers store a normalized `error_code` on failed jobs. AI Service requeues only +explicitly retryable categories while attempts remain. + +| Category | Retry | Delay | +| --- | --- | --- | +| `provider_unavailable`, `model_unavailable`, `provider_error`, `dependency_error`, `timeout`, `storage_error`, `stale_worker` | yes | 30s | +| `bad_response`, `transcript_hallucination`, `transcript_incomplete`, `internal_error`, `unknown` | yes | 2m | +| `bad_audio`, `bad_input`, `context_length`, `unsupported_task`, `cancelled` | no | - | + +Domain services may still expose manual retry for terminal errors after the +underlying data or prompt is corrected. + ## Configuration - `HTTP_HOST`, default `0.0.0.0` diff --git a/internal/store/retry_policy.go b/internal/store/retry_policy.go index 58a6a70..d89c743 100644 --- a/internal/store/retry_policy.go +++ b/internal/store/retry_policy.go @@ -12,9 +12,9 @@ type failRetryPolicy struct { func retryPolicyForError(errorCode string) failRetryPolicy { switch strings.TrimSpace(errorCode) { - case "provider_unavailable", "model_unavailable", "timeout", "storage_error", "stale_worker": + case "provider_unavailable", "model_unavailable", "provider_error", "dependency_error", "timeout", "storage_error", "stale_worker": return failRetryPolicy{Retryable: true, Delay: 30 * time.Second} - case "bad_response", "unknown": + case "bad_response", "transcript_hallucination", "transcript_incomplete", "internal_error", "unknown": return failRetryPolicy{Retryable: true, Delay: 2 * time.Minute} default: return failRetryPolicy{} diff --git a/internal/store/retry_policy_test.go b/internal/store/retry_policy_test.go index 429ba04..9f1d135 100644 --- a/internal/store/retry_policy_test.go +++ b/internal/store/retry_policy_test.go @@ -14,13 +14,21 @@ func TestRetryPolicyForError(t *testing.T) { }{ {name: "provider unavailable", code: "provider_unavailable", retryable: true, delay: 30 * time.Second}, {name: "model unavailable", code: "model_unavailable", retryable: true, delay: 30 * time.Second}, + {name: "provider error", code: "provider_error", retryable: true, delay: 30 * time.Second}, + {name: "dependency error", code: "dependency_error", retryable: true, delay: 30 * time.Second}, {name: "timeout", code: "timeout", retryable: true, delay: 30 * time.Second}, {name: "storage", code: "storage_error", retryable: true, delay: 30 * time.Second}, + {name: "stale worker", code: "stale_worker", retryable: true, delay: 30 * time.Second}, {name: "bad response", code: "bad_response", retryable: true, delay: 2 * time.Minute}, + {name: "transcript hallucination", code: "transcript_hallucination", retryable: true, delay: 2 * time.Minute}, + {name: "transcript incomplete", code: "transcript_incomplete", retryable: true, delay: 2 * time.Minute}, + {name: "internal error", code: "internal_error", retryable: true, delay: 2 * time.Minute}, {name: "unknown", code: "unknown", retryable: true, delay: 2 * time.Minute}, {name: "bad audio", code: "bad_audio"}, {name: "bad input", code: "bad_input"}, {name: "context length", code: "context_length"}, + {name: "unsupported task", code: "unsupported_task"}, + {name: "cancelled", code: "cancelled"}, } for _, tt := range tests {