Add automatic retry policy for AI jobs
This commit is contained in:
22
internal/store/retry_policy.go
Normal file
22
internal/store/retry_policy.go
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
package store
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type failRetryPolicy struct {
|
||||||
|
Retryable bool
|
||||||
|
Delay time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
func retryPolicyForError(errorCode string) failRetryPolicy {
|
||||||
|
switch strings.TrimSpace(errorCode) {
|
||||||
|
case "provider_unavailable", "model_unavailable", "timeout", "storage_error", "stale_worker":
|
||||||
|
return failRetryPolicy{Retryable: true, Delay: 30 * time.Second}
|
||||||
|
case "bad_response", "unknown":
|
||||||
|
return failRetryPolicy{Retryable: true, Delay: 2 * time.Minute}
|
||||||
|
default:
|
||||||
|
return failRetryPolicy{}
|
||||||
|
}
|
||||||
|
}
|
||||||
37
internal/store/retry_policy_test.go
Normal file
37
internal/store/retry_policy_test.go
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
package store
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRetryPolicyForError(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
code string
|
||||||
|
retryable bool
|
||||||
|
delay time.Duration
|
||||||
|
}{
|
||||||
|
{name: "provider unavailable", code: "provider_unavailable", retryable: true, delay: 30 * time.Second},
|
||||||
|
{name: "model unavailable", code: "model_unavailable", retryable: true, delay: 30 * time.Second},
|
||||||
|
{name: "timeout", code: "timeout", retryable: true, delay: 30 * time.Second},
|
||||||
|
{name: "storage", code: "storage_error", retryable: true, delay: 30 * time.Second},
|
||||||
|
{name: "bad response", code: "bad_response", retryable: true, delay: 2 * time.Minute},
|
||||||
|
{name: "unknown", code: "unknown", retryable: true, delay: 2 * time.Minute},
|
||||||
|
{name: "bad audio", code: "bad_audio"},
|
||||||
|
{name: "bad input", code: "bad_input"},
|
||||||
|
{name: "context length", code: "context_length"},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
got := retryPolicyForError(tt.code)
|
||||||
|
if got.Retryable != tt.retryable {
|
||||||
|
t.Fatalf("Retryable = %v, want %v", got.Retryable, tt.retryable)
|
||||||
|
}
|
||||||
|
if got.Delay != tt.delay {
|
||||||
|
t.Fatalf("Delay = %s, want %s", got.Delay, tt.delay)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -548,19 +548,23 @@ func (s *Store) FailJob(ctx context.Context, id uuid.UUID, in model.FailJob) (*m
|
|||||||
errorCode = "unknown"
|
errorCode = "unknown"
|
||||||
}
|
}
|
||||||
errorMessage := strings.TrimSpace(in.ErrorMessage)
|
errorMessage := strings.TrimSpace(in.ErrorMessage)
|
||||||
|
policy := retryPolicyForError(errorCode)
|
||||||
const q = `
|
const q = `
|
||||||
UPDATE ai_jobs
|
UPDATE ai_jobs
|
||||||
SET status = 'failed',
|
SET status = CASE WHEN $4 AND attempts < max_attempts THEN 'pending' ELSE 'failed' END,
|
||||||
error_code = $2,
|
error_code = $2,
|
||||||
error_message = $3,
|
error_message = $3,
|
||||||
completed_at = NOW(),
|
scheduled_at = CASE WHEN $4 AND attempts < max_attempts THEN NOW() + make_interval(secs => $5) ELSE scheduled_at END,
|
||||||
heartbeat_at = NOW(),
|
started_at = CASE WHEN $4 AND attempts < max_attempts THEN NULL ELSE started_at END,
|
||||||
|
completed_at = CASE WHEN $4 AND attempts < max_attempts THEN NULL ELSE NOW() END,
|
||||||
|
worker_id = NULL,
|
||||||
|
heartbeat_at = CASE WHEN $4 AND attempts < max_attempts THEN NULL ELSE NOW() END,
|
||||||
updated_at = NOW()
|
updated_at = NOW()
|
||||||
WHERE id = $1
|
WHERE id = $1
|
||||||
AND status = 'running'
|
AND status = 'running'
|
||||||
RETURNING ` + jobSelectColumns + `
|
RETURNING ` + jobSelectColumns + `
|
||||||
`
|
`
|
||||||
job, err := scanJob(s.pool.QueryRow(ctx, q, id, errorCode, errorMessage))
|
job, err := scanJob(s.pool.QueryRow(ctx, q, id, errorCode, errorMessage, policy.Retryable, int(policy.Delay.Seconds())))
|
||||||
if errors.Is(err, pgx.ErrNoRows) {
|
if errors.Is(err, pgx.ErrNoRows) {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user