Add automatic retry policy for AI jobs
Some checks failed
CI / test (push) Failing after 9s
Build and Deploy / build-and-deploy (push) Successful in 33s

This commit is contained in:
Grendgi
2026-06-10 15:58:41 +03:00
parent f54400e8e2
commit 2a481fdc54
3 changed files with 67 additions and 4 deletions

View File

@@ -548,19 +548,23 @@ func (s *Store) FailJob(ctx context.Context, id uuid.UUID, in model.FailJob) (*m
errorCode = "unknown"
}
errorMessage := strings.TrimSpace(in.ErrorMessage)
policy := retryPolicyForError(errorCode)
const q = `
UPDATE ai_jobs
SET status = 'failed',
SET status = CASE WHEN $4 AND attempts < max_attempts THEN 'pending' ELSE 'failed' END,
error_code = $2,
error_message = $3,
completed_at = NOW(),
heartbeat_at = NOW(),
scheduled_at = CASE WHEN $4 AND attempts < max_attempts THEN NOW() + make_interval(secs => $5) ELSE scheduled_at END,
started_at = CASE WHEN $4 AND attempts < max_attempts THEN NULL ELSE started_at END,
completed_at = CASE WHEN $4 AND attempts < max_attempts THEN NULL ELSE NOW() END,
worker_id = NULL,
heartbeat_at = CASE WHEN $4 AND attempts < max_attempts THEN NULL ELSE NOW() END,
updated_at = NOW()
WHERE id = $1
AND status = 'running'
RETURNING ` + jobSelectColumns + `
`
job, err := scanJob(s.pool.QueryRow(ctx, q, id, errorCode, errorMessage))
job, err := scanJob(s.pool.QueryRow(ctx, q, id, errorCode, errorMessage, policy.Retryable, int(policy.Delay.Seconds())))
if errors.Is(err, pgx.ErrNoRows) {
return nil, nil
}