Requeue stale AI jobs
All checks were successful
CI / test (push) Successful in 12s
Build and Deploy / build-and-deploy (push) Successful in 27s

This commit is contained in:
Grendgi
2026-06-08 13:54:07 +03:00
parent 24c5d89c7b
commit 59e1073d96
6 changed files with 55 additions and 2 deletions

View File

@@ -264,6 +264,44 @@ RETURNING ` + jobSelectColumns + `
return job, err
}
func (s *Store) RequeueStaleRunning(ctx context.Context, olderThan time.Duration, limit int) (int, error) {
if olderThan <= 0 {
olderThan = 15 * time.Minute
}
if limit <= 0 {
limit = 100
}
if limit > 1000 {
limit = 1000
}
const q = `
WITH picked AS (
SELECT id
FROM ai_jobs
WHERE status = 'running'
AND COALESCE(heartbeat_at, started_at, updated_at) < NOW() - make_interval(secs => $1)
ORDER BY COALESCE(heartbeat_at, started_at, updated_at) ASC
LIMIT $2
)
UPDATE ai_jobs j
SET status = CASE WHEN j.attempts < j.max_attempts THEN 'pending' ELSE 'failed' END,
error_code = CASE WHEN j.attempts < j.max_attempts THEN NULL ELSE 'stale_worker' END,
error_message = CASE WHEN j.attempts < j.max_attempts THEN NULL ELSE 'worker lease expired' END,
worker_id = NULL,
heartbeat_at = NULL,
completed_at = CASE WHEN j.attempts < j.max_attempts THEN NULL ELSE NOW() END,
scheduled_at = CASE WHEN j.attempts < j.max_attempts THEN NOW() ELSE j.scheduled_at END,
updated_at = NOW()
FROM picked
WHERE j.id = picked.id
`
tag, err := s.pool.Exec(ctx, q, int(olderThan.Seconds()), limit)
if err != nil {
return 0, err
}
return int(tag.RowsAffected()), nil
}
func (s *Store) Stats(ctx context.Context) (*model.Stats, error) {
out := &model.Stats{At: time.Now().UTC()}