Requeue stale AI jobs
This commit is contained in:
@@ -264,6 +264,44 @@ RETURNING ` + jobSelectColumns + `
|
||||
return job, err
|
||||
}
|
||||
|
||||
func (s *Store) RequeueStaleRunning(ctx context.Context, olderThan time.Duration, limit int) (int, error) {
|
||||
if olderThan <= 0 {
|
||||
olderThan = 15 * time.Minute
|
||||
}
|
||||
if limit <= 0 {
|
||||
limit = 100
|
||||
}
|
||||
if limit > 1000 {
|
||||
limit = 1000
|
||||
}
|
||||
const q = `
|
||||
WITH picked AS (
|
||||
SELECT id
|
||||
FROM ai_jobs
|
||||
WHERE status = 'running'
|
||||
AND COALESCE(heartbeat_at, started_at, updated_at) < NOW() - make_interval(secs => $1)
|
||||
ORDER BY COALESCE(heartbeat_at, started_at, updated_at) ASC
|
||||
LIMIT $2
|
||||
)
|
||||
UPDATE ai_jobs j
|
||||
SET status = CASE WHEN j.attempts < j.max_attempts THEN 'pending' ELSE 'failed' END,
|
||||
error_code = CASE WHEN j.attempts < j.max_attempts THEN NULL ELSE 'stale_worker' END,
|
||||
error_message = CASE WHEN j.attempts < j.max_attempts THEN NULL ELSE 'worker lease expired' END,
|
||||
worker_id = NULL,
|
||||
heartbeat_at = NULL,
|
||||
completed_at = CASE WHEN j.attempts < j.max_attempts THEN NULL ELSE NOW() END,
|
||||
scheduled_at = CASE WHEN j.attempts < j.max_attempts THEN NOW() ELSE j.scheduled_at END,
|
||||
updated_at = NOW()
|
||||
FROM picked
|
||||
WHERE j.id = picked.id
|
||||
`
|
||||
tag, err := s.pool.Exec(ctx, q, int(olderThan.Seconds()), limit)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return int(tag.RowsAffected()), nil
|
||||
}
|
||||
|
||||
func (s *Store) Stats(ctx context.Context) (*model.Stats, error) {
|
||||
out := &model.Stats{At: time.Now().UTC()}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user