Requeue stale AI jobs
This commit is contained in:
@@ -21,6 +21,7 @@ type Config struct {
|
||||
WorkerID string
|
||||
WorkerPollInterval time.Duration
|
||||
WorkerClaimLimit int
|
||||
WorkerLeaseTimeout time.Duration
|
||||
}
|
||||
|
||||
func Load() Config {
|
||||
@@ -39,6 +40,7 @@ func Load() Config {
|
||||
WorkerID: envString("WORKER_ID", hostname()),
|
||||
WorkerPollInterval: envDuration("WORKER_POLL_INTERVAL", 2*time.Second),
|
||||
WorkerClaimLimit: envInt("WORKER_CLAIM_LIMIT", 4),
|
||||
WorkerLeaseTimeout: envDuration("WORKER_LEASE_TIMEOUT", 15*time.Minute),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -264,6 +264,44 @@ RETURNING ` + jobSelectColumns + `
|
||||
return job, err
|
||||
}
|
||||
|
||||
func (s *Store) RequeueStaleRunning(ctx context.Context, olderThan time.Duration, limit int) (int, error) {
|
||||
if olderThan <= 0 {
|
||||
olderThan = 15 * time.Minute
|
||||
}
|
||||
if limit <= 0 {
|
||||
limit = 100
|
||||
}
|
||||
if limit > 1000 {
|
||||
limit = 1000
|
||||
}
|
||||
const q = `
|
||||
WITH picked AS (
|
||||
SELECT id
|
||||
FROM ai_jobs
|
||||
WHERE status = 'running'
|
||||
AND COALESCE(heartbeat_at, started_at, updated_at) < NOW() - make_interval(secs => $1)
|
||||
ORDER BY COALESCE(heartbeat_at, started_at, updated_at) ASC
|
||||
LIMIT $2
|
||||
)
|
||||
UPDATE ai_jobs j
|
||||
SET status = CASE WHEN j.attempts < j.max_attempts THEN 'pending' ELSE 'failed' END,
|
||||
error_code = CASE WHEN j.attempts < j.max_attempts THEN NULL ELSE 'stale_worker' END,
|
||||
error_message = CASE WHEN j.attempts < j.max_attempts THEN NULL ELSE 'worker lease expired' END,
|
||||
worker_id = NULL,
|
||||
heartbeat_at = NULL,
|
||||
completed_at = CASE WHEN j.attempts < j.max_attempts THEN NULL ELSE NOW() END,
|
||||
scheduled_at = CASE WHEN j.attempts < j.max_attempts THEN NOW() ELSE j.scheduled_at END,
|
||||
updated_at = NOW()
|
||||
FROM picked
|
||||
WHERE j.id = picked.id
|
||||
`
|
||||
tag, err := s.pool.Exec(ctx, q, int(olderThan.Seconds()), limit)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return int(tag.RowsAffected()), nil
|
||||
}
|
||||
|
||||
func (s *Store) Stats(ctx context.Context) (*model.Stats, error) {
|
||||
out := &model.Stats{At: time.Now().UTC()}
|
||||
|
||||
|
||||
@@ -24,12 +24,16 @@ type Worker struct {
|
||||
modelProfile string
|
||||
pollInterval time.Duration
|
||||
claimLimit int
|
||||
leaseTimeout time.Duration
|
||||
}
|
||||
|
||||
func New(store *store.Store, llmClient *llm.Client, workerID, modelProfile string, pollInterval time.Duration, claimLimit int) *Worker {
|
||||
func New(store *store.Store, llmClient *llm.Client, workerID, modelProfile string, pollInterval, leaseTimeout time.Duration, claimLimit int) *Worker {
|
||||
if pollInterval <= 0 {
|
||||
pollInterval = 2 * time.Second
|
||||
}
|
||||
if leaseTimeout <= 0 {
|
||||
leaseTimeout = 15 * time.Minute
|
||||
}
|
||||
if claimLimit <= 0 {
|
||||
claimLimit = 4
|
||||
}
|
||||
@@ -43,6 +47,7 @@ func New(store *store.Store, llmClient *llm.Client, workerID, modelProfile strin
|
||||
modelProfile: modelProfile,
|
||||
pollInterval: pollInterval,
|
||||
claimLimit: claimLimit,
|
||||
leaseTimeout: leaseTimeout,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,6 +65,11 @@ func (w *Worker) Run(ctx context.Context) {
|
||||
}
|
||||
|
||||
func (w *Worker) tick(ctx context.Context) {
|
||||
if reset, err := w.store.RequeueStaleRunning(ctx, w.leaseTimeout, 100); err != nil {
|
||||
slog.Error("requeue stale jobs failed", "error", err)
|
||||
} else if reset > 0 {
|
||||
slog.Warn("requeued stale jobs", "count", reset)
|
||||
}
|
||||
jobs, err := w.store.ClaimJobs(ctx, model.ClaimJobs{
|
||||
WorkerID: w.workerID,
|
||||
TaskTypes: []string{TaskLLMChat, TaskChatCompletion},
|
||||
|
||||
Reference in New Issue
Block a user