Requeue stale AI jobs
This commit is contained in:
@@ -24,12 +24,16 @@ type Worker struct {
|
||||
modelProfile string
|
||||
pollInterval time.Duration
|
||||
claimLimit int
|
||||
leaseTimeout time.Duration
|
||||
}
|
||||
|
||||
func New(store *store.Store, llmClient *llm.Client, workerID, modelProfile string, pollInterval time.Duration, claimLimit int) *Worker {
|
||||
func New(store *store.Store, llmClient *llm.Client, workerID, modelProfile string, pollInterval, leaseTimeout time.Duration, claimLimit int) *Worker {
|
||||
if pollInterval <= 0 {
|
||||
pollInterval = 2 * time.Second
|
||||
}
|
||||
if leaseTimeout <= 0 {
|
||||
leaseTimeout = 15 * time.Minute
|
||||
}
|
||||
if claimLimit <= 0 {
|
||||
claimLimit = 4
|
||||
}
|
||||
@@ -43,6 +47,7 @@ func New(store *store.Store, llmClient *llm.Client, workerID, modelProfile strin
|
||||
modelProfile: modelProfile,
|
||||
pollInterval: pollInterval,
|
||||
claimLimit: claimLimit,
|
||||
leaseTimeout: leaseTimeout,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,6 +65,11 @@ func (w *Worker) Run(ctx context.Context) {
|
||||
}
|
||||
|
||||
func (w *Worker) tick(ctx context.Context) {
|
||||
if reset, err := w.store.RequeueStaleRunning(ctx, w.leaseTimeout, 100); err != nil {
|
||||
slog.Error("requeue stale jobs failed", "error", err)
|
||||
} else if reset > 0 {
|
||||
slog.Warn("requeued stale jobs", "count", reset)
|
||||
}
|
||||
jobs, err := w.store.ClaimJobs(ctx, model.ClaimJobs{
|
||||
WorkerID: w.workerID,
|
||||
TaskTypes: []string{TaskLLMChat, TaskChatCompletion},
|
||||
|
||||
Reference in New Issue
Block a user