feat: initial watchdog implementation (#1341)

* feat: initial watchdog implementation

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>

* fiuxups

* Add more output

* wip: idletime checker

* wire idle watchdog checks

* enlarge watchdog time window

* small fixes

* Use stopmodel

* Always delete process

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2023-11-26 18:36:23 +01:00 committed by GitHub
parent 9482acfdfc
commit 824612f1b4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 341 additions and 13 deletions

47
main.go
View file

@ -10,6 +10,7 @@ import (
"path/filepath"
"strings"
"syscall"
"time"
api "github.com/go-skynet/LocalAI/api"
"github.com/go-skynet/LocalAI/api/backend"
@ -154,6 +155,30 @@ func main() {
Usage: "List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys.",
EnvVars: []string{"API_KEY"},
},
&cli.BoolFlag{
Name: "enable-watchdog-idle",
Usage: "Enable watchdog for stopping idle backends. This will stop the backends if are in idle state for too long.",
EnvVars: []string{"WATCHDOG_IDLE"},
Value: false,
},
&cli.BoolFlag{
Name: "enable-watchdog-busy",
Usage: "Enable watchdog for stopping busy backends that exceed a defined threshold.",
EnvVars: []string{"WATCHDOG_BUSY"},
Value: false,
},
&cli.StringFlag{
Name: "watchdog-busy-timeout",
Usage: "Watchdog timeout. This will restart the backend if it crashes.",
EnvVars: []string{"WATCHDOG_BUSY_TIMEOUT"},
Value: "5m",
},
&cli.StringFlag{
Name: "watchdog-idle-timeout",
Usage: "Watchdog idle timeout. This will restart the backend if it crashes.",
EnvVars: []string{"WATCHDOG_IDLE_TIMEOUT"},
Value: "15m",
},
&cli.BoolFlag{
Name: "preload-backend-only",
Usage: "If set, the api is NOT launched, and only the preloaded models / backends are started. This is intended for multi-node setups.",
@ -198,6 +223,28 @@ For a list of compatible model, check out: https://localai.io/model-compatibilit
options.WithUploadLimitMB(ctx.Int("upload-limit")),
options.WithApiKeys(ctx.StringSlice("api-keys")),
}
idleWatchDog := ctx.Bool("enable-watchdog-idle")
busyWatchDog := ctx.Bool("enable-watchdog-busy")
if idleWatchDog || busyWatchDog {
opts = append(opts, options.EnableWatchDog)
if idleWatchDog {
opts = append(opts, options.EnableWatchDogIdleCheck)
dur, err := time.ParseDuration(ctx.String("watchdog-idle-timeout"))
if err != nil {
return err
}
opts = append(opts, options.SetWatchDogIdleTimeout(dur))
}
if busyWatchDog {
opts = append(opts, options.EnableWatchDogBusyCheck)
dur, err := time.ParseDuration(ctx.String("watchdog-busy-timeout"))
if err != nil {
return err
}
opts = append(opts, options.SetWatchDogBusyTimeout(dur))
}
}
if ctx.Bool("parallel-requests") {
opts = append(opts, options.EnableParallelBackendRequests)
}