Merge branch 'master' into lumina

This commit is contained in:
Ettore Di Giacinto 2025-04-09 23:11:07 +02:00 committed by GitHub
commit 505013ce66
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
43 changed files with 1169 additions and 388 deletions

3
.env
View file

@ -29,6 +29,9 @@
## Enable/Disable single backend (useful if only one GPU is available)
# LOCALAI_SINGLE_ACTIVE_BACKEND=true
# Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
# LOCALAI_FORCE_BACKEND_SHUTDOWN=true
## Specify a build type. Available: cublas, openblas, clblas.
## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.

View file

@ -15,7 +15,7 @@ jobs:
strategy:
matrix:
include:
- base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
- base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04
runs-on: 'ubuntu-latest'
platforms: 'linux/amd64'
runs-on: ${{matrix.runs-on}}

View file

@ -18,7 +18,7 @@ jobs:
if: ${{ github.actor != 'dependabot[bot]' }}
- name: Run Gosec Security Scanner
if: ${{ github.actor != 'dependabot[bot]' }}
uses: securego/gosec@v2.22.0
uses: securego/gosec@v2.22.3
with:
# we let the report trigger content trigger a failure using the GitHub Security features.
args: '-no-fail -fmt sarif -out results.sarif ./...'

View file

@ -6,7 +6,7 @@ BINARY_NAME=local-ai
DETECT_LIBS?=true
# llama.cpp versions
CPPLLAMA_VERSION?=5dec47dcd411fdf815a3708fd6194e2b13d19006
CPPLLAMA_VERSION?=b32efad2bc42460637c3a364c9554ea8217b3d7f
# whisper.cpp version
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@ -809,7 +809,7 @@ docker-aio-all:
docker-image-intel:
docker build \
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
--build-arg GO_TAGS="none" \
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@ -817,7 +817,7 @@ docker-image-intel:
docker-image-intel-xpu:
docker build \
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
--build-arg GO_TAGS="none" \
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \

View file

@ -50,7 +50,23 @@
**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API thats compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
| Talk Interface | Generate Audio |
| --- | --- |
| ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](https://github.com/user-attachments/assets/9841b1ee-88af-4b96-8ec0-41b17364efa7) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](https://github.com/user-attachments/assets/d729f6f4-0621-4715-bda3-35fe6e159524) |
| Models Overview | Generate Images |
| --- | --- |
| ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](https://github.com/user-attachments/assets/3cf0b918-ba8e-498a-a3cd-485db5984325) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](https://github.com/user-attachments/assets/6753d23d-218b-4e07-94b8-9e6c5a4f2311) |
| Chat Interface | API Overview |
| --- | --- |
| ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](https://github.com/user-attachments/assets/048eab31-0f0c-4d52-a920-3715233f9bf3) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](https://github.com/user-attachments/assets/2540e8ce-1a2c-4c12-800c-763bd9be247f) |
| Login | Swarm |
| --- | --- |
|![Screenshot 2025-03-31 at 12-09-59 ](https://github.com/user-attachments/assets/5af681b0-dd8e-4fe8-a234-a22f8a040547) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](https://github.com/user-attachments/assets/b9527176-63d6-4d2e-8ed1-7fde13a9b0ad) |
## Quickstart
Run the installer script:
@ -92,6 +108,8 @@ local-ai run oci://localai/phi-2:latest
## 📰 Latest project news
- Apr 2025: WebUI overhaul, AIO images updates
- Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
- Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
- Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
- Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )

View file

@ -2,7 +2,7 @@
## XXX: In some versions of CMake clip wasn't being built before llama.
## This is an hack for now, but it should be fixed in the future.
set(TARGET myclip)
add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
install(TARGETS ${TARGET} LIBRARY)
target_include_directories(myclip PUBLIC .)
target_include_directories(myclip PUBLIC ../..)

View file

@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
TARGET?=--target grpc-server
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
ifeq ($(BUILD_TYPE),cublas)
@ -36,11 +36,18 @@ else ifeq ($(OS),Darwin)
endif
ifeq ($(BUILD_TYPE),sycl_f16)
CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
CMAKE_ARGS+=-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DCMAKE_CXX_FLAGS="-fsycl" \
-DGGML_SYCL_F16=ON
endif
ifeq ($(BUILD_TYPE),sycl_f32)
CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
CMAKE_ARGS+=-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DCMAKE_CXX_FLAGS="-fsycl"
endif
llama.cpp:
@ -73,8 +80,8 @@ grpc-server: llama.cpp llama.cpp/examples/grpc-server
@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+bash -c "source $(ONEAPI_VARS); \
cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)"
cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET) -j$(nproc)"
else
+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET) -j$(nproc)
endif
cp llama.cpp/build/bin/grpc-server .
cp llama.cpp/build/bin/grpc-server .

View file

@ -509,15 +509,15 @@ struct llama_server_context
bool load_model(const common_params &params_)
{
params = params_;
if (!params.mmproj.empty()) {
if (!params.mmproj.path.empty()) {
multimodal = true;
LOG_INFO("Multi Modal Mode Enabled", {});
clp_ctx = clip_init(params.mmproj.c_str(), clip_context_params {
clp_ctx = clip_init(params.mmproj.path.c_str(), clip_context_params {
/* use_gpu */ has_gpu,
/*verbosity=*/ 1,
/*verbosity=*/ GGML_LOG_LEVEL_INFO,
});
if(clp_ctx == nullptr) {
LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
return false;
}
@ -531,7 +531,7 @@ struct llama_server_context
ctx = common_init.context.release();
if (model == nullptr)
{
LOG_ERR("unable to load model: %s", params.model.c_str());
LOG_ERR("unable to load model: %s", params.model.path.c_str());
return false;
}
@ -2122,7 +2122,11 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
}
std::function<void(int)> shutdown_handler;
inline void signal_handler(int signal) { shutdown_handler(signal); }
inline void signal_handler(int signal) {
exit(1);
}
/////////////////////////////////
////////////////////////////////
@ -2322,11 +2326,11 @@ static void params_parse(const backend::ModelOptions* request,
// this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
params.model = request->modelfile();
params.model.path = request->modelfile();
if (!request->mmproj().empty()) {
// get the directory of modelfile
std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
params.mmproj = model_dir + "/"+ request->mmproj();
std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
params.mmproj.path = model_dir + "/"+ request->mmproj();
}
// params.model_alias ??
params.model_alias = request->modelfile();
@ -2401,7 +2405,7 @@ static void params_parse(const backend::ModelOptions* request,
scale_factor = request->lorascale();
}
// get the directory of modelfile
std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor });
}
params.use_mlock = request->mlock();
@ -2649,6 +2653,20 @@ void RunServer(const std::string& server_address) {
int main(int argc, char** argv) {
std::string server_address("localhost:50051");
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = signal_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
sigaction(SIGTERM, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
// Define long and short options
struct option long_options[] = {
{"addr", required_argument, nullptr, 'a'},

View file

@ -21,6 +21,7 @@ fi
## XXX: In some versions of CMake clip wasn't being built before llama.
## This is an hack for now, but it should be fixed in the future.
cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h
cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h

View file

@ -16,7 +16,7 @@ type Application struct {
func newApplication(appConfig *config.ApplicationConfig) *Application {
return &Application{
backendLoader: config.NewBackendConfigLoader(appConfig.ModelPath),
modelLoader: model.NewModelLoader(appConfig.ModelPath),
modelLoader: model.NewModelLoader(appConfig.ModelPath, appConfig.SingleBackend),
applicationConfig: appConfig,
templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
}

View file

@ -143,7 +143,7 @@ func New(opts ...config.AppOption) (*Application, error) {
}()
}
if options.LoadToMemory != nil {
if options.LoadToMemory != nil && !options.SingleBackend {
for _, m := range options.LoadToMemory {
cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options)
if err != nil {

View file

@ -17,6 +17,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendCo
if err != nil {
return nil, err
}
defer loader.Close()
var fn func() ([]float32, error)
switch model := inferenceModel.(type) {

View file

@ -16,6 +16,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
if err != nil {
return nil, err
}
defer loader.Close()
fn := func() error {
_, err := inferenceModel.GenerateImage(

View file

@ -53,6 +53,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
if err != nil {
return nil, err
}
defer loader.Close()
var protoMessages []*proto.Message
// if we are using the tokenizer template, we need to convert the messages to proto messages

View file

@ -40,10 +40,6 @@ func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts ...
grpcOpts := grpcModelOpts(c)
defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))
if so.SingleBackend {
defOpts = append(defOpts, model.WithSingleActiveBackend())
}
if so.ParallelBackendRequests {
defOpts = append(defOpts, model.EnableParallelRequests)
}
@ -121,7 +117,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
triggers := make([]*pb.GrammarTrigger, 0)
for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
triggers = append(triggers, &pb.GrammarTrigger{
Word: t.Word,
Word: t.Word,
})
}
@ -161,33 +157,33 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
DisableLogStatus: c.DisableLogStatus,
DType: c.DType,
// LimitMMPerPrompt vLLM
LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
MMProj: c.MMProj,
FlashAttention: c.FlashAttention,
CacheTypeKey: c.CacheTypeK,
CacheTypeValue: c.CacheTypeV,
NoKVOffload: c.NoKVOffloading,
YarnExtFactor: c.YarnExtFactor,
YarnAttnFactor: c.YarnAttnFactor,
YarnBetaFast: c.YarnBetaFast,
YarnBetaSlow: c.YarnBetaSlow,
NGQA: c.NGQA,
RMSNormEps: c.RMSNormEps,
MLock: mmlock,
RopeFreqBase: c.RopeFreqBase,
RopeScaling: c.RopeScaling,
Type: c.ModelType,
RopeFreqScale: c.RopeFreqScale,
NUMA: c.NUMA,
Embeddings: embeddings,
LowVRAM: lowVRAM,
NGPULayers: int32(nGPULayers),
MMap: mmap,
MainGPU: c.MainGPU,
Threads: int32(*c.Threads),
TensorSplit: c.TensorSplit,
LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
MMProj: c.MMProj,
FlashAttention: c.FlashAttention,
CacheTypeKey: c.CacheTypeK,
CacheTypeValue: c.CacheTypeV,
NoKVOffload: c.NoKVOffloading,
YarnExtFactor: c.YarnExtFactor,
YarnAttnFactor: c.YarnAttnFactor,
YarnBetaFast: c.YarnBetaFast,
YarnBetaSlow: c.YarnBetaSlow,
NGQA: c.NGQA,
RMSNormEps: c.RMSNormEps,
MLock: mmlock,
RopeFreqBase: c.RopeFreqBase,
RopeScaling: c.RopeScaling,
Type: c.ModelType,
RopeFreqScale: c.RopeFreqScale,
NUMA: c.NUMA,
Embeddings: embeddings,
LowVRAM: lowVRAM,
NGPULayers: int32(nGPULayers),
MMap: mmap,
MainGPU: c.MainGPU,
Threads: int32(*c.Threads),
TensorSplit: c.TensorSplit,
// AutoGPTQ
ModelBaseName: c.AutoGPTQ.ModelBaseName,
Device: c.AutoGPTQ.Device,

View file

@ -12,10 +12,10 @@ import (
func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
opts := ModelOptions(backendConfig, appConfig)
rerankModel, err := loader.Load(opts...)
if err != nil {
return nil, err
}
defer loader.Close()
if rerankModel == nil {
return nil, fmt.Errorf("could not load rerank model")

View file

@ -26,10 +26,10 @@ func SoundGeneration(
opts := ModelOptions(backendConfig, appConfig)
soundGenModel, err := loader.Load(opts...)
if err != nil {
return "", nil, err
}
defer loader.Close()
if soundGenModel == nil {
return "", nil, fmt.Errorf("could not load sound generation model")

View file

@ -20,6 +20,7 @@ func TokenMetrics(
if err != nil {
return nil, err
}
defer loader.Close()
if model == nil {
return nil, fmt.Errorf("could not loadmodel model")

View file

@ -14,10 +14,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
opts := ModelOptions(backendConfig, appConfig)
inferenceModel, err = loader.Load(opts...)
if err != nil {
return schema.TokenizeResponse{}, err
}
defer loader.Close()
predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
predictOptions.Prompt = s

View file

@ -24,6 +24,7 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
if err != nil {
return nil, err
}
defer ml.Close()
if transcriptionModel == nil {
return nil, fmt.Errorf("could not load transcription model")

View file

@ -23,10 +23,10 @@ func ModelTTS(
) (string, *proto.Result, error) {
opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend))
ttsModel, err := loader.Load(opts...)
if err != nil {
return "", nil, err
}
defer loader.Close()
if ttsModel == nil {
return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model)

View file

@ -19,6 +19,8 @@ func VAD(request *schema.VADRequest,
if err != nil {
return nil, err
}
defer ml.Close()
req := proto.VADRequest{
Audio: request.Audio,
}

View file

@ -38,7 +38,7 @@ type RunCMD struct {
F16 bool `name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance"`
Threads int `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
ContextSize int `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`
ContextSize int `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" help:"Default context size for models" group:"performance"`
Address string `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
CORS bool `env:"LOCALAI_CORS,CORS" help:"" group:"api"`

View file

@ -74,7 +74,7 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
AssetsDestination: t.BackendAssetsPath,
ExternalGRPCBackends: externalBackends,
}
ml := model.NewModelLoader(opts.ModelPath)
ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
defer func() {
err := ml.StopAllGRPC()

View file

@ -32,7 +32,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
}
cl := config.NewBackendConfigLoader(t.ModelsPath)
ml := model.NewModelLoader(opts.ModelPath)
ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil {
return err
}

View file

@ -41,7 +41,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
AudioDir: outputDir,
AssetsDestination: t.BackendAssetsPath,
}
ml := model.NewModelLoader(opts.ModelPath)
ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
defer func() {
err := ml.StopAllGRPC()

View file

@ -389,16 +389,6 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
cfg.Embeddings = &falseV
}
// Value passed by the top level are treated as default (no implicit defaults)
// defaults are set by the user
if ctx == 0 {
ctx = 1024
}
if cfg.ContextSize == nil {
cfg.ContextSize = &ctx
}
if threads == 0 {
// Threads can't be 0
threads = 4
@ -420,7 +410,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
cfg.Debug = &trueV
}
guessDefaultsFromFile(cfg, lo.modelPath)
guessDefaultsFromFile(cfg, lo.modelPath, ctx)
}
func (c *BackendConfig) Validate() bool {

253
core/config/gguf.go Normal file
View file

@ -0,0 +1,253 @@
package config
import (
"strings"
"github.com/rs/zerolog/log"
gguf "github.com/thxcode/gguf-parser-go"
)
type familyType uint8
const (
Unknown familyType = iota
LLaMa3
CommandR
Phi3
ChatML
Mistral03
Gemma
DeepSeek2
)
const (
defaultContextSize = 1024
)
type settingsConfig struct {
StopWords []string
TemplateConfig TemplateConfig
RepeatPenalty float64
}
// default settings to adopt with a given model family
var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
Gemma: {
RepeatPenalty: 1.0,
StopWords: []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
TemplateConfig: TemplateConfig{
Chat: "{{.Input }}\n<start_of_turn>model\n",
ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
Completion: "{{.Input}}",
},
},
DeepSeek2: {
StopWords: []string{"<end▁of▁sentence>"},
TemplateConfig: TemplateConfig{
ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
{{ end -}}
{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<endofsentence>{{end}}
{{if eq .RoleName "system" -}}{{.Content}}
{{end -}}`,
Chat: "{{.Input -}}\nAssistant: ",
},
},
LLaMa3: {
StopWords: []string{"<|eot_id|>"},
TemplateConfig: TemplateConfig{
Chat: "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
},
},
CommandR: {
TemplateConfig: TemplateConfig{
Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
You are a function calling AI model, you can call the following functions:
## Available Tools
{{range .Functions}}
- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
{{end}}
When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
ChatMessage: `{{if eq .RoleName "user" -}}
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
{{- else if eq .RoleName "system" -}}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
{{- else if eq .RoleName "assistant" -}}
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
{{- else if eq .RoleName "tool" -}}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
{{- else if .FunctionCall -}}
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
{{- end -}}`,
},
StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
},
Phi3: {
TemplateConfig: TemplateConfig{
Chat: "{{.Input}}\n<|assistant|>",
ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
Completion: "{{.Input}}",
},
StopWords: []string{"<|end|>", "<|endoftext|>"},
},
ChatML: {
TemplateConfig: TemplateConfig{
Chat: "{{.Input -}}\n<|im_start|>assistant",
Functions: `<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant`,
ChatMessage: `<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>`,
},
StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
},
Mistral03: {
TemplateConfig: TemplateConfig{
Chat: "{{.Input -}}",
Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
ChatMessage: `{{if eq .RoleName "user" -}}
[INST] {{.Content }} [/INST]
{{- else if .FunctionCall -}}
[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
{{- else if eq .RoleName "tool" -}}
[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
{{- else -}}
{{ .Content -}}
{{ end -}}`,
},
StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
},
}
// this maps well known template used in HF to model families defined above
var knownTemplates = map[string]familyType{
`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`: ChatML,
`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
}
func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
if defaultCtx == 0 && cfg.ContextSize == nil {
ctxSize := f.EstimateLLaMACppUsage().ContextSize
if ctxSize > 0 {
cSize := int(ctxSize)
cfg.ContextSize = &cSize
} else {
defaultCtx = defaultContextSize
cfg.ContextSize = &defaultCtx
}
}
if cfg.HasTemplate() {
// nothing to guess here
log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
return
}
log.Debug().
Any("eosTokenID", f.Tokenizer().EOSTokenID).
Any("bosTokenID", f.Tokenizer().BOSTokenID).
Any("modelName", f.Model().Name).
Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
// guess the name
if cfg.Name == "" {
cfg.Name = f.Model().Name
}
family := identifyFamily(f)
if family == Unknown {
log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
return
}
// identify template
settings, ok := defaultsSettings[family]
if ok {
cfg.TemplateConfig = settings.TemplateConfig
log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
if len(cfg.StopWords) == 0 {
cfg.StopWords = settings.StopWords
}
if cfg.RepeatPenalty == 0.0 {
cfg.RepeatPenalty = settings.RepeatPenalty
}
} else {
log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
}
if cfg.HasTemplate() {
return
}
// identify from well known templates first, otherwise use the raw jinja template
chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
if found {
// try to use the jinja template
cfg.TemplateConfig.JinjaTemplate = true
cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
}
}
func identifyFamily(f *gguf.GGUFFile) familyType {
// identify from well known templates first
chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
if found && chatTemplate.ValueString() != "" {
if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
return family
}
}
// otherwise try to identify from the model properties
arch := f.Architecture().Architecture
eosTokenID := f.Tokenizer().EOSTokenID
bosTokenID := f.Tokenizer().BOSTokenID
isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
llama3 := arch == "llama" && eosTokenID == 128009
commandR := arch == "command-r" && eosTokenID == 255001
qwen2 := arch == "qwen2"
phi3 := arch == "phi-3"
gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
deepseek2 := arch == "deepseek2"
switch {
case deepseek2:
return DeepSeek2
case gemma:
return Gemma
case llama3:
return LLaMa3
case commandR:
return CommandR
case phi3:
return Phi3
case qwen2, isYI:
return ChatML
default:
return Unknown
}
}

View file

@ -3,147 +3,12 @@ package config
import (
"os"
"path/filepath"
"strings"
"github.com/rs/zerolog/log"
gguf "github.com/thxcode/gguf-parser-go"
)
type familyType uint8
const (
Unknown familyType = iota
LLaMa3
CommandR
Phi3
ChatML
Mistral03
Gemma
DeepSeek2
)
type settingsConfig struct {
StopWords []string
TemplateConfig TemplateConfig
RepeatPenalty float64
}
// default settings to adopt with a given model family
var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
Gemma: {
RepeatPenalty: 1.0,
StopWords: []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
TemplateConfig: TemplateConfig{
Chat: "{{.Input }}\n<start_of_turn>model\n",
ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
Completion: "{{.Input}}",
},
},
DeepSeek2: {
StopWords: []string{"<end▁of▁sentence>"},
TemplateConfig: TemplateConfig{
ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
{{ end -}}
{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<endofsentence>{{end}}
{{if eq .RoleName "system" -}}{{.Content}}
{{end -}}`,
Chat: "{{.Input -}}\nAssistant: ",
},
},
LLaMa3: {
StopWords: []string{"<|eot_id|>"},
TemplateConfig: TemplateConfig{
Chat: "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
},
},
CommandR: {
TemplateConfig: TemplateConfig{
Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
You are a function calling AI model, you can call the following functions:
## Available Tools
{{range .Functions}}
- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
{{end}}
When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
ChatMessage: `{{if eq .RoleName "user" -}}
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
{{- else if eq .RoleName "system" -}}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
{{- else if eq .RoleName "assistant" -}}
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
{{- else if eq .RoleName "tool" -}}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
{{- else if .FunctionCall -}}
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
{{- end -}}`,
},
StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
},
Phi3: {
TemplateConfig: TemplateConfig{
Chat: "{{.Input}}\n<|assistant|>",
ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
Completion: "{{.Input}}",
},
StopWords: []string{"<|end|>", "<|endoftext|>"},
},
ChatML: {
TemplateConfig: TemplateConfig{
Chat: "{{.Input -}}\n<|im_start|>assistant",
Functions: `<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant`,
ChatMessage: `<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>`,
},
StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
},
Mistral03: {
TemplateConfig: TemplateConfig{
Chat: "{{.Input -}}",
Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
ChatMessage: `{{if eq .RoleName "user" -}}
[INST] {{.Content }} [/INST]
{{- else if .FunctionCall -}}
[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
{{- else if eq .RoleName "tool" -}}
[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
{{- else -}}
{{ .Content -}}
{{ end -}}`,
},
StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
},
}
// this maps well known template used in HF to model families defined above
var knownTemplates = map[string]familyType{
`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`: ChatML,
`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
}
func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int) {
if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
log.Debug().Msgf("guessDefaultsFromFile: %s", "guessing disabled with LOCALAI_DISABLE_GUESSING")
return
@ -154,106 +19,20 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
return
}
if cfg.HasTemplate() {
// nothing to guess here
log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
return
}
// We try to guess only if we don't have a template defined already
guessPath := filepath.Join(modelPath, cfg.ModelFileName())
// try to parse the gguf file
f, err := gguf.ParseGGUFFile(guessPath)
if err != nil {
// Only valid for gguf files
log.Debug().Str("filePath", guessPath).Msg("guessDefaultsFromFile: not a GGUF file")
if err == nil {
guessGGUFFromFile(cfg, f, defaultCtx)
return
}
log.Debug().
Any("eosTokenID", f.Tokenizer().EOSTokenID).
Any("bosTokenID", f.Tokenizer().BOSTokenID).
Any("modelName", f.Model().Name).
Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
// guess the name
if cfg.Name == "" {
cfg.Name = f.Model().Name
}
family := identifyFamily(f)
if family == Unknown {
log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
return
}
// identify template
settings, ok := defaultsSettings[family]
if ok {
cfg.TemplateConfig = settings.TemplateConfig
log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
if len(cfg.StopWords) == 0 {
cfg.StopWords = settings.StopWords
if cfg.ContextSize == nil {
if defaultCtx == 0 {
defaultCtx = defaultContextSize
}
if cfg.RepeatPenalty == 0.0 {
cfg.RepeatPenalty = settings.RepeatPenalty
}
} else {
log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
}
if cfg.HasTemplate() {
return
}
// identify from well known templates first, otherwise use the raw jinja template
chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
if found {
// try to use the jinja template
cfg.TemplateConfig.JinjaTemplate = true
cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
}
}
func identifyFamily(f *gguf.GGUFFile) familyType {
// identify from well known templates first
chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
if found && chatTemplate.ValueString() != "" {
if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
return family
}
}
// otherwise try to identify from the model properties
arch := f.Architecture().Architecture
eosTokenID := f.Tokenizer().EOSTokenID
bosTokenID := f.Tokenizer().BOSTokenID
isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
llama3 := arch == "llama" && eosTokenID == 128009
commandR := arch == "command-r" && eosTokenID == 255001
qwen2 := arch == "qwen2"
phi3 := arch == "phi-3"
gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
deepseek2 := arch == "deepseek2"
switch {
case deepseek2:
return DeepSeek2
case gemma:
return Gemma
case llama3:
return LLaMa3
case commandR:
return CommandR
case phi3:
return Phi3
case qwen2, isYI:
return ChatML
default:
return Unknown
cfg.ContextSize = &defaultCtx
}
}

View file

@ -122,15 +122,15 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
"id": modalName(m),
"tabindex": "-1",
"aria-hidden": "true",
"class": "hidden overflow-y-auto overflow-x-hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-[calc(100%-1rem)] max-h-full",
"class": "hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-full max-h-full bg-gray-900/50",
},
elem.Div(
attrs.Props{
"class": "relative p-4 w-full max-w-2xl max-h-full",
"class": "relative p-4 w-full max-w-2xl h-[90vh] mx-auto mt-[5vh]",
},
elem.Div(
attrs.Props{
"class": "relative p-4 w-full max-w-2xl max-h-full bg-white rounded-lg shadow dark:bg-gray-700",
"class": "relative bg-white rounded-lg shadow dark:bg-gray-700 h-full flex flex-col",
},
// header
elem.Div(
@ -164,14 +164,13 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
// body
elem.Div(
attrs.Props{
"class": "p-4 md:p-5 space-y-4",
"class": "p-4 md:p-5 space-y-4 overflow-y-auto flex-1 min-h-0",
},
elem.Div(
attrs.Props{
"class": "flex justify-center items-center",
},
elem.Img(attrs.Props{
// "class": "rounded-t-lg object-fit object-center h-96",
"class": "lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3 entered loaded",
"src": m.Icon,
"loading": "lazy",
@ -232,7 +231,6 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
),
),
)
}
func modelDescription(m *gallery.GalleryModel) elem.Node {

View file

@ -21,6 +21,7 @@ func StoresSetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
if err != nil {
return err
}
defer sl.Close()
vals := make([][]byte, len(input.Values))
for i, v := range input.Values {
@ -48,6 +49,7 @@ func StoresDeleteEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationCo
if err != nil {
return err
}
defer sl.Close()
if err := store.DeleteCols(c.Context(), sb, input.Keys); err != nil {
return err
@ -69,6 +71,7 @@ func StoresGetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
if err != nil {
return err
}
defer sl.Close()
keys, vals, err := store.GetCols(c.Context(), sb, input.Keys)
if err != nil {
@ -100,6 +103,7 @@ func StoresFindEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConf
if err != nil {
return err
}
defer sl.Close()
keys, vals, similarities, err := store.Find(c.Context(), sb, input.Key, input.Topk)
if err != nil {

View file

@ -40,7 +40,7 @@ func TestAssistantEndpoints(t *testing.T) {
cl := &config.BackendConfigLoader{}
//configsDir := "/tmp/localai/configs"
modelPath := "/tmp/localai/model"
var ml = model.NewModelLoader(modelPath)
var ml = model.NewModelLoader(modelPath, false)
appConfig := &config.ApplicationConfig{
ConfigsDir: configsDir,

View file

@ -50,11 +50,10 @@ func RegisterLocalAIRoutes(router *fiber.App,
router.Post("/v1/vad", vadChain...)
// Stores
sl := model.NewModelLoader("")
router.Post("/stores/set", localai.StoresSetEndpoint(sl, appConfig))
router.Post("/stores/delete", localai.StoresDeleteEndpoint(sl, appConfig))
router.Post("/stores/get", localai.StoresGetEndpoint(sl, appConfig))
router.Post("/stores/find", localai.StoresFindEndpoint(sl, appConfig))
router.Post("/stores/set", localai.StoresSetEndpoint(ml, appConfig))
router.Post("/stores/delete", localai.StoresDeleteEndpoint(ml, appConfig))
router.Post("/stores/get", localai.StoresGetEndpoint(ml, appConfig))
router.Post("/stores/find", localai.StoresFindEndpoint(ml, appConfig))
if !appConfig.DisableMetrics {
router.Get("/metrics", localai.LocalAIMetricsEndpoint())

View file

@ -1,3 +1,3 @@
{
"version": "v2.26.0"
"version": "v2.27.0"
}

View file

@ -8,9 +8,7 @@ config_file: |
chat_message: |-
<start_of_turn>{{if eq .RoleName "assistant" }}model{{else}}{{ .RoleName }}{{end}}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content -}}
@ -25,11 +23,14 @@ config_file: |
{{.Input}}
function: |
<start_of_turn>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
You have access to functions. If you decide to invoke any of the function(s),
you MUST put it in the format of
{"name": function name, "parameters": dictionary of argument name and its value}
You SHOULD NOT include any other text in the response if you call a function
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<end_of_turn>
{{.Input -}}
<start_of_turn>model

View file

@ -78,6 +78,60 @@
- filename: gemma-3-1b-it-Q4_K_M.gguf
sha256: 8ccc5cd1f1b3602548715ae25a66ed73fd5dc68a210412eea643eb20eb75a135
uri: huggingface://ggml-org/gemma-3-1b-it-GGUF/gemma-3-1b-it-Q4_K_M.gguf
- !!merge <<: *gemma3
name: "gemma-3-12b-it-qat"
urls:
- https://huggingface.co/google/gemma-3-12b-it
- https://huggingface.co/vinimuchulski/gemma-3-12b-it-qat-q4_0-gguf
description: |
This model corresponds to the 12B instruction-tuned version of the Gemma 3 model in GGUF format using Quantization Aware Training (QAT). The GGUF corresponds to Q4_0 quantization.
Thanks to QAT, the model is able to preserve similar quality as bfloat16 while significantly reducing the memory requirements to load the model.
You can find the half-precision version here.
overrides:
parameters:
model: gemma-3-12b-it-q4_0.gguf
files:
- filename: gemma-3-12b-it-q4_0.gguf
sha256: 6f1bb5f455414f7b46482bda51cbfdbf19786e21a5498c4403fdfc03d09b045c
uri: huggingface://vinimuchulski/gemma-3-12b-it-qat-q4_0-gguf/gemma-3-12b-it-q4_0.gguf
- !!merge <<: *gemma3
name: "gemma-3-4b-it-qat"
urls:
- https://huggingface.co/google/gemma-3-4b-it
- https://huggingface.co/vinimuchulski/gemma-3-4b-it-qat-q4_0-gguf
description: |
This model corresponds to the 4B instruction-tuned version of the Gemma 3 model in GGUF format using Quantization Aware Training (QAT). The GGUF corresponds to Q4_0 quantization.
Thanks to QAT, the model is able to preserve similar quality as bfloat16 while significantly reducing the memory requirements to load the model.
You can find the half-precision version here.
overrides:
parameters:
model: gemma-3-4b-it-q4_0.gguf
files:
- filename: gemma-3-4b-it-q4_0.gguf
sha256: 2ca493d426ffcb43db27132f183a0230eda4a3621e58b328d55b665f1937a317
uri: huggingface://vinimuchulski/gemma-3-4b-it-qat-q4_0-gguf/gemma-3-4b-it-q4_0.gguf
- !!merge <<: *gemma3
name: "gemma-3-27b-it-qat"
urls:
- https://huggingface.co/google/gemma-3-27b-it
- https://huggingface.co/vinimuchulski/gemma-3-27b-it-qat-q4_0-gguf
description: |
This model corresponds to the 27B instruction-tuned version of the Gemma 3 model in GGUF format using Quantization Aware Training (QAT). The GGUF corresponds to Q4_0 quantization.
Thanks to QAT, the model is able to preserve similar quality as bfloat16 while significantly reducing the memory requirements to load the model.
You can find the half-precision version here.
overrides:
parameters:
model: gemma-3-27b-it-q4_0.gguf
files:
- filename: gemma-3-27b-it-q4_0.gguf
sha256: 45e586879bc5f5d7a5b6527e812952057ce916d9fc7ba16f7262ec9972c9e2a2
uri: huggingface://vinimuchulski/gemma-3-27b-it-qat-q4_0-gguf/gemma-3-27b-it-q4_0.gguf
- !!merge <<: *gemma3
name: "qgallouedec_gemma-3-27b-it-codeforces-sft"
urls:
@ -386,6 +440,61 @@
- filename: Gemma-3-Starshine-12B.i1-Q4_K_M.gguf
sha256: 4c35a678e3784e20a8d85d4e7045d965509a1a71305a0da105fc5991ba7d6dc4
uri: huggingface://mradermacher/Gemma-3-Starshine-12B-i1-GGUF/Gemma-3-Starshine-12B.i1-Q4_K_M.gguf
- !!merge <<: *gemma3
name: "burtenshaw_gemmacoder3-12b"
icon: https://cdn-uploads.huggingface.co/production/uploads/62d648291fa3e4e7ae3fa6e8/zkcBr2UZFDpALAsMdgbze.gif
urls:
- https://huggingface.co/burtenshaw/GemmaCoder3-12B
- https://huggingface.co/bartowski/burtenshaw_GemmaCoder3-12B-GGUF
description: |
This model is a fine-tuned version of google/gemma-3-12b-it on the open-r1/codeforces-cots dataset. It has been trained using TRL.
overrides:
parameters:
model: burtenshaw_GemmaCoder3-12B-Q4_K_M.gguf
files:
- filename: burtenshaw_GemmaCoder3-12B-Q4_K_M.gguf
sha256: 47f0a2848eeed783cb03336afd8cc69f6ee0e088e3cec11ab6d9fe16457dc3d4
uri: huggingface://bartowski/burtenshaw_GemmaCoder3-12B-GGUF/burtenshaw_GemmaCoder3-12B-Q4_K_M.gguf
- !!merge <<: *gemma3
name: "tesslate_synthia-s1-27b"
icon: https://cdn-uploads.huggingface.co/production/uploads/64d1129297ca59bcf7458d07/zgFDl7UvWhiPYqdote7XT.png
urls:
- https://huggingface.co/Tesslate/Synthia-S1-27b
- https://huggingface.co/bartowski/Tesslate_Synthia-S1-27b-GGUF
description: |
Synthia-S1-27b is a reasoning, AI model developed by Tesslate AI, fine-tuned specifically for advanced reasoning, coding, and RP usecases. Built upon the robust Gemma3 architecture, Synthia-S1-27b excels in logical reasoning, creative writing, and deep contextual understanding. It supports multimodal inputs (text and images) with a large 128K token context window, enabling complex analysis suitable for research, academic tasks, and enterprise-grade AI applications.
overrides:
parameters:
model: Tesslate_Synthia-S1-27b-Q4_K_M.gguf
files:
- filename: Tesslate_Synthia-S1-27b-Q4_K_M.gguf
sha256: d953bf7f802dc68f85a35360deb24b9a8b446af051e82c77f2f0759065d2aa71
uri: huggingface://bartowski/Tesslate_Synthia-S1-27b-GGUF/Tesslate_Synthia-S1-27b-Q4_K_M.gguf
- &llama4
url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master"
icon: https://avatars.githubusercontent.com/u/153379578
license: llama4
tags:
- llm
- gguf
- gpu
- cpu
- llama3.3
name: "meta-llama_llama-4-scout-17b-16e-instruct"
urls:
- https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
- https://huggingface.co/bartowski/meta-llama_Llama-4-Scout-17B-16E-Instruct-GGUF
description: |
The Llama 4 collection of models are natively multimodal AI models that enable text and multimodal experiences. These models leverage a mixture-of-experts architecture to offer industry-leading performance in text and image understanding.
These Llama 4 models mark the beginning of a new era for the Llama ecosystem. We are launching two efficient models in the Llama 4 series, Llama 4 Scout, a 17 billion parameter model with 16 experts, and Llama 4 Maverick, a 17 billion parameter model with 128 experts.
overrides:
parameters:
model: meta-llama_Llama-4-Scout-17B-16E-Instruct-Q3_K_S.gguf
files:
- filename: meta-llama_Llama-4-Scout-17B-16E-Instruct-Q3_K_S.gguf
sha256: 48dfc18d40691b4190b7fecf1f89b78cadc758c3a27a9e2a1cabd686fdb822e3
uri: huggingface://bartowski/meta-llama_Llama-4-Scout-17B-16E-Instruct-GGUF/meta-llama_Llama-4-Scout-17B-16E-Instruct-Q3_K_S.gguf
- &eurollm
name: "eurollm-9b-instruct"
icon: https://openeurollm.eu/_next/static/media/logo-dark.e7001867.svg
@ -1315,6 +1424,119 @@
- filename: Sao10K_Llama-3.3-70B-Vulpecula-r1-Q4_K_M.gguf
sha256: 817073c85286c25a9373f330aad32b503e6c13d626a3fbee926d96a7ab866845
uri: huggingface://bartowski/Sao10K_Llama-3.3-70B-Vulpecula-r1-GGUF/Sao10K_Llama-3.3-70B-Vulpecula-r1-Q4_K_M.gguf
- !!merge <<: *llama33
name: "tarek07_legion-v2.1-llama-70b"
icon: https://cdn-uploads.huggingface.co/production/uploads/64909c086073a0cd172d0411/mqajIk-EsgQ0ZVAZJ4trP.png
urls:
- https://huggingface.co/Tarek07/Legion-V2.1-LLaMa-70B
- https://huggingface.co/bartowski/Tarek07_Legion-V2.1-LLaMa-70B-GGUF
description: |
My biggest merge yet, consisting of a total of 20 specially curated models. My methodology in approaching this was to create 5 highly specialized models:
A completely uncensored base A very intelligent model based on UGI, Willingness and NatInt scores on the UGI Leaderboard A highly descriptive writing model, specializing in creative and natural prose A RP model specially merged with fine-tuned models that use a lot of RP datasets The secret ingredient: A completely unhinged, uncensored final model
These five models went through a series of iterations until I got something I thought worked well and then combined them to make LEGION.
The full list of models used in this merge is below:
TheDrummer/Fallen-Llama-3.3-R1-70B-v1
Sao10K/Llama-3.3-70B-Vulpecula-r1
Sao10K/L3-70B-Euryale-v2.1
SicariusSicariiStuff/Negative_LLAMA_70B
allura-org/Bigger-Body-70b
Sao10K/70B-L3.3-mhnnn-x1
Sao10K/L3.3-70B-Euryale-v2.3
Doctor-Shotgun/L3.3-70B-Magnum-v4-SE
Sao10K/L3.1-70B-Hanami-x1
Sao10K/70B-L3.3-Cirrus-x1
EVA-UNIT-01/EVA-LLaMA-3.33-70B-v0.1
TheDrummer/Anubis-70B-v1
ArliAI/Llama-3.3-70B-ArliAI-RPMax-v1.4
LatitudeGames/Wayfarer-Large-70B-Llama-3.3
NeverSleep/Lumimaid-v0.2-70B
mlabonne/Hermes-3-Llama-3.1-70B-lorablated
ReadyArt/Forgotten-Safeword-70B-3.6
ReadyArt/Fallen-Abomination-70B-R1-v4.1
ReadyArt/Fallen-Safeword-70B-R1-v4.1
huihui-ai/Llama-3.3-70B-Instruct-abliterated
overrides:
parameters:
model: Tarek07_Legion-V2.1-LLaMa-70B-Q4_K_M.gguf
files:
- filename: Tarek07_Legion-V2.1-LLaMa-70B-Q4_K_M.gguf
sha256: 8f3de344ba83705f9491c2ed354fd6131a05946fca2eceae495b6fc67a7dbe7a
uri: huggingface://bartowski/Tarek07_Legion-V2.1-LLaMa-70B-GGUF/Tarek07_Legion-V2.1-LLaMa-70B-Q4_K_M.gguf
- !!merge <<: *llama33
name: "forgotten-abomination-70b-v5.0"
icon: https://huggingface.co/ReadyArt/Forgotten-Abomination-70B-v5.0/resolve/main/waifu5.webp
urls:
- https://huggingface.co/ReadyArt/Forgotten-Abomination-70B-v5.0
- https://huggingface.co/mradermacher/Forgotten-Abomination-70B-v5.0-GGUF
description: |
The Unholy Union of Safeword and Nevoria
This model represents the forbidden merger of:
🧬 Forgotten-Safeword-70B-v5.0 - Industrial-grade depravity matrix with 23 distinct taboo specializations
⚗️ L3.3-MS-Nevoria-70b - SteelSkull's masterpiece of storytelling and character adherence, free of positivity bias and ethical constraints
Expect:
🔥 Unparalleled erotic roleplaying with the least Llama refusals you've ever seen
📖 Novel-quality prose that follows your character card with precision
🧠 Handles complex multi-character scenarios effortlessly
💀 Will gleefully explore any taboo subject without hesitation
overrides:
parameters:
model: Forgotten-Abomination-70B-v5.0.Q4_K_M.gguf
files:
- filename: Forgotten-Abomination-70B-v5.0.Q4_K_M.gguf
sha256: a5f5e712e66b855f36ff45175f20c24441fa942ca8af47bd6f49107c6e0f025d
uri: huggingface://mradermacher/Forgotten-Abomination-70B-v5.0-GGUF/Forgotten-Abomination-70B-v5.0.Q4_K_M.gguf
- !!merge <<: *llama33
name: "watt-ai_watt-tool-70b"
urls:
- https://huggingface.co/watt-ai/watt-tool-70B
- https://huggingface.co/bartowski/watt-ai_watt-tool-70B-GGUF
description: |
watt-tool-70B is a fine-tuned language model based on LLaMa-3.3-70B-Instruct, optimized for tool usage and multi-turn dialogue. It achieves state-of-the-art performance on the Berkeley Function-Calling Leaderboard (BFCL).
Model Description
This model is specifically designed to excel at complex tool usage scenarios that require multi-turn interactions, making it ideal for empowering platforms like Lupan, an AI-powered workflow building tool. By leveraging a carefully curated and optimized dataset, watt-tool-70B demonstrates superior capabilities in understanding user requests, selecting appropriate tools, and effectively utilizing them across multiple turns of conversation.
Target Application: AI Workflow Building as in https://lupan.watt.chat/ and Coze.
Key Features
Enhanced Tool Usage: Fine-tuned for precise and efficient tool selection and execution.
Multi-Turn Dialogue: Optimized for maintaining context and effectively utilizing tools across multiple turns of conversation, enabling more complex task completion.
State-of-the-Art Performance: Achieves top performance on the BFCL, demonstrating its capabilities in function calling and tool usage.
Based on LLaMa-3.1-70B-Instruct: Inherits the strong language understanding and generation capabilities of the base model.
overrides:
parameters:
model: watt-ai_watt-tool-70B-Q4_K_M.gguf
files:
- filename: watt-ai_watt-tool-70B-Q4_K_M.gguf
sha256: 93806a5482b9e40e50ffca7a72abe3414d384749cc9e3d378eab5db8a8154b18
uri: huggingface://bartowski/watt-ai_watt-tool-70B-GGUF/watt-ai_watt-tool-70B-Q4_K_M.gguf
- !!merge <<: *llama33
name: "deepcogito_cogito-v1-preview-llama-70b"
icon: https://huggingface.co/deepcogito/cogito-v1-preview-llama-70B/resolve/main/images/deep-cogito-logo.png
urls:
- https://huggingface.co/deepcogito/cogito-v1-preview-llama-70B
- https://huggingface.co/bartowski/deepcogito_cogito-v1-preview-llama-70B-GGUF
description: |
The Cogito LLMs are instruction tuned generative models (text in/text out). All models are released under an open license for commercial use.
Cogito models are hybrid reasoning models. Each model can answer directly (standard LLM), or self-reflect before answering (like reasoning models).
The LLMs are trained using Iterated Distillation and Amplification (IDA) - an scalable and efficient alignment strategy for superintelligence using iterative self-improvement.
The models have been optimized for coding, STEM, instruction following and general helpfulness, and have significantly higher multilingual, coding and tool calling capabilities than size equivalent counterparts.
In both standard and reasoning modes, Cogito v1-preview models outperform their size equivalent counterparts on common industry benchmarks.
Each model is trained in over 30 languages and supports a context length of 128k.
overrides:
parameters:
model: deepcogito_cogito-v1-preview-llama-70B-Q4_K_M.gguf
files:
- filename: deepcogito_cogito-v1-preview-llama-70B-Q4_K_M.gguf
sha256: d1deaf80c649e2a9446463cf5e1f7c026583647f46e3940d2b405a57cc685225
uri: huggingface://bartowski/deepcogito_cogito-v1-preview-llama-70B-GGUF/deepcogito_cogito-v1-preview-llama-70B-Q4_K_M.gguf
- &rwkv
url: "github:mudler/LocalAI/gallery/rwkv.yaml@master"
name: "rwkv-6-world-7b"
@ -2428,6 +2650,27 @@
- filename: Eximius_Persona_5B.Q4_K_M.gguf
sha256: 8a8e7a0fa1068755322c51900e53423d795e57976b4d95982242cbec41141c7b
uri: huggingface://mradermacher/Eximius_Persona_5B-GGUF/Eximius_Persona_5B.Q4_K_M.gguf
- !!merge <<: *llama32
name: "deepcogito_cogito-v1-preview-llama-3b"
icon: https://huggingface.co/deepcogito/cogito-v1-preview-llama-3B/resolve/main/images/deep-cogito-logo.png
urls:
- https://huggingface.co/deepcogito/cogito-v1-preview-llama-3B
- https://huggingface.co/bartowski/deepcogito_cogito-v1-preview-llama-3B-GGUF
description: |
The Cogito LLMs are instruction tuned generative models (text in/text out). All models are released under an open license for commercial use.
Cogito models are hybrid reasoning models. Each model can answer directly (standard LLM), or self-reflect before answering (like reasoning models).
The LLMs are trained using Iterated Distillation and Amplification (IDA) - an scalable and efficient alignment strategy for superintelligence using iterative self-improvement.
The models have been optimized for coding, STEM, instruction following and general helpfulness, and have significantly higher multilingual, coding and tool calling capabilities than size equivalent counterparts.
In both standard and reasoning modes, Cogito v1-preview models outperform their size equivalent counterparts on common industry benchmarks.
Each model is trained in over 30 languages and supports a context length of 128k.
overrides:
parameters:
model: deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf
files:
- filename: deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf
sha256: 726a0ef5f818b8d238f2844f3204848bea66fb9c172b8ae0f6dc51b7bc081dd5
uri: huggingface://bartowski/deepcogito_cogito-v1-preview-llama-3B-GGUF/deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf
- &qwen25
name: "qwen2.5-14b-instruct" ## Qwen2.5
icon: https://avatars.githubusercontent.com/u/141221163
@ -5219,6 +5462,401 @@
- filename: Qwen2.5-14B-Instruct-1M-Unalign.i1-Q4_K_M.gguf
sha256: 11b2eb96a8a4d512fceb3344dccc694972801c964cf748d723fdf436bc368915
uri: huggingface://mradermacher/Qwen2.5-14B-Instruct-1M-Unalign-i1-GGUF/Qwen2.5-14B-Instruct-1M-Unalign.i1-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "tesslate_tessa-t1-32b"
icon: https://cdn-uploads.huggingface.co/production/uploads/64d1129297ca59bcf7458d07/I7XzH-NMKUshcGU86u6VA.png
urls:
- https://huggingface.co/Tesslate/Tessa-T1-32B
- https://huggingface.co/bartowski/Tesslate_Tessa-T1-32B-GGUF
description: |
Tessa-T1 is an innovative transformer-based React reasoning model, fine-tuned from the powerful Qwen2.5-Coder-32B-Instruct base model. Designed specifically for React frontend development, Tessa-T1 leverages advanced reasoning to autonomously generate well-structured, semantic React components. Its integration into agent systems makes it a powerful tool for automating web interface development and frontend code intelligence.
Model Highlights
React-specific Reasoning: Accurately generates functional and semantic React components.
Agent Integration: Seamlessly fits into AI-driven coding agents and autonomous frontend systems.
Context-Aware Generation: Effectively understands and utilizes UI context to provide relevant code solutions.
overrides:
parameters:
model: Tesslate_Tessa-T1-32B-Q4_K_M.gguf
files:
- filename: Tesslate_Tessa-T1-32B-Q4_K_M.gguf
sha256: e52a2a0a877ce1de78f2ea472c9e3bc7a0c20d6998423e9d99a59175809d3a22
uri: huggingface://bartowski/Tesslate_Tessa-T1-32B-GGUF/Tesslate_Tessa-T1-32B-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "tesslate_tessa-t1-14b"
icon: https://cdn-uploads.huggingface.co/production/uploads/64d1129297ca59bcf7458d07/I7XzH-NMKUshcGU86u6VA.png
urls:
- https://huggingface.co/Tesslate/Tessa-T1-14B
- https://huggingface.co/bartowski/Tesslate_Tessa-T1-14B-GGUF
description: |
Tessa-T1 is an innovative transformer-based React reasoning model, fine-tuned from the powerful Qwen2.5-Coder-14B-Instruct base model. Designed specifically for React frontend development, Tessa-T1 leverages advanced reasoning to autonomously generate well-structured, semantic React components. Its integration into agent systems makes it a powerful tool for automating web interface development and frontend code intelligence.
Model Highlights
React-specific Reasoning: Accurately generates functional and semantic React components.
Agent Integration: Seamlessly fits into AI-driven coding agents and autonomous frontend systems.
Context-Aware Generation: Effectively understands and utilizes UI context to provide relevant code solutions.
overrides:
parameters:
model: Tesslate_Tessa-T1-14B-Q4_K_M.gguf
files:
- filename: Tesslate_Tessa-T1-14B-Q4_K_M.gguf
sha256: 1b35ff651b9c1e4538d10e3117390ae36094b6455a9f937a4f3ab72162125bca
uri: huggingface://bartowski/Tesslate_Tessa-T1-14B-GGUF/Tesslate_Tessa-T1-14B-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "tesslate_tessa-t1-7b"
icon: https://cdn-uploads.huggingface.co/production/uploads/64d1129297ca59bcf7458d07/I7XzH-NMKUshcGU86u6VA.png
urls:
- https://huggingface.co/Tesslate/Tessa-T1-7B
- https://huggingface.co/bartowski/Tesslate_Tessa-T1-7B-GGUF
description: |
Tessa-T1 is an innovative transformer-based React reasoning model, fine-tuned from the powerful Qwen2.5-Coder-7B-Instruct base model. Designed specifically for React frontend development, Tessa-T1 leverages advanced reasoning to autonomously generate well-structured, semantic React components. Its integration into agent systems makes it a powerful tool for automating web interface development and frontend code intelligence.
Model Highlights
React-specific Reasoning: Accurately generates functional and semantic React components.
Agent Integration: Seamlessly fits into AI-driven coding agents and autonomous frontend systems.
Context-Aware Generation: Effectively understands and utilizes UI context to provide relevant code solutions.
overrides:
parameters:
model: Tesslate_Tessa-T1-7B-Q4_K_M.gguf
files:
- filename: Tesslate_Tessa-T1-7B-Q4_K_M.gguf
sha256: 7968332d01b5479dee99aff7c9764b9e61c2a6d2828c266163596dd783bdee18
uri: huggingface://bartowski/Tesslate_Tessa-T1-7B-GGUF/Tesslate_Tessa-T1-7B-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "tesslate_tessa-t1-3b"
icon: https://cdn-uploads.huggingface.co/production/uploads/64d1129297ca59bcf7458d07/I7XzH-NMKUshcGU86u6VA.png
urls:
- https://huggingface.co/Tesslate/Tessa-T1-3B
- https://huggingface.co/bartowski/Tesslate_Tessa-T1-3B-GGUF
description: |
Tessa-T1 is an innovative transformer-based React reasoning model, fine-tuned from the powerful Qwen2.5-Coder-3B-Instruct base model. Designed specifically for React frontend development, Tessa-T1 leverages advanced reasoning to autonomously generate well-structured, semantic React components. Its integration into agent systems makes it a powerful tool for automating web interface development and frontend code intelligence.
Model Highlights
React-specific Reasoning: Accurately generates functional and semantic React components.
Agent Integration: Seamlessly fits into AI-driven coding agents and autonomous frontend systems.
Context-Aware Generation: Effectively understands and utilizes UI context to provide relevant code solutions.
overrides:
parameters:
model: Tesslate_Tessa-T1-3B-Q4_K_M.gguf
files:
- filename: Tesslate_Tessa-T1-3B-Q4_K_M.gguf
sha256: d6b9d31d78d36094cab2725a7df318f8f3556990df736a21998c952d9a6ee0bf
uri: huggingface://bartowski/Tesslate_Tessa-T1-3B-GGUF/Tesslate_Tessa-T1-3B-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "chaoticneutrals_very_berry_qwen2_7b"
icon: https://cdn-uploads.huggingface.co/production/uploads/626dfb8786671a29c715f8a9/1J817kx3zZccf5yvQYiGM.png
urls:
- https://huggingface.co/ChaoticNeutrals/Very_Berry_Qwen2_7B
- https://huggingface.co/bartowski/ChaoticNeutrals_Very_Berry_Qwen2_7B-GGUF
description: |
It do the stuff.
overrides:
parameters:
model: ChaoticNeutrals_Very_Berry_Qwen2_7B-Q4_K_M.gguf
files:
- filename: ChaoticNeutrals_Very_Berry_Qwen2_7B-Q4_K_M.gguf
sha256: cbda41c638c23a3e8e9fb33c27ca0d0a0ee044b6813941a0017fd46369a35ec5
uri: huggingface://bartowski/ChaoticNeutrals_Very_Berry_Qwen2_7B-GGUF/ChaoticNeutrals_Very_Berry_Qwen2_7B-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "galactic-qwen-14b-exp1"
icon: https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/SjM3y5Qcr2RX6zC3GQxR3.png
urls:
- https://huggingface.co/prithivMLmods/Galactic-Qwen-14B-Exp1
- https://huggingface.co/mradermacher/Galactic-Qwen-14B-Exp1-GGUF
description: |
Galactic-Qwen-14B-Exp1 is based on the Qwen 2.5 14B modality architecture, designed to enhance the reasoning capabilities of 14B-parameter models. This model is optimized for general-purpose reasoning and answering, excelling in contextual understanding, logical deduction, and multi-step problem-solving. It has been fine-tuned using a long chain-of-thought reasoning model and specialized datasets to improve comprehension, structured responses, and conversational intelligence.
overrides:
parameters:
model: Galactic-Qwen-14B-Exp1.Q4_K_M.gguf
files:
- filename: Galactic-Qwen-14B-Exp1.Q4_K_M.gguf
sha256: 26e99578c341c879cc2676c4c7a45b6c0d00b30bd17c8ee7494fcc4092480ef0
uri: huggingface://mradermacher/Galactic-Qwen-14B-Exp1-GGUF/Galactic-Qwen-14B-Exp1.Q4_K_M.gguf
- !!merge <<: *qwen25
name: "hammer2.0-7b"
urls:
- https://huggingface.co/MadeAgents/Hammer2.0-7b
- https://huggingface.co/Nekuromento/Hammer2.0-7b-Q5_K_M-GGUF
description: |
Hammer2.0 finetuned based on Qwen 2.5 series and Qwen 2.5 coder series using function masking techniques. It's trained using the APIGen Function Calling Datasets containing 60,000 samples, supplemented by xlam-irrelevance-7.5k we generated. Hammer2.0 has achieved exceptional performances across numerous function calling benchmarks. For more details, please refer to Hammer: Robust Function-Calling for On-Device Language Models via Function Masking and Hammer GitHub repository .
overrides:
parameters:
model: hammer2.0-7b-q5_k_m.gguf
files:
- filename: hammer2.0-7b-q5_k_m.gguf
sha256: 3682843c857595765f0786cf24b3d501af96fe5d99a9fb2526bc7707e28bae1e
uri: huggingface://Nekuromento/Hammer2.0-7b-Q5_K_M-GGUF/hammer2.0-7b-q5_k_m.gguf
- !!merge <<: *qwen25
icon: https://github.com/All-Hands-AI/OpenHands/blob/main/docs/static/img/logo.png?raw=true
name: "all-hands_openhands-lm-32b-v0.1"
urls:
- https://huggingface.co/all-hands/openhands-lm-32b-v0.1
- https://huggingface.co/bartowski/all-hands_openhands-lm-32b-v0.1-GGUF
description: |
Autonomous agents for software development are already contributing to a wide range of software development tasks. But up to this point, strong coding agents have relied on proprietary models, which means that even if you use an open-source agent like OpenHands, you are still reliant on API calls to an external service.
Today, we are excited to introduce OpenHands LM, a new open coding model that:
Is open and available on Hugging Face, so you can download it and run it locally
Is a reasonable size, 32B, so it can be run locally on hardware such as a single 3090 GPU
Achieves strong performance on software engineering tasks, including 37.2% resolve rate on SWE-Bench Verified
Read below for more details and our future plans!
What is OpenHands LM?
OpenHands LM is built on the foundation of Qwen Coder 2.5 Instruct 32B, leveraging its powerful base capabilities for coding tasks. What sets OpenHands LM apart is our specialized fine-tuning process:
We used training data generated by OpenHands itself on a diverse set of open-source repositories
Specifically, we use an RL-based framework outlined in SWE-Gym, where we set up a training environment, generate training data using an existing agent, and then fine-tune the model on examples that were resolved successfully
It features a 128K token context window, ideal for handling large codebases and long-horizon software engineering tasks
overrides:
parameters:
model: all-hands_openhands-lm-32b-v0.1-Q4_K_M.gguf
files:
- filename: all-hands_openhands-lm-32b-v0.1-Q4_K_M.gguf
sha256: f7c2311d3264cc1e021a21a319748a9c75b74ddebe38551786aa4053448e5e74
uri: huggingface://bartowski/all-hands_openhands-lm-32b-v0.1-GGUF/all-hands_openhands-lm-32b-v0.1-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "all-hands_openhands-lm-7b-v0.1"
icon: https://github.com/All-Hands-AI/OpenHands/blob/main/docs/static/img/logo.png?raw=true
urls:
- https://huggingface.co/all-hands/openhands-lm-7b-v0.1
- https://huggingface.co/bartowski/all-hands_openhands-lm-7b-v0.1-GGUF
description: |
This is a smaller 7B model trained following the recipe of all-hands/openhands-lm-32b-v0.1. Autonomous agents for software development are already contributing to a wide range of software development tasks. But up to this point, strong coding agents have relied on proprietary models, which means that even if you use an open-source agent like OpenHands, you are still reliant on API calls to an external service.
Today, we are excited to introduce OpenHands LM, a new open coding model that:
Is open and available on Hugging Face, so you can download it and run it locally
Is a reasonable size, 32B, so it can be run locally on hardware such as a single 3090 GPU
Achieves strong performance on software engineering tasks, including 37.2% resolve rate on SWE-Bench Verified
Read below for more details and our future plans!
What is OpenHands LM?
OpenHands LM is built on the foundation of Qwen Coder 2.5 Instruct 32B, leveraging its powerful base capabilities for coding tasks. What sets OpenHands LM apart is our specialized fine-tuning process:
We used training data generated by OpenHands itself on a diverse set of open-source repositories
Specifically, we use an RL-based framework outlined in SWE-Gym, where we set up a training environment, generate training data using an existing agent, and then fine-tune the model on examples that were resolved successfully
It features a 128K token context window, ideal for handling large codebases and long-horizon software engineering tasks
overrides:
parameters:
model: all-hands_openhands-lm-7b-v0.1-Q4_K_M.gguf
files:
- filename: all-hands_openhands-lm-7b-v0.1-Q4_K_M.gguf
sha256: d50031b04bbdad714c004a0dc117c18d26a026297c236cda36089c20279b2ec1
uri: huggingface://bartowski/all-hands_openhands-lm-7b-v0.1-GGUF/all-hands_openhands-lm-7b-v0.1-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "all-hands_openhands-lm-1.5b-v0.1"
icon: https://github.com/All-Hands-AI/OpenHands/blob/main/docs/static/img/logo.png?raw=true
urls:
- https://huggingface.co/all-hands/openhands-lm-1.5b-v0.1
- https://huggingface.co/bartowski/all-hands_openhands-lm-1.5b-v0.1-GGUF
description: |
This is a smaller 1.5B model trained following the recipe of all-hands/openhands-lm-32b-v0.1. It is intended to be used for speculative decoding. Autonomous agents for software development are already contributing to a wide range of software development tasks. But up to this point, strong coding agents have relied on proprietary models, which means that even if you use an open-source agent like OpenHands, you are still reliant on API calls to an external service.
Today, we are excited to introduce OpenHands LM, a new open coding model that:
Is open and available on Hugging Face, so you can download it and run it locally
Is a reasonable size, 32B, so it can be run locally on hardware such as a single 3090 GPU
Achieves strong performance on software engineering tasks, including 37.2% resolve rate on SWE-Bench Verified
Read below for more details and our future plans!
What is OpenHands LM?
OpenHands LM is built on the foundation of Qwen Coder 2.5 Instruct 32B, leveraging its powerful base capabilities for coding tasks. What sets OpenHands LM apart is our specialized fine-tuning process:
We used training data generated by OpenHands itself on a diverse set of open-source repositories
Specifically, we use an RL-based framework outlined in SWE-Gym, where we set up a training environment, generate training data using an existing agent, and then fine-tune the model on examples that were resolved successfully
It features a 128K token context window, ideal for handling large codebases and long-horizon software engineering tasks
overrides:
parameters:
model: all-hands_openhands-lm-1.5b-v0.1-Q4_K_M.gguf
files:
- filename: all-hands_openhands-lm-1.5b-v0.1-Q4_K_M.gguf
sha256: 30abd7860c4eb5f2f51546389407b0064360862f64ea55cdf95f97c6e155b3c6
uri: huggingface://bartowski/all-hands_openhands-lm-1.5b-v0.1-GGUF/all-hands_openhands-lm-1.5b-v0.1-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "katanemo_arch-function-chat-7b"
urls:
- https://huggingface.co/katanemo/Arch-Function-Chat-7B
- https://huggingface.co/bartowski/katanemo_Arch-Function-Chat-7B-GGUF
description: |
The Arch-Function-Chat collection builds upon the Katanemo's Arch-Function collection by extending its capabilities beyond function calling. This new collection maintains the state-of-the-art(SOTA) function calling performance of the original collection while adding powerful new features that make it even more versatile in real-world applications.
In addition to function calling capabilities, this collection now offers:
Clarify & refine: Generates natural follow-up questions to collect missing information for function calling
Interpret & respond: Provides human-friendly responses based on function execution results
Context management: Mantains context in complex multi-turn interactions
Note: Arch-Function-Chat is now the primarly LLM used in then open source Arch Gateway - An AI-native proxy for agents. For more details about the project, check out the Github README.
overrides:
parameters:
model: katanemo_Arch-Function-Chat-7B-Q4_K_M.gguf
files:
- filename: katanemo_Arch-Function-Chat-7B-Q4_K_M.gguf
sha256: 6fd603511076ffea3697c8a76d82c054781c5e11f134b937a66cedfc49b3d2c5
uri: huggingface://bartowski/katanemo_Arch-Function-Chat-7B-GGUF/katanemo_Arch-Function-Chat-7B-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "katanemo_arch-function-chat-1.5b"
urls:
- https://huggingface.co/katanemo/Arch-Function-Chat-1.5B
- https://huggingface.co/bartowski/katanemo_Arch-Function-Chat-1.5B-GGUF
description: |
The Arch-Function-Chat collection builds upon the Katanemo's Arch-Function collection by extending its capabilities beyond function calling. This new collection maintains the state-of-the-art(SOTA) function calling performance of the original collection while adding powerful new features that make it even more versatile in real-world applications.
In addition to function calling capabilities, this collection now offers:
Clarify & refine: Generates natural follow-up questions to collect missing information for function calling
Interpret & respond: Provides human-friendly responses based on function execution results
Context management: Mantains context in complex multi-turn interactions
Note: Arch-Function-Chat is now the primarly LLM used in then open source Arch Gateway - An AI-native proxy for agents. For more details about the project, check out the Github README.
overrides:
parameters:
model: katanemo_Arch-Function-Chat-1.5B-Q4_K_M.gguf
files:
- filename: katanemo_Arch-Function-Chat-1.5B-Q4_K_M.gguf
sha256: 5bfcb72803745c374a90b0ceb60f347a8c7d1239960cce6a2d22cc1276236098
uri: huggingface://bartowski/katanemo_Arch-Function-Chat-1.5B-GGUF/katanemo_Arch-Function-Chat-1.5B-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "katanemo_arch-function-chat-3b"
urls:
- https://huggingface.co/katanemo/Arch-Function-Chat-3B
- https://huggingface.co/bartowski/katanemo_Arch-Function-Chat-3B-GGUF
description: |
The Arch-Function-Chat collection builds upon the Katanemo's Arch-Function collection by extending its capabilities beyond function calling. This new collection maintains the state-of-the-art(SOTA) function calling performance of the original collection while adding powerful new features that make it even more versatile in real-world applications.
In addition to function calling capabilities, this collection now offers:
Clarify & refine: Generates natural follow-up questions to collect missing information for function calling
Interpret & respond: Provides human-friendly responses based on function execution results
Context management: Mantains context in complex multi-turn interactions
Note: Arch-Function-Chat is now the primarly LLM used in then open source Arch Gateway - An AI-native proxy for agents. For more details about the project, check out the Github README.
overrides:
parameters:
model: katanemo_Arch-Function-Chat-3B-Q4_K_M.gguf
files:
- filename: katanemo_Arch-Function-Chat-3B-Q4_K_M.gguf
sha256: f59dbef397bf1364b5f0a2c23a7f67c40ec63cc666036c4e7615fa7d79d4e1a0
uri: huggingface://bartowski/katanemo_Arch-Function-Chat-3B-GGUF/katanemo_Arch-Function-Chat-3B-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "open-thoughts_openthinker2-32b"
icon: https://huggingface.co/datasets/open-thoughts/open-thoughts-114k/resolve/main/open_thoughts.png
urls:
- https://huggingface.co/open-thoughts/OpenThinker2-32B
- https://huggingface.co/bartowski/open-thoughts_OpenThinker2-32B-GGUF
description: |
This model is a fine-tuned version of Qwen/Qwen2.5-32B-Instruct on the OpenThoughts2-1M dataset.
The OpenThinker2-32B model is the highest performing open-data model. This model improves upon our previous OpenThinker-32B model, which was trained on 114k examples from OpenThoughts-114k. The numbers reported in the table below are evaluated with our open-source tool Evalchemy.
overrides:
parameters:
model: open-thoughts_OpenThinker2-32B-Q4_K_M.gguf
files:
- filename: open-thoughts_OpenThinker2-32B-Q4_K_M.gguf
sha256: e9c7bf7cb349cfe07b4550759a3b4d7005834d0fa7580b23e483cbfeecd7a982
uri: huggingface://bartowski/open-thoughts_OpenThinker2-32B-GGUF/open-thoughts_OpenThinker2-32B-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "open-thoughts_openthinker2-7b"
icon: https://huggingface.co/datasets/open-thoughts/open-thoughts-114k/resolve/main/open_thoughts.pnghttps://huggingface.co/datasets/open-thoughts/open-thoughts-114k/resolve/main/open_thoughts.png
urls:
- https://huggingface.co/open-thoughts/OpenThinker2-7B
- https://huggingface.co/bartowski/open-thoughts_OpenThinker2-7B-GGUF
description: |
This model is a fine-tuned version of Qwen/Qwen2.5-7B-Instruct on the OpenThoughts2-1M dataset.
The OpenThinker2-7B model is the top 7B open-data reasoning model. It delivers performance comparable to state of the art 7B models like DeepSeek-R1-Distill-7B across a suite of tasks. This model improves upon our previous OpenThinker-7B model, which was trained on 114k examples from OpenThoughts-114k. The numbers reported in the table below are evaluated with our open-source tool Evalchemy.
overrides:
parameters:
model: open-thoughts_OpenThinker2-7B-Q4_K_M.gguf
files:
- filename: open-thoughts_OpenThinker2-7B-Q4_K_M.gguf
sha256: 481d785047d66ae2eeaf14650a9e659ec4f7766a6414b6c7e92854c944201734
uri: huggingface://bartowski/open-thoughts_OpenThinker2-7B-GGUF/open-thoughts_OpenThinker2-7B-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "arliai_qwq-32b-arliai-rpr-v1"
icon: https://cdn-uploads.huggingface.co/production/uploads/6625f4a8a8d1362ebcc3851a/albSlnUy9dPVGVuLlsBua.jpeg
urls:
- https://huggingface.co/ArliAI/QwQ-32B-ArliAI-RpR-v1
- https://huggingface.co/bartowski/ArliAI_QwQ-32B-ArliAI-RpR-v1-GGUF
description: |
RpR (RolePlay with Reasoning) is a new series of models from ArliAI. This series builds directly upon the successful dataset curation methodology and training methods developed for the RPMax series.
RpR models use the same curated, deduplicated RP and creative writing dataset used for RPMax, with a focus on variety to ensure high creativity and minimize cross-context repetition. Users familiar with RPMax will recognize the unique, non-repetitive writing style unlike other finetuned-for-RP models.
With the release of QwQ as the first high performing open-source reasoning model that can be easily trained, it was clear that the available instruct and creative writing reasoning datasets contains only one response per example. This is type of single response dataset used for training reasoning models causes degraded output quality in long multi-turn chats. Which is why Arli AI decided to create a real RP model capable of long multi-turn chat with reasoning.
In order to create RpR, we first had to actually create the reasoning RP dataset by re-processing our existing known-good RPMax dataset into a reasoning dataset. This was possible by using the base QwQ Instruct model itself to create the reasoning process for every turn in the RPMax dataset conversation examples, which is then further refined in order to make sure the reasoning is in-line with the actual response examples from the dataset.
Another important thing to get right is to make sure the model is trained on examples that present reasoning blocks in the same way as it encounters it during inference. Which is, never seeing the reasoning blocks in it's context. In order to do this, the training run was completed using axolotl with manual template-free segments dataset in order to make sure that the model is never trained to see the reasoning block in the context. Just like how the model will be used during inference time.
The result of training QwQ on this dataset with this method are consistently coherent and interesting outputs even in long multi-turn RP chats. This is as far as we know the first true correctly-trained reasoning model trained for RP and creative writing.
overrides:
parameters:
model: ArliAI_QwQ-32B-ArliAI-RpR-v1-Q4_K_M.gguf
files:
- filename: ArliAI_QwQ-32B-ArliAI-RpR-v1-Q4_K_M.gguf
sha256: b0f2ca8f62a5d021e20db40608a109713e9d23e75b68b3b71b7654c04d596dcf
uri: huggingface://bartowski/ArliAI_QwQ-32B-ArliAI-RpR-v1-GGUF/ArliAI_QwQ-32B-ArliAI-RpR-v1-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "mensa-beta-14b-instruct-i1"
icon: https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/DyO5Fvqwvee-UM9QqgWZS.png
urls:
- https://huggingface.co/prithivMLmods/Mensa-Beta-14B-Instruct
- https://huggingface.co/mradermacher/Mensa-Beta-14B-Instruct-i1-GGUF
description: |
weighted/imatrix quants of https://huggingface.co/prithivMLmods/Mensa-Beta-14B-Instruct
overrides:
parameters:
model: Mensa-Beta-14B-Instruct.i1-Q4_K_M.gguf
files:
- filename: Mensa-Beta-14B-Instruct.i1-Q4_K_M.gguf
sha256: 86ccd640d72dcf3129fdd5b94381a733a684672b22487784e388b2ee9de57760
uri: huggingface://mradermacher/Mensa-Beta-14B-Instruct-i1-GGUF/Mensa-Beta-14B-Instruct.i1-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "cogito-v1-preview-qwen-14B"
icon: https://huggingface.co/deepcogito/cogito-v1-preview-qwen-14B/resolve/main/images/deep-cogito-logo.png
urls:
- https://huggingface.co/deepcogito/cogito-v1-preview-qwen-14B
- https://huggingface.co/NikolayKozloff/cogito-v1-preview-qwen-14B-Q4_K_M-GGUF
description: |
The Cogito LLMs are instruction tuned generative models (text in/text out). All models are released under an open license for commercial use.
Cogito models are hybrid reasoning models. Each model can answer directly (standard LLM), or self-reflect before answering (like reasoning models).
The LLMs are trained using Iterated Distillation and Amplification (IDA) - an scalable and efficient alignment strategy for superintelligence using iterative self-improvement.
The models have been optimized for coding, STEM, instruction following and general helpfulness, and have significantly higher multilingual, coding and tool calling capabilities than size equivalent counterparts.
In both standard and reasoning modes, Cogito v1-preview models outperform their size equivalent counterparts on common industry benchmarks.
Each model is trained in over 30 languages and supports a context length of 128k.
overrides:
parameters:
model: cogito-v1-preview-qwen-14b-q4_k_m.gguf
files:
- filename: cogito-v1-preview-qwen-14b-q4_k_m.gguf
sha256: 42ddd667bac3e5f0989f52b3dca5767ed15d0e5077c6f537e4b3873862ff7096
uri: huggingface://NikolayKozloff/cogito-v1-preview-qwen-14B-Q4_K_M-GGUF/cogito-v1-preview-qwen-14b-q4_k_m.gguf
- !!merge <<: *qwen25
name: "deepcogito_cogito-v1-preview-qwen-32b"
icon: https://huggingface.co/deepcogito/cogito-v1-preview-qwen-32B/resolve/main/images/deep-cogito-logo.png
urls:
- https://huggingface.co/deepcogito/cogito-v1-preview-qwen-32B
- https://huggingface.co/bartowski/deepcogito_cogito-v1-preview-qwen-32B-GGUF
description: |
The Cogito LLMs are instruction tuned generative models (text in/text out). All models are released under an open license for commercial use.
Cogito models are hybrid reasoning models. Each model can answer directly (standard LLM), or self-reflect before answering (like reasoning models).
The LLMs are trained using Iterated Distillation and Amplification (IDA) - an scalable and efficient alignment strategy for superintelligence using iterative self-improvement.
The models have been optimized for coding, STEM, instruction following and general helpfulness, and have significantly higher multilingual, coding and tool calling capabilities than size equivalent counterparts.
In both standard and reasoning modes, Cogito v1-preview models outperform their size equivalent counterparts on common industry benchmarks.
Each model is trained in over 30 languages and supports a context length of 128k.
overrides:
parameters:
model: deepcogito_cogito-v1-preview-qwen-32B-Q4_K_M.gguf
files:
- filename: deepcogito_cogito-v1-preview-qwen-32B-Q4_K_M.gguf
sha256: 985f2d49330090e64603309f7eb61030769f25a5da027ac0b0a740858d087ad8
uri: huggingface://bartowski/deepcogito_cogito-v1-preview-qwen-32B-GGUF/deepcogito_cogito-v1-preview-qwen-32B-Q4_K_M.gguf
- &llama31
url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
icon: https://avatars.githubusercontent.com/u/153379578
@ -7357,6 +7995,27 @@
- filename: TextSynth-8B.i1-Q4_K_M.gguf
sha256: 9186a8cb3a797cd2cd5b2eeaee99808674d96731824a9ee45685bbf480ba56c3
uri: huggingface://mradermacher/TextSynth-8B-i1-GGUF/TextSynth-8B.i1-Q4_K_M.gguf
- !!merge <<: *llama31
name: "deepcogito_cogito-v1-preview-llama-8b"
icon: https://huggingface.co/deepcogito/cogito-v1-preview-llama-8B/resolve/main/images/deep-cogito-logo.png
urls:
- https://huggingface.co/deepcogito/cogito-v1-preview-llama-8B
- https://huggingface.co/bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF
description: |
The Cogito LLMs are instruction tuned generative models (text in/text out). All models are released under an open license for commercial use.
Cogito models are hybrid reasoning models. Each model can answer directly (standard LLM), or self-reflect before answering (like reasoning models).
The LLMs are trained using Iterated Distillation and Amplification (IDA) - an scalable and efficient alignment strategy for superintelligence using iterative self-improvement.
The models have been optimized for coding, STEM, instruction following and general helpfulness, and have significantly higher multilingual, coding and tool calling capabilities than size equivalent counterparts.
In both standard and reasoning modes, Cogito v1-preview models outperform their size equivalent counterparts on common industry benchmarks.
Each model is trained in over 30 languages and supports a context length of 128k.
overrides:
parameters:
model: deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf
files:
- filename: deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf
sha256: 445173fb1dacef3fa0be49ebb4512b948fdb1434d86732de198424695b017b50
uri: huggingface://bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF/deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf
- !!merge <<: *llama33
name: "llama-3.3-magicalgirl-2.5-i1"
icon: https://cdn-uploads.huggingface.co/production/uploads/633e85093a17ab61de8d9073/FGK0qBGmELj6DEUxbbrdR.png
@ -7835,6 +8494,20 @@
- filename: Fallen-Safeword-70B-R1-v4.1.Q4_K_M.gguf
sha256: aed6bd5bb03b7bd886939237bc10ea6331d4feb5a3b6712e0c5474a778acf817
uri: huggingface://mradermacher/Fallen-Safeword-70B-R1-v4.1-GGUF/Fallen-Safeword-70B-R1-v4.1.Q4_K_M.gguf
- !!merge <<: *deepseek-r1
name: "agentica-org_deepcoder-14b-preview"
urls:
- https://huggingface.co/agentica-org/DeepCoder-14B-Preview
- https://huggingface.co/bartowski/agentica-org_DeepCoder-14B-Preview-GGUF
description: |
DeepCoder-14B-Preview is a code reasoning LLM fine-tuned from DeepSeek-R1-Distilled-Qwen-14B using distributed reinforcement learning (RL) to scale up to long context lengths. The model achieves 60.6% Pass@1 accuracy on LiveCodeBench v5 (8/1/24-2/1/25), representing a 8% improvement over the base model (53%) and achieving similar performance to OpenAI's o3-mini with just 14B parameters.
overrides:
parameters:
model: agentica-org_DeepCoder-14B-Preview-Q4_K_M.gguf
files:
- filename: agentica-org_DeepCoder-14B-Preview-Q4_K_M.gguf
sha256: 38f0f777de3116ca27d10ec84388b3290a1bf3f7db8c5bdc1f92d100e4231870
uri: huggingface://bartowski/agentica-org_DeepCoder-14B-Preview-GGUF/agentica-org_DeepCoder-14B-Preview-Q4_K_M.gguf
- &qwen2
url: "github:mudler/LocalAI/gallery/chatml.yaml@master" ## Start QWEN2
name: "qwen2-7b-instruct"
@ -9149,6 +9822,21 @@
- filename: BlackSheep-24B.i1-Q4_K_M.gguf
sha256: 95ae096eca05a95591254babf81b4d5617ceebbe8eda04c6cf8968ef4a69fc80
uri: huggingface://mradermacher/BlackSheep-24B-i1-GGUF/BlackSheep-24B.i1-Q4_K_M.gguf
- !!merge <<: *mistral03
name: "eurydice-24b-v2-i1"
icon: https://cdn-uploads.huggingface.co/production/uploads/652c2a63d78452c4742cd3d3/Hm_tg4s0D6yWmtrTHII32.png
urls:
- https://huggingface.co/aixonlab/Eurydice-24b-v2
- https://huggingface.co/mradermacher/Eurydice-24b-v2-i1-GGUF
description: |
Eurydice 24b v2 is designed to be the perfect companion for multi-role conversations. It demonstrates exceptional contextual understanding and excels in creativity, natural conversation and storytelling. Built on Mistral 3.1, this model has been trained on a custom dataset specifically crafted to enhance its capabilities.
overrides:
parameters:
model: Eurydice-24b-v2.i1-Q4_K_M.gguf
files:
- filename: Eurydice-24b-v2.i1-Q4_K_M.gguf
sha256: fb4104a1b33dd860e1eca3b6906a10cacc5b91a2534db72d9749652a204fbcbf
uri: huggingface://mradermacher/Eurydice-24b-v2-i1-GGUF/Eurydice-24b-v2.i1-Q4_K_M.gguf
- &mudler
url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models
name: "LocalAI-llama3-8b-function-call-v0.2"

View file

@ -74,10 +74,9 @@ Version: ${version}
),
kong.UsageOnError(),
kong.Vars{
"basepath": kong.ExpandPath("."),
"remoteLibraryURL": "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/model_library.yaml",
"galleries": `[{"name":"localai", "url":"github:mudler/LocalAI/gallery/index.yaml@master"}]`,
"version": internal.PrintableVersion(),
"basepath": kong.ExpandPath("."),
"galleries": `[{"name":"localai", "url":"github:mudler/LocalAI/gallery/index.yaml@master"}]`,
"version": internal.PrintableVersion(),
},
)

View file

@ -473,8 +473,6 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
backend = realBackend
}
ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
var backendToConsume string
switch backend {
@ -497,17 +495,37 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
}
func (ml *ModelLoader) stopActiveBackends(modelID string, singleActiveBackend bool) {
if !singleActiveBackend {
return
}
// If we can have only one backend active, kill all the others (except external backends)
if singleActiveBackend {
log.Debug().Msgf("Stopping all backends except '%s'", modelID)
err := ml.StopGRPC(allExcept(modelID))
if err != nil {
log.Error().Err(err).Str("keptModel", modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
}
// Stop all backends except the one we are going to load
log.Debug().Msgf("Stopping all backends except '%s'", modelID)
err := ml.StopGRPC(allExcept(modelID))
if err != nil {
log.Error().Err(err).Str("keptModel", modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
}
}
func (ml *ModelLoader) Close() {
if !ml.singletonMode {
return
}
ml.singletonLock.Unlock()
}
func (ml *ModelLoader) lockBackend() {
if !ml.singletonMode {
return
}
ml.singletonLock.Lock()
}
func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
ml.lockBackend() // grab the singleton lock if needed
o := NewOptions(opts...)
// Return earlier if we have a model already loaded
@ -518,17 +536,20 @@ func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
return m.GRPC(o.parallelRequests, ml.wd), nil
}
ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
ml.stopActiveBackends(o.modelID, ml.singletonMode)
// if a backend is defined, return the loader directly
if o.backendString != "" {
return ml.backendLoader(opts...)
}
// Otherwise scan for backends in the asset directory
var err error
// get backends embedded in the binary
autoLoadBackends, err := ml.ListAvailableBackends(o.assetDir)
if err != nil {
ml.Close() // we failed, release the lock
return nil, err
}
@ -560,5 +581,7 @@ func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
}
}
ml.Close() // make sure to release the lock in case of failure
return nil, fmt.Errorf("could not load model - all backends returned error: %s", err.Error())
}

View file

@ -18,16 +18,19 @@ import (
// TODO: Split ModelLoader and TemplateLoader? Just to keep things more organized. Left together to share a mutex until I look into that. Would split if we seperate directories for .bin/.yaml and .tmpl
type ModelLoader struct {
ModelPath string
mu sync.Mutex
models map[string]*Model
wd *WatchDog
ModelPath string
mu sync.Mutex
singletonLock sync.Mutex
singletonMode bool
models map[string]*Model
wd *WatchDog
}
func NewModelLoader(modelPath string) *ModelLoader {
func NewModelLoader(modelPath string, singleActiveBackend bool) *ModelLoader {
nml := &ModelLoader{
ModelPath: modelPath,
models: make(map[string]*Model),
ModelPath: modelPath,
models: make(map[string]*Model),
singletonMode: singleActiveBackend,
}
return nml
@ -142,26 +145,6 @@ func (ml *ModelLoader) LoadModel(modelID, modelName string, loader func(string,
func (ml *ModelLoader) ShutdownModel(modelName string) error {
ml.mu.Lock()
defer ml.mu.Unlock()
model, ok := ml.models[modelName]
if !ok {
return fmt.Errorf("model %s not found", modelName)
}
retries := 1
for model.GRPC(false, ml.wd).IsBusy() {
log.Debug().Msgf("%s busy. Waiting.", modelName)
dur := time.Duration(retries*2) * time.Second
if dur > retryTimeout {
dur = retryTimeout
}
time.Sleep(dur)
retries++
if retries > 10 && os.Getenv("LOCALAI_FORCE_BACKEND_SHUTDOWN") == "true" {
log.Warn().Msgf("Model %s is still busy after %d retries. Forcing shutdown.", modelName, retries)
break
}
}
return ml.deleteProcess(modelName)
}

View file

@ -17,10 +17,9 @@ type Options struct {
externalBackends map[string]string
grpcAttempts int
grpcAttemptsDelay int
singleActiveBackend bool
parallelRequests bool
grpcAttempts int
grpcAttemptsDelay int
parallelRequests bool
}
type Option func(*Options)
@ -88,12 +87,6 @@ func WithContext(ctx context.Context) Option {
}
}
func WithSingleActiveBackend() Option {
return func(o *Options) {
o.singleActiveBackend = true
}
}
func WithModelID(id string) Option {
return func(o *Options) {
o.modelID = id

View file

@ -21,7 +21,7 @@ var _ = Describe("ModelLoader", func() {
// Setup the model loader with a test directory
modelPath = "/tmp/test_model_path"
os.Mkdir(modelPath, 0755)
modelLoader = model.NewModelLoader(modelPath)
modelLoader = model.NewModelLoader(modelPath, false)
})
AfterEach(func() {

View file

@ -9,25 +9,43 @@ import (
"strconv"
"strings"
"syscall"
"time"
"github.com/hpcloud/tail"
process "github.com/mudler/go-processmanager"
"github.com/rs/zerolog/log"
)
var forceBackendShutdown bool = os.Getenv("LOCALAI_FORCE_BACKEND_SHUTDOWN") == "true"
func (ml *ModelLoader) deleteProcess(s string) error {
model, ok := ml.models[s]
if !ok {
log.Debug().Msgf("Model %s not found", s)
return fmt.Errorf("model %s not found", s)
}
defer delete(ml.models, s)
retries := 1
for model.GRPC(false, ml.wd).IsBusy() {
log.Debug().Msgf("%s busy. Waiting.", s)
dur := time.Duration(retries*2) * time.Second
if dur > retryTimeout {
dur = retryTimeout
}
time.Sleep(dur)
retries++
if retries > 10 && forceBackendShutdown {
log.Warn().Msgf("Model %s is still busy after %d retries. Forcing shutdown.", s, retries)
break
}
}
log.Debug().Msgf("Deleting process %s", s)
m, exists := ml.models[s]
if !exists {
log.Error().Msgf("Model does not exist %s", s)
// Nothing to do
return nil
}
process := m.Process()
process := model.Process()
if process == nil {
log.Error().Msgf("No process for %s", s)
// Nothing to do as there is no process
@ -44,9 +62,12 @@ func (ml *ModelLoader) deleteProcess(s string) error {
func (ml *ModelLoader) StopGRPC(filter GRPCProcessFilter) error {
var err error = nil
ml.mu.Lock()
defer ml.mu.Unlock()
for k, m := range ml.models {
if filter(k, m.Process()) {
e := ml.ShutdownModel(k)
e := ml.deleteProcess(k)
err = errors.Join(err, e)
}
}

View file

@ -70,7 +70,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
model.WithModel("test"),
}
sl = model.NewModelLoader("")
sl = model.NewModelLoader("", false)
sc, err = sl.Load(storeOpts...)
Expect(err).ToNot(HaveOccurred())
Expect(sc).ToNot(BeNil())
@ -235,7 +235,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
keys := [][]float32{{1.0, 0.0, 0.0}, {0.0, 1.0, 0.0}, {0.0, 0.0, 1.0}, {-1.0, 0.0, 0.0}}
vals := [][]byte{[]byte("x"), []byte("y"), []byte("z"), []byte("-z")}
err := store.SetCols(context.Background(), sc, keys, vals);
err := store.SetCols(context.Background(), sc, keys, vals)
Expect(err).ToNot(HaveOccurred())
_, _, sims, err := store.Find(context.Background(), sc, keys[0], 4)
@ -247,7 +247,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
keys := [][]float32{{1.0, 0.0, 1.0}, {0.0, 2.0, 0.0}, {0.0, 0.0, -1.0}, {-1.0, 0.0, -1.0}}
vals := [][]byte{[]byte("x"), []byte("y"), []byte("z"), []byte("-z")}
err := store.SetCols(context.Background(), sc, keys, vals);
err := store.SetCols(context.Background(), sc, keys, vals)
Expect(err).ToNot(HaveOccurred())
_, _, sims, err := store.Find(context.Background(), sc, keys[0], 4)
@ -314,7 +314,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
normalize(keys[6:])
err := store.SetCols(context.Background(), sc, keys, vals);
err := store.SetCols(context.Background(), sc, keys, vals)
Expect(err).ToNot(HaveOccurred())
expectTriangleEq(keys, vals)
@ -341,7 +341,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
c += 1
}
err := store.SetCols(context.Background(), sc, keys, vals);
err := store.SetCols(context.Background(), sc, keys, vals)
Expect(err).ToNot(HaveOccurred())
expectTriangleEq(keys, vals)