From 0b910e059504d20ca2bae3450abc117b23486a9e Mon Sep 17 00:00:00 2001
From: Dave Lee <dave@gray101.com>
Date: Thu, 8 Jun 2023 03:11:52 -0400
Subject: [PATCH] stash progress for the night. loading GPTJ fails, hacked in
 PredictTEMP as kludge

---
 Makefile                                  |   6 +-
 apiv2/config.go                           | 252 ++++++++++------------
 apiv2/config_manager.go                   | 139 ++++++++++++
 apiv2/engine.go                           | 205 ++++++++++++++++++
 apiv2/localai.go                          |  29 ++-
 apiv2/localai_nethttp.go                  |   2 +
 config/gpt-3.5-turbo-chat.yaml            |   2 +
 config/gpt-3.5-turbo-completion.yaml      |   2 +
 go.mod                                    |   4 +-
 go.sum                                    |  12 +-
 openai-openapi/chi-interface.tmpl         |  19 --
 openai-openapi/config.yaml                |   1 -
 openai-openapi/localai_model_patches.yaml |   8 +
 openai-openapi/test_segment.yml           |  30 ---
 14 files changed, 518 insertions(+), 193 deletions(-)
 create mode 100644 apiv2/config_manager.go
 create mode 100644 apiv2/engine.go
 delete mode 100644 openai-openapi/chi-interface.tmpl
 delete mode 100644 openai-openapi/test_segment.yml

diff --git a/Makefile b/Makefile
index d2b419e2..ddecf145 100644
--- a/Makefile
+++ b/Makefile
@@ -78,9 +78,9 @@ openai-openapi/transformed: openai-openapi/spec
 
 apiv2/localai.gen.go: prepare-sources
 	echo "go mod download done, running YTT"
-	cp ./openai-openapi/transformed/localai.yaml ./openai-openapi/transformed/localai.orig.yaml
-	$(GOCMD) run github.com/vmware-tanzu/carvel-ytt/cmd/ytt --output-files ./openai-openapi/transformed -f ./openai-openapi/transformed/localai.yaml -f ./openai-openapi/localai_model_patches.yaml 
-	# -f ./openai-openapi/remove_depreciated_openapi.yaml
+	# cp ./openai-openapi/transformed/localai.yaml ./openai-openapi/transformed/localai.orig.yaml
+	$(GOCMD) run github.com/vmware-tanzu/carvel-ytt/cmd/ytt --output-files ./openai-openapi/transformed -f ./openai-openapi/transformed/localai.yaml -f ./openai-openapi/localai_model_patches.yaml
+	cp ./openai-openapi/transformed/localai.yaml ./openai-openapi/transformed/localai.mid.yaml
 	echo "YTT Done, generating code..."
 	$(GOCMD) run github.com/deepmap/oapi-codegen/cmd/oapi-codegen --config=./openai-openapi/config.yaml ./openai-openapi/transformed/localai.yaml
 
diff --git a/apiv2/config.go b/apiv2/config.go
index e40957ec..844ceac5 100644
--- a/apiv2/config.go
+++ b/apiv2/config.go
@@ -2,14 +2,8 @@ package apiv2
 
 import (
 	"fmt"
-	"os"
-	"path/filepath"
-	"strings"
-	"sync"
 
 	llama "github.com/go-skynet/go-llama.cpp"
-	"github.com/mitchellh/mapstructure"
-	"gopkg.in/yaml.v2"
 )
 
 type ConfigRegistration struct {
@@ -38,6 +32,33 @@ type Config interface {
 	GetRequestDefaults() interface{}
 	GetLocalSettings() ConfigLocalSettings
 	GetRegistration() ConfigRegistration
+
+	// TODO: Test these. I am not sure.
+	ToPredictOptions() []llama.PredictOption
+	ToModelOptions() []llama.ModelOption
+
+	// TODO also dubious? Technically some requests lack prompts, but it's pretty general and may just be worth sticking here.
+	GetPrompts() ([]Prompt, error)
+	GetN() (int, error)
+}
+
+type Prompt interface {
+	AsString() string //, bool)
+	AsTokens() []int
+}
+
+// How do Go people name these? Should I just ditch the interface entirely?
+type PromptImpl struct {
+	sVal string
+	tVal []int
+}
+
+func (p PromptImpl) AsString() string {
+	return p.sVal
+}
+
+func (p PromptImpl) AsTokens() []int {
+	return p.tVal
 }
 
 func (cs ConfigStub) GetRequestDefaults() interface{} {
@@ -52,6 +73,23 @@ func (cs ConfigStub) GetRegistration() ConfigRegistration {
 	return cs.Registration
 }
 
+func (cs ConfigStub) ToPredictOptions() []llama.PredictOption {
+	return []llama.PredictOption{}
+}
+
+func (cs ConfigStub) ToModelOptions() []llama.ModelOption {
+	return []llama.ModelOption{}
+}
+
+func (cs ConfigStub) GetPrompts() ([]Prompt, error) {
+	// Does this make sense?
+	return nil, fmt.Errorf("unsupported operation GetPrompts for %T", cs)
+}
+
+func (cs ConfigStub) GetN() (int, error) {
+	return 0, fmt.Errorf("unsupported operation GetN for %T", cs)
+}
+
 func (sc SpecificConfig[RequestModel]) GetRequestDefaults() interface{} {
 	return sc.RequestDefaults
 }
@@ -68,133 +106,6 @@ func (sc SpecificConfig[RequestModel]) GetRegistration() ConfigRegistration {
 	return sc.Registration
 }
 
-type ConfigManager struct {
-	configs map[ConfigRegistration]Config
-	sync.Mutex
-}
-
-func NewConfigManager() *ConfigManager {
-	return &ConfigManager{
-		configs: make(map[ConfigRegistration]Config),
-	}
-}
-
-// Private helper method doesn't enforce the mutex. This is because loading at the directory level keeps the lock up the whole time, and I like that.
-func (cm *ConfigManager) loadConfigFile(path string) (*Config, error) {
-	fmt.Printf("INTERNAL loadConfigFile for %s\n", path)
-	stub := ConfigStub{}
-	f, err := os.ReadFile(path)
-	if err != nil {
-		return nil, fmt.Errorf("cannot read config file: %w", err)
-	}
-	if err := yaml.Unmarshal(f, &stub); err != nil {
-		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
-	}
-	fmt.Printf("RAW STUB: %+v\n", stub)
-
-	endpoint := stub.Registration.Endpoint
-
-	// EndpointConfigMap is generated over in localai.gen.go
-	// It's a map that translates a string endpoint function name to an empty SpecificConfig[T], with the type parameter for that request.
-	if structType, ok := EndpointConfigMap[endpoint]; ok {
-		fmt.Printf("~~ EndpointConfigMap[%s]: %+v\n", endpoint, structType)
-		tmpUnmarshal := map[string]interface{}{}
-		if err := yaml.Unmarshal(f, &tmpUnmarshal); err != nil {
-			if e, ok := err.(*yaml.TypeError); ok {
-				fmt.Println("\n!!!!!Type error:", e)
-			}
-			return nil, fmt.Errorf("cannot unmarshal config file for %s: %w", endpoint, err)
-		}
-		fmt.Printf("$$$ tmpUnmarshal: %+v\n", tmpUnmarshal)
-		mapstructure.Decode(tmpUnmarshal, &structType)
-
-		fmt.Printf("AFTER UNMARSHAL %T\n%+v\n=======\n", structType, structType)
-
-		// rawConfig.RequestDefaults = structType.GetRequestDefaults()
-
-		cm.configs[structType.GetRegistration()] = structType
-		// fmt.Printf("\n\n\n!!!!!HIT BOTTOM!!!!!!")
-		return &structType, nil
-		// fmt.Printf("\n\n\n!!!!!\n\n\nBIG MISS!\n\n%+v\n\n%T\n%T=====", specificStruct, specificStruct, structType)
-	}
-
-	// for i, ts := range EndpointToRequestBodyMap {
-	// 	fmt.Printf("%s: %+v\n", i, ts)
-	// }
-
-	return nil, fmt.Errorf("failed to parse config for endpoint %s", endpoint)
-}
-
-func (cm *ConfigManager) LoadConfigFile(path string) (*Config, error) {
-	fmt.Printf("LoadConfigFile TOP for %s", path)
-
-	cm.Lock()
-	fmt.Println("cm.Lock done")
-
-	defer cm.Unlock()
-	fmt.Println("cm.Unlock done")
-
-	return cm.loadConfigFile(path)
-}
-
-func (cm *ConfigManager) LoadConfigDirectory(path string) ([]ConfigRegistration, error) {
-	fmt.Printf("LoadConfigDirectory TOP for %s\n", path)
-	cm.Lock()
-	defer cm.Unlock()
-	files, err := os.ReadDir(path)
-	if err != nil {
-		return []ConfigRegistration{}, err
-	}
-	fmt.Printf("os.ReadDir done, found %d files\n", len(files))
-
-	for _, file := range files {
-		// Skip anything that isn't yaml
-		if !strings.Contains(file.Name(), ".yaml") {
-			continue
-		}
-		_, err := cm.loadConfigFile(filepath.Join(path, file.Name()))
-		if err != nil {
-			return []ConfigRegistration{}, err
-		}
-	}
-
-	fmt.Printf("LoadConfigDirectory DONE %d", len(cm.configs))
-
-	return cm.listConfigs(), nil
-}
-
-func (cm *ConfigManager) GetConfig(r ConfigRegistration) (Config, bool) {
-	cm.Lock()
-	defer cm.Unlock()
-	v, exists := cm.configs[r]
-	return v, exists
-}
-
-// This is a convience function for endpoint functions to use.
-// The advantage is it avoids errors in the endpoint string
-// Not a clue what the performance cost of this is.
-func (cm *ConfigManager) GetConfigForThisEndpoint(m string) (Config, bool) {
-	endpoint := printCurrentFunctionName(2)
-	return cm.GetConfig(ConfigRegistration{
-		Model:    m,
-		Endpoint: endpoint,
-	})
-}
-
-func (cm *ConfigManager) listConfigs() []ConfigRegistration {
-	var res []ConfigRegistration
-	for k := range cm.configs {
-		res = append(res, k)
-	}
-	return res
-}
-
-func (cm *ConfigManager) ListConfigs() []ConfigRegistration {
-	cm.Lock()
-	defer cm.Unlock()
-	return cm.listConfigs()
-}
-
 // These functions I'm a bit dubious about. I think there's a better refactoring down in pkg/model
 // But to get a minimal test up and running, here we go!
 // TODO: non text completion
@@ -328,3 +239,78 @@ func (sc SpecificConfig[RequestModel]) ToPredictOptions() []llama.PredictOption
 
 	return llamaOpts
 }
+
+// It's unclear if this code belongs here or somewhere else, but I'm jamming it here for now.
+func (sc SpecificConfig[RequestModel]) GetPrompts() ([]Prompt, error) {
+	prompts := []Prompt{}
+
+	switch req := sc.GetRequestDefaults().(type) {
+	case CreateCompletionRequest:
+		p0, err := req.Prompt.AsCreateCompletionRequestPrompt0()
+		if err == nil {
+			p := PromptImpl{sVal: p0}
+			return []Prompt{p}, nil
+		}
+		p1, err := req.Prompt.AsCreateCompletionRequestPrompt1()
+		if err == nil {
+			for _, m := range p1 {
+				prompts = append(prompts, PromptImpl{sVal: m})
+			}
+			return prompts, nil
+		}
+		p2, err := req.Prompt.AsCreateCompletionRequestPrompt2()
+		if err == nil {
+			p := PromptImpl{tVal: p2}
+			return []Prompt{p}, nil
+		}
+		p3, err := req.Prompt.AsCreateCompletionRequestPrompt3()
+		if err == nil {
+			for _, t := range p3 {
+				prompts = append(prompts, PromptImpl{tVal: t})
+			}
+			return prompts, nil
+		}
+	case CreateChatCompletionRequest:
+
+		for _, message := range req.Messages {
+
+			prompts = append(prompts, PromptImpl{sVal: message.Content})
+
+			// TODO Deal with ROLES
+			// var content string
+			// r := req.Roles[message.Role]
+			// if r != "" {
+			// 	content = fmt.Sprint(r, " ", message.Content)
+			// } else {
+			// 	content = message.Content
+			// }
+
+			// if content != "" {
+			// 	prompt = prompt + content
+			// }
+
+		}
+		return prompts, nil
+	}
+
+	return nil, fmt.Errorf("string prompt not found for %T", sc.GetRequestDefaults())
+}
+
+func (sc SpecificConfig[RequestModel]) GetN() (int, error) {
+	switch req := sc.GetRequestDefaults().(type) {
+
+	case CreateChatCompletionRequest:
+	case CreateCompletionRequest:
+	case CreateEditRequest:
+	case CreateImageRequest:
+		// TODO I AM SORRY FOR THIS DIRTY HACK.
+		// YTT is currently mangling the n property and renaming it to false.
+		// This needs to be fixed before merging. However for testing.....
+		return *req.False, nil
+	}
+
+	return 0, fmt.Errorf("unsupported operation GetN for %T", sc)
+}
+
+// TODO: Not even using this, but illustration of difficulty: should this be integrated to make GetPrompts(), returning an interface of {Tokens []int, String string}
+// func (sc SpecificConfig[RequestModel]) GetTokenPrompts() ([]int, error) {}
diff --git a/apiv2/config_manager.go b/apiv2/config_manager.go
new file mode 100644
index 00000000..f1629dc2
--- /dev/null
+++ b/apiv2/config_manager.go
@@ -0,0 +1,139 @@
+package apiv2
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+
+	"github.com/mitchellh/mapstructure"
+	"gopkg.in/yaml.v2"
+)
+
+type ConfigManager struct {
+	configs map[ConfigRegistration]Config
+	sync.Mutex
+}
+
+func NewConfigManager() *ConfigManager {
+	return &ConfigManager{
+		configs: make(map[ConfigRegistration]Config),
+	}
+}
+
+// Private helper method doesn't enforce the mutex. This is because loading at the directory level keeps the lock up the whole time, and I like that.
+func (cm *ConfigManager) loadConfigFile(path string) (*Config, error) {
+	fmt.Printf("INTERNAL loadConfigFile for %s\n", path)
+	stub := ConfigStub{}
+	f, err := os.ReadFile(path)
+	if err != nil {
+		return nil, fmt.Errorf("cannot read config file: %w", err)
+	}
+	if err := yaml.Unmarshal(f, &stub); err != nil {
+		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
+	}
+	fmt.Printf("RAW STUB: %+v\n", stub)
+
+	endpoint := stub.Registration.Endpoint
+
+	// EndpointConfigMap is generated over in localai.gen.go
+	// It's a map that translates a string endpoint function name to an empty SpecificConfig[T], with the type parameter for that request.
+	if structType, ok := EndpointConfigMap[endpoint]; ok {
+		fmt.Printf("~~ EndpointConfigMap[%s]: %+v\n", endpoint, structType)
+		tmpUnmarshal := map[string]interface{}{}
+		if err := yaml.Unmarshal(f, &tmpUnmarshal); err != nil {
+			if e, ok := err.(*yaml.TypeError); ok {
+				fmt.Println("\n!!!!!Type error:", e)
+			}
+			return nil, fmt.Errorf("cannot unmarshal config file for %s: %w", endpoint, err)
+		}
+		fmt.Printf("$$$ tmpUnmarshal: %+v\n", tmpUnmarshal)
+		mapstructure.Decode(tmpUnmarshal, &structType)
+
+		fmt.Printf("AFTER UNMARSHAL %T\n%+v\n=======\n", structType, structType)
+
+		// rawConfig.RequestDefaults = structType.GetRequestDefaults()
+
+		cm.configs[structType.GetRegistration()] = structType
+		// fmt.Printf("\n\n\n!!!!!HIT BOTTOM!!!!!!")
+		return &structType, nil
+		// fmt.Printf("\n\n\n!!!!!\n\n\nBIG MISS!\n\n%+v\n\n%T\n%T=====", specificStruct, specificStruct, structType)
+	}
+
+	// for i, ts := range EndpointToRequestBodyMap {
+	// 	fmt.Printf("%s: %+v\n", i, ts)
+	// }
+
+	return nil, fmt.Errorf("failed to parse config for endpoint %s", endpoint)
+}
+
+func (cm *ConfigManager) LoadConfigFile(path string) (*Config, error) {
+	fmt.Printf("LoadConfigFile TOP for %s", path)
+
+	cm.Lock()
+	fmt.Println("cm.Lock done")
+
+	defer cm.Unlock()
+	fmt.Println("cm.Unlock done")
+
+	return cm.loadConfigFile(path)
+}
+
+func (cm *ConfigManager) LoadConfigDirectory(path string) ([]ConfigRegistration, error) {
+	fmt.Printf("LoadConfigDirectory TOP for %s\n", path)
+	cm.Lock()
+	defer cm.Unlock()
+	files, err := os.ReadDir(path)
+	if err != nil {
+		return []ConfigRegistration{}, err
+	}
+	fmt.Printf("os.ReadDir done, found %d files\n", len(files))
+
+	for _, file := range files {
+		// Skip anything that isn't yaml
+		if !strings.Contains(file.Name(), ".yaml") {
+			continue
+		}
+		_, err := cm.loadConfigFile(filepath.Join(path, file.Name()))
+		if err != nil {
+			return []ConfigRegistration{}, err
+		}
+	}
+
+	fmt.Printf("LoadConfigDirectory DONE %d", len(cm.configs))
+
+	return cm.listConfigs(), nil
+}
+
+func (cm *ConfigManager) GetConfig(r ConfigRegistration) (Config, bool) {
+	cm.Lock()
+	defer cm.Unlock()
+	v, exists := cm.configs[r]
+	return v, exists
+}
+
+// This is a convience function for endpoint functions to use.
+// The advantage is it avoids errors in the endpoint string
+// Not a clue what the performance cost of this is.
+func (cm *ConfigManager) GetConfigForThisEndpoint(m string) (Config, bool) {
+	endpoint := printCurrentFunctionName(2)
+	return cm.GetConfig(ConfigRegistration{
+		Model:    m,
+		Endpoint: endpoint,
+	})
+}
+
+func (cm *ConfigManager) listConfigs() []ConfigRegistration {
+	var res []ConfigRegistration
+	for k := range cm.configs {
+		res = append(res, k)
+	}
+	return res
+}
+
+func (cm *ConfigManager) ListConfigs() []ConfigRegistration {
+	cm.Lock()
+	defer cm.Unlock()
+	return cm.listConfigs()
+}
diff --git a/apiv2/engine.go b/apiv2/engine.go
new file mode 100644
index 00000000..326004df
--- /dev/null
+++ b/apiv2/engine.go
@@ -0,0 +1,205 @@
+package apiv2
+
+import (
+	"fmt"
+	"regexp"
+	"strings"
+	"sync"
+
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
+	llama "github.com/go-skynet/go-llama.cpp"
+	"github.com/mitchellh/mapstructure"
+	gpt4all "github.com/nomic-ai/gpt4all/gpt4all-bindings/golang"
+)
+
+type LocalAIEngine struct {
+	loader         *model.ModelLoader
+	mutexMapMutex  sync.Mutex
+	mutexes        map[ConfigRegistration]*sync.Mutex
+	cutstrings     map[ConfigRegistration]map[string]*regexp.Regexp
+	cutstringMutex sync.Mutex
+}
+
+func NewLocalAIEngine(loader *model.ModelLoader) LocalAIEngine {
+	return LocalAIEngine{
+		loader:     loader,
+		mutexes:    make(map[ConfigRegistration]*sync.Mutex),
+		cutstrings: make(map[ConfigRegistration]map[string]*regexp.Regexp),
+	}
+}
+
+// TODO model interface? Currently scheduled for phase 3 lol
+func (e *LocalAIEngine) LoadModel(config Config) (interface{}, error) {
+	ls := config.GetLocalSettings()
+	fmt.Printf("LocalAIEngine.LoadModel => %+v\n\n", config)
+	return e.loader.BackendLoader(ls.Backend, ls.ModelPath, config.ToModelOptions(), uint32(ls.Threads))
+}
+
+func (e *LocalAIEngine) GetModelPredictionFunction(config Config, tokenCallback func(string) bool) (func() ([]string, error), error) {
+
+	fmt.Printf("LocalAIEngine.GetModelPredictionFunction => %+v\n\n", config)
+
+	supportStreams := false
+	var predictOnce func(p Prompt) (string, error) = nil
+
+	inferenceModel, err := e.LoadModel(config)
+	if err != nil {
+		fmt.Printf("ERROR LOADING MODEL: %s\n", err.Error())
+		return nil, err
+	}
+
+	prompts, err := config.GetPrompts()
+	if err != nil {
+		fmt.Printf("ERROR GetPrompts: %s\n", err.Error())
+		return nil, err
+	}
+
+	switch localModel := inferenceModel.(type) {
+	case *llama.LLama:
+		fmt.Println("setting predictOnce for llama")
+		supportStreams = true
+		predictOnce = func(p Prompt) (string, error) {
+
+			if tokenCallback != nil {
+				localModel.SetTokenCallback(tokenCallback)
+			}
+
+			// TODO: AsTokens? I think that would need to be exposed from llama and the others.
+			str, er := localModel.Predict(
+				p.AsString(),
+				config.ToPredictOptions()...,
+			)
+			// Seems that if we don't free the callback explicitly we leave functions registered (that might try to send on closed channels)
+			// For instance otherwise the API returns: {"error":{"code":500,"message":"send on closed channel","type":""}}
+			// after a stream event has occurred
+			localModel.SetTokenCallback(nil)
+			return str, er
+		}
+	case *gpt4all.Model:
+		fmt.Println("setting predictOnce for gpt4all")
+		supportStreams = true
+		predictOnce = func(p Prompt) (string, error) {
+			if tokenCallback != nil {
+				localModel.SetTokenCallback(tokenCallback)
+			}
+
+			mappedPredictOptions := gpt4all.PredictOptions{}
+
+			mapstructure.Decode(config.ToPredictOptions(), &mappedPredictOptions)
+
+			str, err := localModel.PredictTEMP(
+				p.AsString(),
+				mappedPredictOptions,
+			)
+			// Seems that if we don't free the callback explicitly we leave functions registered (that might try to send on closed channels)
+			// For instance otherwise the API returns: {"error":{"code":500,"message":"send on closed channel","type":""}}
+			// after a stream event has occurred
+			localModel.SetTokenCallback(nil)
+			return str, err
+		}
+	case *transformers.GPTJ:
+		fmt.Println("setting predictOnce for GPTJ")
+		supportStreams = false // EXP
+		predictOnce = func(p Prompt) (string, error) {
+			mappedPredictOptions := transformers.PredictOptions{}
+
+			mapstructure.Decode(config.ToPredictOptions(), &mappedPredictOptions)
+
+			fmt.Printf("MAPPED OPTIONS: %+v\n", mappedPredictOptions)
+
+			str, err := localModel.PredictTEMP(
+				p.AsString(),
+				mappedPredictOptions,
+			)
+			return str, err
+		}
+	}
+
+	if predictOnce == nil {
+		fmt.Printf("Failed to find a predictOnce for %T", inferenceModel)
+		return nil, fmt.Errorf("failed to find a predictOnce for %T", inferenceModel)
+	}
+
+	req := config.GetRequestDefaults()
+
+	return func() ([]string, error) {
+		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
+		e.mutexMapMutex.Lock()
+		r := config.GetRegistration()
+		l, ok := e.mutexes[r]
+		if !ok {
+			m := &sync.Mutex{}
+			e.mutexes[r] = m
+			l = m
+		}
+		e.mutexMapMutex.Unlock()
+		l.Lock()
+		defer l.Unlock()
+
+		results := []string{}
+
+		n, err := config.GetN()
+
+		if err != nil {
+			// TODO live to regret this, but for now...
+			n = 1
+		}
+
+		for p_i, prompt := range prompts {
+			for n_i := 0; n_i < n; n_i++ {
+				res, err := predictOnce(prompt)
+
+				// TODO: this used to be a part of finetune. For.... questionable parameter reasons I've moved it up here. Revisit this if it's smelly in the future.
+				ccr, is_ccr := req.(CreateCompletionRequest)
+				if is_ccr {
+					if *ccr.Echo {
+						res = prompt.AsString() + res
+					}
+				}
+
+				res = e.Finetune(config, res)
+
+				if err != nil {
+					fmt.Printf("ERROR DURING GetModelPredictionFunction -> PredictionFunction for %T with p_i: %d/n_i: %d\n%s", config, p_i, n_i, err.Error())
+					return nil, err
+				}
+				if tokenCallback != nil && !supportStreams {
+					tokenCallback(res)
+				}
+				results = append(results, res)
+			}
+		}
+
+		return results, nil
+
+	}, nil
+}
+
+func (e *LocalAIEngine) Finetune(config Config, prediction string) string {
+
+	reg := config.GetRegistration()
+	switch req := config.GetRequestDefaults().(type) {
+	case *CreateChatCompletionRequest:
+	case *CreateCompletionRequest:
+		ext := req.XLocalaiExtensions
+		if ext != nil {
+			for _, c := range *ext.Cutstrings {
+				e.cutstringMutex.Lock()
+				regex, ok := e.cutstrings[reg][c]
+				if !ok {
+					e.cutstrings[reg][c] = regexp.MustCompile(c)
+					regex = e.cutstrings[reg][c]
+				}
+				e.cutstringMutex.Unlock()
+				prediction = regex.ReplaceAllString(prediction, "")
+			}
+
+			for _, c := range *ext.Trimstrings {
+				prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
+			}
+		}
+	}
+
+	return prediction
+}
diff --git a/apiv2/localai.go b/apiv2/localai.go
index 29c9131e..395f401c 100644
--- a/apiv2/localai.go
+++ b/apiv2/localai.go
@@ -12,6 +12,7 @@ import (
 type LocalAIServer struct {
 	configManager *ConfigManager
 	loader        *model.ModelLoader
+	engine        *LocalAIEngine
 }
 
 func combineRequestAndConfig[RequestType any](configManager *ConfigManager, model string, requestFromInput *RequestType) (*SpecificConfig[RequestType], error) {
@@ -93,7 +94,33 @@ func (las *LocalAIServer) CreateChatCompletion(ctx context.Context, request Crea
 		fmt.Printf("message #%d: %+v", i, m)
 	}
 
-	return CreateChatCompletion200JSONResponse{}, nil
+	fmt.Println("Dodgy Stuff Below")
+
+	predict, err := las.engine.GetModelPredictionFunction(chatRequestConfig, nil)
+	if err != nil {
+		fmt.Printf("!!!!!!!!!! Error obtaining predict fn %s\n", err.Error())
+		return nil, err
+	}
+
+	predictions, err := predict()
+	if err != nil {
+		fmt.Printf("!!!!!!!!!! Error INSIDE predict fn %s\n", err.Error())
+		return nil, err
+	}
+
+	resp := CreateChatCompletion200JSONResponse{}
+
+	for i, prediction := range predictions {
+		resp.Choices = append(resp.Choices, CreateChatCompletionResponseChoice{
+			Message: &ChatCompletionResponseMessage{
+				Content: prediction,
+				Role:    "asssistant", // TODO FIX
+			},
+			Index: &i,
+		})
+	}
+
+	return resp, nil
 
 	// panic("unimplemented")
 }
diff --git a/apiv2/localai_nethttp.go b/apiv2/localai_nethttp.go
index ca571f50..1eae95cf 100644
--- a/apiv2/localai_nethttp.go
+++ b/apiv2/localai_nethttp.go
@@ -7,9 +7,11 @@ import (
 )
 
 func NewLocalAINetHTTPServer(configManager *ConfigManager, loader *model.ModelLoader, address string) *LocalAIServer {
+	engine := NewLocalAIEngine(loader)
 	localAI := LocalAIServer{
 		configManager: configManager,
 		loader:        loader,
+		engine:        &engine,
 	}
 
 	var middlewares []StrictMiddlewareFunc
diff --git a/config/gpt-3.5-turbo-chat.yaml b/config/gpt-3.5-turbo-chat.yaml
index ee71495d..b92f3c7d 100644
--- a/config/gpt-3.5-turbo-chat.yaml
+++ b/config/gpt-3.5-turbo-chat.yaml
@@ -4,6 +4,8 @@ registration:
 local_paths:
   model: ggml-gpt4all-j
   template: chat-gpt4all
+  backend: gptj
+  threads: 4
 request_defaults:
   top_p: 0.7
   temperature: 0.2
diff --git a/config/gpt-3.5-turbo-completion.yaml b/config/gpt-3.5-turbo-completion.yaml
index f890d09e..21d01dc0 100644
--- a/config/gpt-3.5-turbo-completion.yaml
+++ b/config/gpt-3.5-turbo-completion.yaml
@@ -4,6 +4,8 @@ registration:
 local_paths:
   model: ggml-gpt4all-j
   template: chat-gpt4all
+  backend: gptj
+  threads: 4
 request_defaults:
   top_p: 0.7
   temperature: 0.2
diff --git a/go.mod b/go.mod
index 501611db..00cbb854 100644
--- a/go.mod
+++ b/go.mod
@@ -20,7 +20,7 @@ require (
 	github.com/mudler/go-stable-diffusion v0.0.0-20230605122230-d89260f598af
 	github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230605194130-266f13aee9d8
 	github.com/onsi/ginkgo/v2 v2.9.7
-	github.com/onsi/gomega v1.27.7
+	github.com/onsi/gomega v1.27.8
 	github.com/otiai10/openaigo v1.1.0
 	github.com/rs/zerolog v1.29.1
 	github.com/sashabaranov/go-openai v1.10.0
@@ -28,7 +28,7 @@ require (
 	github.com/tmc/langchaingo v0.0.0-20230605114752-4afed6d7be4a
 	github.com/urfave/cli/v2 v2.25.5
 	github.com/valyala/fasthttp v1.47.0
-	github.com/vmware-tanzu/carvel-ytt v0.45.1
+	github.com/vmware-tanzu/carvel-ytt v0.45.2
 	gopkg.in/yaml.v2 v2.4.0
 	gopkg.in/yaml.v3 v3.0.1
 )
diff --git a/go.sum b/go.sum
index 40b8f8a6..32084b0d 100644
--- a/go.sum
+++ b/go.sum
@@ -38,12 +38,12 @@ github.com/donomii/go-rwkv.cpp v0.0.0-20230531084548-c43cdf5fc5bf h1:upCz8WYdzMe
 github.com/donomii/go-rwkv.cpp v0.0.0-20230531084548-c43cdf5fc5bf/go.mod h1:gWy7FIWioqYmYxkaoFyBnaKApeZVrUkHhv9EV9pz4dM=
 github.com/donomii/go-rwkv.cpp v0.0.0-20230601111443-3b28b09469fc h1:RCGGh/zw+K09sjCIYHUV7lFenxONml+LS02RdN+AkwI=
 github.com/donomii/go-rwkv.cpp v0.0.0-20230601111443-3b28b09469fc/go.mod h1:gWy7FIWioqYmYxkaoFyBnaKApeZVrUkHhv9EV9pz4dM=
+github.com/donomii/go-rwkv.cpp v0.0.0-20230604202420-1e18b2490e7e h1:Qne1BO0ltmyJcsizxZ61SV+uwuD1F8NztsfBDHOd0LI=
+github.com/donomii/go-rwkv.cpp v0.0.0-20230604202420-1e18b2490e7e/go.mod h1:gWy7FIWioqYmYxkaoFyBnaKApeZVrUkHhv9EV9pz4dM=
 github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
 github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA=
 github.com/getkin/kin-openapi v0.117.0 h1:QT2DyGujAL09F4NrKDHJGsUoIprlIcFVHWDVDcUFE8A=
 github.com/getkin/kin-openapi v0.117.0/go.mod h1:l5e9PaFUo9fyLJCPGQeXI2ML8c3P8BHOEV2VaAVf/pc=
-github.com/donomii/go-rwkv.cpp v0.0.0-20230604202420-1e18b2490e7e h1:Qne1BO0ltmyJcsizxZ61SV+uwuD1F8NztsfBDHOd0LI=
-github.com/donomii/go-rwkv.cpp v0.0.0-20230604202420-1e18b2490e7e/go.mod h1:gWy7FIWioqYmYxkaoFyBnaKApeZVrUkHhv9EV9pz4dM=
 github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230520182345-041be06d5881 h1:dafqVivljYk51VLFnnpTXJnfWDe637EobWZ1l8PyEf8=
 github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230520182345-041be06d5881/go.mod h1:QIjZ9OktHFG7p+/m3sMvrAJKKdWrr1fZIK0rM6HZlyo=
 github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230523110439-77eab3fbfe5e h1:4PMorQuoUGAXmIzCtnNOHaasyLokXdgd8jUWwsraFTo=
@@ -60,14 +60,14 @@ github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230601065548-3f7436e8a096
 github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230601065548-3f7436e8a096/go.mod h1:QIjZ9OktHFG7p+/m3sMvrAJKKdWrr1fZIK0rM6HZlyo=
 github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230601124500-5b9e59bc07dd h1:os3FeYEIB4j5m5QlbFC3HkVcaAmLxNXz48uIfQAexm0=
 github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230601124500-5b9e59bc07dd/go.mod h1:QIjZ9OktHFG7p+/m3sMvrAJKKdWrr1fZIK0rM6HZlyo=
+github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230606002726-57543c169e27 h1:boeMTUUBtnLU8JElZJHXrsUzROJar9/t6vGOFjkrhhI=
+github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230606002726-57543c169e27/go.mod h1:QIjZ9OktHFG7p+/m3sMvrAJKKdWrr1fZIK0rM6HZlyo=
 github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
 github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
 github.com/gin-gonic/gin v1.9.0 h1:OjyFBKICoexlu99ctXNR2gg+c5pKrKMuyjgARg9qeY8=
 github.com/gin-gonic/gin v1.9.0/go.mod h1:W1Me9+hsUSyj3CePGrd1/QrKJMSJ1Tu/0hFEH89961k=
 github.com/gin-gonic/gin v1.9.1 h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=
 github.com/gin-gonic/gin v1.9.1/go.mod h1:hPrL7YrpYKXt5YId3A/Tnip5kqbEAP+KLuI3SUcPTeU=
-github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230606002726-57543c169e27 h1:boeMTUUBtnLU8JElZJHXrsUzROJar9/t6vGOFjkrhhI=
-github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230606002726-57543c169e27/go.mod h1:QIjZ9OktHFG7p+/m3sMvrAJKKdWrr1fZIK0rM6HZlyo=
 github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
 github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs=
 github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
@@ -280,6 +280,7 @@ github.com/onsi/ginkgo/v2 v2.9.7 h1:06xGQy5www2oN160RtEZoTvnP2sPhEfePYmCDc2szss=
 github.com/onsi/ginkgo/v2 v2.9.7/go.mod h1:cxrmXWykAwTwhQsJOPfdIDiJ+l2RYq7U8hFU+M/1uw0=
 github.com/onsi/gomega v1.27.7 h1:fVih9JD6ogIiHUN6ePK7HJidyEDpWGVB5mzM7cWNXoU=
 github.com/onsi/gomega v1.27.7/go.mod h1:1p8OOlwo2iUUDsHnOrjE5UKYJ+e3W8eQ3qSlRahPmr4=
+github.com/onsi/gomega v1.27.8/go.mod h1:2J8vzI/s+2shY9XHRApDkdgPo1TKT7P2u6fXeJKFnNQ=
 github.com/otiai10/mint v1.5.1 h1:XaPLeE+9vGbuyEHem1JNk3bYc7KKqyI/na0/mLd/Kks=
 github.com/otiai10/mint v1.5.1/go.mod h1:MJm72SBthJjz8qhefc4z1PYEieWmy8Bku7CjcAqyUSM=
 github.com/otiai10/openaigo v1.1.0 h1:zRvGBqZUW5PCMgdkJNsPVTBd8tOLCMTipXE5wD2pdTg=
@@ -330,6 +331,7 @@ github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o
 github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY=
 github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
 github.com/swaggo/swag v1.16.1/go.mod h1:9/LMvHycG3NFHfR6LwvikHv5iFvmPADQ359cKikGxto=
 github.com/tinylib/msgp v1.1.6/go.mod h1:75BAfg2hauQhs3qedfdDZmWAPcFMAvJE5b9rGOMufyw=
 github.com/tinylib/msgp v1.1.8 h1:FCXC1xanKO4I8plpHGH2P7koL/RzZs12l/+r7vakfm0=
@@ -359,6 +361,8 @@ github.com/valyala/tcplisten v1.0.0 h1:rBHj/Xf+E1tRGZyWIWwJDiRY0zc1Js+CV5DqwacVS
 github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc=
 github.com/vmware-tanzu/carvel-ytt v0.45.1 h1:zjiOnV7WiKJbkLHkJCRxlmABOMIL4WhKKyahfCzFoIk=
 github.com/vmware-tanzu/carvel-ytt v0.45.1/go.mod h1:+r+ZVZLsETAYlRsgINztFdUdUufj2OwrTXCfOzYB4fY=
+github.com/vmware-tanzu/carvel-ytt v0.45.2 h1:0+aECp3BcMXkD8L/VXVn4KOh1jD3COAt3DOXZ76iZu0=
+github.com/vmware-tanzu/carvel-ytt v0.45.2/go.mod h1:oHqFBnn/JvqaUjcQo9T/a/WPUP1ituKjUpFPH+BTzfc=
 github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 h1:bAn7/zixMGCfxrRTfdpNzjtPYqr8smhKouy9mxVdGPU=
 github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673/go.mod h1:N3UwUGtsrSj3ccvlPHLoLsHnpR27oXr4ZE984MbSER8=
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
diff --git a/openai-openapi/chi-interface.tmpl b/openai-openapi/chi-interface.tmpl
deleted file mode 100644
index 5fa41189..00000000
--- a/openai-openapi/chi-interface.tmpl
+++ /dev/null
@@ -1,19 +0,0 @@
-// NOT ACTUALLY USING THIS CURRENTLY???
-
-// ServerInterface represents all server handlers.
-type ServerInterface interface {
-{{range .}}{{.SummaryAsComment }}
-// ({{.Method}} {{.Path}})
-{{.OperationId}}(w http.ResponseWriter, r *http.Request{{genParamArgs .PathParams}}{{if .RequiresParamObject}}, params {{.OperationId}}Params{{end}})
-{{end}}
-}
-
-// TypedServerInterface is used to give each endpoint a fully typed method signature for cases where we're able to route automatically
-type TypedServerInterface interface {
-{{range .}}{{.SummaryAsComment }}
-// ({{.Method}} {{.Path}})
-{{$reqBody := genDefaultRequestBodyType . -}}
-{{- if ne $reqBody "" }}{{$reqBody = printf ", body %s" $reqBody}}{{end -}}
-{{.OperationId}}(w http.ResponseWriter{{genParamArgs .PathParams}}{{if .RequiresParamObject}}, params {{.OperationId}}Params{{end}}{{$reqBody}})
-{{end}}
-}
\ No newline at end of file
diff --git a/openai-openapi/config.yaml b/openai-openapi/config.yaml
index 2547f33a..07be6edf 100644
--- a/openai-openapi/config.yaml
+++ b/openai-openapi/config.yaml
@@ -11,7 +11,6 @@ output-options:
     - mapstructure
   user-templates:
     endpoint-body-mapping.tmpl: ./openai-openapi/endpoint-body-mapping.tmpl
-#     chi/chi-interface.tmpl: ./openai-openapi/chi-interface.tmpl
 #     union.tmpl: "// SKIP"
 #     union-and-additional-properties.tmpl: "// SKIP"
 #     additional-properties.tmpl: "// SKIP"
\ No newline at end of file
diff --git a/openai-openapi/localai_model_patches.yaml b/openai-openapi/localai_model_patches.yaml
index 6d8771b2..abe2f8ae 100644
--- a/openai-openapi/localai_model_patches.yaml
+++ b/openai-openapi/localai_model_patches.yaml
@@ -54,6 +54,14 @@ components:
               type: number
               nullable: true
               x-go-type: float64
+            cutstrings:
+              type: array
+              items:
+                type: string
+            trimstrings:
+              type: array
+              items:
+                type: string
     #@overlay/match missing_ok=True
     LocalAIImageRequestExtension:
       allOf:
diff --git a/openai-openapi/test_segment.yml b/openai-openapi/test_segment.yml
deleted file mode 100644
index 20d98a2f..00000000
--- a/openai-openapi/test_segment.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-#! This file is just for my reference during development and will be removed.
-components:
-  schemas:
-    CreateChatCompletionRequest:
-      type: object
-      properties:
-        model:
-          description: ID of the model to use. Currently, only `gpt-3.5-turbo` and `gpt-3.5-turbo-0301` are supported.
-          type: string
-        messages:
-          description: The messages to generate chat completions for, in the [chat format](/docs/guides/chat/introduction).
-          type: array
-          minItems: 1
-          items:
-            $ref: '#/components/schemas/ChatCompletionRequestMessage'
-        temperature:
-          type: number
-          minimum: 0
-          maximum: 2
-          default: 1
-          example: 1
-          nullable: true
-          description: *completions_temperature_description
-    CreateImageRequest:
-      type: object
-      properties:
-        prompt:
-          description: A text description of the desired image(s). The maximum length is 1000 characters.
-          type: string
-          example: "A cute baby sea otter"
\ No newline at end of file