Reorganize repository layout

2025-05-28 06:25:00 +00:00 · 2023-04-11 23:43:43 +02:00 · 2023-04-11 23:43:43 +02:00 · ae30bd346d
commit ae30bd346d
parent 93d8977ba2
4 changed files with 357 additions and 1 deletions
--- a/api/api.go
+++ b/api/api.go
@ -0,0 +1,353 @@
+package main
+
+import (
+	"embed"
+	"fmt"
+	"net/http"
+	"strconv"
+	"strings"
+	"sync"
+
+	model "github.com/go-skynet/llama-cli/pkg/model"
+
+	llama "github.com/go-skynet/go-llama.cpp"
+	"github.com/gofiber/fiber/v2"
+	"github.com/gofiber/fiber/v2/middleware/cors"
+	"github.com/gofiber/fiber/v2/middleware/filesystem"
+	"github.com/gofiber/fiber/v2/middleware/recover"
+)
+
+type OpenAIResponse struct {
+	Created int      `json:"created,omitempty"`
+	Object  string   `json:"chat.completion,omitempty"`
+	ID      string   `json:"id,omitempty"`
+	Model   string   `json:"model,omitempty"`
+	Choices []Choice `json:"choices,omitempty"`
+}
+
+type Choice struct {
+	Index        int     `json:"index,omitempty"`
+	FinishReason string  `json:"finish_reason,omitempty"`
+	Message      Message `json:"message,omitempty"`
+	Text         string  `json:"text,omitempty"`
+}
+
+type Message struct {
+	Role    string `json:"role,omitempty"`
+	Content string `json:"content,omitempty"`
+}
+
+type OpenAIModel struct {
+	ID     string `json:"id"`
+	Object string `json:"object"`
+}
+
+//go:embed index.html
+var indexHTML embed.FS
+
+func completionEndpoint(defaultModel *llama.LLama, loader *model.ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+
+		var err error
+		var model *llama.LLama
+
+		// Get input data from the request body
+		input := new(struct {
+			Model  string `json:"model"`
+			Prompt string `json:"prompt"`
+		})
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		if input.Model == "" {
+			if defaultModel == nil {
+				return fmt.Errorf("no default model loaded, and no model specified")
+			}
+			model = defaultModel
+		} else {
+			model, err = loader.LoadModel(input.Model)
+			if err != nil {
+				return err
+			}
+		}
+
+		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
+		if input.Model != "" {
+			mutexMap.Lock()
+			l, ok := mutexes[input.Model]
+			if !ok {
+				m := &sync.Mutex{}
+				mutexes[input.Model] = m
+				l = m
+			}
+			mutexMap.Unlock()
+			l.Lock()
+			defer l.Unlock()
+		} else {
+			defaultMutex.Lock()
+			defer defaultMutex.Unlock()
+		}
+
+		// Set the parameters for the language model prediction
+		topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
+		if err != nil {
+			return err
+		}
+
+		topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
+		if err != nil {
+			return err
+		}
+
+		temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
+		if err != nil {
+			return err
+		}
+
+		tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
+		if err != nil {
+			return err
+		}
+
+		predInput := input.Prompt
+		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+		templatedInput, err := loader.TemplatePrefix(input.Model, struct {
+			Input string
+		}{Input: input.Prompt})
+		if err == nil {
+			predInput = templatedInput
+		}
+
+		// Generate the prediction using the language model
+		prediction, err := model.Predict(
+			predInput,
+			llama.SetTemperature(temperature),
+			llama.SetTopP(topP),
+			llama.SetTopK(topK),
+			llama.SetTokens(tokens),
+			llama.SetThreads(threads),
+		)
+		if err != nil {
+			return err
+		}
+
+		// Return the prediction in the response body
+		return c.JSON(OpenAIResponse{
+			Model:   input.Model,
+			Choices: []Choice{{Text: prediction}},
+		})
+	}
+}
+
+func chatEndpoint(defaultModel *llama.LLama, loader *model.ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		var err error
+		var model *llama.LLama
+
+		// Get input data from the request body
+		input := new(struct {
+			Messages []Message `json:"messages"`
+			Model    string    `json:"model"`
+		})
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		// TODO: drop me!
+		if input.Model == "gpt-3.5-turbo" {
+			input.Model = "ggml-koala-7b-model-q4_0-r2"
+		}
+
+		if input.Model == "" {
+			if defaultModel == nil {
+				return fmt.Errorf("no default model loaded, and no model specified")
+			}
+			model = defaultModel
+		} else {
+			model, err = loader.LoadModel(input.Model)
+			if err != nil {
+				return err
+			}
+		}
+
+		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
+		if input.Model != "" {
+			mutexMap.Lock()
+			l, ok := mutexes[input.Model]
+			if !ok {
+				m := &sync.Mutex{}
+				mutexes[input.Model] = m
+				l = m
+			}
+			mutexMap.Unlock()
+			l.Lock()
+			defer l.Unlock()
+		} else {
+			defaultMutex.Lock()
+			defer defaultMutex.Unlock()
+		}
+
+		// Set the parameters for the language model prediction
+		topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
+		if err != nil {
+			return err
+		}
+
+		topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
+		if err != nil {
+			return err
+		}
+
+		temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
+		if err != nil {
+			return err
+		}
+
+		tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
+		if err != nil {
+			return err
+		}
+
+		mess := []string{}
+		for _, i := range input.Messages {
+			mess = append(mess, i.Content)
+		}
+
+		predInput := strings.Join(mess, "\n")
+
+		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+		templatedInput, err := loader.TemplatePrefix(input.Model, struct {
+			Input string
+		}{Input: predInput})
+		if err == nil {
+			predInput = templatedInput
+		}
+
+		// Generate the prediction using the language model
+		prediction, err := model.Predict(
+			predInput,
+			llama.SetTemperature(temperature),
+			llama.SetTopP(topP),
+			llama.SetTopK(topK),
+			llama.SetTokens(tokens),
+			llama.SetThreads(threads),
+		)
+		if err != nil {
+			return err
+		}
+
+		// Return the prediction in the response body
+		return c.JSON(OpenAIResponse{
+			Model:   input.Model,
+			Choices: []Choice{{Message: Message{Role: "assistant", Content: prediction}}},
+		})
+	}
+}
+
+func Start(defaultModel *llama.LLama, loader *model.ModelLoader, listenAddr string, threads int) error {
+	app := fiber.New()
+
+	// Default middleware config
+	app.Use(recover.New())
+	app.Use(cors.New())
+
+	// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
+	var mutex = &sync.Mutex{}
+	mu := map[string]*sync.Mutex{}
+	var mumutex = &sync.Mutex{}
+
+	// openAI compatible API endpoint
+	app.Post("/v1/chat/completions", chatEndpoint(defaultModel, loader, threads, mutex, mumutex, mu))
+	app.Post("/v1/completions", completionEndpoint(defaultModel, loader, threads, mutex, mumutex, mu))
+	app.Get("/v1/models", func(c *fiber.Ctx) error {
+		models, err := loader.ListModels()
+		if err != nil {
+			return err
+		}
+
+		dataModels := []OpenAIModel{}
+		for _, m := range models {
+			dataModels = append(dataModels, OpenAIModel{ID: m, Object: "model"})
+		}
+		return c.JSON(struct {
+			Object string        `json:"object"`
+			Data   []OpenAIModel `json:"data"`
+		}{
+			Object: "list",
+			Data:   dataModels,
+		})
+	})
+
+	app.Use("/", filesystem.New(filesystem.Config{
+		Root:         http.FS(indexHTML),
+		NotFoundFile: "index.html",
+	}))
+
+	/*
+		curl --location --request POST 'http://localhost:8080/predict' --header 'Content-Type: application/json' --data-raw '{
+		    "text": "What is an alpaca?",
+		    "topP": 0.8,
+		    "topK": 50,
+		    "temperature": 0.7,
+		    "tokens": 100
+		}'
+	*/
+	// Endpoint to generate the prediction
+	app.Post("/predict", func(c *fiber.Ctx) error {
+		mutex.Lock()
+		defer mutex.Unlock()
+		// Get input data from the request body
+		input := new(struct {
+			Text string `json:"text"`
+		})
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		// Set the parameters for the language model prediction
+		topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
+		if err != nil {
+			return err
+		}
+
+		topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
+		if err != nil {
+			return err
+		}
+
+		temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
+		if err != nil {
+			return err
+		}
+
+		tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
+		if err != nil {
+			return err
+		}
+
+		// Generate the prediction using the language model
+		prediction, err := defaultModel.Predict(
+			input.Text,
+			llama.SetTemperature(temperature),
+			llama.SetTopP(topP),
+			llama.SetTopK(topK),
+			llama.SetTokens(tokens),
+			llama.SetThreads(threads),
+		)
+		if err != nil {
+			return err
+		}
+
+		// Return the prediction in the response body
+		return c.JSON(struct {
+			Prediction string `json:"prediction"`
+		}{
+			Prediction: prediction,
+		})
+	})
+
+	// Start the server
+	app.Listen(listenAddr)
+	return nil
+}
--- a/api/index.html
+++ b/api/index.html
@ -0,0 +1,120 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>llama-cli</title>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css" crossorigin="anonymous" referrerpolicy="no-referrer" />
+    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css">
+</head>
+<style>
+    @keyframes rotating {
+    from {
+        transform: rotate(0deg);
+    }
+    to {
+        transform: rotate(360deg);
+    }
+}
+
+.waiting {
+    animation: rotating 1s linear infinite;
+}
+
+</style>
+<body>
+
+<div class="container mt-5" x-data="{ templates:[
+    {
+      name: 'Alpaca: Instruction without input',
+      text: `Below is an instruction that describes a task. Write a response that appropriately completes the request.
+
+### Instruction:
+{{.Instruction}}
+
+### Response:`,
+    },
+    {
+      name: 'Alpaca: Instruction with input',
+      text: `Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+
+### Instruction:
+{{.Instruction}}
+
+### Input:
+{{.Input}}
+
+### Response:`,
+    }
+  ], selectedTemplate: '', selectedTemplateText: '' }">
+    <h1>llama-cli API</h1>
+    <div class="form-group">
+        <label for="inputText">Input Text:</label>
+        <textarea class="form-control" id="inputText" rows="6" placeholder="Your text input here..." x-text="selectedTemplateText"></textarea>
+    </div>
+    <div class="form-group">
+        <label for="templateSelect">Select Template:</label>
+        <select class="form-control" id="templateSelect" x-model="selectedTemplateText">
+            <option value="">None</option>
+            <template x-for="(template, index) in templates" :key="index">
+                <option :value="template.text" x-text="template.name"></option>
+            </template>
+        </select>
+    </div>
+    <div class="form-group">
+        <label for="topP">Top P:</label>
+        <input type="range" step="0.01" min="0" max="1" class="form-control" id="topP" value="0.20" name="topP" onchange="this.nextElementSibling.value = this.value" required>
+        <output>0.20</output>
+    </div>
+    <div class="form-group">
+        <label for="topK">Top K:</label>
+        <input type="number" class="form-control" id="topK" value="10000" name="topK"  required>
+    </div>
+    <div class="form-group">
+        <label for="temperature">Temperature:</label>
+        <input type="range" step="0.01" min="0" max="1" value="0.9" class="form-control" id="temperature" name="temperature" onchange="this.nextElementSibling.value = this.value"  required>
+        <output>0.9</output>
+    </div>
+    <div class="form-group">
+        <label for="tokens">Tokens:</label>
+        <input type="number" class="form-control" id="tokens" name="tokens" value="128" required>
+    </div>
+    <button class="btn btn-primary" x-on:click="submitRequest()">Submit <i class="fas fa-paper-plane"></i></button>
+    <hr>
+    <div class="form-group">
+        <label for="outputText">Output Text:</label>
+        <textarea class="form-control" id="outputText" rows="5" readonly></textarea>
+    </div>
+</div>
+
+<script defer src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"></script>
+<script>
+    function submitRequest() {
+        var button = document.querySelector("i.fa-paper-plane");
+        button.classList.add("waiting");
+        var text = document.getElementById("inputText").value;
+        var url = "/predict";
+        var data = {
+            "text": text,
+            "topP": document.getElementById("topP").value,
+            "topK": document.getElementById("topK").value,
+            "temperature": document.getElementById("temperature").value,
+            "tokens": document.getElementById("tokens").value
+        };
+        fetch(url, {
+            method: "POST",
+            headers: {
+                "Content-Type": "application/json"
+            },
+            body: JSON.stringify(data)
+        })
+        .then(response => response.json())
+        .then(data => {
+            document.getElementById("outputText").value = data.prediction;
+            button.classList.remove("waiting");
+        })
+        .catch(error => { console.error(error); button.classList.remove("waiting"); });
+    }
+</script>
+</body>
+</html>