diff --git a/core/backend/llm.go b/core/backend/llm.go index 9e121f79..9a4d0d46 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -118,7 +118,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im var partialRune []byte err := inferenceModel.PredictStream(ctx, opts, func(reply *proto.Reply) { - msg := reply.GetMessage() + msg := reply.Message partialRune = append(partialRune, msg...) tokenUsage.Prompt = int(reply.PromptTokens) diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index b03b18bd..1ac1387e 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -39,15 +39,11 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup responses <- initialMessage ComputeChoices(req, s, config, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool { - choices := []schema.Choice{} - if s != "" { - choices = append(choices, schema.Choice{Delta: &schema.Message{Content: &s}, Index: 0}) - } resp := schema.OpenAIResponse{ ID: id, Created: created, Model: req.Model, // we have to return what the user sent here, due to OpenAI spec. - Choices: choices, + Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0}}, Object: "chat.completion.chunk", Usage: schema.OpenAIUsage{ PromptTokens: usage.Prompt, @@ -469,9 +465,6 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup toolsCalled := false for ev := range responses { usage = &ev.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it - if len(ev.Choices) == 0 { - break - } if len(ev.Choices[0].Delta.ToolCalls) > 0 { toolsCalled = true }