mirror of
https://github.com/mudler/LocalAI.git
synced 2025-06-05 10:25:00 +00:00

Some checks are pending
Explorer deployment / build-linux (push) Waiting to run
GPU tests / ubuntu-latest (1.21.x) (push) Waiting to run
generate and publish intel docker caches / generate_caches (intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04, linux/amd64, ubuntu-latest) (push) Waiting to run
build container images / hipblas-jobs (-aio-gpu-hipblas, rocm/dev-ubuntu-22.04:6.1, hipblas, true, ubuntu:22.04, extras, latest-gpu-hipblas-extras, latest-aio-gpu-hipblas, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, auto, -hipblas-extras) (push) Waiting to run
build container images / hipblas-jobs (rocm/dev-ubuntu-22.04:6.1, hipblas, true, ubuntu:22.04, core, latest-gpu-hipblas, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, false, -hipblas) (push) Waiting to run
build container images / self-hosted-jobs (-aio-gpu-intel-f16, quay.io/go-skynet/intel-oneapi-base:latest, sycl_f16, true, ubuntu:22.04, extras, latest-gpu-intel-f16-extras, latest-aio-gpu-intel-f16, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, false, -sycl-f16-… (push) Waiting to run
build container images / self-hosted-jobs (-aio-gpu-intel-f32, quay.io/go-skynet/intel-oneapi-base:latest, sycl_f32, true, ubuntu:22.04, extras, latest-gpu-intel-f32-extras, latest-aio-gpu-intel-f32, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, false, -sycl-f32-… (push) Waiting to run
build container images / self-hosted-jobs (-aio-gpu-nvidia-cuda-11, ubuntu:22.04, cublas, 11, 7, true, extras, latest-gpu-nvidia-cuda-11-extras, latest-aio-gpu-nvidia-cuda-11, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, false, -cublas-cuda11-extras) (push) Waiting to run
build container images / self-hosted-jobs (-aio-gpu-nvidia-cuda-12, ubuntu:22.04, cublas, 12, 0, true, extras, latest-gpu-nvidia-cuda-12-extras, latest-aio-gpu-nvidia-cuda-12, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, false, -cublas-cuda12-extras) (push) Waiting to run
build container images / self-hosted-jobs (quay.io/go-skynet/intel-oneapi-base:latest, sycl_f16, true, ubuntu:22.04, core, latest-gpu-intel-f16, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, false, -sycl-f16) (push) Waiting to run
build container images / self-hosted-jobs (quay.io/go-skynet/intel-oneapi-base:latest, sycl_f32, true, ubuntu:22.04, core, latest-gpu-intel-f32, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, false, -sycl-f32) (push) Waiting to run
build container images / core-image-build (-aio-cpu, ubuntu:22.04, , true, core, latest-cpu, latest-aio-cpu, --jobs=4 --output-sync=target, linux/amd64,linux/arm64, arc-runner-set, false, auto, ) (push) Waiting to run
build container images / core-image-build (ubuntu:22.04, cublas, 11, 7, true, core, latest-gpu-nvidia-cuda-12, --jobs=4 --output-sync=target, linux/amd64, arc-runner-set, false, false, -cublas-cuda11) (push) Waiting to run
build container images / core-image-build (ubuntu:22.04, cublas, 12, 0, true, core, latest-gpu-nvidia-cuda-12, --jobs=4 --output-sync=target, linux/amd64, arc-runner-set, false, false, -cublas-cuda12) (push) Waiting to run
build container images / core-image-build (ubuntu:22.04, vulkan, true, core, latest-gpu-vulkan, --jobs=4 --output-sync=target, linux/amd64, arc-runner-set, false, false, -vulkan) (push) Waiting to run
build container images / gh-runner (nvcr.io/nvidia/l4t-jetpack:r36.4.0, cublas, 12, 0, true, core, latest-nvidia-l4t-arm64, --jobs=4 --output-sync=target, linux/arm64, ubuntu-24.04-arm, true, false, -nvidia-l4t-arm64) (push) Waiting to run
Security Scan / tests (push) Waiting to run
Tests extras backends / tests-transformers (push) Waiting to run
Tests extras backends / tests-rerankers (push) Waiting to run
Tests extras backends / tests-diffusers (push) Waiting to run
Tests extras backends / tests-coqui (push) Waiting to run
tests / tests-linux (1.21.x) (push) Waiting to run
tests / tests-aio-container (push) Waiting to run
tests / tests-apple (1.21.x) (push) Waiting to run
* feat(realtime): Initial Realtime API implementation Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * chore: go mod tidy Signed-off-by: Richard Palethorpe <io@richiejp.com> * feat: Implement transcription only mode for realtime API Reduce the scope of the real time API for the initial realease and make transcription only mode functional. Signed-off-by: Richard Palethorpe <io@richiejp.com> * chore(build): Build backends on a separate layer to speed up core only changes Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Signed-off-by: Richard Palethorpe <io@richiejp.com> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
259 lines
8.6 KiB
Go
259 lines
8.6 KiB
Go
package openai
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
|
|
"github.com/mudler/LocalAI/core/backend"
|
|
"github.com/mudler/LocalAI/core/config"
|
|
grpcClient "github.com/mudler/LocalAI/pkg/grpc"
|
|
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
|
model "github.com/mudler/LocalAI/pkg/model"
|
|
"github.com/rs/zerolog/log"
|
|
"google.golang.org/grpc"
|
|
)
|
|
|
|
var (
|
|
_ Model = new(wrappedModel)
|
|
_ Model = new(anyToAnyModel)
|
|
)
|
|
|
|
// wrappedModel represent a model which does not support Any-to-Any operations
|
|
// This means that we will fake an Any-to-Any model by overriding some of the gRPC client methods
|
|
// which are for Any-To-Any models, but instead we will call a pipeline (for e.g STT->LLM->TTS)
|
|
type wrappedModel struct {
|
|
TTSConfig *config.BackendConfig
|
|
TranscriptionConfig *config.BackendConfig
|
|
LLMConfig *config.BackendConfig
|
|
TTSClient grpcClient.Backend
|
|
TranscriptionClient grpcClient.Backend
|
|
LLMClient grpcClient.Backend
|
|
|
|
VADConfig *config.BackendConfig
|
|
VADClient grpcClient.Backend
|
|
}
|
|
|
|
// anyToAnyModel represent a model which supports Any-to-Any operations
|
|
// We have to wrap this out as well because we want to load two models one for VAD and one for the actual model.
|
|
// In the future there could be models that accept continous audio input only so this design will be useful for that
|
|
type anyToAnyModel struct {
|
|
LLMConfig *config.BackendConfig
|
|
LLMClient grpcClient.Backend
|
|
|
|
VADConfig *config.BackendConfig
|
|
VADClient grpcClient.Backend
|
|
}
|
|
|
|
type transcriptOnlyModel struct {
|
|
TranscriptionConfig *config.BackendConfig
|
|
TranscriptionClient grpcClient.Backend
|
|
VADConfig *config.BackendConfig
|
|
VADClient grpcClient.Backend
|
|
}
|
|
|
|
func (m *transcriptOnlyModel) VAD(ctx context.Context, in *proto.VADRequest, opts ...grpc.CallOption) (*proto.VADResponse, error) {
|
|
return m.VADClient.VAD(ctx, in)
|
|
}
|
|
|
|
func (m *transcriptOnlyModel) Transcribe(ctx context.Context, in *proto.TranscriptRequest, opts ...grpc.CallOption) (*proto.TranscriptResult, error) {
|
|
return m.TranscriptionClient.AudioTranscription(ctx, in, opts...)
|
|
}
|
|
|
|
func (m *transcriptOnlyModel) Predict(ctx context.Context, in *proto.PredictOptions, opts ...grpc.CallOption) (*proto.Reply, error) {
|
|
return nil, fmt.Errorf("predict operation not supported in transcript-only mode")
|
|
}
|
|
|
|
func (m *transcriptOnlyModel) PredictStream(ctx context.Context, in *proto.PredictOptions, f func(reply *proto.Reply), opts ...grpc.CallOption) error {
|
|
return fmt.Errorf("predict stream operation not supported in transcript-only mode")
|
|
}
|
|
|
|
func (m *wrappedModel) VAD(ctx context.Context, in *proto.VADRequest, opts ...grpc.CallOption) (*proto.VADResponse, error) {
|
|
return m.VADClient.VAD(ctx, in)
|
|
}
|
|
|
|
func (m *anyToAnyModel) VAD(ctx context.Context, in *proto.VADRequest, opts ...grpc.CallOption) (*proto.VADResponse, error) {
|
|
return m.VADClient.VAD(ctx, in)
|
|
}
|
|
|
|
func (m *wrappedModel) Transcribe(ctx context.Context, in *proto.TranscriptRequest, opts ...grpc.CallOption) (*proto.TranscriptResult, error) {
|
|
return m.TranscriptionClient.AudioTranscription(ctx, in, opts...)
|
|
}
|
|
|
|
func (m *anyToAnyModel) Transcribe(ctx context.Context, in *proto.TranscriptRequest, opts ...grpc.CallOption) (*proto.TranscriptResult, error) {
|
|
// TODO: Can any-to-any models transcribe?
|
|
return m.LLMClient.AudioTranscription(ctx, in, opts...)
|
|
}
|
|
|
|
func (m *wrappedModel) Predict(ctx context.Context, in *proto.PredictOptions, opts ...grpc.CallOption) (*proto.Reply, error) {
|
|
// TODO: Convert with pipeline (audio to text, text to llm, result to tts, and return it)
|
|
// sound.BufferAsWAV(audioData, "audio.wav")
|
|
|
|
return m.LLMClient.Predict(ctx, in)
|
|
}
|
|
|
|
func (m *wrappedModel) PredictStream(ctx context.Context, in *proto.PredictOptions, f func(reply *proto.Reply), opts ...grpc.CallOption) error {
|
|
// TODO: Convert with pipeline (audio to text, text to llm, result to tts, and return it)
|
|
|
|
return m.LLMClient.PredictStream(ctx, in, f)
|
|
}
|
|
|
|
func (m *anyToAnyModel) Predict(ctx context.Context, in *proto.PredictOptions, opts ...grpc.CallOption) (*proto.Reply, error) {
|
|
return m.LLMClient.Predict(ctx, in)
|
|
}
|
|
|
|
func (m *anyToAnyModel) PredictStream(ctx context.Context, in *proto.PredictOptions, f func(reply *proto.Reply), opts ...grpc.CallOption) error {
|
|
return m.LLMClient.PredictStream(ctx, in, f)
|
|
}
|
|
|
|
func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.BackendConfig, error) {
|
|
cfgVAD, err := cl.LoadBackendConfigFileByName(pipeline.VAD, ml.ModelPath)
|
|
if err != nil {
|
|
|
|
return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
|
|
}
|
|
|
|
if !cfgVAD.Validate() {
|
|
return nil, nil, fmt.Errorf("failed to validate config: %w", err)
|
|
}
|
|
|
|
opts := backend.ModelOptions(*cfgVAD, appConfig)
|
|
VADClient, err := ml.Load(opts...)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("failed to load tts model: %w", err)
|
|
}
|
|
|
|
cfgSST, err := cl.LoadBackendConfigFileByName(pipeline.Transcription, ml.ModelPath)
|
|
if err != nil {
|
|
|
|
return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
|
|
}
|
|
|
|
if !cfgSST.Validate() {
|
|
return nil, nil, fmt.Errorf("failed to validate config: %w", err)
|
|
}
|
|
|
|
opts = backend.ModelOptions(*cfgSST, appConfig)
|
|
transcriptionClient, err := ml.Load(opts...)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("failed to load SST model: %w", err)
|
|
}
|
|
|
|
return &transcriptOnlyModel{
|
|
VADConfig: cfgVAD,
|
|
VADClient: VADClient,
|
|
TranscriptionConfig: cfgSST,
|
|
TranscriptionClient: transcriptionClient,
|
|
}, cfgSST, nil
|
|
}
|
|
|
|
// returns and loads either a wrapped model or a model that support audio-to-audio
|
|
func newModel(pipeline *config.Pipeline, cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, error) {
|
|
|
|
cfgVAD, err := cl.LoadBackendConfigFileByName(pipeline.VAD, ml.ModelPath)
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
|
}
|
|
|
|
if !cfgVAD.Validate() {
|
|
return nil, fmt.Errorf("failed to validate config: %w", err)
|
|
}
|
|
|
|
opts := backend.ModelOptions(*cfgVAD, appConfig)
|
|
VADClient, err := ml.Load(opts...)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to load tts model: %w", err)
|
|
}
|
|
|
|
// TODO: Do we always need a transcription model? It can be disabled. Note that any-to-any instruction following models don't transcribe as such, so if transcription is required it is a separate process
|
|
cfgSST, err := cl.LoadBackendConfigFileByName(pipeline.Transcription, ml.ModelPath)
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
|
}
|
|
|
|
if !cfgSST.Validate() {
|
|
return nil, fmt.Errorf("failed to validate config: %w", err)
|
|
}
|
|
|
|
opts = backend.ModelOptions(*cfgSST, appConfig)
|
|
transcriptionClient, err := ml.Load(opts...)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to load SST model: %w", err)
|
|
}
|
|
|
|
// TODO: Decide when we have a real any-to-any model
|
|
if false {
|
|
|
|
cfgAnyToAny, err := cl.LoadBackendConfigFileByName(pipeline.LLM, ml.ModelPath)
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
|
}
|
|
|
|
if !cfgAnyToAny.Validate() {
|
|
return nil, fmt.Errorf("failed to validate config: %w", err)
|
|
}
|
|
|
|
opts := backend.ModelOptions(*cfgAnyToAny, appConfig)
|
|
anyToAnyClient, err := ml.Load(opts...)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to load tts model: %w", err)
|
|
}
|
|
|
|
return &anyToAnyModel{
|
|
LLMConfig: cfgAnyToAny,
|
|
LLMClient: anyToAnyClient,
|
|
VADConfig: cfgVAD,
|
|
VADClient: VADClient,
|
|
}, nil
|
|
}
|
|
|
|
log.Debug().Msg("Loading a wrapped model")
|
|
|
|
// Otherwise we want to return a wrapped model, which is a "virtual" model that re-uses other models to perform operations
|
|
cfgLLM, err := cl.LoadBackendConfigFileByName(pipeline.LLM, ml.ModelPath)
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
|
}
|
|
|
|
if !cfgLLM.Validate() {
|
|
return nil, fmt.Errorf("failed to validate config: %w", err)
|
|
}
|
|
|
|
cfgTTS, err := cl.LoadBackendConfigFileByName(pipeline.TTS, ml.ModelPath)
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
|
}
|
|
|
|
if !cfgTTS.Validate() {
|
|
return nil, fmt.Errorf("failed to validate config: %w", err)
|
|
}
|
|
|
|
|
|
opts = backend.ModelOptions(*cfgTTS, appConfig)
|
|
ttsClient, err := ml.Load(opts...)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to load tts model: %w", err)
|
|
}
|
|
|
|
opts = backend.ModelOptions(*cfgLLM, appConfig)
|
|
llmClient, err := ml.Load(opts...)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to load LLM model: %w", err)
|
|
}
|
|
|
|
return &wrappedModel{
|
|
TTSConfig: cfgTTS,
|
|
TranscriptionConfig: cfgSST,
|
|
LLMConfig: cfgLLM,
|
|
TTSClient: ttsClient,
|
|
TranscriptionClient: transcriptionClient,
|
|
LLMClient: llmClient,
|
|
|
|
VADConfig: cfgVAD,
|
|
VADClient: VADClient,
|
|
}, nil
|
|
}
|