feat: add bert.cpp embeddings (#222)

2025-05-28 22:44:59 +00:00 · 2023-05-10 15:20:21 +02:00 · 2023-05-10 15:20:21 +02:00 · f8ee20991c
commit f8ee20991c
parent e6db14e2f1
14 changed files with 104 additions and 53 deletions
--- a/examples/query_data/README.md
+++ b/examples/query_data/README.md
@ -12,11 +12,7 @@ Summary of the steps:

 ## Requirements

-For this in order to work, you will need LocalAI and a model compatible with the `llama.cpp` backend. This is will not work with gpt4all, however you can mix models (use a llama.cpp one to build the index database, and gpt4all to query it).
-
-The example uses `WizardLM` for both embeddings and Q&A. Edit the config files in `models/` accordingly to specify the model you use (change `HERE` in the configuration files).
-
-You will also need a training data set. Copy that over `data`.
+You will need a training data set. Copy that over `data`.

 ## Setup

@ -28,7 +24,8 @@ git clone https://github.com/go-skynet/LocalAI

 cd LocalAI/examples/query_data

-# Copy your models, edit config files accordingly
+wget https://huggingface.co/skeskinen/ggml/resolve/main/all-MiniLM-L6-v2/ggml-model-q4_0.bin -O models/bert
+wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j

 # start with docker-compose
 docker-compose up -d --build
--- a/examples/query_data/models/embeddings.yaml
+++ b/examples/query_data/models/embeddings.yaml
@ -1,18 +1,6 @@
 name: text-embedding-ada-002
 parameters:
-  model: HERE
-  top_k: 80
-  temperature: 0.2
-  top_p: 0.7
-context_size: 1024
+  model: bert
 threads: 14
-stopwords:
- "HUMAN:"
- "GPT:"
-roles:
-  user: " "
-  system: " "
+backend: bert-embeddings
 embeddings: true
-template:
-  completion: completion
-  chat: gpt4all
--- a/examples/query_data/models/gpt-3.5-turbo.yaml
+++ b/examples/query_data/models/gpt-3.5-turbo.yaml
@ -1,12 +1,11 @@
 name: gpt-3.5-turbo
 parameters:
-  model: HERE
+  model: ggml-gpt4all-j
  top_k: 80
  temperature: 0.2
  top_p: 0.7
 context_size: 1024
 threads: 14
-embeddings: true
 stopwords:
 - "HUMAN:"
 - "GPT:"
@ -15,4 +14,4 @@ roles:
  system: " "
 template:
  completion: completion
-  chat: wizardlm
+  chat: gpt4all
--- a/examples/query_data/models/wizardlm.tmpl
+++ b/examples/query_data/models/wizardlm.tmpl
@ -1,3 +0,0 @@
-{{.Input}}
-
-### Response:
--- a/examples/query_data/query.py
+++ b/examples/query_data/query.py
@ -13,7 +13,7 @@ base_path = os.environ.get('OPENAI_API_BASE', 'http://localhost:8080/v1')
 llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="gpt-3.5-turbo", openai_api_base=base_path))

 # Configure prompt parameters and initialise helper
-max_input_size = 1024
+max_input_size = 500
 num_output = 256
 max_chunk_overlap = 20

--- a/examples/query_data/store.py
+++ b/examples/query_data/store.py
@ -13,15 +13,15 @@ base_path = os.environ.get('OPENAI_API_BASE', 'http://localhost:8080/v1')
 llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="gpt-3.5-turbo", openai_api_base=base_path))

 # Configure prompt parameters and initialise helper
-max_input_size = 512
-num_output = 512
+max_input_size = 400
+num_output = 400
 max_chunk_overlap = 30

 prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

 # Load documents from the 'data' directory
 documents = SimpleDirectoryReader('data').load_data()
-service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper, chunk_size_limit = 512)
+service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper, chunk_size_limit = 400)
 index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)
 index.storage_context.persist(persist_dir="./storage")