mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-20 10:35:01 +00:00
fix: ExLlama Backend Context Size & Rope Scaling (#1311)
* fix: context_size not propagated to exllama backend * fix: exllama rope scaling
This commit is contained in:
parent
480b14c8dc
commit
20d637e7b7
1 changed files with 13 additions and 0 deletions
|
@ -63,6 +63,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
|
|
||||||
config = ExLlamaConfig(model_config_path) # create config from config.json
|
config = ExLlamaConfig(model_config_path) # create config from config.json
|
||||||
config.model_path = model_path # supply path to model weights file
|
config.model_path = model_path # supply path to model weights file
|
||||||
|
if (request.ContextSize):
|
||||||
|
config.max_seq_len = request.ContextSize # override max sequence length
|
||||||
|
config.max_attention_size = request.ContextSize**2 # Should be set to context_size^2.
|
||||||
|
# https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163
|
||||||
|
|
||||||
|
# Set Rope scaling.
|
||||||
|
if (request.RopeFreqScale):
|
||||||
|
# Alpha value for Rope scaling.
|
||||||
|
# Higher value increases context but adds perplexity.
|
||||||
|
# alpha_value and compress_pos_emb are mutually exclusive.
|
||||||
|
# https://github.com/turboderp/exllama/issues/115
|
||||||
|
config.alpha_value = request.RopeFreqScale
|
||||||
|
config.calculate_rotary_embedding_base()
|
||||||
|
|
||||||
model = ExLlama(config) # create ExLlama instance and load the weights
|
model = ExLlama(config) # create ExLlama instance and load the weights
|
||||||
tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file
|
tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue