fix: ExLlama Backend Context Size & Rope Scaling (#1311)

* fix: context_size not propagated to exllama backend * fix: exllama rope scaling
2025-05-20 10:35:01 +00:00 · 2023-11-21 10:26:39 -08:00 · 2023-11-21 10:26:39 -08:00 · 20d637e7b7
commit 20d637e7b7
parent 480b14c8dc
1 changed files with 13 additions and 0 deletions
--- a/backend/python/exllama/exllama.py
+++ b/backend/python/exllama/exllama.py
@ -63,6 +63,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            config = ExLlamaConfig(model_config_path)               # create config from config.json
            config.model_path = model_path                          # supply path to model weights file
            if (request.ContextSize):
                config.max_seq_len = request.ContextSize            # override max sequence length
                config.max_attention_size = request.ContextSize**2  # Should be set to context_size^2. 
                # https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163
            # Set Rope scaling.
            if (request.RopeFreqScale):
                # Alpha value for Rope scaling. 
                # Higher value increases context but adds perplexity.
                # alpha_value and compress_pos_emb are mutually exclusive.
                # https://github.com/turboderp/exllama/issues/115
                config.alpha_value = request.RopeFreqScale
                config.calculate_rotary_embedding_base()
            model = ExLlama(config)                                 # create ExLlama instance and load the weights
            tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file