avoid stomping extra_params[max_tokens]=1 in cache warming, which causes loop of 1 token infinite-output responses with prefill models #1842 #1841

This commit is contained in:
Paul Gauthier 2024-09-30 14:22:11 -07:00
parent 8fb0362b47
commit f2e1e17741

View file

@ -1073,7 +1073,7 @@ class Coder:
self.warming_pings_left -= 1
self.next_cache_warm = time.time() + delay
kwargs = self.main_model.extra_params or dict()
kwargs = dict(self.main_model.extra_params) or dict()
kwargs["max_tokens"] = 1
try: