feat(llama.cpp): upgrade and use libmtmd (#5379)

* WIP * wip * wip * Make it compile * Update json.hpp * this shouldn't be private for now * Add logs * Reset auto detected template Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Re-enable grammars * This seems to be broken - 360a9c98e1 (diff-a18a8e64e12a01167d8e98fc)[…]cccf0d4eed09d76d879L2998-L3207 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Placeholder * Simplify image loading * use completion type * disable streaming Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * correctly return timings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Remove some debug logging * Adapt tests Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Keep header * embedding: do not use oai type Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Sync from server.cpp * Use utils and json directly from llama.cpp Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Sync with upstream Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix: copy json.hpp from the correct location Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix: add httplib * sync llama.cpp Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Embeddiongs: set OAICOMPAT_TYPE_EMBEDDING Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat: sync with server.cpp by including it Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * make it darwin-compatible Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-20 10:35:01 +00:00 · 2025-05-17 16:02:53 +02:00 · 2025-05-17 16:02:53 +02:00 · 6d5bde860b
commit 6d5bde860b
parent 6ef383033b
8 changed files with 648 additions and 27490 deletions
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@ -7,22 +7,46 @@ for patch in $(ls patches); do
    patch -d llama.cpp/ -p1 < patches/$patch
 done 

+set -e
+
 cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
 cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
-cp -rfv json.hpp llama.cpp/tools/grpc-server/
-cp -rfv utils.hpp llama.cpp/tools/grpc-server/
-    
+cp -rfv llama.cpp/common/json.hpp llama.cpp/tools/grpc-server/
+cp -rfv llama.cpp/tools/server/utils.hpp llama.cpp/tools/grpc-server/
+cp -rfv llama.cpp/tools/server/httplib.h llama.cpp/tools/grpc-server/
+
+set +e
 if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
    echo "grpc-server already added"
 else
    echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt
 fi
+set -e

-## XXX: In some versions of CMake clip wasn't being built before llama.
-## This is an hack for now, but it should be fixed in the future.
-cp -rfv llama.cpp/tools/mtmd/clip.h llama.cpp/tools/grpc-server/clip.h
-cp -rfv llama.cpp/tools/mtmd/clip-impl.h llama.cpp/tools/grpc-server/clip-impl.h
-cp -rfv llama.cpp/tools/mtmd/llava.cpp llama.cpp/tools/grpc-server/llava.cpp
-echo '#include "llama.h"' > llama.cpp/tools/grpc-server/llava.h
-cat llama.cpp/tools/mtmd/llava.h >> llama.cpp/tools/grpc-server/llava.h
-cp -rfv llama.cpp/tools/mtmd/clip.cpp llama.cpp/tools/grpc-server/clip.cpp
+# Now to keep maximum compatibility with the original server.cpp, we need to remove the index.html.gz.hpp and loading.html.hpp includes
+# and remove the main function
+# TODO: upstream this to the original server.cpp by extracting the upstream main function to a separate file
+awk '
+/int[ \t]+main[ \t]*\(/ {          # If the line starts the main function
+    in_main=1;                     # Set a flag
+    open_braces=0;                 # Track number of open braces
+}
+in_main {
+    open_braces += gsub(/\{/, "{"); # Count opening braces
+    open_braces -= gsub(/\}/, "}"); # Count closing braces
+    if (open_braces == 0) {         # If all braces are closed
+        in_main=0;                  # End skipping
+    }
+    next;                           # Skip lines inside main
+}
+!in_main                           # Print lines not inside main
+' "llama.cpp/tools/server/server.cpp" > llama.cpp/tools/grpc-server/server.cpp
+
+# remove index.html.gz.hpp and loading.html.hpp includes
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # macOS
+    sed -i '' '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
+else
+    # Linux and others
+    sed -i '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
+fi