From 8b1fcb8d3dcf2963df1694b77db875c91d44c03e Mon Sep 17 00:00:00 2001 From: Svjatoslav Agejenko Date: Sun, 17 May 2026 02:48:38 +0300 Subject: [PATCH] chore: add local Phi-4 LLM server launcher for TCP testing Add tools/llama-server-phi-4 script to start a fast local llama.cpp server\nwith the Phi-4 model (Q4_K_M, ~8.3 GiB) on port 8081.\n\nIntended for gradual transition to using LLM over TCP in the project.\nRuns at low priority with single slot and 16K context. --- tools/llama-server-phi-4 | 49 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100755 tools/llama-server-phi-4 diff --git a/tools/llama-server-phi-4 b/tools/llama-server-phi-4 new file mode 100755 index 0000000..19eb65f --- /dev/null +++ b/tools/llama-server-phi-4 @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# ============================================================================= +# Script: llama-server-phi-4 +# ============================================================================= +# Purpose: Starts a fast local llama.cpp server with the Phi-4 model, +# providing an OpenAI-compatible API endpoint over TCP. +# Intended for local development and testing of LLM integration. +# +# Usage: ./llama-server-phi-4 +# +# Example: +# ./llama-server-phi-4 +# # Server starts on http://0.0.0.0:8081 +# +# API Endpoint: +# - Host: 0.0.0.0 +# - Port: 8081 +# - OpenAI-compatible API at /v1/* +# +# Model Configuration: +# - Model: phi-4-Q4_K_M.gguf (8.3 GiB) +# - Backend: Vulkan (all layers on GPU) +# - Context size: 16384 tokens +# - Flash attention: enabled +# - Continuous batching: enabled +# +# Notes: +# - Runs at low priority (nice 19, ionice idle) +# - Single slot (-np 1) for minimal latency +# - Timeout: 3600 seconds (1 hour) +# ============================================================================= + +( + cd ~/data/AI/llama.cpp/build/bin/ + nice -n 19 ionice -c 3 ./llama-server \ + -m ~/data/AI/models/phi-4-Q4_K_M.gguf \ + --host 0.0.0.0 \ + --port 8081 \ + -ngl 999 \ + -c 16384 \ + --flash-attn on \ + --cont-batching \ + --threads 8 \ + --threads-batch 8 \ + -ub 512 \ + -b 2048 \ + -np 1 \ + --timeout 3600 +) -- 2.20.1