From 8b1fcb8d3dcf2963df1694b77db875c91d44c03e Mon Sep 17 00:00:00 2001
From: Svjatoslav Agejenko <svjatoslav@svjatoslav.eu>
Date: Sun, 17 May 2026 02:48:38 +0300
Subject: [PATCH] chore: add local Phi-4 LLM server launcher for TCP testing

Add tools/llama-server-phi-4 script to start a fast local llama.cpp server\nwith the Phi-4 model (Q4_K_M, ~8.3 GiB) on port 8081.\n\nIntended for gradual transition to using LLM over TCP in the project.\nRuns at low priority with single slot and 16K context.
---
 tools/llama-server-phi-4 | 49 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100755 tools/llama-server-phi-4

diff --git a/tools/llama-server-phi-4 b/tools/llama-server-phi-4
new file mode 100755
index 0000000..19eb65f
--- /dev/null
+++ b/tools/llama-server-phi-4
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# =============================================================================
+# Script: llama-server-phi-4
+# =============================================================================
+# Purpose: Starts a fast local llama.cpp server with the Phi-4 model,
+#          providing an OpenAI-compatible API endpoint over TCP.
+#          Intended for local development and testing of LLM integration.
+#
+# Usage:   ./llama-server-phi-4
+#
+# Example:
+#   ./llama-server-phi-4
+#   # Server starts on http://0.0.0.0:8081
+#
+# API Endpoint:
+#   - Host: 0.0.0.0
+#   - Port: 8081
+#   - OpenAI-compatible API at /v1/*
+#
+# Model Configuration:
+#   - Model: phi-4-Q4_K_M.gguf (8.3 GiB)
+#   - Backend: Vulkan (all layers on GPU)
+#   - Context size: 16384 tokens
+#   - Flash attention: enabled
+#   - Continuous batching: enabled
+#
+# Notes:
+#   - Runs at low priority (nice 19, ionice idle)
+#   - Single slot (-np 1) for minimal latency
+#   - Timeout: 3600 seconds (1 hour)
+# =============================================================================
+
+(
+    cd ~/data/AI/llama.cpp/build/bin/
+    nice -n 19 ionice -c 3 ./llama-server \
+        -m ~/data/AI/models/phi-4-Q4_K_M.gguf \
+        --host 0.0.0.0 \
+        --port 8081 \
+        -ngl 999 \
+        -c 16384 \
+        --flash-attn on \
+        --cont-batching \
+        --threads 8 \
+        --threads-batch 8 \
+        -ub 512 \
+        -b 2048 \
+        -np 1 \
+        --timeout 3600
+)
-- 
2.20.1