--- /dev/null
+#!/usr/bin/env bash
+# =============================================================================
+# Script: llama-server-phi-4
+# =============================================================================
+# Purpose: Starts a fast local llama.cpp server with the Phi-4 model,
+# providing an OpenAI-compatible API endpoint over TCP.
+# Intended for local development and testing of LLM integration.
+#
+# Usage: ./llama-server-phi-4
+#
+# Example:
+# ./llama-server-phi-4
+# # Server starts on http://0.0.0.0:8081
+#
+# API Endpoint:
+# - Host: 0.0.0.0
+# - Port: 8081
+# - OpenAI-compatible API at /v1/*
+#
+# Model Configuration:
+# - Model: phi-4-Q4_K_M.gguf (8.3 GiB)
+# - Backend: Vulkan (all layers on GPU)
+# - Context size: 16384 tokens
+# - Flash attention: enabled
+# - Continuous batching: enabled
+#
+# Notes:
+# - Runs at low priority (nice 19, ionice idle)
+# - Single slot (-np 1) for minimal latency
+# - Timeout: 3600 seconds (1 hour)
+# =============================================================================
+
+(
+ cd ~/data/AI/llama.cpp/build/bin/
+ nice -n 19 ionice -c 3 ./llama-server \
+ -m ~/data/AI/models/phi-4-Q4_K_M.gguf \
+ --host 0.0.0.0 \
+ --port 8081 \
+ -ngl 999 \
+ -c 16384 \
+ --flash-attn on \
+ --cont-batching \
+ --threads 8 \
+ --threads-batch 8 \
+ -ub 512 \
+ -b 2048 \
+ -np 1 \
+ --timeout 3600
+)