|
|
import os
|
||
|
|
import sys
|
||
|
|
import signal
|
||
|
|
import platform
|
||
|
|
import argparse
|
||
|
|
import subprocess
|
||
|
|
|
||
|
|
def run_command(command, shell=False):
|
||
|
|
"""Run a system command and ensure it succeeds."""
|
||
|
|
try:
|
||
|
|
subprocess.run(command, shell=shell, check=True)
|
||
|
|
except subprocess.CalledProcessError as e:
|
||
|
|
print(f"Error occurred while running command: {e}")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
def run_server():
|
||
|
|
build_dir = "build"
|
||
|
|
if platform.system() == "Windows":
|
||
|
|
server_path = os.path.join(build_dir, "bin", "Release", "llama-server.exe")
|
||
|
|
if not os.path.exists(server_path):
|
||
|
|
server_path = os.path.join(build_dir, "bin", "llama-server")
|
||
|
|
else:
|
||
|
|
server_path = os.path.join(build_dir, "bin", "llama-server")
|
||
|
|
|
||
|
|
command = [
|
||
|
|
f'{server_path}',
|
||
|
|
'-m', args.model,
|
||
|
|
'-c', str(args.ctx_size),
|
||
|
|
'-t', str(args.threads),
|
||
|
|
'-n', str(args.n_predict),
|
||
|
|
'-ngl', '0',
|
||
|
|
'--temp', str(args.temperature),
|
||
|
|
'--host', args.host,
|
||
|
|
'--port', str(args.port),
|
||
|
|
'-cb' # Enable continuous batching
|
||
|
|
]
|
||
|
|
|
||
|
|
if args.prompt:
|
||
|
|
command.extend(['-p', args.prompt])
|
||
|
|
|
||
|
|
# Note: -cnv flag is removed as it's not supported by the server
|
||
|
|
|
||
|
|
print(f"Starting server on {args.host}:{args.port}")
|
||
|
|
run_command(command)
|
||
|
|
|
||
|
|
def signal_handler(sig, frame):
|
||
|
|
print("Ctrl+C pressed, shutting down server...")
|
||
|
|
sys.exit(0)
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
signal.signal(signal.SIGINT, signal_handler)
|
||
|
|
|
||
|
|
parser = argparse.ArgumentParser(description='Run llama.cpp server')
|
||
|
|
parser.add_argument("-m", "--model", type=str, help="Path to model file", required=False, default="models/bitnet_b1_58-3B/ggml-model-i2_s.gguf")
|
||
|
|
parser.add_argument("-p", "--prompt", type=str, help="System prompt for the model", required=False)
|
||
|
|
parser.add_argument("-n", "--n-predict", type=int, help="Number of tokens to predict", required=False, default=4096)
|
||
|
|
parser.add_argument("-t", "--threads", type=int, help="Number of threads to use", required=False, default=2)
|
||
|
|
parser.add_argument("-c", "--ctx-size", type=int, help="Size of the context window", required=False, default=2048)
|
||
|
|
parser.add_argument("--temperature", type=float, help="Temperature for sampling", required=False, default=0.8)
|
||
|
|
parser.add_argument("--host", type=str, help="IP address to listen on", required=False, default="127.0.0.1")
|
||
|
|
parser.add_argument("--port", type=int, help="Port to listen on", required=False, default=8080)
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
run_server()
|