2023-11-10 05:39:42 -05:00
|
|
|
import ctypes
|
2024-09-18 20:00:19 -04:00
|
|
|
import multiprocessing
|
2023-11-10 05:39:42 -05:00
|
|
|
|
2023-12-16 15:59:26 -08:00
|
|
|
import numpy as np
|
|
|
|
|
from scipy.special import log_softmax
|
2023-11-10 05:39:42 -05:00
|
|
|
|
2024-09-18 20:00:19 -04:00
|
|
|
from huggingface_hub import hf_hub_download
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
2023-04-05 03:23:15 -04:00
|
|
|
import llama_cpp
|
2024-09-18 20:00:19 -04:00
|
|
|
import llama_cpp._internals as internals
|
|
|
|
|
|
2023-04-05 03:23:15 -04:00
|
|
|
|
2024-04-29 23:34:55 -04:00
|
|
|
MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama-spm.gguf"
|
2023-04-05 03:23:15 -04:00
|
|
|
|
|
|
|
|
|
2024-09-18 20:00:19 -04:00
|
|
|
def test_llama_cpp_version():
|
|
|
|
|
assert llama_cpp.__version__
|
|
|
|
|
|
|
|
|
|
|
2023-08-25 04:56:48 -04:00
|
|
|
def test_llama_cpp_tokenization():
|
|
|
|
|
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, verbose=False)
|
2023-04-05 03:23:15 -04:00
|
|
|
|
|
|
|
|
assert llama
|
2023-11-06 09:16:36 -05:00
|
|
|
assert llama._ctx.ctx is not None
|
2023-04-05 03:23:15 -04:00
|
|
|
|
|
|
|
|
text = b"Hello World"
|
|
|
|
|
|
2023-08-25 04:56:48 -04:00
|
|
|
tokens = llama.tokenize(text)
|
|
|
|
|
assert tokens[0] == llama.token_bos()
|
2023-08-27 12:59:20 -04:00
|
|
|
assert tokens == [1, 15043, 2787]
|
2023-08-25 04:56:48 -04:00
|
|
|
detokenized = llama.detokenize(tokens)
|
|
|
|
|
assert detokenized == text
|
|
|
|
|
|
|
|
|
|
tokens = llama.tokenize(text, add_bos=False)
|
|
|
|
|
assert tokens[0] != llama.token_bos()
|
2023-08-27 12:59:20 -04:00
|
|
|
assert tokens == [15043, 2787]
|
2023-08-25 04:56:48 -04:00
|
|
|
|
|
|
|
|
detokenized = llama.detokenize(tokens)
|
2023-08-27 12:59:20 -04:00
|
|
|
assert detokenized != text
|
2023-04-05 03:23:15 -04:00
|
|
|
|
2023-11-02 01:29:06 +00:00
|
|
|
text = b"Hello World</s>"
|
|
|
|
|
tokens = llama.tokenize(text)
|
|
|
|
|
assert tokens[-1] != llama.token_eos()
|
|
|
|
|
assert tokens == [1, 15043, 2787, 829, 29879, 29958]
|
|
|
|
|
|
|
|
|
|
tokens = llama.tokenize(text, special=True)
|
|
|
|
|
assert tokens[-1] == llama.token_eos()
|
2023-11-20 14:11:33 -05:00
|
|
|
assert tokens == [1, 15043, 2787, 2]
|
2023-11-02 01:29:06 +00:00
|
|
|
|
2023-11-20 23:23:18 -05:00
|
|
|
text = b""
|
|
|
|
|
tokens = llama.tokenize(text, add_bos=True, special=True)
|
|
|
|
|
assert tokens[-1] != llama.token_eos()
|
|
|
|
|
assert tokens == [llama.token_bos()]
|
|
|
|
|
assert text == llama.detokenize(tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
2024-09-18 20:00:19 -04:00
|
|
|
def llama_cpp_model_path():
|
|
|
|
|
repo_id = "Qwen/Qwen2-0.5B-Instruct-GGUF"
|
|
|
|
|
filename = "qwen2-0_5b-instruct-q8_0.gguf"
|
|
|
|
|
model_path = hf_hub_download(repo_id, filename)
|
|
|
|
|
return model_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_real_model(llama_cpp_model_path):
|
|
|
|
|
import os
|
|
|
|
|
assert os.path.exists(llama_cpp_model_path)
|
|
|
|
|
|
|
|
|
|
params = llama_cpp.llama_model_default_params()
|
|
|
|
|
params.use_mmap = llama_cpp.llama_supports_mmap()
|
|
|
|
|
params.use_mlock = llama_cpp.llama_supports_mlock()
|
|
|
|
|
params.check_tensors = False
|
|
|
|
|
|
|
|
|
|
model = internals.LlamaModel(path_model=llama_cpp_model_path, params=params)
|
|
|
|
|
|
|
|
|
|
cparams = llama_cpp.llama_context_default_params()
|
|
|
|
|
cparams.n_ctx = 16
|
|
|
|
|
cparams.n_batch = 16
|
|
|
|
|
cparams.n_ubatch = 16
|
|
|
|
|
cparams.n_threads = multiprocessing.cpu_count()
|
|
|
|
|
cparams.n_threads_batch = multiprocessing.cpu_count()
|
|
|
|
|
cparams.logits_all = False
|
|
|
|
|
cparams.flash_attn = True
|
|
|
|
|
|
|
|
|
|
context = internals.LlamaContext(model=model, params=cparams)
|
|
|
|
|
tokens = model.tokenize(b"Hello, world!", add_bos=True, special=True)
|
|
|
|
|
|
|
|
|
|
assert tokens == [9707, 11, 1879, 0]
|
|
|
|
|
|
|
|
|
|
tokens = model.tokenize(b"The quick brown fox jumps", add_bos=True, special=True)
|
|
|
|
|
|
|
|
|
|
batch = internals.LlamaBatch(n_tokens=len(tokens), embd=0, n_seq_max=1)
|
|
|
|
|
|
|
|
|
|
seed = 1337
|
|
|
|
|
sampler = internals.LlamaSampler()
|
|
|
|
|
sampler.add_top_k(50)
|
|
|
|
|
sampler.add_top_p(0.9, 1)
|
|
|
|
|
sampler.add_temp(0.8)
|
|
|
|
|
sampler.add_dist(seed)
|
|
|
|
|
|
|
|
|
|
result = tokens
|
|
|
|
|
n_eval = 0
|
|
|
|
|
for _ in range(4):
|
|
|
|
|
batch.set_batch(tokens, n_past=n_eval, logits_all=False)
|
|
|
|
|
context.decode(batch)
|
|
|
|
|
n_eval += len(tokens)
|
|
|
|
|
token_id = sampler.sample(context, -1)
|
|
|
|
|
tokens = [token_id]
|
|
|
|
|
result += tokens
|
|
|
|
|
|
|
|
|
|
output = result[5:]
|
|
|
|
|
output_text = model.detokenize(output, special=True)
|
|
|
|
|
assert output_text == b" over the lazy dog"
|
|
|
|
|
|
|
|
|
|
def test_real_llama(llama_cpp_model_path):
|
|
|
|
|
model = llama_cpp.Llama(
|
|
|
|
|
llama_cpp_model_path,
|
|
|
|
|
n_ctx=32,
|
|
|
|
|
n_batch=32,
|
|
|
|
|
n_ubatch=32,
|
|
|
|
|
n_threads=multiprocessing.cpu_count(),
|
|
|
|
|
n_threads_batch=multiprocessing.cpu_count(),
|
|
|
|
|
logits_all=False,
|
|
|
|
|
flash_attn=True,
|
2023-04-05 03:23:15 -04:00
|
|
|
)
|
2023-04-28 23:26:07 -07:00
|
|
|
|
2024-09-18 20:00:19 -04:00
|
|
|
output = model.create_completion(
|
|
|
|
|
"The quick brown fox jumps",
|
|
|
|
|
max_tokens=4,
|
|
|
|
|
top_k=50,
|
|
|
|
|
top_p=0.9,
|
|
|
|
|
temperature=0.8,
|
|
|
|
|
seed=1337
|
|
|
|
|
)
|
|
|
|
|
assert output["choices"][0]["text"] == " over the lazy dog"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output = model.create_completion(
|
|
|
|
|
"The capital of france is paris, 'true' or 'false'?:\n",
|
|
|
|
|
max_tokens=4,
|
|
|
|
|
top_k=50,
|
|
|
|
|
top_p=0.9,
|
|
|
|
|
temperature=0.8,
|
|
|
|
|
seed=1337,
|
|
|
|
|
grammar=llama_cpp.LlamaGrammar.from_string("""
|
|
|
|
|
root ::= "true" | "false"
|
|
|
|
|
""")
|
|
|
|
|
)
|
|
|
|
|
assert output["choices"][0]["text"] == "true"
|
2023-04-28 23:26:07 -07:00
|
|
|
|
2024-09-18 20:00:19 -04:00
|
|
|
suffix = b"rot"
|
|
|
|
|
tokens = model.tokenize(suffix, add_bos=True, special=True)
|
|
|
|
|
def logit_processor_func(input_ids, logits):
|
|
|
|
|
for token in tokens:
|
|
|
|
|
logits[token] *= 1000
|
|
|
|
|
return logits
|
2023-05-01 22:38:46 -04:00
|
|
|
|
2024-09-18 20:00:19 -04:00
|
|
|
logit_processors = llama_cpp.LogitsProcessorList(
|
|
|
|
|
[logit_processor_func]
|
2023-05-01 22:41:54 -04:00
|
|
|
)
|
2023-12-16 15:59:26 -08:00
|
|
|
|
2024-09-18 20:00:19 -04:00
|
|
|
output = model.create_completion(
|
|
|
|
|
"The capital of france is par",
|
|
|
|
|
max_tokens=4,
|
|
|
|
|
top_k=50,
|
|
|
|
|
top_p=0.9,
|
|
|
|
|
temperature=0.8,
|
|
|
|
|
seed=1337,
|
|
|
|
|
logits_processor=logit_processors
|
|
|
|
|
)
|
|
|
|
|
assert output["choices"][0]["text"].lower().startswith("rot")
|
2024-09-19 10:11:01 +08:00
|
|
|
|
|
|
|
|
model.set_seed(1337)
|
|
|
|
|
|
|
|
|
|
state = model.save_state()
|
|
|
|
|
|
|
|
|
|
output = model.create_completion(
|
|
|
|
|
"Pick a number from 1 to 10?:\n",
|
|
|
|
|
max_tokens=4,
|
|
|
|
|
top_k=50,
|
|
|
|
|
top_p=0.9,
|
|
|
|
|
temperature=0.8,
|
|
|
|
|
grammar=llama_cpp.LlamaGrammar.from_string("""
|
|
|
|
|
root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
|
|
|
|
|
""")
|
|
|
|
|
)
|
|
|
|
|
number_1 = output["choices"][0]["text"]
|
|
|
|
|
|
|
|
|
|
output = model.create_completion(
|
|
|
|
|
"Pick a number from 1 to 10?:\n",
|
|
|
|
|
max_tokens=4,
|
|
|
|
|
top_k=50,
|
|
|
|
|
top_p=0.9,
|
|
|
|
|
temperature=0.8,
|
|
|
|
|
grammar=llama_cpp.LlamaGrammar.from_string("""
|
|
|
|
|
root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
|
|
|
|
|
""")
|
|
|
|
|
)
|
|
|
|
|
number_2 = output["choices"][0]["text"]
|
|
|
|
|
|
|
|
|
|
model.load_state(state)
|
|
|
|
|
|
|
|
|
|
output = model.create_completion(
|
|
|
|
|
"Pick a number from 1 to 10?:\n",
|
|
|
|
|
max_tokens=4,
|
|
|
|
|
top_k=50,
|
|
|
|
|
top_p=0.9,
|
|
|
|
|
temperature=0.8,
|
|
|
|
|
grammar=llama_cpp.LlamaGrammar.from_string("""
|
|
|
|
|
root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
|
|
|
|
|
""")
|
|
|
|
|
)
|
|
|
|
|
number_3 = output["choices"][0]["text"]
|
|
|
|
|
|
|
|
|
|
assert number_1 != number_2
|
|
|
|
|
assert number_1 == number_3
|
2025-07-05 02:15:10 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_real_llama_embeddings(llama_cpp_model_path):
|
|
|
|
|
model = llama_cpp.Llama(
|
|
|
|
|
llama_cpp_model_path,
|
|
|
|
|
n_ctx=32,
|
|
|
|
|
n_batch=32,
|
|
|
|
|
n_ubatch=32,
|
|
|
|
|
n_threads=multiprocessing.cpu_count(),
|
|
|
|
|
n_threads_batch=multiprocessing.cpu_count(),
|
|
|
|
|
logits_all=False,
|
|
|
|
|
flash_attn=True,
|
|
|
|
|
embedding=True
|
|
|
|
|
)
|
|
|
|
|
# Smoke test for now
|
|
|
|
|
model.embed("Hello World")
|