SIGN IN SIGN UP
2023-11-10 05:39:42 -05:00
import ctypes
import multiprocessing
2023-11-10 05:39:42 -05:00
import numpy as np
from scipy.special import log_softmax
2023-11-10 05:39:42 -05:00
from huggingface_hub import hf_hub_download
import pytest
2023-04-05 03:23:15 -04:00
import llama_cpp
import llama_cpp._internals as internals
2023-04-05 03:23:15 -04:00
2024-04-29 23:34:55 -04:00
MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama-spm.gguf"
2023-04-05 03:23:15 -04:00
def test_llama_cpp_version():
assert llama_cpp.__version__
def test_llama_cpp_tokenization():
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, verbose=False)
2023-04-05 03:23:15 -04:00
assert llama
2023-11-06 09:16:36 -05:00
assert llama._ctx.ctx is not None
2023-04-05 03:23:15 -04:00
text = b"Hello World"
tokens = llama.tokenize(text)
assert tokens[0] == llama.token_bos()
2023-08-27 12:59:20 -04:00
assert tokens == [1, 15043, 2787]
detokenized = llama.detokenize(tokens)
assert detokenized == text
tokens = llama.tokenize(text, add_bos=False)
assert tokens[0] != llama.token_bos()
2023-08-27 12:59:20 -04:00
assert tokens == [15043, 2787]
detokenized = llama.detokenize(tokens)
2023-08-27 12:59:20 -04:00
assert detokenized != text
2023-04-05 03:23:15 -04:00
text = b"Hello World</s>"
tokens = llama.tokenize(text)
assert tokens[-1] != llama.token_eos()
assert tokens == [1, 15043, 2787, 829, 29879, 29958]
tokens = llama.tokenize(text, special=True)
assert tokens[-1] == llama.token_eos()
2023-11-20 14:11:33 -05:00
assert tokens == [1, 15043, 2787, 2]
2023-11-20 23:23:18 -05:00
text = b""
tokens = llama.tokenize(text, add_bos=True, special=True)
assert tokens[-1] != llama.token_eos()
assert tokens == [llama.token_bos()]
assert text == llama.detokenize(tokens)
@pytest.fixture
def llama_cpp_model_path():
repo_id = "Qwen/Qwen2-0.5B-Instruct-GGUF"
filename = "qwen2-0_5b-instruct-q8_0.gguf"
model_path = hf_hub_download(repo_id, filename)
return model_path
def test_real_model(llama_cpp_model_path):
import os
assert os.path.exists(llama_cpp_model_path)
params = llama_cpp.llama_model_default_params()
params.use_mmap = llama_cpp.llama_supports_mmap()
params.use_mlock = llama_cpp.llama_supports_mlock()
params.check_tensors = False
model = internals.LlamaModel(path_model=llama_cpp_model_path, params=params)
cparams = llama_cpp.llama_context_default_params()
cparams.n_ctx = 16
cparams.n_batch = 16
cparams.n_ubatch = 16
cparams.n_threads = multiprocessing.cpu_count()
cparams.n_threads_batch = multiprocessing.cpu_count()
cparams.logits_all = False
cparams.flash_attn = True
context = internals.LlamaContext(model=model, params=cparams)
tokens = model.tokenize(b"Hello, world!", add_bos=True, special=True)
assert tokens == [9707, 11, 1879, 0]
tokens = model.tokenize(b"The quick brown fox jumps", add_bos=True, special=True)
batch = internals.LlamaBatch(n_tokens=len(tokens), embd=0, n_seq_max=1)
seed = 1337
sampler = internals.LlamaSampler()
sampler.add_top_k(50)
sampler.add_top_p(0.9, 1)
sampler.add_temp(0.8)
sampler.add_dist(seed)
result = tokens
n_eval = 0
for _ in range(4):
batch.set_batch(tokens, n_past=n_eval, logits_all=False)
context.decode(batch)
n_eval += len(tokens)
token_id = sampler.sample(context, -1)
tokens = [token_id]
result += tokens
output = result[5:]
output_text = model.detokenize(output, special=True)
assert output_text == b" over the lazy dog"
def test_real_llama(llama_cpp_model_path):
model = llama_cpp.Llama(
llama_cpp_model_path,
n_ctx=32,
n_batch=32,
n_ubatch=32,
n_threads=multiprocessing.cpu_count(),
n_threads_batch=multiprocessing.cpu_count(),
logits_all=False,
flash_attn=True,
2023-04-05 03:23:15 -04:00
)
2023-04-28 23:26:07 -07:00
output = model.create_completion(
"The quick brown fox jumps",
max_tokens=4,
top_k=50,
top_p=0.9,
temperature=0.8,
seed=1337
)
assert output["choices"][0]["text"] == " over the lazy dog"
output = model.create_completion(
"The capital of france is paris, 'true' or 'false'?:\n",
max_tokens=4,
top_k=50,
top_p=0.9,
temperature=0.8,
seed=1337,
grammar=llama_cpp.LlamaGrammar.from_string("""
root ::= "true" | "false"
""")
)
assert output["choices"][0]["text"] == "true"
2023-04-28 23:26:07 -07:00
suffix = b"rot"
tokens = model.tokenize(suffix, add_bos=True, special=True)
def logit_processor_func(input_ids, logits):
for token in tokens:
logits[token] *= 1000
return logits
2023-05-01 22:38:46 -04:00
logit_processors = llama_cpp.LogitsProcessorList(
[logit_processor_func]
2023-05-01 22:41:54 -04:00
)
output = model.create_completion(
"The capital of france is par",
max_tokens=4,
top_k=50,
top_p=0.9,
temperature=0.8,
seed=1337,
logits_processor=logit_processors
)
assert output["choices"][0]["text"].lower().startswith("rot")
model.set_seed(1337)
state = model.save_state()
output = model.create_completion(
"Pick a number from 1 to 10?:\n",
max_tokens=4,
top_k=50,
top_p=0.9,
temperature=0.8,
grammar=llama_cpp.LlamaGrammar.from_string("""
root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
""")
)
number_1 = output["choices"][0]["text"]
output = model.create_completion(
"Pick a number from 1 to 10?:\n",
max_tokens=4,
top_k=50,
top_p=0.9,
temperature=0.8,
grammar=llama_cpp.LlamaGrammar.from_string("""
root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
""")
)
number_2 = output["choices"][0]["text"]
model.load_state(state)
output = model.create_completion(
"Pick a number from 1 to 10?:\n",
max_tokens=4,
top_k=50,
top_p=0.9,
temperature=0.8,
grammar=llama_cpp.LlamaGrammar.from_string("""
root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
""")
)
number_3 = output["choices"][0]["text"]
assert number_1 != number_2
assert number_1 == number_3
def test_real_llama_embeddings(llama_cpp_model_path):
model = llama_cpp.Llama(
llama_cpp_model_path,
n_ctx=32,
n_batch=32,
n_ubatch=32,
n_threads=multiprocessing.cpu_count(),
n_threads_batch=multiprocessing.cpu_count(),
logits_all=False,
flash_attn=True,
embedding=True
)
# Smoke test for now
model.embed("Hello World")