2024-02-25 21:09:41 -05:00
|
|
|
import llama_cpp
|
|
|
|
|
import llama_cpp.llama_tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llama = llama_cpp.Llama.from_pretrained(
|
2026-03-22 22:33:31 -07:00
|
|
|
repo_id="lmstudio-community/Qwen3.5-0.8B-GGUF",
|
|
|
|
|
filename="*Q8_0.gguf",
|
2024-07-09 12:20:17 -04:00
|
|
|
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
|
2026-03-22 22:33:31 -07:00
|
|
|
"Qwen/Qwen3.5-0.8B"
|
2024-07-09 12:20:17 -04:00
|
|
|
),
|
|
|
|
|
verbose=False,
|
2024-02-25 21:09:41 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
response = llama.create_chat_completion(
|
2024-07-09 12:20:17 -04:00
|
|
|
messages=[{"role": "user", "content": "What is the capital of France?"}],
|
2024-02-25 21:09:41 -05:00
|
|
|
response_format={
|
|
|
|
|
"type": "json_object",
|
|
|
|
|
"schema": {
|
|
|
|
|
"type": "object",
|
|
|
|
|
"properties": {
|
|
|
|
|
"country": {"type": "string"},
|
2024-07-09 12:20:17 -04:00
|
|
|
"capital": {"type": "string"},
|
2024-02-25 21:09:41 -05:00
|
|
|
},
|
|
|
|
|
"required": ["country", "capital"],
|
2024-07-09 12:20:17 -04:00
|
|
|
},
|
2024-02-25 21:09:41 -05:00
|
|
|
},
|
2024-07-09 12:20:17 -04:00
|
|
|
stream=True,
|
2024-02-25 21:09:41 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
for chunk in response:
|
|
|
|
|
delta = chunk["choices"][0]["delta"]
|
|
|
|
|
if "content" not in delta:
|
|
|
|
|
continue
|
|
|
|
|
print(delta["content"], end="", flush=True)
|
|
|
|
|
|
2024-07-09 12:20:17 -04:00
|
|
|
print()
|