SIGN IN SIGN UP
import llama_cpp
import llama_cpp.llama_tokenizer
llama = llama_cpp.Llama.from_pretrained(
repo_id="lmstudio-community/Qwen3.5-0.8B-GGUF",
filename="*Q8_0.gguf",
2024-07-09 12:20:17 -04:00
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
"Qwen/Qwen3.5-0.8B"
2024-07-09 12:20:17 -04:00
),
verbose=False,
)
response = llama.create_chat_completion(
2024-07-09 12:20:17 -04:00
messages=[{"role": "user", "content": "What is the capital of France?"}],
response_format={
"type": "json_object",
"schema": {
"type": "object",
"properties": {
"country": {"type": "string"},
2024-07-09 12:20:17 -04:00
"capital": {"type": "string"},
},
"required": ["country", "capital"],
2024-07-09 12:20:17 -04:00
},
},
2024-07-09 12:20:17 -04:00
stream=True,
)
for chunk in response:
delta = chunk["choices"][0]["delta"]
if "content" not in delta:
continue
print(delta["content"], end="", flush=True)
2024-07-09 12:20:17 -04:00
print()