|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Perplexity Test Script
|
|||
|
|
Tests GGUF model perplexity on multiple datasets using llama-perplexity.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import subprocess
|
|||
|
|
import time
|
|||
|
|
import csv
|
|||
|
|
import re
|
|||
|
|
from datetime import datetime
|
|||
|
|
from pathlib import Path
|
|||
|
|
import argparse
|
|||
|
|
import tempfile
|
|||
|
|
import shutil
|
|||
|
|
import statistics
|
|||
|
|
|
|||
|
|
|
|||
|
|
class PerplexityTester:
|
|||
|
|
def __init__(self, model_path, llama_perplexity_bin="../build/bin/llama-perplexity",
|
|||
|
|
data_dir="../data", output_dir="perplexity_results", quick_mode=False,
|
|||
|
|
quantize_bin="../build/bin/llama-quantize", test_embeddings=False, csv_output=None):
|
|||
|
|
self.model_path = Path(model_path)
|
|||
|
|
self.llama_perplexity_bin = Path(llama_perplexity_bin)
|
|||
|
|
self.quantize_bin = Path(quantize_bin)
|
|||
|
|
self.data_dir = Path(data_dir)
|
|||
|
|
self.output_dir = Path(output_dir)
|
|||
|
|
self.quick_mode = quick_mode
|
|||
|
|
self.test_embeddings = test_embeddings
|
|||
|
|
self.csv_output = Path(csv_output) if csv_output else None
|
|||
|
|
self.results = []
|
|||
|
|
self.created_models = set() # Track newly created model files
|
|||
|
|
self.temp_files = [] # Track temporary files for cleanup
|
|||
|
|
|
|||
|
|
# Embedding types to test
|
|||
|
|
self.embedding_types = [
|
|||
|
|
('F32', 'f32'),
|
|||
|
|
('F16', 'f16'),
|
|||
|
|
('Q8_0', 'q8_0'),
|
|||
|
|
('Q6_K', 'q6_k'),
|
|||
|
|
('Q5_0', 'q5_0'),
|
|||
|
|
('Q4_0', 'q4_0'),
|
|||
|
|
('Q3_K', 'q3_k'),
|
|||
|
|
('TQ2_0', 'tq2_0'),
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
# Create output directory
|
|||
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|||
|
|
|
|||
|
|
# Verify llama-perplexity binary exists
|
|||
|
|
if not self.llama_perplexity_bin.exists():
|
|||
|
|
raise FileNotFoundError(f"llama-perplexity binary not found: {self.llama_perplexity_bin}")
|
|||
|
|
|
|||
|
|
# Verify quantize binary exists if testing embeddings
|
|||
|
|
if self.test_embeddings and not self.quantize_bin.exists():
|
|||
|
|
raise FileNotFoundError(f"llama-quantize binary not found: {self.quantize_bin}")
|
|||
|
|
|
|||
|
|
# Verify model file exists
|
|||
|
|
if not self.model_path.exists():
|
|||
|
|
raise FileNotFoundError(f"Model file not found: {self.model_path}")
|
|||
|
|
|
|||
|
|
def find_datasets(self):
|
|||
|
|
"""Find all test.txt files in dataset directories."""
|
|||
|
|
datasets = []
|
|||
|
|
|
|||
|
|
if not self.data_dir.exists():
|
|||
|
|
print(f"❌ Data directory not found: {self.data_dir}")
|
|||
|
|
return datasets
|
|||
|
|
|
|||
|
|
print(f"\n🔍 Searching for datasets in {self.data_dir}...")
|
|||
|
|
|
|||
|
|
# Look for test.txt files in subdirectories
|
|||
|
|
for dataset_dir in sorted(self.data_dir.iterdir()):
|
|||
|
|
if dataset_dir.is_dir():
|
|||
|
|
test_file = dataset_dir / "test.txt"
|
|||
|
|
if test_file.exists():
|
|||
|
|
size_mb = test_file.stat().st_size / (1024 * 1024)
|
|||
|
|
datasets.append({
|
|||
|
|
'name': dataset_dir.name,
|
|||
|
|
'path': test_file,
|
|||
|
|
'size': test_file.stat().st_size,
|
|||
|
|
'size_mb': size_mb
|
|||
|
|
})
|
|||
|
|
print(f" ✅ {dataset_dir.name:<20} ({size_mb:.2f} MB)")
|
|||
|
|
else:
|
|||
|
|
print(f" ⚠️ {dataset_dir.name:<20} (no test.txt found)")
|
|||
|
|
|
|||
|
|
return datasets
|
|||
|
|
|
|||
|
|
def create_quick_dataset(self, dataset_path, num_chars=4096):
|
|||
|
|
"""Create a temporary dataset with only the first N characters for quick testing."""
|
|||
|
|
temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8')
|
|||
|
|
self.temp_files.append(temp_file.name)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
with open(dataset_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|||
|
|
content = f.read(num_chars)
|
|||
|
|
temp_file.write(content)
|
|||
|
|
temp_file.close()
|
|||
|
|
return Path(temp_file.name)
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"⚠️ Failed to create quick dataset: {e}")
|
|||
|
|
temp_file.close()
|
|||
|
|
return dataset_path
|
|||
|
|
|
|||
|
|
def cleanup_temp_files(self):
|
|||
|
|
"""Clean up temporary files."""
|
|||
|
|
for temp_file in self.temp_files:
|
|||
|
|
try:
|
|||
|
|
os.unlink(temp_file)
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
self.temp_files = []
|
|||
|
|
|
|||
|
|
def run_perplexity_test(self, dataset_name, dataset_path, threads=16, ctx_size=512, model_override=None):
|
|||
|
|
"""Run perplexity test on a single dataset."""
|
|||
|
|
test_model = model_override if model_override else self.model_path
|
|||
|
|
|
|||
|
|
print(f"\n{'='*80}")
|
|||
|
|
print(f"📊 Testing on dataset: {dataset_name}")
|
|||
|
|
print(f" File: {dataset_path}")
|
|||
|
|
print(f" Model: {test_model.name}")
|
|||
|
|
print(f"{'='*80}")
|
|||
|
|
|
|||
|
|
cmd = [
|
|||
|
|
str(self.llama_perplexity_bin),
|
|||
|
|
"-m", str(test_model),
|
|||
|
|
"-f", str(dataset_path),
|
|||
|
|
"-t", str(threads),
|
|||
|
|
"-c", str(ctx_size),
|
|||
|
|
"-ngl", "0" # CPU only
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
print(f"💻 Command: {' '.join(cmd)}")
|
|||
|
|
print(f"⏱️ Starting test...\n")
|
|||
|
|
|
|||
|
|
start_time = time.time()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
result = subprocess.run(
|
|||
|
|
cmd,
|
|||
|
|
capture_output=True,
|
|||
|
|
text=True,
|
|||
|
|
timeout=3600, # 1 hour timeout
|
|||
|
|
cwd=os.getcwd()
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
elapsed_time = time.time() - start_time
|
|||
|
|
|
|||
|
|
if result.returncode == 0:
|
|||
|
|
# Parse perplexity from output (check both stdout and stderr)
|
|||
|
|
combined_output = result.stdout + "\n" + result.stderr
|
|||
|
|
ppl = self.parse_perplexity(combined_output)
|
|||
|
|
|
|||
|
|
if ppl is not None:
|
|||
|
|
print(f"\n✅ Perplexity: {ppl}")
|
|||
|
|
print(f"⏱️ Time: {elapsed_time:.2f}s ({elapsed_time/60:.2f} min)")
|
|||
|
|
status = "success"
|
|||
|
|
else:
|
|||
|
|
print(f"\n⚠️ Test completed but could not parse perplexity")
|
|||
|
|
print(f"Last 500 chars of stdout:")
|
|||
|
|
print(result.stdout[-500:])
|
|||
|
|
print(f"Last 500 chars of stderr:")
|
|||
|
|
print(result.stderr[-500:])
|
|||
|
|
status = "parse_error"
|
|||
|
|
ppl = None
|
|||
|
|
else:
|
|||
|
|
print(f"\n❌ Test failed with return code {result.returncode}")
|
|||
|
|
print(f"Error: {result.stderr[:500]}")
|
|||
|
|
status = "failed"
|
|||
|
|
ppl = None
|
|||
|
|
elapsed_time = time.time() - start_time
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
'dataset': dataset_name,
|
|||
|
|
'perplexity': ppl,
|
|||
|
|
'time': elapsed_time,
|
|||
|
|
'status': status,
|
|||
|
|
'stdout': result.stdout,
|
|||
|
|
'stderr': result.stderr
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
except subprocess.TimeoutExpired:
|
|||
|
|
elapsed_time = time.time() - start_time
|
|||
|
|
print(f"\n❌ Timeout after {elapsed_time:.2f}s")
|
|||
|
|
return {
|
|||
|
|
'dataset': dataset_name,
|
|||
|
|
'perplexity': None,
|
|||
|
|
'time': elapsed_time,
|
|||
|
|
'status': 'timeout',
|
|||
|
|
'stdout': '',
|
|||
|
|
'stderr': 'Test exceeded 1 hour timeout'
|
|||
|
|
}
|
|||
|
|
except Exception as e:
|
|||
|
|
elapsed_time = time.time() - start_time
|
|||
|
|
print(f"\n❌ Error: {e}")
|
|||
|
|
return {
|
|||
|
|
'dataset': dataset_name,
|
|||
|
|
'perplexity': None,
|
|||
|
|
'time': elapsed_time,
|
|||
|
|
'status': 'error',
|
|||
|
|
'stdout': '',
|
|||
|
|
'stderr': str(e)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def parse_perplexity(self, output):
|
|||
|
|
"""Parse perplexity value (mean±std format) from llama-perplexity output."""
|
|||
|
|
# First try to match "PPL = mean +/- std" format
|
|||
|
|
pattern_with_std = r'PPL\s*=\s*(\d+\.?\d*)\s*\+/-\s*(\d+\.?\d*)'
|
|||
|
|
match = re.search(pattern_with_std, output, re.IGNORECASE | re.MULTILINE)
|
|||
|
|
if match:
|
|||
|
|
try:
|
|||
|
|
mean = float(match.group(1))
|
|||
|
|
std = float(match.group(2))
|
|||
|
|
return f"{mean:.4f}±{std:.4f}"
|
|||
|
|
except ValueError:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# Fallback to patterns without std
|
|||
|
|
patterns = [
|
|||
|
|
r'Final estimate:\s*PPL\s*=\s*(\d+\.?\d*)',
|
|||
|
|
r'Final perplexity:\s*(\d+\.?\d*)',
|
|||
|
|
r'PPL\s*=\s*(\d+\.?\d*)',
|
|||
|
|
r'PPL:\s*(\d+\.?\d*)',
|
|||
|
|
r'perplexity:\s*(\d+\.?\d*)',
|
|||
|
|
r'ppl\s*=\s*(\d+\.?\d*)',
|
|||
|
|
r'Perplexity:\s*(\d+\.?\d*)',
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for pattern in patterns:
|
|||
|
|
match = re.search(pattern, output, re.IGNORECASE | re.MULTILINE)
|
|||
|
|
if match:
|
|||
|
|
try:
|
|||
|
|
return f"{float(match.group(1)):.4f}"
|
|||
|
|
except ValueError:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def quantize_embedding(self, embedding_type, output_suffix):
|
|||
|
|
"""
|
|||
|
|
Quantize model with specific embedding type.
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
embedding_type: Token embedding type (uppercase, e.g., 'Q6_K')
|
|||
|
|
output_suffix: Output file suffix (lowercase, e.g., 'q6_k')
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
Path to quantized model or None if failed
|
|||
|
|
"""
|
|||
|
|
# Construct output path
|
|||
|
|
model_dir = self.model_path.parent
|
|||
|
|
output_path = model_dir / f"ggml-model-i2_s-embed-{output_suffix}.gguf"
|
|||
|
|
|
|||
|
|
# Check if file already exists
|
|||
|
|
file_existed = output_path.exists()
|
|||
|
|
|
|||
|
|
if file_existed:
|
|||
|
|
print(f"ℹ️ Model already exists: {output_path.name}")
|
|||
|
|
return output_path
|
|||
|
|
|
|||
|
|
cmd = [
|
|||
|
|
str(self.quantize_bin),
|
|||
|
|
"--token-embedding-type", embedding_type,
|
|||
|
|
str(self.model_path),
|
|||
|
|
str(output_path),
|
|||
|
|
"I2_S",
|
|||
|
|
"1",
|
|||
|
|
"1"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
print(f"\n{'='*80}")
|
|||
|
|
print(f"🔄 Quantizing with embedding type: {embedding_type}")
|
|||
|
|
print(f"📥 Input: {self.model_path.name}")
|
|||
|
|
print(f"📤 Output: {output_path.name}")
|
|||
|
|
print(f"💻 Command: {' '.join(cmd)}")
|
|||
|
|
print(f"{'='*80}\n")
|
|||
|
|
|
|||
|
|
start_time = time.time()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
result = subprocess.run(
|
|||
|
|
cmd,
|
|||
|
|
capture_output=True,
|
|||
|
|
text=True,
|
|||
|
|
cwd=os.getcwd(),
|
|||
|
|
timeout=600 # 10 minutes timeout
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
duration = time.time() - start_time
|
|||
|
|
|
|||
|
|
if result.returncode == 0:
|
|||
|
|
file_size_mb = output_path.stat().st_size / (1024 * 1024)
|
|||
|
|
print(f"✅ Quantization successful!")
|
|||
|
|
print(f" Duration: {duration:.2f}s")
|
|||
|
|
print(f" Size: {file_size_mb:.2f} MB")
|
|||
|
|
|
|||
|
|
# Mark as newly created
|
|||
|
|
self.created_models.add(output_path)
|
|||
|
|
return output_path
|
|||
|
|
else:
|
|||
|
|
print(f"❌ Quantization failed with return code {result.returncode}")
|
|||
|
|
print(f"Error: {result.stderr[:500]}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
except subprocess.TimeoutExpired:
|
|||
|
|
print(f"❌ Quantization timeout (exceeded 10 minutes)")
|
|||
|
|
return None
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"❌ Quantization error: {e}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def cleanup_model(self, model_path):
|
|||
|
|
"""Delete model file if it was created during this session."""
|
|||
|
|
if model_path in self.created_models:
|
|||
|
|
try:
|
|||
|
|
model_path.unlink()
|
|||
|
|
print(f"🗑️ Deleted: {model_path.name}")
|
|||
|
|
self.created_models.remove(model_path)
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"⚠️ Failed to delete {model_path.name}: {e}")
|
|||
|
|
else:
|
|||
|
|
print(f"ℹ️ Keeping existing file: {model_path.name}")
|
|||
|
|
|
|||
|
|
def run_all_tests(self, threads=16, ctx_size=512):
|
|||
|
|
"""Run perplexity tests on all datasets."""
|
|||
|
|
datasets = self.find_datasets()
|
|||
|
|
|
|||
|
|
if not datasets:
|
|||
|
|
print(f"\n❌ No datasets found in {self.data_dir}")
|
|||
|
|
print(f" Make sure each dataset directory has a test.txt file")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# Quick mode: test all datasets but only first 4096 chars with smaller context
|
|||
|
|
if self.quick_mode:
|
|||
|
|
ctx_size = min(ctx_size, 128) # Use smaller context in quick mode
|
|||
|
|
print(f"\n⚡ QUICK TEST MODE ENABLED")
|
|||
|
|
print(f" - Testing all datasets with first 4096 characters only")
|
|||
|
|
print(f" - Using reduced context size: {ctx_size}")
|
|||
|
|
|
|||
|
|
# Determine models to test
|
|||
|
|
if self.test_embeddings:
|
|||
|
|
print(f"\n{'='*80}")
|
|||
|
|
print(f"🧪 EMBEDDING QUANTIZATION TEST MODE")
|
|||
|
|
print(f"{'='*80}")
|
|||
|
|
print(f"📦 Base model: {self.model_path.name}")
|
|||
|
|
print(f"🔢 Embedding types to test: {len(self.embedding_types)}")
|
|||
|
|
print(f"📊 Datasets: {len(datasets)}")
|
|||
|
|
print(f"🧵 Threads: {threads}")
|
|||
|
|
print(f"📏 Context size: {ctx_size}")
|
|||
|
|
print(f"{'='*80}")
|
|||
|
|
|
|||
|
|
total_start = time.time()
|
|||
|
|
|
|||
|
|
# Test each embedding type
|
|||
|
|
for i, (embedding_type, output_suffix) in enumerate(self.embedding_types, 1):
|
|||
|
|
print(f"\n\n{'#'*80}")
|
|||
|
|
print(f"[{i}/{len(self.embedding_types)}] Testing embedding type: {output_suffix} ({embedding_type})")
|
|||
|
|
print(f"{'#'*80}")
|
|||
|
|
|
|||
|
|
# Quantize model
|
|||
|
|
quantized_model = self.quantize_embedding(embedding_type, output_suffix)
|
|||
|
|
|
|||
|
|
if quantized_model is None:
|
|||
|
|
print(f"⚠️ Skipping tests for {output_suffix} due to quantization failure")
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# Test on all datasets
|
|||
|
|
for j, dataset in enumerate(datasets, 1):
|
|||
|
|
print(f"\n[{j}/{len(datasets)}] Testing {dataset['name']} with {output_suffix}...")
|
|||
|
|
|
|||
|
|
# Use quick dataset if in quick mode
|
|||
|
|
test_path = dataset['path']
|
|||
|
|
if self.quick_mode:
|
|||
|
|
test_path = self.create_quick_dataset(dataset['path'])
|
|||
|
|
|
|||
|
|
result = self.run_perplexity_test(
|
|||
|
|
f"{dataset['name']}_embed-{output_suffix}",
|
|||
|
|
test_path,
|
|||
|
|
threads,
|
|||
|
|
ctx_size,
|
|||
|
|
model_override=quantized_model
|
|||
|
|
)
|
|||
|
|
self.results.append(result)
|
|||
|
|
|
|||
|
|
# Cleanup model after testing
|
|||
|
|
print(f"\n🧹 Cleaning up {output_suffix} model...")
|
|||
|
|
self.cleanup_model(quantized_model)
|
|||
|
|
|
|||
|
|
print(f"\n{'#'*80}")
|
|||
|
|
print(f"✅ Completed {output_suffix}")
|
|||
|
|
print(f"{'#'*80}")
|
|||
|
|
|
|||
|
|
total_time = time.time() - total_start
|
|||
|
|
|
|||
|
|
else:
|
|||
|
|
# Regular single model test
|
|||
|
|
print(f"\n{'='*80}")
|
|||
|
|
print(f"🚀 PERPLEXITY TEST SESSION{' (QUICK MODE)' if self.quick_mode else ''}")
|
|||
|
|
print(f"{'='*80}")
|
|||
|
|
print(f"📦 Model: {self.model_path.name}")
|
|||
|
|
print(f"📁 Model path: {self.model_path}")
|
|||
|
|
print(f"📊 Datasets {'to test' if self.quick_mode else 'found'}: {len(datasets)}")
|
|||
|
|
print(f"🧵 Threads: {threads}")
|
|||
|
|
print(f"📏 Context size: {ctx_size}")
|
|||
|
|
print(f"{'='*80}")
|
|||
|
|
|
|||
|
|
total_start = time.time()
|
|||
|
|
|
|||
|
|
# Run tests
|
|||
|
|
for i, dataset in enumerate(datasets, 1):
|
|||
|
|
print(f"\n\n[{i}/{len(datasets)}] Processing {dataset['name']}...")
|
|||
|
|
|
|||
|
|
# Use quick dataset if in quick mode
|
|||
|
|
test_path = dataset['path']
|
|||
|
|
if self.quick_mode:
|
|||
|
|
test_path = self.create_quick_dataset(dataset['path'])
|
|||
|
|
|
|||
|
|
result = self.run_perplexity_test(
|
|||
|
|
dataset['name'],
|
|||
|
|
test_path,
|
|||
|
|
threads,
|
|||
|
|
ctx_size
|
|||
|
|
)
|
|||
|
|
self.results.append(result)
|
|||
|
|
|
|||
|
|
total_time = time.time() - total_start
|
|||
|
|
|
|||
|
|
# Clean up temporary files
|
|||
|
|
if self.quick_mode:
|
|||
|
|
print(f"\n🧹 Cleaning up temporary files...")
|
|||
|
|
self.cleanup_temp_files()
|
|||
|
|
|
|||
|
|
# Save results
|
|||
|
|
self.save_results()
|
|||
|
|
|
|||
|
|
# Print summary
|
|||
|
|
self.print_summary(total_time)
|
|||
|
|
|
|||
|
|
def save_results(self):
|
|||
|
|
"""Save results to CSV file."""
|
|||
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|||
|
|
model_name = self.model_path.stem
|
|||
|
|
|
|||
|
|
# Use custom CSV path if provided
|
|||
|
|
if self.csv_output:
|
|||
|
|
csv_file = self.csv_output
|
|||
|
|
# Create parent directory if needed
|
|||
|
|
csv_file.parent.mkdir(parents=True, exist_ok=True)
|
|||
|
|
else:
|
|||
|
|
csv_file = self.output_dir / f"ppl_{model_name}_{timestamp}.csv"
|
|||
|
|
|
|||
|
|
print(f"\n💾 Saving results...")
|
|||
|
|
|
|||
|
|
with open(csv_file, 'w', newline='') as f:
|
|||
|
|
writer = csv.DictWriter(f, fieldnames=['dataset', 'perplexity', 'time_seconds', 'status'])
|
|||
|
|
writer.writeheader()
|
|||
|
|
for result in self.results:
|
|||
|
|
writer.writerow({
|
|||
|
|
'dataset': result['dataset'],
|
|||
|
|
'perplexity': result['perplexity'] if result['perplexity'] is not None else 'N/A',
|
|||
|
|
'time_seconds': f"{result['time']:.2f}",
|
|||
|
|
'status': result['status']
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
print(f" ✅ CSV saved: {csv_file}")
|
|||
|
|
|
|||
|
|
# Save detailed log
|
|||
|
|
log_file = self.output_dir / f"ppl_{model_name}_{timestamp}.log"
|
|||
|
|
with open(log_file, 'w') as f:
|
|||
|
|
f.write(f"Perplexity Test Results\n")
|
|||
|
|
f.write(f"{'='*80}\n")
|
|||
|
|
f.write(f"Model: {self.model_path}\n")
|
|||
|
|
f.write(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
|||
|
|
f.write(f"{'='*80}\n\n")
|
|||
|
|
|
|||
|
|
for result in self.results:
|
|||
|
|
f.write(f"\n{'='*80}\n")
|
|||
|
|
f.write(f"Dataset: {result['dataset']}\n")
|
|||
|
|
f.write(f"Perplexity: {result['perplexity']}\n")
|
|||
|
|
f.write(f"Time: {result['time']:.2f}s\n")
|
|||
|
|
f.write(f"Status: {result['status']}\n")
|
|||
|
|
f.write(f"\nOutput:\n{result['stdout']}\n")
|
|||
|
|
if result['stderr']:
|
|||
|
|
f.write(f"\nErrors:\n{result['stderr']}\n")
|
|||
|
|
|
|||
|
|
print(f" ✅ Log saved: {log_file}")
|
|||
|
|
|
|||
|
|
def print_summary(self, total_time):
|
|||
|
|
"""Print summary of all tests."""
|
|||
|
|
print(f"\n\n{'='*80}")
|
|||
|
|
print(f"📊 TEST SUMMARY")
|
|||
|
|
print(f"{'='*80}\n")
|
|||
|
|
|
|||
|
|
# Sort results by perplexity (lower is better)
|
|||
|
|
successful = [r for r in self.results if r['perplexity'] is not None]
|
|||
|
|
failed = [r for r in self.results if r['perplexity'] is None]
|
|||
|
|
|
|||
|
|
if successful:
|
|||
|
|
# Extract numeric value from "mean±std" format for sorting
|
|||
|
|
def get_ppl_value(result):
|
|||
|
|
ppl = result['perplexity']
|
|||
|
|
if isinstance(ppl, str) and '±' in ppl:
|
|||
|
|
return float(ppl.split('±')[0])
|
|||
|
|
elif isinstance(ppl, str):
|
|||
|
|
try:
|
|||
|
|
return float(ppl)
|
|||
|
|
except ValueError:
|
|||
|
|
return float('inf')
|
|||
|
|
return ppl
|
|||
|
|
|
|||
|
|
successful_sorted = sorted(successful, key=get_ppl_value)
|
|||
|
|
|
|||
|
|
print(f"{'Dataset':<20} {'Perplexity':>20} {'Time (s)':>12} {'Status':<15}")
|
|||
|
|
print(f"{'-'*80}")
|
|||
|
|
|
|||
|
|
for result in successful_sorted:
|
|||
|
|
ppl_str = str(result['perplexity']) if result['perplexity'] is not None else 'N/A'
|
|||
|
|
print(f"{result['dataset']:<20} {ppl_str:>20} "
|
|||
|
|
f"{result['time']:>12.2f} {result['status']:<15}")
|
|||
|
|
|
|||
|
|
best_ppl = str(successful_sorted[0]['perplexity'])
|
|||
|
|
print(f"\n🏆 Best result: {successful_sorted[0]['dataset']} "
|
|||
|
|
f"(PPL: {best_ppl})")
|
|||
|
|
|
|||
|
|
if failed:
|
|||
|
|
print(f"\n❌ Failed tests ({len(failed)}):")
|
|||
|
|
for result in failed:
|
|||
|
|
print(f" - {result['dataset']}: {result['status']}")
|
|||
|
|
|
|||
|
|
print(f"\n{'='*80}")
|
|||
|
|
print(f"✅ Completed: {len(successful)}/{len(self.results)}")
|
|||
|
|
print(f"⏱️ Total time: {total_time:.2f}s ({total_time/60:.2f} min)")
|
|||
|
|
print(f"📁 Results saved in: {self.output_dir}")
|
|||
|
|
print(f"{'='*80}\n")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
parser = argparse.ArgumentParser(description='Test model perplexity on multiple datasets')
|
|||
|
|
parser.add_argument('--model', '-m',
|
|||
|
|
required=True,
|
|||
|
|
help='Path to GGUF model file')
|
|||
|
|
parser.add_argument('--data-dir', '-d',
|
|||
|
|
default='data',
|
|||
|
|
help='Directory containing dataset folders (default: data)')
|
|||
|
|
parser.add_argument('--threads', '-t',
|
|||
|
|
type=int,
|
|||
|
|
default=16,
|
|||
|
|
help='Number of threads (default: 16)')
|
|||
|
|
parser.add_argument('--ctx-size', '-c',
|
|||
|
|
type=int,
|
|||
|
|
default=512,
|
|||
|
|
help='Context size (default: 512)')
|
|||
|
|
parser.add_argument('--output-dir', '-o',
|
|||
|
|
default='perplexity_results',
|
|||
|
|
help='Output directory for results (default: perplexity_results)')
|
|||
|
|
parser.add_argument('--llama-perplexity',
|
|||
|
|
default='./build/bin/llama-perplexity',
|
|||
|
|
help='Path to llama-perplexity binary (default: ./build/bin/llama-perplexity)')
|
|||
|
|
parser.add_argument('--quick', '-q',
|
|||
|
|
action='store_true',
|
|||
|
|
help='Quick test mode: test all datasets with first 4096 characters and reduced context size (128)')
|
|||
|
|
parser.add_argument('--test-embeddings', '-e',
|
|||
|
|
action='store_true',
|
|||
|
|
help='Test different embedding quantization types (f32, f16, q8_0, q6_k, q5_0, q4_0, q3_k, tq2_0)')
|
|||
|
|
parser.add_argument('--csv-output',
|
|||
|
|
help='Custom path for CSV output file (e.g., results/my_ppl_results.csv)')
|
|||
|
|
parser.add_argument('--quantize-bin',
|
|||
|
|
default='./build/bin/llama-quantize',
|
|||
|
|
help='Path to llama-quantize binary (default: ./build/bin/llama-quantize)')
|
|||
|
|
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
tester = PerplexityTester(
|
|||
|
|
model_path=args.model,
|
|||
|
|
llama_perplexity_bin=args.llama_perplexity,
|
|||
|
|
data_dir=args.data_dir,
|
|||
|
|
output_dir=args.output_dir,
|
|||
|
|
quick_mode=args.quick,
|
|||
|
|
quantize_bin=args.quantize_bin,
|
|||
|
|
test_embeddings=args.test_embeddings,
|
|||
|
|
csv_output=args.csv_output
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
tester.run_all_tests(
|
|||
|
|
threads=args.threads,
|
|||
|
|
ctx_size=args.ctx_size
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
except FileNotFoundError as e:
|
|||
|
|
print(f"❌ Error: {e}")
|
|||
|
|
return 1
|
|||
|
|
except KeyboardInterrupt:
|
|||
|
|
print("\n\n⚠️ Test interrupted by user")
|
|||
|
|
return 1
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"\n❌ Unexpected error: {e}")
|
|||
|
|
import traceback
|
|||
|
|
traceback.print_exc()
|
|||
|
|
return 1
|
|||
|
|
|
|||
|
|
return 0
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
exit(main())
|