Benchmark
Decorator to mark evaluation functions as named benchmarks with metadata.
Signature
@benchmark(name, description="", **kwargs)
@evaluate(*models)
async def evaluate_function(model, dataset):
...
Parameters
name(str): Name of the benchmarkdescription(str, optional): Description of what the benchmark tests**kwargs: Additional keyword arguments passed to the benchmark.
Returns
A decorator that adds benchmark metadata to the evaluation function.
Basic Usage
from benchwise import benchmark, evaluate
@benchmark("General QA", "Tests general knowledge questions")
@evaluate("gpt-3.5-turbo", "gemini-2.5-flash")
async def test_general_qa(model, dataset):
responses = await model.generate(dataset.prompts)
return {"responses": responses}
With Metadata
@benchmark(
"Medical QA v2.0",
"Medical question answering benchmark",
version="2.0",
domain="healthcare",
difficulty="hard",
dataset_size=100
)
@evaluate("gpt-3.5-turbo")
async def test_medical_qa(model, dataset):
...
Accessing Metadata
Benchmark metadata is stored in the function's _benchmark_metadata attribute:
@benchmark("Test Benchmark", "Description", version="1.0")
@evaluate("gpt-3.5-turbo")
async def my_test(model, dataset):
...
# Access metadata
metadata = my_test._benchmark_metadata
print(metadata["name"]) # "Test Benchmark"
print(metadata["description"]) # "Description"
print(metadata["version"]) # "1.0"
Decorator Order
@benchmark must be applied before @evaluate:
# Correct order
@benchmark("My Benchmark", "Description")
@evaluate("gpt-3.5-turbo")
async def correct_test(model, dataset):
...
# Wrong order - will not work properly
@evaluate("gpt-3.5-turbo")
@benchmark("My Benchmark", "Description")
async def wrong_test(model, dataset):
...
Complete Example
import asyncio
from benchwise import benchmark, evaluate, create_qa_dataset, accuracy
dataset = create_qa_dataset(
questions=["What is AI?", "What is ML?"],
answers=["Artificial Intelligence", "Machine Learning"]
)
@benchmark(
name="AI Knowledge Test v1.0",
description="Tests understanding of AI and ML concepts",
version="1.0",
category="technology",
difficulty="beginner",
language="english"
)
@evaluate("gpt-3.5-turbo", "gemini-2.5-flash")
async def test_ai_knowledge(model, dataset):
responses = await model.generate(dataset.prompts, temperature=0)
scores = accuracy(responses, dataset.references)
return {
"accuracy": scores["accuracy"],
"total_questions": len(responses)
}
# Run the benchmark
results = asyncio.run(test_ai_knowledge(dataset))
# Access benchmark metadata
print(f"Benchmark: {test_ai_knowledge._benchmark_metadata['name']}")
print(f"Version: {test_ai_knowledge._benchmark_metadata['version']}")
# Process results
for result in results:
if result.success:
print(f"{result.model_name}: {result.result['accuracy']:.2%}")
Versioning Benchmarks
Track benchmark versions over time:
@benchmark("Customer QA", "Customer support QA", version="1.0")
@evaluate("gpt-3.5-turbo")
async def test_customer_qa_v1(model, dataset):
# Original version
pass
@benchmark("Customer QA", "Customer support QA", version="2.0")
@evaluate("gpt-3.5-turbo")
async def test_customer_qa_v2(model, dataset):
# Updated version with improvements
pass
Domain-Specific Metadata
# Medical domain
@benchmark(
"Medical Diagnosis",
"Diagnostic accuracy evaluation",
domain="healthcare",
specialty="general_medicine",
risk_level="high"
)
@evaluate("gpt-3.5-turbo")
async def test_medical(model, dataset):
...
# Legal domain
@benchmark(
"Legal Analysis",
"Contract analysis benchmark",
domain="legal",
jurisdiction="US",
contract_type="commercial"
)
@evaluate("gpt-3.5-turbo")
async def test_legal(model, dataset):
...
# Financial domain
@benchmark(
"Financial Forecasting",
"Stock price prediction",
domain="finance",
market="NYSE",
timeframe="daily"
)
@evaluate("gpt-3.5-turbo")
async def test_financial(model, dataset):
pass
Best Practices
1. Use Descriptive Names
# Good
@benchmark("Medical QA - Cardiology", "Heart disease diagnosis questions")
@evaluate("gpt-3.5-turbo")
async def test_medical_cardiology(model, dataset):
...
# Less descriptive
@benchmark("Test 1", "Some test")
@evaluate("gpt-3.5-turbo")
async def test_one(model, dataset):
...
2. Include Version Information
@benchmark("Product QA", "Product questions", version="2.1", updated="2024-11-16")
@evaluate("gpt-3.5-turbo")
async def test_product_qa(model, dataset):
...
3. Document Difficulty
@benchmark("Math Problems", "Algebra questions", difficulty="intermediate", grade_level="9-10")
@evaluate("gpt-3.5-turbo")
async def test_math_problems(model, dataset):
...
4. Specify Dataset Information
@benchmark(
"MMLU Sample",
"Multiple choice questions",
dataset_size=100,
source="MMLU",
sample_strategy="random"
)
@evaluate("gpt-3.5-turbo")
async def test_mmlu(model, dataset):
...
See Also
- @evaluate - Main evaluation decorator
- @stress_test - Performance testing
- Evaluation Guide - Evaluation patterns