Core Concepts
Understand the key concepts that power Benchwise.
Decorators
Benchwise uses decorators to make evaluation simple and intuitive.
@evaluate
The main decorator for running tests on multiple models:
from benchwise import evaluate
# Single model
@evaluate("gpt-3.5-turbo")
async def test_single(model, dataset):
responses = await model.generate(dataset.prompts)
return {"responses": responses}
# Multiple models - runs test on both
@evaluate("gpt-3.5-turbo", "gemini-2.5-flash")
async def test_multiple(model, dataset):
responses = await model.generate(dataset.prompts)
return {"responses": responses}
# With options
@evaluate("gpt-3.5-turbo", temperature=0.7, upload=True)
async def test_with_options(model, dataset):
responses = await model.generate(dataset.prompts, temperature=0.7)
return {"responses": responses}
@benchmark
Mark evaluations as named benchmarks:
from benchwise import benchmark, evaluate
@benchmark("Medical QA", "Evaluates medical question answering")
@evaluate("gpt-3.5-turbo", "gemini-2.5-flash")
async def test_medical_qa(model, dataset):
responses = await model.generate(dataset.prompts)
return {"responses": responses}
@stress_test
Performance and load testing:
from benchwise import stress_test, evaluate
@stress_test(concurrent_requests=10, duration=60)
@evaluate("gpt-3.5-turbo")
async def test_performance(model, dataset):
response = await model.generate(["Hello, world!"])
return {"response": response}
Models
Benchwise supports multiple LLM providers out of the box.
Supported Providers
# OpenAI models
@evaluate("gpt-3.5-turbo")
async def test_openai(model, dataset):
...
# Anthropic models
@evaluate("claude-4.5-sonnet")
async def test_anthropic(model, dataset):
...
# Google models
@evaluate("gemini-2.5-flash")
async def test_google(model, dataset):
...
# HuggingFace models
@evaluate("microsoft/DialoGPT-medium")
async def test_huggingface(model, dataset):
...
# Mock adapter for testing
@evaluate("mock-test")
async def test_mock(model, dataset):
...
Model Interface
All models provide a consistent async interface:
async def my_test(model, dataset):
# Generate text
responses = await model.generate(prompts, temperature=0.7, max_tokens=100)
# Get token count
# Note: Token count is currently an estimate and may not be reliable.
tokens = model.get_token_count(text)
# Estimate cost
cost = model.get_cost_estimate(input_tokens, output_tokens)
Datasets
Datasets organize your evaluation data.
Creating Datasets
from benchwise import create_qa_dataset, create_summarization_dataset, load_dataset
# Question-Answer dataset
qa_data = create_qa_dataset(
questions=["What is AI?", "Explain ML"],
answers=["Artificial Intelligence", "Machine Learning"]
)
# Summarization dataset
summ_data = create_summarization_dataset(
documents=["Long text here..."],
summaries=["Summary here..."]
)
# Load from file
dataset = load_dataset("data.json") # or .csv
Dataset Properties
# Access prompts and references
prompts = dataset.prompts
references = dataset.references
# Access raw data
data = dataset.data
# Dataset operations
sample = dataset.sample(n=10, random_state=42)
train, test = dataset.split(train_ratio=0.8)
filtered = dataset.filter(lambda x: len(x["question"]) > 10)
Standard Benchmarks
from benchwise import load_mmlu_sample, load_hellaswag_sample, load_gsm8k_sample
# Load benchmark samples
# MMLU: Measures knowledge across 57 subjects.
mmlu = load_mmlu_sample()
# HellaSwag: Tests common sense reasoning.
hellaswag = load_hellaswag_sample()
# GSM8K: Evaluates multi-step math reasoning.
gsm8k = load_gsm8k_sample()
Metrics
Built-in metrics for evaluating model outputs across different dimensions.
Text Similarity
from benchwise import rouge_l, bleu_score, bert_score_metric
# ROUGE-L for summarization
rouge = rouge_l(predictions, references)
print(rouge["f1"], rouge["precision"], rouge["recall"])
# BLEU for translation
bleu = bleu_score(predictions, references)
# BERT score for semantic similarity
bert = bert_score_metric(predictions, references)
Accuracy & Correctness
from benchwise import accuracy, factual_correctness
# Exact match accuracy
acc = accuracy(predictions, references)
print(acc["accuracy"])
# Factual correctness
correctness = factual_correctness(predictions, references)
Semantic Similarity
from benchwise import semantic_similarity, coherence_score
# Embedding-based similarity
similarity = semantic_similarity(predictions, references)
print(similarity["mean_similarity"])
# Text coherence
coherence = coherence_score(texts)
Safety
from benchwise import safety_score
# Content safety evaluation
safety = safety_score(responses)
print(safety["mean_safety"])
Metric Collections
from benchwise import get_text_generation_metrics, get_qa_metrics, get_safety_metrics
# Use predefined metric bundles
text_metrics = get_text_generation_metrics()
qa_metrics = get_qa_metrics()
safety_metrics = get_safety_metrics()
# Evaluate with multiple metrics
results = qa_metrics.evaluate(responses, references)
Results
Handle and analyze evaluation results.
EvaluationResult
Each model evaluation returns an EvaluationResult:
result = results[0]
print(result.model_name) # Model identifier
print(result.result) # Your returned metrics
print(result.success) # Whether evaluation succeeded
print(result.error) # Error message if failed
print(result.duration) # Time taken
print(result.metadata) # Additional metadata
BenchmarkResult
Organize multiple results:
from benchwise import BenchmarkResult, save_results
benchmark = BenchmarkResult(benchmark_name="My Benchmark")
for result in results:
benchmark.add_result(result)
# Save in different formats
save_results(benchmark, "results.json", format="json")
save_results(benchmark, "results.csv", format="csv")
save_results(benchmark, "report.md", format="markdown")
Analysis
from benchwise import ResultsAnalyzer
# Generate reports
report = ResultsAnalyzer.generate_report(benchmark, "markdown")
# Compare models
comparison = benchmark.compare_models("accuracy")
print(f"Best: {comparison['best_model']}")
Async-First Architecture
All evaluation functions are async:
import asyncio
# Define async evaluation
@evaluate("gpt-3.5-turbo")
async def my_test(model, dataset):
responses = await model.generate(dataset.prompts)
return {"responses": responses}
# Run from async code
async def main():
results = await my_test(dataset)
print(results)
asyncio.run(main())
# Or run directly
results = asyncio.run(my_test(dataset))
Next Steps
- Evaluation Guide - Learn evaluation patterns
- Metrics Guide - Deep dive into metrics
- Datasets Guide - Master dataset management