Skip to main content

Results

Learn how to manage, analyze, and export evaluation results.

Understanding Results

EvaluationResult

Each model evaluation returns an EvaluationResult:

from benchwise import evaluate

@evaluate("gpt-4", "claude-3-opus")
async def my_test(model, dataset):
responses = await model.generate(dataset.prompts)
return {"accuracy": 0.85}

results = asyncio.run(my_test(dataset))

# Access result properties
for result in results:
print(f"Model: {result.model_name}")
print(f"Success: {result.success}")
print(f"Result: {result.result}")
print(f"Duration: {result.duration:.2f}s")
print(f"Error: {result.error}")
print(f"Metadata: {result.metadata}")

Checking Success

results = asyncio.run(my_evaluation(dataset))

successful = [r for r in results if r.success]
failed = [r for r in results if not r.success]

print(f"Successful: {len(successful)}")
print(f"Failed: {len(failed)}")

for failure in failed:
print(f"Failed: {failure.model_name} - {failure.error}")

Organizing Results

BenchmarkResult

Organize multiple evaluations:

from benchwise import BenchmarkResult, save_results

# Create benchmark result container
benchmark = BenchmarkResult(
benchmark_name="My Benchmark",
metadata={"date": "2024-11-16", "version": "1.0"}
)

# Add individual results
results = asyncio.run(my_test(dataset))
for result in results:
benchmark.add_result(result)

# Access results
all_results = benchmark.results
print(f"Total results: {len(all_results)}")

Saving Results

JSON Format

from benchwise import save_results

save_results(benchmark, "results.json", format="json")

Example output:

{
"name": "My Benchmark",
"metadata": {"date": "2024-11-16"},
"results": [
{
"model_name": "gpt-4",
"result": {"accuracy": 0.85},
"success": true,
"duration": 12.5
}
]
}

CSV Format

save_results(benchmark, "results.csv", format="csv")

Example output:

model_name,accuracy,success,duration
gpt-4,0.85,true,12.5
claude-3-opus,0.82,true,11.2

Markdown Report

save_results(benchmark, "report.md", format="markdown")

Example output:

# My Benchmark Results

| Model | Accuracy | Success | Duration |
|-------|----------|---------|----------|
| gpt-4 | 0.85 || 12.5s |
| claude-3-opus | 0.82 || 11.2s |

Loading Results

from benchwise import load_results

# Load previously saved results
benchmark = load_results("results.json")

print(f"Loaded: {benchmark.name}")
print(f"Results: {len(benchmark.results)}")

Analyzing Results

Compare Models

# Find best performing model
comparison = benchmark.compare_models("accuracy")

print(f"Best model: {comparison['best_model']}")
print(f"Best score: {comparison['best_score']:.2%}")
print(f"Worst model: {comparison['worst_model']}")
print(f"Worst score: {comparison['worst_score']:.2%}")

Generate Reports

from benchwise import ResultsAnalyzer

# Generate markdown report
report = ResultsAnalyzer.generate_report(benchmark, output_format="markdown")
print(report)

# Generate HTML report
html_report = ResultsAnalyzer.generate_report(benchmark, output_format="html")

# Generate text report
text_report = ResultsAnalyzer.generate_report(benchmark, output_format="text")

Caching Results

Benchwise automatically caches results to avoid re-running expensive evaluations:

from benchwise import cache

# Results are cached by default

# Clear cache when needed
cache.clear_cache()

# List cached results
cached = cache.list_cached_results()
print(f"Cached evaluations: {len(cached)}")

# Get specific cached result (requires model_name, test_name, and dataset_hash)
# You would typically get these from a list of cached results or a known evaluation.
# Example:
# cached_result = cache.load_result("gpt-4", "my_test", "dataset_hash_value")

Complete Example

from benchwise import (
evaluate,
benchmark,
create_qa_dataset,
accuracy,
semantic_similarity,
save_results,
BenchmarkResult,
ResultsAnalyzer
)
import asyncio

# Create dataset
dataset = create_qa_dataset(
questions=["What is AI?", "What is ML?"],
answers=["Artificial Intelligence", "Machine Learning"]
)

# Run evaluation
@benchmark("AI Knowledge Test", "Tests understanding of AI concepts")
@evaluate("gpt-4", "claude-3-opus", "gemini-pro")
async def test_ai_knowledge(model, dataset):
responses = await model.generate(dataset.prompts)

acc = accuracy(responses, dataset.references)
sim = semantic_similarity(responses, dataset.references)

return {
"accuracy": acc["accuracy"],
"similarity": sim["mean_similarity"]
}

# Main execution
async def main():
# Run evaluation
results = await test_ai_knowledge(dataset)

# Create benchmark result
benchmark = BenchmarkResult(
benchmark_name="AI Knowledge Benchmark",
metadata={"date": "2024-11-16", "version": "1.0"}
)

for result in results:
benchmark.add_result(result)

# Save in multiple formats
save_results(benchmark, "results.json", format="json")
save_results(benchmark, "results.csv", format="csv")
save_results(benchmark, "report.md", format="markdown")

# Analyze
comparison = benchmark.compare_models("accuracy")
print(f"\nBest model: {comparison['best_model']}")
print(f"Best accuracy: {comparison['best_score']:.2%}")

# Generate report
report = ResultsAnalyzer.generate_report(benchmark, output_format="markdown")
print("\n" + report)

# Statistics using compare_models
comparison = benchmark.compare_models("accuracy")
print(f"\nMean accuracy: {comparison['mean_score']:.2%}")
print(f"Std deviation: {comparison['std_score']:.3f}")

asyncio.run(main())

Best Practices

1. Always Save Results

# Save after every major evaluation
save_results(benchmark, f"results_{timestamp}.json", format="json")

2. Include Metadata

benchmark = BenchmarkResult(
benchmark_name="My Benchmark",
metadata={
"date": "2024-11-16",
"version": "2.0",
"dataset_size": len(dataset.data),
"models_tested": ["gpt-4", "claude-3-opus"],
"environment": "production"
}
)

3. Check for Failures

results = asyncio.run(my_test(dataset))

failed = [r for r in results if not r.success]
if failed:
print("WARNING: Some evaluations failed:")
for f in failed:
print(f" - {f.model_name}: {f.error}")

4. Compare Over Time

# Load previous results
old_results = load_results("results_2024_10.json")
new_results = load_results("results_2024_11.json")

# Compare improvements
old_best = old_results.compare_models("accuracy")["best_score"]
new_best = new_results.compare_models("accuracy")["best_score"]

improvement = new_best - old_best
print(f"Improvement: {improvement:+.2%}")
Complete Examples

For comprehensive result handling examples including saving, loading, comparing, and analyzing results, see Results.

Next Steps