Question Answering
Evaluate models on question answering tasks.
Basic QA Evaluation
import asyncio
from benchwise import evaluate, benchmark, create_qa_dataset, accuracy, semantic_similarity
# Create QA dataset
qa_dataset = create_qa_dataset(
questions=[
"What is the capital of Japan?",
"Who wrote '1984'?",
"What is the speed of light?",
"Explain photosynthesis in one sentence.",
"What causes rainbows?"
],
answers=[
"Tokyo",
"George Orwell",
"299,792,458 meters per second",
"Photosynthesis is the process by which plants convert sunlight into energy.",
"Rainbows are caused by light refraction and reflection in water droplets."
],
name="general_knowledge_qa"
)
@benchmark("General Knowledge QA", "Tests basic factual knowledge")
@evaluate("gpt-3.5-turbo", "claude-3-5-haiku-20241022", "gemini-pro")
async def test_general_knowledge(model, dataset):
responses = await model.generate(dataset.prompts)
# Multiple metrics for comprehensive evaluation
acc = accuracy(responses, dataset.references)
similarity = semantic_similarity(responses, dataset.references)
return {
"accuracy": acc["accuracy"],
"semantic_similarity": similarity["mean_similarity"],
"total_questions": len(responses)
}
# Run the evaluation
async def main():
results = await test_general_knowledge(qa_dataset)
print("\n=== General Knowledge QA Results ===")
for result in results:
if result.success:
print(f"{result.model_name}:")
print(f" Accuracy: {result.result['accuracy']:.2%}")
print(f" Similarity: {result.result['semantic_similarity']:.3f}")
print(f" Duration: {result.duration:.2f}s")
else:
print(f"{result.model_name}: FAILED - {result.error}")
asyncio.run(main())
Medical QA Example
import asyncio
from benchwise import evaluate, benchmark, create_qa_dataset, accuracy, semantic_similarity
# Medical domain dataset
medical_qa = create_qa_dataset(
questions=[
"What is hypertension?",
"What are the symptoms of diabetes?",
"How does aspirin work?",
"What is the function of the liver?"
],
answers=[
"High blood pressure",
"Increased thirst, frequent urination, fatigue, and blurred vision",
"Aspirin inhibits enzymes that produce prostaglandins, reducing inflammation and pain",
"The liver filters blood, produces bile, metabolizes nutrients, and detoxifies harmful substances"
],
name="medical_qa"
)
@benchmark("Medical QA", "Medical question answering benchmark")
@evaluate("gpt-4", "claude-3-opus")
async def test_medical_qa(model, dataset):
# Use temperature=0 for factual accuracy
responses = await model.generate(dataset.prompts, temperature=0)
acc = accuracy(responses, dataset.references)
similarity = semantic_similarity(responses, dataset.references)
return {
"accuracy": acc["accuracy"],
"similarity": similarity["mean_similarity"]
}
async def main():
results = await test_medical_qa(medical_qa)
print("\n=== Medical QA Results ===")
for result in results:
if result.success:
print(f"\n{result.model_name}:")
print(f" Accuracy: {result.result['accuracy']:.2%}")
print(f" Similarity: {result.result['similarity']:.3f}")
else:
print(f"\n{result.model_name}: FAILED - {result.error}")
asyncio.run(main())
Multi-Hop Reasoning
import asyncio
from benchwise import evaluate, create_qa_dataset, accuracy
# Complex reasoning questions
reasoning_qa = create_qa_dataset(
questions=[
"If all roses are flowers and some flowers fade quickly, can we conclude that some roses fade quickly?",
"A train leaves New York at 2 PM traveling at 60 mph. Another leaves Boston at 3 PM traveling at 80 mph toward New York. If they're 200 miles apart, when will they meet?",
"If it takes 5 machines 5 minutes to make 5 widgets, how long does it take 100 machines to make 100 widgets?"
],
answers=[
"No, this is a logical fallacy. We cannot conclude that some roses fade quickly.",
"They will meet at 4:30 PM",
"5 minutes"
],
name="reasoning_qa"
)
@evaluate("gpt-4", "claude-3-opus", temperature=0)
async def test_reasoning(model, dataset):
# Add reasoning instruction
prompts = [f"Think step by step and answer: {q}" for q in dataset.prompts]
responses = await model.generate(prompts)
acc = accuracy(responses, dataset.references)
return {
"accuracy": acc["accuracy"],
"total": len(responses)
}
async def main():
results = await test_reasoning(reasoning_qa)
print("\n=== Multi-Hop Reasoning Results ===")
for result in results:
if result.success:
print(f"\n{result.model_name}:")
print(f" Accuracy: {result.result['accuracy']:.2%}")
print(f" Total: {result.result['total']}")
else:
print(f"\n{result.model_name}: FAILED - {result.error}")
asyncio.run(main())
Handling Ambiguous Questions
import asyncio
from benchwise import evaluate, create_qa_dataset
ambiguous_qa = create_qa_dataset(
questions=[
"What is the capital of Congo?", # Two countries named Congo
"Who invented the telephone?", # Disputed invention
"What is the tallest mountain?" # On Earth? In solar system?
],
answers=[
"Kinshasa (DRC) or Brazzaville (Republic of Congo)",
"Alexander Graham Bell (though disputed with Antonio Meucci)",
"Mount Everest (on Earth)"
],
name="ambiguous_qa"
)
@evaluate("gpt-4", "claude-3-opus")
async def test_ambiguous(model, dataset):
# Ask models to clarify ambiguity
prompts = [f"Answer this question and note any ambiguities: {q}"
for q in dataset.prompts]
responses = await model.generate(prompts)
# Custom scoring: check if model acknowledges ambiguity
ambiguity_scores = []
for response in responses:
response_lower = response.lower()
acknowledges = any(word in response_lower
for word in ["ambiguous", "unclear", "depends", "could be", "both"])
ambiguity_scores.append(1 if acknowledges else 0)
return {
"ambiguity_acknowledgment_rate": sum(ambiguity_scores) / len(ambiguity_scores),
"total": len(responses)
}
async def main():
results = await test_ambiguous(ambiguous_qa)
print("\n=== Ambiguous Questions Results ===")
for result in results:
if result.success:
print(f"\n{result.model_name}:")
print(f" Ambiguity Acknowledgment Rate: {result.result['ambiguity_acknowledgment_rate']:.2%}")
print(f" Total: {result.result['total']}")
else:
print(f"\n{result.model_name}: FAILED - {result.error}")
asyncio.run(main())
Batch Processing Large QA Datasets
import asyncio
from benchwise import evaluate, load_dataset, accuracy
# Load large dataset
large_qa = load_dataset("data/qa_1000.json")
@evaluate("gpt-3.5-turbo")
async def test_large_batch(model, dataset):
# Sample for testing
test_sample = dataset.sample(n=100, random_state=42)
# Process in batches of 10
batch_size = 10
all_responses = []
for i in range(0, len(test_sample.prompts), batch_size):
batch = test_sample.prompts[i:i+batch_size]
responses = await model.generate(batch)
all_responses.extend(responses)
print(f"Processed {len(all_responses)}/{len(test_sample.prompts)}")
# Evaluate
acc = accuracy(all_responses, test_sample.references)
return {
"accuracy": acc["accuracy"],
"total_processed": len(all_responses)
}
asyncio.run(test_large_batch(large_qa))
Using Standard Benchmarks
import asyncio
from benchwise import evaluate, benchmark, load_mmlu_sample, accuracy
# Load MMLU sample
mmlu = load_mmlu_sample()
@benchmark("MMLU Sample", "Multiple choice questions from MMLU")
@evaluate("gpt-4", "claude-3-opus")
async def test_mmlu(model, dataset):
responses = await model.generate(dataset.prompts, temperature=0)
acc = accuracy(responses, dataset.references)
return {
"accuracy": acc["accuracy"],
"total": len(responses)
}
async def main():
results = await test_mmlu(mmlu)
print("\n=== MMLU Sample Results ===")
for result in results:
if result.success:
print(f"\n{result.model_name}:")
print(f" Accuracy: {result.result['accuracy']:.2%}")
print(f" Total: {result.result['total']}")
else:
print(f"\n{result.model_name}: FAILED - {result.error}")
asyncio.run(main())
Saving and Analyzing Results
import asyncio
from benchwise import (
evaluate,
benchmark,
create_qa_dataset,
accuracy,
semantic_similarity,
save_results,
BenchmarkResult,
ResultsAnalyzer
)
# Create QA dataset (same as in Basic QA Evaluation)
qa_dataset = create_qa_dataset(
questions=[
"What is the capital of Japan?",
"Who wrote '1984'?",
"What is the speed of light?",
"Explain photosynthesis in one sentence.",
"What causes rainbows?"
],
answers=[
"Tokyo",
"George Orwell",
"299,792,458 meters per second",
"Photosynthesis is the process by which plants convert sunlight into energy.",
"Rainbows are caused by light refraction and reflection in water droplets."
],
name="general_knowledge_qa"
)
@benchmark("General Knowledge QA", "Tests basic factual knowledge")
@evaluate("gpt-3.5-turbo", "claude-3-5-haiku-20241022", "gemini-pro")
async def test_general_knowledge(model, dataset):
responses = await model.generate(dataset.prompts)
acc = accuracy(responses, dataset.references)
similarity = semantic_similarity(responses, dataset.references)
return {
"accuracy": acc["accuracy"],
"semantic_similarity": similarity["mean_similarity"],
"total_questions": len(responses)
}
async def run_complete_qa_evaluation():
results = await test_general_knowledge(qa_dataset)
# Create benchmark result
benchmark = BenchmarkResult(
"Complete QA Evaluation",
metadata={"date": "2024-11-16", "version": "1.0"}
)
for result in results:
benchmark.add_result(result)
# Save in multiple formats
save_results(benchmark, "qa_results.json", format="json")
save_results(benchmark, "qa_results.csv", format="csv")
save_results(benchmark, "qa_report.md", format="markdown")
# Generate analysis report
report = ResultsAnalyzer.generate_report(benchmark, "markdown")
print(report)
# Compare models
comparison = benchmark.compare_models("accuracy")
print(f"\nBest model: {comparison['best_model']}")
print(f"Best accuracy: {comparison['best_score']:.2%}")
asyncio.run(run_complete_qa_evaluation())
Next Steps
- Summarization Example - Text summarization evaluation
- Multi-Model Comparison - Compare multiple models
- Metrics Guide - Learn about evaluation metrics