Summarization

Evaluate text summarization quality using ROUGE and other metrics.

Basic Summarization Evaluation

import asyncio
from benchwise import evaluate, benchmark, create_summarization_dataset, rouge_l

# Create summarization dataset
documents = [
    """Climate change refers to long-term shifts in global temperatures and weather patterns.
    While climate variations are natural, since the 1800s human activities have been the main
    driver of climate change, primarily due to the burning of fossil fuels like coal, oil and
    gas, which produces heat-trapping gases.""",

    """Artificial intelligence (AI) refers to the simulation of human intelligence in machines
    that are programmed to think and learn like humans. The term may also be applied to any
    machine that exhibits traits associated with a human mind such as learning and problem-solving."""
]

summaries = [
    "Climate change is long-term temperature shifts mainly caused by human fossil fuel use since the 1800s.",
    "AI is the simulation of human intelligence in machines programmed to think and learn."
]

summ_dataset = create_summarization_dataset(
    documents=documents,
    summaries=summaries,
    name="news_summarization"
)

@benchmark("News Summarization", "Evaluates summarization quality")
@evaluate("gpt-4", "claude-3-sonnet", "gemini-pro")
async def test_summarization(model, dataset):
    # Generate summaries with specific instructions
    prompts = [f"Summarize this in one sentence: {doc}" for doc in dataset.prompts]
    summaries = await model.generate(prompts, max_tokens=100, temperature=0)

    # Use ROUGE-L for summarization evaluation
    rouge_scores = rouge_l(summaries, dataset.references)

    return {
        "rouge_l_f1": rouge_scores["f1"],
        "rouge_l_precision": rouge_scores["precision"],
        "rouge_l_recall": rouge_scores["recall"]
    }

# Run and analyze results
async def main():
    results = await test_summarization(summ_dataset)

    print("\n=== Summarization Results ===")

    for result in results:
        if result.success:
            print(f"\n{result.model_name}:")
            print(f"  ROUGE-L F1: {result.result['rouge_l_f1']:.3f}")
            print(f"  Precision: {result.result['rouge_l_precision']:.3f}")
            print(f"  Recall: {result.result['rouge_l_recall']:.3f}")
            print(f"  Duration: {result.duration:.2f}s")
        else:
            print(f"\n{result.model_name}: FAILED - {result.error}")

asyncio.run(main())

Multi-Metric Summarization

import asyncio
from benchwise import evaluate, create_summarization_dataset, rouge_l, bleu_score, semantic_similarity

# Create summarization dataset
documents = [
    """Climate change refers to long-term shifts in global temperatures and weather patterns.
    While climate variations are natural, since the 1800s human activities have been the main
    driver of climate change, primarily due to the burning of fossil fuels like coal, oil and
    gas, which produces heat-trapping gases.""",

    """Artificial intelligence (AI) refers to the simulation of human intelligence in machines
    that are programmed to think and learn like humans. The term may also be applied to any
    machine that exhibits traits associated with a human mind such as learning and problem-solving."""
]

summaries = [
    "Climate change is long-term temperature shifts mainly caused by human fossil fuel use since the 1800s.",
    "AI is the simulation of human intelligence in machines programmed to think and learn."
]

summ_dataset = create_summarization_dataset(
    documents=documents,
    summaries=summaries,
    name="news_summarization"
)

@evaluate("gpt-4", "claude-3-opus")
async def test_with_multiple_metrics(model, dataset):
    prompts = [f"Summarize concisely: {doc}" for doc in dataset.prompts]
    summaries = await model.generate(prompts, temperature=0)

    # Multiple metrics
    rouge = rouge_l(summaries, dataset.references)
    bleu = bleu_score(summaries, dataset.references)
    similarity = semantic_similarity(summaries, dataset.references)

    return {
        "rouge_f1": rouge["f1"],
        "bleu": bleu["bleu"],
        "semantic_similarity": similarity["mean_similarity"]
    }

async def main():
    results = await test_with_multiple_metrics(summ_dataset)

    print("\n=== Multi-Metric Summarization Results ===")
    for result in results:
        if result.success:
            print(f"\n{result.model_name}:")
            print(f"  ROUGE F1: {result.result['rouge_f1']:.3f}")
            print(f"  BLEU: {result.result['bleu']:.3f}")
            print(f"  Semantic Similarity: {result.result['semantic_similarity']:.3f}")
        else:
            print(f"\n{result.model_name}: FAILED - {result.error}")

asyncio.run(main())

Abstractive vs Extractive Summarization

import asyncio
from benchwise import evaluate, create_summarization_dataset, rouge_l

article = """The stock market experienced significant volatility today, with the Dow Jones
Industrial Average dropping 500 points in early trading before recovering most losses by the
closing bell. Technology stocks led the decline, with major companies seeing 3-5% drops.
Analysts attribute the volatility to concerns about inflation and potential interest rate hikes."""

extractive_summary = "The stock market dropped 500 points before recovering. Technology stocks declined 3-5%."
abstractive_summary = "Markets were volatile today due to inflation concerns, with tech leading losses."

abstractive_dataset = create_summarization_dataset(
    documents=[article],
    summaries=[abstractive_summary],
    name="abstractive_test"
)

@evaluate("gpt-4", temperature=0)
async def test_abstractive(model, dataset):
    # Encourage abstractive summarization
    prompts = [f"Write an abstractive summary (use your own words): {doc}"
               for doc in dataset.prompts]
    summaries = await model.generate(prompts)

    rouge = rouge_l(summaries, dataset.references)

    return {
        "rouge_f1": rouge["f1"],
        "avg_length": sum(len(s.split()) for s in summaries) / len(summaries)
    }

async def main():
    results = await test_abstractive(abstractive_dataset)

    print("\n=== Abstractive Summarization Results ===")
    for result in results:
        if result.success:
            print(f"\n{result.model_name}:")
            print(f"  ROUGE F1: {result.result['rouge_f1']:.3f}")
            print(f"  Avg Length: {result.result['avg_length']:.1f} words")
        else:
            print(f"\n{result.model_name}: FAILED - {result.error}")

asyncio.run(main())

Length-Controlled Summarization

import asyncio
from benchwise import evaluate, create_summarization_dataset, rouge_l

long_articles = [
    """[Very long article about quantum computing - 1000+ words]""",
    """[Very long article about renewable energy - 1000+ words]"""
]

long_article_dataset = create_summarization_dataset(
    documents=long_articles,
    summaries=["Quantum computing summary", "Renewable energy summary"],
    name="long_articles"
)

@evaluate("gpt-4")
async def test_different_lengths(model, dataset):
    results = {}

    # Test different summary lengths
    for length in [25, 50, 100]:
        prompts = [f"Summarize in exactly {length} words: {doc}"
                  for doc in dataset.prompts]
        summaries = await model.generate(prompts, max_tokens=length*2)

        # Check length compliance
        actual_lengths = [len(s.split()) for s in summaries]
        avg_length = sum(actual_lengths) / len(actual_lengths)

        rouge = rouge_l(summaries, dataset.references)

        results[f"length_{length}"] = {
            "rouge_f1": rouge["f1"],
            "target_length": length,
            "actual_length": avg_length,
            "length_accuracy": 1 - abs(avg_length - length) / length
        }

    return results

async def main():
    results = await test_different_lengths(long_article_dataset)

    print("\n=== Length-Controlled Summarization Results ===")
    for result in results:
        if result.success:
            print(f"\n{result.model_name}:")
            for length_key, metrics in result.result.items():
                print(f"  {length_key}:")
                print(f"    ROUGE F1: {metrics['rouge_f1']:.3f}")
                print(f"    Target: {metrics['target_length']}, Actual: {metrics['actual_length']:.1f}")
                print(f"    Length Accuracy: {metrics['length_accuracy']:.2%}")
        else:
            print(f"\n{result.model_name}: FAILED - {result.error}")

asyncio.run(main())

Domain-Specific Summarization

import asyncio
from benchwise import evaluate, create_summarization_dataset, rouge_l

# Scientific paper summarization
scientific_dataset = create_summarization_dataset(
    documents=[
        """Abstract: This study investigates the effects of machine learning algorithms
        on medical diagnosis accuracy. We trained three models on a dataset of 10,000
        patient records and achieved 95% accuracy..."""
    ],
    summaries=[
        "ML algorithms achieved 95% accuracy in medical diagnosis using 10,000 patient records."
    ],
    name="scientific_summarization"
)

@evaluate("gpt-4", "claude-3-opus")
async def test_scientific_summarization(model, dataset):
    prompts = [f"Summarize this scientific abstract for a general audience: {doc}"
               for doc in dataset.prompts]
    summaries = await model.generate(prompts)

    rouge = rouge_l(summaries, dataset.references)

    return {
        "rouge_f1": rouge["f1"],
        "readability_friendly": True  # Custom metric
    }

async def main():
    results = await test_scientific_summarization(scientific_dataset)

    print("\n=== Scientific Summarization Results ===")
    for result in results:
        if result.success:
            print(f"\n{result.model_name}:")
            print(f"  ROUGE F1: {result.result['rouge_f1']:.3f}")
            print(f"  Readability Friendly: {result.result['readability_friendly']}")
        else:
            print(f"\n{result.model_name}: FAILED - {result.error}")

asyncio.run(main())

Bullet Point Summaries

import asyncio
from benchwise import evaluate, create_summarization_dataset, rouge_l

# Create dataset
documents = [
    """Climate change refers to long-term shifts in global temperatures and weather patterns.
    While climate variations are natural, since the 1800s human activities have been the main
    driver of climate change, primarily due to the burning of fossil fuels like coal, oil and
    gas, which produces heat-trapping gases.""",

    """Artificial intelligence (AI) refers to the simulation of human intelligence in machines
    that are programmed to think and learn like humans. The term may also be applied to any
    machine that exhibits traits associated with a human mind such as learning and problem-solving."""
]

summaries = [
    "• Climate change is caused by human activities\n• Fossil fuels are the main driver\n• Effects started in the 1800s",
    "• AI simulates human intelligence\n• Machines learn and think\n• Applied to problem-solving tasks"
]

bullet_dataset = create_summarization_dataset(
    documents=documents,
    summaries=summaries,
    name="bullet_summaries"
)

@evaluate("gpt-4", "claude-3-opus")
async def test_bullet_summaries(model, dataset):
    prompts = [f"Summarize in 3-5 bullet points:\n{doc}" for doc in dataset.prompts]
    summaries = await model.generate(prompts)

    # Custom metric: count bullet points
    bullet_counts = []
    for summary in summaries:
        # Count lines starting with -, *, or •
        bullets = sum(1 for line in summary.split('\n')
                     if line.strip().startswith(('-', '*', '•')))
        bullet_counts.append(bullets)

    avg_bullets = sum(bullet_counts) / len(bullet_counts)

    rouge = rouge_l(summaries, dataset.references)

    return {
        "rouge_f1": rouge["f1"],
        "avg_bullet_points": avg_bullets,
        "bullet_compliance": sum(1 for b in bullet_counts if 3 <= b <= 5) / len(bullet_counts)
    }

async def main():
    results = await test_bullet_summaries(bullet_dataset)

    print("\n=== Bullet Point Summaries Results ===")
    for result in results:
        if result.success:
            print(f"\n{result.model_name}:")
            print(f"  ROUGE F1: {result.result['rouge_f1']:.3f}")
            print(f"  Avg Bullet Points: {result.result['avg_bullet_points']:.1f}")
            print(f"  Bullet Compliance: {result.result['bullet_compliance']:.2%}")
        else:
            print(f"\n{result.model_name}: FAILED - {result.error}")

asyncio.run(main())

Multilingual Summarization

import asyncio
from benchwise import evaluate, create_summarization_dataset, rouge_l

multilingual_dataset = create_summarization_dataset(
    documents=[
        "This is a document in English about global warming...",
        "Ceci est un document en français sur le réchauffement climatique..."
    ],
    summaries=[
        "Summary in English",
        "Résumé en français"
    ],
    name="multilingual_summarization"
)

@evaluate("gpt-4", "claude-3-opus")
async def test_multilingual(model, dataset):
    prompts = [f"Summarize in the same language as the input: {doc}"
               for doc in dataset.prompts]
    summaries = await model.generate(prompts)

    rouge = rouge_l(summaries, dataset.references)

    return {
        "rouge_f1": rouge["f1"]
    }

async def main():
    results = await test_multilingual(multilingual_dataset)

    print("\n=== Multilingual Summarization Results ===")
    for result in results:
        if result.success:
            print(f"\n{result.model_name}:")
            print(f"  ROUGE F1: {result.result['rouge_f1']:.3f}")
        else:
            print(f"\n{result.model_name}: FAILED - {result.error}")

asyncio.run(main())

Real-Time News Summarization

import asyncio
import time
from benchwise import evaluate, create_summarization_dataset, save_results, BenchmarkResult, benchmark, rouge_l

# Create news articles dataset
news_dataset = create_summarization_dataset(
    documents=[
        "Stock markets rallied today with tech stocks leading gains...",
        "Climate summit reaches historic agreement on emissions..."
    ],
    summaries=[
        "Markets up, tech leads",
        "Climate deal reached"
    ],
    name="news_articles"
)

@benchmark("News Summarization", "Real-time news article summarization")
@evaluate("gpt-4o-mini", "claude-3-5-haiku-20241022", "gemini-pro")
async def test_news_summarization(model, dataset):
    # Fast summarization for real-time use
    prompts = [f"Headline and 2-sentence summary: {doc}" for doc in dataset.prompts]

    start_time = time.monotonic()
    summaries = await model.generate(prompts, max_tokens=150, temperature=0.3)
    duration = time.monotonic() - start_time

    rouge = rouge_l(summaries, dataset.references)

    return {
        "rouge_f1": rouge["f1"],
        "avg_latency": duration / len(summaries)  # Per-article latency
    }

async def main():
    results = await test_news_summarization(news_dataset)

    # Find fastest model with good quality
    for result in results:
        if result.success and result.result["rouge_f1"] > 0.4:
            latency = result.result["avg_latency"]
            print(f"{result.model_name}: ROUGE {result.result['rouge_f1']:.3f}, "
                  f"Latency: {latency:.2f}s per article")

asyncio.run(main())

Saving Results

import asyncio
from benchwise import (
    evaluate,
    benchmark,
    create_summarization_dataset,
    rouge_l,
    save_results,
    BenchmarkResult,
    ResultsAnalyzer
)

# Create summarization dataset
documents = [
    """Climate change refers to long-term shifts in global temperatures and weather patterns.
    While climate variations are natural, since the 1800s human activities have been the main
    driver of climate change, primarily due to the burning of fossil fuels like coal, oil and
    gas, which produces heat-trapping gases.""",

    """Artificial intelligence (AI) refers to the simulation of human intelligence in machines
    that are programmed to think and learn like humans. The term may also be applied to any
    machine that exhibits traits associated with a human mind such as learning and problem-solving."""
]

summaries = [
    "Climate change is long-term temperature shifts mainly caused by human fossil fuel use since the 1800s.",
    "AI is the simulation of human intelligence in machines programmed to think and learn."
]

summ_dataset = create_summarization_dataset(
    documents=documents,
    summaries=summaries,
    name="news_summarization"
)

@benchmark("News Summarization", "Evaluates summarization quality")
@evaluate("gpt-4", "claude-3-sonnet", "gemini-pro")
async def test_summarization(model, dataset):
    prompts = [f"Summarize this in one sentence: {doc}" for doc in dataset.prompts]
    summaries = await model.generate(prompts, max_tokens=100, temperature=0)
    rouge_scores = rouge_l(summaries, dataset.references)
    return {
        "rouge_l_f1": rouge_scores["f1"],
        "rouge_l_precision": rouge_scores["precision"],
        "rouge_l_recall": rouge_scores["recall"]
    }

async def run_and_save():
    results = await test_summarization(summ_dataset)

    # Create benchmark result
    benchmark = BenchmarkResult(
        "Summarization Benchmark",
        metadata={"date": "2024-11-16", "task": "summarization"}
    )

    for result in results:
        benchmark.add_result(result)

    # Save results
    save_results(benchmark, "summarization_results.json", format="json")
    save_results(benchmark, "summarization_report.md", format="markdown")

    # Analyze
    report = ResultsAnalyzer.generate_report(benchmark, "markdown")
    print(report)

asyncio.run(run_and_save())

Next Steps

Safety Evaluation - Evaluate content safety
Classification - Text classification tasks
Metrics Guide - Learn about ROUGE and other metrics

Basic Summarization Evaluation​

Multi-Metric Summarization​

Abstractive vs Extractive Summarization​

Length-Controlled Summarization​

Domain-Specific Summarization​

Bullet Point Summaries​

Multilingual Summarization​

Real-Time News Summarization​

Saving Results​

Next Steps​

Basic Summarization Evaluation

Multi-Metric Summarization

Abstractive vs Extractive Summarization

Length-Controlled Summarization

Domain-Specific Summarization

Bullet Point Summaries

Multilingual Summarization

Real-Time News Summarization

Saving Results

Next Steps