web2llm 0.4.0

Fetch web pages and convert to clean Markdown for LLM pipelines
Documentation
use criterion::{Criterion, criterion_group, criterion_main};
use std::hint::black_box;
use std::time::Duration;
use web2llm::config::Web2llmConfig;
use web2llm::{CrawlConfig, FetchMode, Web2llm};
use wiremock::matchers::{method, path};
use wiremock::{Mock, MockServer, ResponseTemplate};

fn test_client() -> Web2llm {
    let config = Web2llmConfig {
        user_agent: "web2llm-benchmark".to_string(),
        timeout: Duration::from_secs(30),
        block_private_hosts: false,
        robots_check: false,
        rate_limit: 1000,
        max_concurrency: 100,
        fetch_mode: FetchMode::Static,
        ..Default::default()
    };
    Web2llm::new(config).unwrap()
}

fn benchmark_extraction_wikipedia(c: &mut Criterion) {
    let html = std::fs::read_to_string("benchmarks/fixtures/wikipedia.html")
        .expect("missing benchmark fixture");
    let rt = tokio::runtime::Runtime::new().unwrap();

    let (server, client) = rt.block_on(async {
        let server = MockServer::start().await;
        Mock::given(method("GET"))
            .respond_with(ResponseTemplate::new(200).set_body_string(html))
            .mount(&server)
            .await;

        let client = test_client();
        (server, client)
    });

    c.bench_function("extract_wikipedia", |b| {
        b.to_async(&rt)
            .iter(|| async { client.fetch(black_box(&server.uri())).await.unwrap() })
    });
}

fn benchmark_extraction_simple(c: &mut Criterion) {
    let html = "<html><body><article>Just some simple content for a fast benchmark.</article></body></html>";
    let rt = tokio::runtime::Runtime::new().unwrap();

    let (server, client) = rt.block_on(async {
        let server = MockServer::start().await;
        Mock::given(method("GET"))
            .respond_with(ResponseTemplate::new(200).set_body_string(html))
            .mount(&server)
            .await;

        let client = test_client();
        (server, client)
    });

    c.bench_function("extract_simple", |b| {
        b.to_async(&rt)
            .iter(|| async { client.fetch(black_box(&server.uri())).await.unwrap() })
    });
}

fn benchmark_batch_wikipedia(c: &mut Criterion) {
    let html = std::fs::read_to_string("benchmarks/fixtures/wikipedia.html")
        .expect("missing benchmark fixture");
    let rt = tokio::runtime::Runtime::new().unwrap();

    let (server, client) = rt.block_on(async {
        let server = MockServer::start().await;
        Mock::given(method("GET"))
            .respond_with(ResponseTemplate::new(200).set_body_string(html))
            .mount(&server)
            .await;

        let client = test_client();
        (server, client)
    });

    c.bench_function("batch_fetch_wikipedia_100x", |b| {
        b.to_async(&rt).iter(|| async {
            let urls = vec![server.uri(); 100];
            client.batch_fetch(black_box(urls)).await
        })
    });
}

fn benchmark_crawl_depth_one(c: &mut Criterion) {
    let rt = tokio::runtime::Runtime::new().unwrap();

    let (server, client) = rt.block_on(async {
        let server = MockServer::start().await;

        Mock::given(path("/"))
            .respond_with(ResponseTemplate::new(200).set_body_string(format!(
                r#"
                <html><body>
                    <article>
                        Seed content with enough text to score properly.
                        <a href="{0}/one">One</a>
                        <a href="{0}/two">Two</a>
                    </article>
                </body></html>
                "#,
                server.uri()
            )))
            .mount(&server)
            .await;

        Mock::given(path("/one"))
            .respond_with(ResponseTemplate::new(200).set_body_string(
                r#"
                <html><body>
                    <article>First child content with enough text to score properly.</article>
                </body></html>
                "#,
            ))
            .mount(&server)
            .await;

        Mock::given(path("/two"))
            .respond_with(ResponseTemplate::new(200).set_body_string(
                r#"
                <html><body>
                    <article>Second child content with enough text to score properly.</article>
                </body></html>
                "#,
            ))
            .mount(&server)
            .await;

        let client = test_client();
        (server, client)
    });

    c.bench_function("crawl_depth_one", |b| {
        b.to_async(&rt).iter(|| async {
            client
                .crawl(
                    black_box(&server.uri()),
                    CrawlConfig {
                        max_depth: 1,
                        preserve_domain: true,
                    },
                )
                .await
        })
    });
}

criterion_group!(
    benches,
    benchmark_extraction_wikipedia,
    benchmark_extraction_simple,
    benchmark_batch_wikipedia,
    benchmark_crawl_depth_one
);
criterion_main!(benches);