crw-extract 0.6.2

HTML extraction and markdown conversion engine for the CRW web scraper
Documentation
use crw_core::types::OutputFormat;
use crw_extract::ExtractOptions;

fn main() {
    let html = r##"<html><head>
        <title>New extended temperature range for Compute Module 4 - Raspberry Pi</title>
        <meta property="og:title" content="New extended temperature range for Compute Module 4 - Raspberry Pi">
    </head><body>
        <nav><h1>News</h1><a href="/news">All news</a></nav>
        <article><p>While the Raspberry Pi project has its origins in education, the majority of Raspberry Pi computers we make today are destined for industrial and embedded applications. Compute Module 4 has been used by thousands of customers in challenging environments.</p></article>
    </body></html>"##;
    let data = crw_extract::extract(ExtractOptions {
        raw_html: html,
        source_url: "https://www.raspberrypi.com/news/x/",
        status_code: 200,
        rendered_with: Some("http".into()),
        elapsed_ms: 0,
        render_decision: None,
        credit_cost: 0,
        warnings: Vec::new(),
        formats: &[OutputFormat::Markdown],
        only_main_content: true,
        include_tags: &[],
        exclude_tags: &[],
        css_selector: None,
        xpath: None,
        chunk_strategy: None,
        query: None,
        filter_mode: None,
        top_k: None,
        domain_selectors: None,
        captured_responses: &[],
        llm_fallback: None,
        debug: false,
        debug_sink: None,
    })
    .unwrap();
    let md = data.markdown.unwrap();
    println!("og_title: {:?}", data.metadata.og_title);
    println!("md_len: {}", md.len());
    println!("first 200:\n{}", &md[..md.len().min(200)]);
    println!();
    println!(
        "contains core: {}",
        md.contains("New extended temperature range for Compute Module 4")
    );
    println!(
        "starts with prepended title H1: {}",
        md.starts_with("# New extended temperature range")
    );
}