Skip to main content

webfetch/convert/
mod.rs

1//! Output dispatcher: routes an HTML document to the requested format.
2
3pub mod markdown;
4pub mod structured;
5pub mod text;
6
7use crate::compress::{compress_block, compress_text};
8use crate::types::{ContentType, UrlReference};
9
10/// Elements whose contents never belong in extracted output (scripts,
11/// styling, embedded documents). Shared by every walker so the formats
12/// agree on what to drop.
13pub(crate) fn is_skippable(name: &str) -> bool {
14    matches!(
15        name,
16        "script" | "style" | "noscript" | "svg" | "head" | "template" | "iframe"
17    )
18}
19
20/// A converted document: the rendered `content` plus any preserved references.
21pub struct Converted {
22    pub content: String,
23    pub references: Vec<UrlReference>,
24}
25
26/// Convert HTML to the requested content type.
27///
28/// For [`ContentType::Text`], the reference list is rendered into a trailing
29/// `References:` block appended to the content (and also returned separately).
30pub fn convert(html: &str, base_url: &str, content_type: ContentType) -> Converted {
31    match content_type {
32        ContentType::Text => {
33            let (body, references) = text::html_to_text_with_refs(html, base_url);
34            let body = compress_block(&body);
35            let refs_block = text::render_references(&references);
36            let content = if refs_block.is_empty() {
37                body
38            } else {
39                format!("{}\n\n{}", body, refs_block)
40            };
41            Converted {
42                content,
43                references,
44            }
45        }
46        ContentType::Markdown => {
47            let md = markdown::html_to_markdown(html, base_url);
48            Converted {
49                content: compress_block(&md),
50                references: Vec::new(),
51            }
52        }
53        ContentType::Structured => {
54            let doc = structured::html_to_structured(html, base_url);
55            let _ = compress_text; // available for callers extending block kinds
56            Converted {
57                content: structured::to_json(&doc),
58                references: doc.references,
59            }
60        }
61    }
62}