Skip to main content

webfetch/convert/
structured.rs

1//! Structured conversion: emit the page as an ordered list of typed blocks,
2//! serialized to JSON. Links are preserved as reference indices (same scheme
3//! as the text path), so structured output is both machine-parseable and
4//! token-frugal inline.
5
6use serde::{Deserialize, Serialize};
7
8use super::text::html_to_text_with_refs;
9use crate::compress::compress_text;
10use crate::types::UrlReference;
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct StructuredDoc {
14    pub blocks: Vec<Block>,
15    pub references: Vec<UrlReference>,
16}
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct Block {
20    pub kind: BlockKind,
21    pub text: String,
22}
23
24#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
25#[serde(rename_all = "lowercase")]
26pub enum BlockKind {
27    Paragraph,
28}
29
30/// Build a structured document. Each non-empty line of the reference-style
31/// text becomes a paragraph block; references are carried alongside.
32pub fn html_to_structured(html: &str, base_url: &str) -> StructuredDoc {
33    let (text, references) = html_to_text_with_refs(html, base_url);
34    let blocks = text
35        .lines()
36        .map(compress_text)
37        .filter(|l| !l.is_empty())
38        .map(|text| Block {
39            kind: BlockKind::Paragraph,
40            text,
41        })
42        .collect();
43    StructuredDoc { blocks, references }
44}
45
46pub fn to_json(doc: &StructuredDoc) -> String {
47    serde_json::to_string_pretty(doc).unwrap_or_else(|_| "{}".to_string())
48}