webfetch/convert/
structured.rs1use serde::{Deserialize, Serialize};
7
8use super::text::html_to_text_with_refs;
9use crate::compress::compress_text;
10use crate::types::UrlReference;
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct StructuredDoc {
14 pub blocks: Vec<Block>,
15 pub references: Vec<UrlReference>,
16}
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct Block {
20 pub kind: BlockKind,
21 pub text: String,
22}
23
24#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
25#[serde(rename_all = "lowercase")]
26pub enum BlockKind {
27 Paragraph,
28}
29
30pub fn html_to_structured(html: &str, base_url: &str) -> StructuredDoc {
33 let (text, references) = html_to_text_with_refs(html, base_url);
34 let blocks = text
35 .lines()
36 .map(compress_text)
37 .filter(|l| !l.is_empty())
38 .map(|text| Block {
39 kind: BlockKind::Paragraph,
40 text,
41 })
42 .collect();
43 StructuredDoc { blocks, references }
44}
45
46pub fn to_json(doc: &StructuredDoc) -> String {
47 serde_json::to_string_pretty(doc).unwrap_or_else(|_| "{}".to_string())
48}