Skip to main content

grapheme_stdlib/
html.rs

1use serde_json::{json, Value as JsonValue};
2
3pub fn to_md(args: &JsonValue) -> JsonValue {
4    let request = match ToMdRequest::from_args(args) {
5        Ok(request) => request,
6        Err(err) => {
7            return json!({
8                "error": err,
9                "text": "",
10                "markdown": "",
11            });
12        }
13    };
14
15    match html_to_markdown_rs::convert(&request.html, request.options.clone()) {
16        Ok(result) => {
17            let markdown = result.content.as_deref().unwrap_or_default().to_string();
18            let result_json = serde_json::to_value(&result).unwrap_or(JsonValue::Null);
19            json!({
20                "text": markdown.clone(),
21                "markdown": markdown,
22                "result": result_json,
23                "used_options": request.options,
24            })
25        }
26        Err(err) => json!({
27            "error": format!("html to markdown conversion failed: {err}"),
28            "text": "",
29            "markdown": "",
30        }),
31    }
32}
33
34pub fn clean_text(args: &JsonValue) -> JsonValue {
35    let request = CleanTextRequest::from_args(args);
36    let cleaned = clean_page_text_raw(&request.text, request.max_chars);
37    json!({
38        "text": cleaned,
39        "length": cleaned.chars().count(),
40    })
41}
42
43pub fn clean_page_text_raw(raw: &str, max_chars: Option<usize>) -> String {
44    let mut lines = Vec::new();
45    let mut in_code = false;
46
47    for original in raw.lines() {
48        let line = original.trim();
49
50        if line.starts_with("```") {
51            in_code = !in_code;
52            continue;
53        }
54        if in_code || line.is_empty() {
55            continue;
56        }
57
58        let mut candidate = line.to_string();
59        for marker in [
60            " const ",
61            " window.",
62            " document.",
63            " function ",
64            " @media ",
65            " input[type=",
66            " { font-family",
67            "::-webkit",
68            " appearance:",
69            " let ",
70            " var ",
71        ] {
72            if let Some(idx) = candidate.find(marker) {
73                if idx > 20 {
74                    candidate.truncate(idx);
75                } else {
76                    candidate.clear();
77                }
78                break;
79            }
80        }
81
82        let candidate = candidate.trim();
83        if candidate.is_empty() {
84            continue;
85        }
86
87        let lower = candidate.to_lowercase();
88        let noisy = lower.starts_with("skip to")
89            || lower.contains("cookie")
90            || lower.contains("privacy policy")
91            || lower.contains("terms of service")
92            || lower.starts_with("open menu")
93            || lower.contains("keyboard shortcuts")
94            || lower.starts_with("sign in")
95            || lower.starts_with("sign up")
96            || lower.starts_with("footer")
97            || lower.starts_with("copyright")
98            || candidate.starts_with("window.")
99            || candidate.starts_with("document.")
100            || candidate.starts_with("function ")
101            || candidate.starts_with("const ")
102            || candidate.starts_with("let ")
103            || candidate.starts_with("var ")
104            || candidate.starts_with("@media");
105
106        if noisy {
107            continue;
108        }
109
110        lines.push(candidate.to_string());
111    }
112
113    let cleaned = lines.join("\n");
114    match max_chars {
115        Some(limit) if cleaned.chars().count() > limit => cleaned.chars().take(limit).collect(),
116        _ => cleaned,
117    }
118}
119
120#[derive(Debug, Clone)]
121struct ToMdRequest {
122    html: String,
123    options: Option<html_to_markdown_rs::ConversionOptions>,
124}
125
126impl ToMdRequest {
127    fn from_args(args: &JsonValue) -> Result<Self, String> {
128        Ok(Self {
129            html: arg_text(args, "html"),
130            options: parse_html_to_md_options(args)?,
131        })
132    }
133}
134
135#[derive(Debug, Clone)]
136struct CleanTextRequest {
137    text: String,
138    max_chars: Option<usize>,
139}
140
141impl CleanTextRequest {
142    fn from_args(args: &JsonValue) -> Self {
143        Self {
144            text: arg_text(args, "text"),
145            max_chars: arg_u64(args, "max_chars").map(|v| v as usize),
146        }
147    }
148}
149
150fn parse_html_to_md_options(
151    args: &JsonValue,
152) -> Result<Option<html_to_markdown_rs::ConversionOptions>, String> {
153    let Some(raw) = args.get("options") else {
154        return Ok(None);
155    };
156
157    if !raw.is_object() {
158        return Err("html.to_md options must be an object".to_string());
159    }
160
161    serde_json::from_value::<html_to_markdown_rs::ConversionOptions>(raw.clone())
162        .map(Some)
163        .map_err(|err| format!("invalid html.to_md options: {err}"))
164}
165
166fn arg_text(args: &JsonValue, key: &str) -> String {
167    args.get(key)
168        .and_then(|v| v.as_str())
169        .map(ToOwned::to_owned)
170        .or_else(|| {
171            args.get("__input")
172                .and_then(|v| v.as_str())
173                .map(ToOwned::to_owned)
174        })
175        .unwrap_or_default()
176}
177
178fn arg_u64(args: &JsonValue, key: &str) -> Option<u64> {
179    args.get(key).and_then(|v| {
180        v.as_u64()
181            .or_else(|| v.as_str().and_then(|s| s.parse::<u64>().ok()))
182    })
183}
184
185#[cfg(test)]
186mod tests {
187    use super::*;
188    use serde_json::json;
189
190    #[test]
191    fn clean_text_filters_script_noise() {
192        let out = clean_text(&json!({
193            "text": "const x = 1;\nKeyboard shortcuts\nReal content line"
194        }));
195        let text = out.get("text").and_then(|v| v.as_str()).unwrap_or("");
196        assert!(text.contains("Real content line"));
197        assert!(!text.contains("Keyboard shortcuts"));
198        assert!(!text.contains("const x = 1"));
199    }
200
201    #[test]
202    fn to_md_options_enable_document_structure() {
203        let out = to_md(&json!({
204            "html": "<h1>Title</h1><p>Hello</p>",
205            "options": {
206                "include_document_structure": true,
207                "extract_metadata": true,
208                "output_format": "markdown"
209            }
210        }));
211
212        assert!(out.get("error").is_none());
213        assert!(out.get("result").is_some());
214        assert_eq!(
215            out.get("used_options")
216                .and_then(|v| v.get("include_document_structure"))
217                .and_then(|v| v.as_bool()),
218            Some(true)
219        );
220    }
221
222    #[test]
223    fn to_md_rejects_non_object_options() {
224        let out = to_md(&json!({
225            "html": "<p>hello</p>",
226            "options": "strict"
227        }));
228        assert!(out.get("error").and_then(|v| v.as_str()).is_some());
229    }
230}