Skip to main content

grapheme_stdlib/
html.rs

1use serde_json::{json, Value as JsonValue};
2
3pub fn to_md(args: &JsonValue) -> JsonValue {
4    let request = match ToMdRequest::from_args(args) {
5        Ok(request) => request,
6        Err(err) => {
7            return json!({
8                "error": err,
9                "text": "",
10                "markdown": "",
11            });
12        }
13    };
14
15    match html_to_markdown_rs::convert(&request.html, request.options.clone()) {
16        Ok(result) => {
17            let markdown = result.content.as_deref().unwrap_or_default().to_string();
18            let result_json = serde_json::to_value(&result).unwrap_or(JsonValue::Null);
19            json!({
20                "text": markdown.clone(),
21                "markdown": markdown,
22                "result": result_json,
23                "used_options": request.options,
24            })
25        }
26        Err(err) => json!({
27            "error": format!("html to markdown conversion failed: {err}"),
28            "text": "",
29            "markdown": "",
30        }),
31    }
32}
33
34pub fn clean_text(args: &JsonValue) -> JsonValue {
35    let request = CleanTextRequest::from_args(args);
36    let cleaned = clean_page_text_raw(&request.text, request.max_chars);
37    json!({
38        "text": cleaned,
39        "length": cleaned.chars().count(),
40    })
41}
42
43pub fn clean_page_text_raw(raw: &str, max_chars: Option<usize>) -> String {
44    let mut lines = Vec::new();
45    let mut in_code = false;
46
47    for original in raw.lines() {
48        let line = original.trim();
49
50        if line.starts_with("```") {
51            in_code = !in_code;
52            continue;
53        }
54        if in_code || line.is_empty() {
55            continue;
56        }
57
58        let mut candidate = line.to_string();
59        for marker in [
60            " const ",
61            " window.",
62            " document.",
63            " function ",
64            " @media ",
65            " input[type=",
66            " { font-family",
67            "::-webkit",
68            " appearance:",
69            " let ",
70            " var ",
71        ] {
72            if let Some(idx) = candidate.find(marker) {
73                if idx > 20 {
74                    candidate.truncate(idx);
75                } else {
76                    candidate.clear();
77                }
78                break;
79            }
80        }
81
82        let candidate = candidate.trim();
83        if candidate.is_empty() {
84            continue;
85        }
86
87        let lower = candidate.to_lowercase();
88        let noisy = lower.starts_with("skip to")
89            || lower.contains("cookie")
90            || lower.contains("privacy policy")
91            || lower.contains("terms of service")
92            || lower.starts_with("open menu")
93            || lower.contains("keyboard shortcuts")
94            || lower.starts_with("sign in")
95            || lower.starts_with("sign up")
96            || lower.starts_with("footer")
97            || lower.starts_with("copyright")
98            || candidate.starts_with("window.")
99            || candidate.starts_with("document.")
100            || candidate.starts_with("function ")
101            || candidate.starts_with("const ")
102            || candidate.starts_with("let ")
103            || candidate.starts_with("var ")
104            || candidate.starts_with("@media");
105
106        if noisy {
107            continue;
108        }
109
110        lines.push(candidate.to_string());
111    }
112
113    let cleaned = lines.join("\n");
114    match max_chars {
115        Some(limit) if cleaned.chars().count() > limit => cleaned.chars().take(limit).collect(),
116        _ => cleaned,
117    }
118}
119
120#[derive(Debug, Clone)]
121struct ToMdRequest {
122    html: String,
123    options: Option<html_to_markdown_rs::ConversionOptions>,
124}
125
126impl ToMdRequest {
127    fn from_args(args: &JsonValue) -> Result<Self, String> {
128        Ok(Self {
129            html: arg_text(args, "html"),
130            options: parse_html_to_md_options(args)?,
131        })
132    }
133}
134
135#[derive(Debug, Clone)]
136struct CleanTextRequest {
137    text: String,
138    max_chars: Option<usize>,
139}
140
141impl CleanTextRequest {
142    fn from_args(args: &JsonValue) -> Self {
143        Self {
144            text: arg_text(args, "text"),
145            max_chars: arg_u64(args, "max_chars").map(|v| v as usize),
146        }
147    }
148}
149
150fn parse_html_to_md_options(args: &JsonValue) -> Result<Option<html_to_markdown_rs::ConversionOptions>, String> {
151    let Some(raw) = args.get("options") else {
152        return Ok(None);
153    };
154
155    if !raw.is_object() {
156        return Err("html.to_md options must be an object".to_string());
157    }
158
159    serde_json::from_value::<html_to_markdown_rs::ConversionOptions>(raw.clone())
160        .map(Some)
161        .map_err(|err| format!("invalid html.to_md options: {err}"))
162}
163
164fn arg_text(args: &JsonValue, key: &str) -> String {
165    args.get(key)
166        .and_then(|v| v.as_str())
167        .map(ToOwned::to_owned)
168        .or_else(|| args.get("__input").and_then(|v| v.as_str()).map(ToOwned::to_owned))
169        .unwrap_or_default()
170}
171
172fn arg_u64(args: &JsonValue, key: &str) -> Option<u64> {
173    args.get(key)
174        .and_then(|v| v.as_u64().or_else(|| v.as_str().and_then(|s| s.parse::<u64>().ok())))
175}
176
177#[cfg(test)]
178mod tests {
179    use super::*;
180    use serde_json::json;
181
182    #[test]
183    fn clean_text_filters_script_noise() {
184        let out = clean_text(&json!({
185            "text": "const x = 1;\nKeyboard shortcuts\nReal content line"
186        }));
187        let text = out.get("text").and_then(|v| v.as_str()).unwrap_or("");
188        assert!(text.contains("Real content line"));
189        assert!(!text.contains("Keyboard shortcuts"));
190        assert!(!text.contains("const x = 1"));
191    }
192
193    #[test]
194    fn to_md_options_enable_document_structure() {
195        let out = to_md(&json!({
196            "html": "<h1>Title</h1><p>Hello</p>",
197            "options": {
198                "include_document_structure": true,
199                "extract_metadata": true,
200                "output_format": "markdown"
201            }
202        }));
203
204        assert!(out.get("error").is_none());
205        assert!(out.get("result").is_some());
206        assert_eq!(
207            out.get("used_options")
208                .and_then(|v| v.get("include_document_structure"))
209                .and_then(|v| v.as_bool()),
210            Some(true)
211        );
212    }
213
214    #[test]
215    fn to_md_rejects_non_object_options() {
216        let out = to_md(&json!({
217            "html": "<p>hello</p>",
218            "options": "strict"
219        }));
220        assert!(out.get("error").and_then(|v| v.as_str()).is_some());
221    }
222}