1use serde_json::{json, Value as JsonValue};
2
3pub fn to_md(args: &JsonValue) -> JsonValue {
4 let request = match ToMdRequest::from_args(args) {
5 Ok(request) => request,
6 Err(err) => {
7 return json!({
8 "error": err,
9 "text": "",
10 "markdown": "",
11 });
12 }
13 };
14
15 match html_to_markdown_rs::convert(&request.html, request.options.clone()) {
16 Ok(result) => {
17 let markdown = result.content.as_deref().unwrap_or_default().to_string();
18 let result_json = serde_json::to_value(&result).unwrap_or(JsonValue::Null);
19 json!({
20 "text": markdown.clone(),
21 "markdown": markdown,
22 "result": result_json,
23 "used_options": request.options,
24 })
25 }
26 Err(err) => json!({
27 "error": format!("html to markdown conversion failed: {err}"),
28 "text": "",
29 "markdown": "",
30 }),
31 }
32}
33
34pub fn clean_text(args: &JsonValue) -> JsonValue {
35 let request = CleanTextRequest::from_args(args);
36 let cleaned = clean_page_text_raw(&request.text, request.max_chars);
37 json!({
38 "text": cleaned,
39 "length": cleaned.chars().count(),
40 })
41}
42
43pub fn clean_page_text_raw(raw: &str, max_chars: Option<usize>) -> String {
44 let mut lines = Vec::new();
45 let mut in_code = false;
46
47 for original in raw.lines() {
48 let line = original.trim();
49
50 if line.starts_with("```") {
51 in_code = !in_code;
52 continue;
53 }
54 if in_code || line.is_empty() {
55 continue;
56 }
57
58 let mut candidate = line.to_string();
59 for marker in [
60 " const ",
61 " window.",
62 " document.",
63 " function ",
64 " @media ",
65 " input[type=",
66 " { font-family",
67 "::-webkit",
68 " appearance:",
69 " let ",
70 " var ",
71 ] {
72 if let Some(idx) = candidate.find(marker) {
73 if idx > 20 {
74 candidate.truncate(idx);
75 } else {
76 candidate.clear();
77 }
78 break;
79 }
80 }
81
82 let candidate = candidate.trim();
83 if candidate.is_empty() {
84 continue;
85 }
86
87 let lower = candidate.to_lowercase();
88 let noisy = lower.starts_with("skip to")
89 || lower.contains("cookie")
90 || lower.contains("privacy policy")
91 || lower.contains("terms of service")
92 || lower.starts_with("open menu")
93 || lower.contains("keyboard shortcuts")
94 || lower.starts_with("sign in")
95 || lower.starts_with("sign up")
96 || lower.starts_with("footer")
97 || lower.starts_with("copyright")
98 || candidate.starts_with("window.")
99 || candidate.starts_with("document.")
100 || candidate.starts_with("function ")
101 || candidate.starts_with("const ")
102 || candidate.starts_with("let ")
103 || candidate.starts_with("var ")
104 || candidate.starts_with("@media");
105
106 if noisy {
107 continue;
108 }
109
110 lines.push(candidate.to_string());
111 }
112
113 let cleaned = lines.join("\n");
114 match max_chars {
115 Some(limit) if cleaned.chars().count() > limit => cleaned.chars().take(limit).collect(),
116 _ => cleaned,
117 }
118}
119
120#[derive(Debug, Clone)]
121struct ToMdRequest {
122 html: String,
123 options: Option<html_to_markdown_rs::ConversionOptions>,
124}
125
126impl ToMdRequest {
127 fn from_args(args: &JsonValue) -> Result<Self, String> {
128 Ok(Self {
129 html: arg_text(args, "html"),
130 options: parse_html_to_md_options(args)?,
131 })
132 }
133}
134
135#[derive(Debug, Clone)]
136struct CleanTextRequest {
137 text: String,
138 max_chars: Option<usize>,
139}
140
141impl CleanTextRequest {
142 fn from_args(args: &JsonValue) -> Self {
143 Self {
144 text: arg_text(args, "text"),
145 max_chars: arg_u64(args, "max_chars").map(|v| v as usize),
146 }
147 }
148}
149
150fn parse_html_to_md_options(args: &JsonValue) -> Result<Option<html_to_markdown_rs::ConversionOptions>, String> {
151 let Some(raw) = args.get("options") else {
152 return Ok(None);
153 };
154
155 if !raw.is_object() {
156 return Err("html.to_md options must be an object".to_string());
157 }
158
159 serde_json::from_value::<html_to_markdown_rs::ConversionOptions>(raw.clone())
160 .map(Some)
161 .map_err(|err| format!("invalid html.to_md options: {err}"))
162}
163
164fn arg_text(args: &JsonValue, key: &str) -> String {
165 args.get(key)
166 .and_then(|v| v.as_str())
167 .map(ToOwned::to_owned)
168 .or_else(|| args.get("__input").and_then(|v| v.as_str()).map(ToOwned::to_owned))
169 .unwrap_or_default()
170}
171
172fn arg_u64(args: &JsonValue, key: &str) -> Option<u64> {
173 args.get(key)
174 .and_then(|v| v.as_u64().or_else(|| v.as_str().and_then(|s| s.parse::<u64>().ok())))
175}
176
177#[cfg(test)]
178mod tests {
179 use super::*;
180 use serde_json::json;
181
182 #[test]
183 fn clean_text_filters_script_noise() {
184 let out = clean_text(&json!({
185 "text": "const x = 1;\nKeyboard shortcuts\nReal content line"
186 }));
187 let text = out.get("text").and_then(|v| v.as_str()).unwrap_or("");
188 assert!(text.contains("Real content line"));
189 assert!(!text.contains("Keyboard shortcuts"));
190 assert!(!text.contains("const x = 1"));
191 }
192
193 #[test]
194 fn to_md_options_enable_document_structure() {
195 let out = to_md(&json!({
196 "html": "<h1>Title</h1><p>Hello</p>",
197 "options": {
198 "include_document_structure": true,
199 "extract_metadata": true,
200 "output_format": "markdown"
201 }
202 }));
203
204 assert!(out.get("error").is_none());
205 assert!(out.get("result").is_some());
206 assert_eq!(
207 out.get("used_options")
208 .and_then(|v| v.get("include_document_structure"))
209 .and_then(|v| v.as_bool()),
210 Some(true)
211 );
212 }
213
214 #[test]
215 fn to_md_rejects_non_object_options() {
216 let out = to_md(&json!({
217 "html": "<p>hello</p>",
218 "options": "strict"
219 }));
220 assert!(out.get("error").and_then(|v| v.as_str()).is_some());
221 }
222}