1use serde_json::{json, Value as JsonValue};
2
3pub fn to_md(args: &JsonValue) -> JsonValue {
4 let request = match ToMdRequest::from_args(args) {
5 Ok(request) => request,
6 Err(err) => {
7 return json!({
8 "error": err,
9 "text": "",
10 "markdown": "",
11 });
12 }
13 };
14
15 match html_to_markdown_rs::convert(&request.html, request.options.clone()) {
16 Ok(result) => {
17 let markdown = result.content.as_deref().unwrap_or_default().to_string();
18 let result_json = serde_json::to_value(&result).unwrap_or(JsonValue::Null);
19 json!({
20 "text": markdown.clone(),
21 "markdown": markdown,
22 "result": result_json,
23 "used_options": request.options,
24 })
25 }
26 Err(err) => json!({
27 "error": format!("html to markdown conversion failed: {err}"),
28 "text": "",
29 "markdown": "",
30 }),
31 }
32}
33
34pub fn clean_text(args: &JsonValue) -> JsonValue {
35 let request = CleanTextRequest::from_args(args);
36 let cleaned = clean_page_text_raw(&request.text, request.max_chars);
37 json!({
38 "text": cleaned,
39 "length": cleaned.chars().count(),
40 })
41}
42
43pub fn clean_page_text_raw(raw: &str, max_chars: Option<usize>) -> String {
44 let mut lines = Vec::new();
45 let mut in_code = false;
46
47 for original in raw.lines() {
48 let line = original.trim();
49
50 if line.starts_with("```") {
51 in_code = !in_code;
52 continue;
53 }
54 if in_code || line.is_empty() {
55 continue;
56 }
57
58 let mut candidate = line.to_string();
59 for marker in [
60 " const ",
61 " window.",
62 " document.",
63 " function ",
64 " @media ",
65 " input[type=",
66 " { font-family",
67 "::-webkit",
68 " appearance:",
69 " let ",
70 " var ",
71 ] {
72 if let Some(idx) = candidate.find(marker) {
73 if idx > 20 {
74 candidate.truncate(idx);
75 } else {
76 candidate.clear();
77 }
78 break;
79 }
80 }
81
82 let candidate = candidate.trim();
83 if candidate.is_empty() {
84 continue;
85 }
86
87 let lower = candidate.to_lowercase();
88 let noisy = lower.starts_with("skip to")
89 || lower.contains("cookie")
90 || lower.contains("privacy policy")
91 || lower.contains("terms of service")
92 || lower.starts_with("open menu")
93 || lower.contains("keyboard shortcuts")
94 || lower.starts_with("sign in")
95 || lower.starts_with("sign up")
96 || lower.starts_with("footer")
97 || lower.starts_with("copyright")
98 || candidate.starts_with("window.")
99 || candidate.starts_with("document.")
100 || candidate.starts_with("function ")
101 || candidate.starts_with("const ")
102 || candidate.starts_with("let ")
103 || candidate.starts_with("var ")
104 || candidate.starts_with("@media");
105
106 if noisy {
107 continue;
108 }
109
110 lines.push(candidate.to_string());
111 }
112
113 let cleaned = lines.join("\n");
114 match max_chars {
115 Some(limit) if cleaned.chars().count() > limit => cleaned.chars().take(limit).collect(),
116 _ => cleaned,
117 }
118}
119
120#[derive(Debug, Clone)]
121struct ToMdRequest {
122 html: String,
123 options: Option<html_to_markdown_rs::ConversionOptions>,
124}
125
126impl ToMdRequest {
127 fn from_args(args: &JsonValue) -> Result<Self, String> {
128 Ok(Self {
129 html: arg_text(args, "html"),
130 options: parse_html_to_md_options(args)?,
131 })
132 }
133}
134
135#[derive(Debug, Clone)]
136struct CleanTextRequest {
137 text: String,
138 max_chars: Option<usize>,
139}
140
141impl CleanTextRequest {
142 fn from_args(args: &JsonValue) -> Self {
143 Self {
144 text: arg_text(args, "text"),
145 max_chars: arg_u64(args, "max_chars").map(|v| v as usize),
146 }
147 }
148}
149
150fn parse_html_to_md_options(
151 args: &JsonValue,
152) -> Result<Option<html_to_markdown_rs::ConversionOptions>, String> {
153 let Some(raw) = args.get("options") else {
154 return Ok(None);
155 };
156
157 if !raw.is_object() {
158 return Err("html.to_md options must be an object".to_string());
159 }
160
161 serde_json::from_value::<html_to_markdown_rs::ConversionOptions>(raw.clone())
162 .map(Some)
163 .map_err(|err| format!("invalid html.to_md options: {err}"))
164}
165
166fn arg_text(args: &JsonValue, key: &str) -> String {
167 args.get(key)
168 .and_then(|v| v.as_str())
169 .map(ToOwned::to_owned)
170 .or_else(|| {
171 args.get("__input")
172 .and_then(|v| v.as_str())
173 .map(ToOwned::to_owned)
174 })
175 .unwrap_or_default()
176}
177
178fn arg_u64(args: &JsonValue, key: &str) -> Option<u64> {
179 args.get(key).and_then(|v| {
180 v.as_u64()
181 .or_else(|| v.as_str().and_then(|s| s.parse::<u64>().ok()))
182 })
183}
184
185#[cfg(test)]
186mod tests {
187 use super::*;
188 use serde_json::json;
189
190 #[test]
191 fn clean_text_filters_script_noise() {
192 let out = clean_text(&json!({
193 "text": "const x = 1;\nKeyboard shortcuts\nReal content line"
194 }));
195 let text = out.get("text").and_then(|v| v.as_str()).unwrap_or("");
196 assert!(text.contains("Real content line"));
197 assert!(!text.contains("Keyboard shortcuts"));
198 assert!(!text.contains("const x = 1"));
199 }
200
201 #[test]
202 fn to_md_options_enable_document_structure() {
203 let out = to_md(&json!({
204 "html": "<h1>Title</h1><p>Hello</p>",
205 "options": {
206 "include_document_structure": true,
207 "extract_metadata": true,
208 "output_format": "markdown"
209 }
210 }));
211
212 assert!(out.get("error").is_none());
213 assert!(out.get("result").is_some());
214 assert_eq!(
215 out.get("used_options")
216 .and_then(|v| v.get("include_document_structure"))
217 .and_then(|v| v.as_bool()),
218 Some(true)
219 );
220 }
221
222 #[test]
223 fn to_md_rejects_non_object_options() {
224 let out = to_md(&json!({
225 "html": "<p>hello</p>",
226 "options": "strict"
227 }));
228 assert!(out.get("error").and_then(|v| v.as_str()).is_some());
229 }
230}