webspec_index/parse/
algorithms.rs1use htmd::HtmlToMarkdown;
3use scraper::{ElementRef, Node};
4
5pub fn render_algorithm_ol(ol_element: &ElementRef, converter: &HtmlToMarkdown) -> String {
9 let mut result = String::new();
10 let mut step_number = 1;
11
12 for child in ol_element.children() {
13 if let Some(child_element) = ElementRef::wrap(child) {
14 let tag_name = child_element.value().name();
15
16 if tag_name == "li" {
17 let step_text = render_li_recursive(&child_element, &[step_number], 0, converter);
18 result.push_str(&step_text);
19 step_number += 1;
20 } else {
21 let elem_md = converter
23 .convert(&child_element.html())
24 .unwrap_or_default()
25 .trim()
26 .to_string();
27
28 if !elem_md.is_empty() {
29 result.push_str("\n\n");
30 result.push_str(&elem_md);
31 result.push('\n');
32 }
33 }
34 }
35 }
36
37 result.trim_end().to_string()
38}
39
40fn render_li_recursive(
47 li: &ElementRef,
48 numbering: &[usize],
49 indent: usize,
50 converter: &HtmlToMarkdown,
51) -> String {
52 let mut result = String::new();
53
54 for _ in 0..indent {
56 result.push_str(" ");
57 }
58
59 let step_num = numbering.last().unwrap_or(&1);
61 result.push_str(&format!("{}. ", step_num));
62
63 let mut content_html = String::new();
66 let mut first_chunk = true;
67
68 for child in li.children() {
69 if let Some(child_element) = ElementRef::wrap(child) {
70 let tag_name = child_element.value().name();
71
72 if tag_name == "ol" {
73 flush_content_html(
75 &mut result,
76 &mut content_html,
77 &mut first_chunk,
78 indent,
79 converter,
80 );
81
82 result.push_str("\n\n");
84 let mut sub_step = 1;
85 for sub_child in child_element.children() {
86 if let Some(sub_li) = ElementRef::wrap(sub_child) {
87 if sub_li.value().name() == "li" {
88 let mut new_numbering = numbering.to_vec();
89 new_numbering.push(sub_step);
90 result.push_str(&render_li_recursive(
91 &sub_li,
92 &new_numbering,
93 indent + 1,
94 converter,
95 ));
96 sub_step += 1;
97 }
98 }
99 }
100 } else if tag_name == "ul" {
101 flush_content_html(
102 &mut result,
103 &mut content_html,
104 &mut first_chunk,
105 indent,
106 converter,
107 );
108
109 result.push_str("\n\n");
110 result.push_str(&render_ul(&child_element, indent + 1, converter));
111 } else {
112 content_html.push_str(&child_element.html());
114 }
115 } else if let Node::Text(text) = child.value() {
116 content_html.push_str(text);
117 }
118 }
119
120 flush_content_html(
122 &mut result,
123 &mut content_html,
124 &mut first_chunk,
125 indent,
126 converter,
127 );
128
129 if !result.ends_with('\n') {
131 result.push('\n');
132 }
133
134 result
135}
136
137fn flush_content_html(
140 result: &mut String,
141 content_html: &mut String,
142 first_chunk: &mut bool,
143 indent: usize,
144 converter: &HtmlToMarkdown,
145) {
146 if content_html.trim().is_empty() {
147 content_html.clear();
148 return;
149 }
150
151 let md = converter
152 .convert(content_html)
153 .unwrap_or_default()
154 .trim()
155 .to_string();
156 content_html.clear();
157
158 if md.is_empty() {
159 return;
160 }
161
162 if *first_chunk {
163 result.push_str(&md);
164 *first_chunk = false;
165 } else {
166 result.push_str("\n\n");
168 let indented = indent_lines(&md, indent + 1);
169 result.push_str(&indented);
170 }
171}
172
173fn indent_lines(text: &str, indent: usize) -> String {
175 let prefix = " ".repeat(indent);
176 text.lines()
177 .map(|line| {
178 if line.trim().is_empty() {
179 line.to_string()
180 } else {
181 format!("{}{}", prefix, line)
182 }
183 })
184 .collect::<Vec<_>>()
185 .join("\n")
186}
187
188fn render_ul(ul: &ElementRef, indent: usize, converter: &HtmlToMarkdown) -> String {
190 let mut result = String::new();
191
192 for child in ul.children() {
193 if let Some(li_element) = ElementRef::wrap(child) {
194 if li_element.value().name() == "li" {
195 for _ in 0..indent {
197 result.push_str(" ");
198 }
199
200 result.push_str("* ");
202
203 let li_html = li_element.html();
205 let li_content = converter
206 .convert(&li_html)
207 .unwrap_or_default()
208 .trim()
209 .to_string();
210
211 let li_content = li_content.strip_prefix("*").unwrap_or(&li_content).trim();
213
214 result.push_str(li_content);
215 result.push('\n');
216 }
217 }
218 }
219
220 result
221}
222
223#[cfg(test)]
224mod tests {
225 use super::*;
226 use crate::parse::markdown;
227 use scraper::{Html, Selector};
228
229 fn test_converter() -> HtmlToMarkdown {
230 markdown::build_converter("https://test.example.com")
231 }
232
233 #[test]
234 fn test_simple_algorithm() {
235 let html = r#"
236 <ol>
237 <li><p>First step</p></li>
238 <li><p>Second step</p></li>
239 <li><p>Third step</p></li>
240 </ol>
241 "#;
242
243 let fragment = Html::parse_fragment(html);
244 let selector = Selector::parse("ol").unwrap();
245 let ol = fragment.select(&selector).next().unwrap();
246
247 let result = render_algorithm_ol(&ol, &test_converter());
248 assert!(result.contains("1. First step"));
249 assert!(result.contains("2. Second step"));
250 assert!(result.contains("3. Third step"));
251 }
252
253 #[test]
254 fn test_nested_algorithm() {
255 let html = r#"
256 <ol>
257 <li><p>Step one</p></li>
258 <li><p>Step two</p>
259 <ol>
260 <li><p>Sub-step 2.1</p></li>
261 <li><p>Sub-step 2.2</p></li>
262 </ol>
263 </li>
264 <li><p>Step three</p></li>
265 </ol>
266 "#;
267
268 let fragment = Html::parse_fragment(html);
269 let selector = Selector::parse("ol").unwrap();
270 let ol = fragment.select(&selector).next().unwrap();
271
272 let result = render_algorithm_ol(&ol, &test_converter());
273 assert!(result.contains("1. Step one"));
274 assert!(result.contains("2. Step two"));
275 assert!(result.contains(" 1. Sub-step 2.1"));
276 assert!(result.contains(" 2. Sub-step 2.2"));
277 assert!(result.contains("3. Step three"));
278 }
279
280 #[test]
281 fn test_deeply_nested_algorithm() {
282 let html = r#"
283 <ol>
284 <li><p>Level 1</p>
285 <ol>
286 <li><p>Level 1.1</p>
287 <ol>
288 <li><p>Level 1.1.1</p></li>
289 </ol>
290 </li>
291 </ol>
292 </li>
293 </ol>
294 "#;
295
296 let fragment = Html::parse_fragment(html);
297 let selector = Selector::parse("ol").unwrap();
298 let ol = fragment.select(&selector).next().unwrap();
299
300 let result = render_algorithm_ol(&ol, &test_converter());
301 assert!(result.contains("1. Level 1"));
302 assert!(result.contains(" 1. Level 1.1"));
303 assert!(result.contains(" 1. Level 1.1.1"));
304 }
305
306 #[test]
307 fn test_algorithm_with_var_and_code() {
308 let html = r#"
309 <ol>
310 <li><p>Let <var>foo</var> be a <code>Document</code>.</p></li>
311 <li><p>Return <var>foo</var>.</p></li>
312 </ol>
313 "#;
314
315 let fragment = Html::parse_fragment(html);
316 let selector = Selector::parse("ol").unwrap();
317 let ol = fragment.select(&selector).next().unwrap();
318
319 let result = render_algorithm_ol(&ol, &test_converter());
320 assert!(result.contains("1. Let *foo* be a `Document`."));
322 assert!(result.contains("2. Return *foo*."));
323 }
324
325 #[test]
326 fn test_algorithm_from_fixture() {
327 let html = include_str!("../../tests/fixtures/algorithms/bikeshed_algorithm.html");
328 let fragment = Html::parse_fragment(html);
329 let selector = Selector::parse("div.algorithm ol").unwrap();
330 let ol = fragment.select(&selector).next().unwrap();
331
332 let result = render_algorithm_ol(&ol, &test_converter());
333
334 assert!(result.contains("1. "));
336 assert!(result.contains("2. "));
337
338 assert!(!result.trim().is_empty());
340 }
341
342 #[test]
343 fn test_indentation() {
344 let html = r#"
345 <ol>
346 <li><p>Top</p>
347 <ol>
348 <li><p>Nested</p></li>
349 </ol>
350 </li>
351 </ol>
352 "#;
353
354 let fragment = Html::parse_fragment(html);
355 let selector = Selector::parse("ol").unwrap();
356 let ol = fragment.select(&selector).next().unwrap();
357
358 let result = render_algorithm_ol(&ol, &test_converter());
359
360 assert!(result.contains("1. Top"));
362
363 assert!(result.contains(" 1. Nested"));
365 let lines: Vec<&str> = result.lines().collect();
366 let nested_line = lines.iter().find(|l| l.contains("Nested")).unwrap();
367 assert!(nested_line.starts_with(" 1."));
368 }
369
370 #[test]
371 fn test_note_between_steps() {
372 let html = r#"
374 <ol>
375 <li><p>First step</p></li>
376 <li><p>Second step</p>
377 <div class="note">
378 <p>This is a note between steps.</p>
379 </div>
380 </li>
381 <li><p>Third step</p></li>
382 </ol>
383 "#;
384
385 let fragment = Html::parse_fragment(html);
386 let selector = Selector::parse("ol").unwrap();
387 let ol = fragment.select(&selector).next().unwrap();
388
389 let result = render_algorithm_ol(&ol, &test_converter());
390
391 assert!(result.contains("1. First step"));
393 assert!(result.contains("2. Second step"));
394 assert!(result.contains("3. Third step"));
395
396 assert!(
398 result.contains("> **Note:** This is a note between steps."),
399 "Note should be a blockquote: {}",
400 result
401 );
402
403 let lines: Vec<&str> = result.lines().collect();
405 let step3_index = lines
406 .iter()
407 .position(|l| l.contains("3. Third step"))
408 .unwrap();
409 let note_index = lines
410 .iter()
411 .position(|l| l.contains("> **Note:**"))
412 .unwrap();
413
414 assert!(
416 step3_index > note_index,
417 "Step 3 should appear after the note"
418 );
419 }
420
421 #[test]
422 fn test_nested_bullet_list() {
423 let html = r#"
425 <ol>
426 <li><p>If all of the following are true:</p>
427 <ul>
428 <li><var>x</var> is null;</li>
429 <li><var>y</var> is null;</li>
430 </ul>
431 <p>then return.</p>
432 </li>
433 <li><p>Next step</p></li>
434 </ol>
435 "#;
436
437 let fragment = Html::parse_fragment(html);
438 let selector = Selector::parse("ol").unwrap();
439 let ol = fragment.select(&selector).next().unwrap();
440
441 let result = render_algorithm_ol(&ol, &test_converter());
442
443 assert!(result.contains("1. If all of the following are true:"));
445
446 assert!(result.contains(" * *x* is null;"));
448 assert!(result.contains(" * *y* is null;"));
449
450 let x_pos = result.find("*x* is null").expect("x bullet should exist");
452 let y_pos = result.find("*y* is null").expect("y bullet should exist");
453 let then_pos = result
454 .find("then return")
455 .expect("then return should exist");
456
457 assert!(x_pos < then_pos, "bullets should come before 'then return'");
458 assert!(y_pos < then_pos, "bullets should come before 'then return'");
459
460 assert!(
462 result.contains(" then return"),
463 "continuation content should be indented"
464 );
465
466 assert!(result.contains("2. Next step"));
468 }
469}