Skip to main content

webspec_index/parse/
algorithms.rs

1// Algorithm rendering: convert <ol>/<li> to numbered markdown text
2use htmd::HtmlToMarkdown;
3use scraper::{ElementRef, Node};
4
5/// Render an algorithm's `<ol>` element with markdown-style numbering.
6/// Nested lists use simple numbering (1., 2., etc.) with indentation - markdown handles visual hierarchy.
7/// Inline content is converted to markdown using the provided converter.
8pub fn render_algorithm_ol(ol_element: &ElementRef, converter: &HtmlToMarkdown) -> String {
9    let mut result = String::new();
10    let mut step_number = 1;
11
12    for child in ol_element.children() {
13        if let Some(child_element) = ElementRef::wrap(child) {
14            let tag_name = child_element.value().name();
15
16            if tag_name == "li" {
17                let step_text = render_li_recursive(&child_element, &[step_number], 0, converter);
18                result.push_str(&step_text);
19                step_number += 1;
20            } else {
21                // Handle other elements between list items (notes, examples, etc.)
22                let elem_md = converter
23                    .convert(&child_element.html())
24                    .unwrap_or_default()
25                    .trim()
26                    .to_string();
27
28                if !elem_md.is_empty() {
29                    result.push_str("\n\n");
30                    result.push_str(&elem_md);
31                    result.push('\n');
32                }
33            }
34        }
35    }
36
37    result.trim_end().to_string()
38}
39
40/// Recursively render a `<li>` element with simple numbering (markdown handles hierarchy via indentation)
41///
42/// Collects inline content (text, `<var>`, `<emu-xref>`, `<p>`, etc.) into HTML chunks
43/// and converts each chunk to markdown in one pass. This prevents inline elements from
44/// being broken across lines. Nested `<ol>`/`<ul>` are rendered separately with proper
45/// indentation.
46fn render_li_recursive(
47    li: &ElementRef,
48    numbering: &[usize],
49    indent: usize,
50    converter: &HtmlToMarkdown,
51) -> String {
52    let mut result = String::new();
53
54    // Add indentation (4 spaces per level for markdown list continuation)
55    for _ in 0..indent {
56        result.push_str("    ");
57    }
58
59    // Add step number
60    let step_num = numbering.last().unwrap_or(&1);
61    result.push_str(&format!("{}. ", step_num));
62
63    // Process children in document order. Accumulate inline/block content HTML into a
64    // buffer, flush it when we hit a nested list, then render the list recursively.
65    let mut content_html = String::new();
66    let mut first_chunk = true;
67
68    for child in li.children() {
69        if let Some(child_element) = ElementRef::wrap(child) {
70            let tag_name = child_element.value().name();
71
72            if tag_name == "ol" {
73                // Flush accumulated content before the nested list
74                flush_content_html(
75                    &mut result,
76                    &mut content_html,
77                    &mut first_chunk,
78                    indent,
79                    converter,
80                );
81
82                // Nested numbered list
83                result.push_str("\n\n");
84                let mut sub_step = 1;
85                for sub_child in child_element.children() {
86                    if let Some(sub_li) = ElementRef::wrap(sub_child) {
87                        if sub_li.value().name() == "li" {
88                            let mut new_numbering = numbering.to_vec();
89                            new_numbering.push(sub_step);
90                            result.push_str(&render_li_recursive(
91                                &sub_li,
92                                &new_numbering,
93                                indent + 1,
94                                converter,
95                            ));
96                            sub_step += 1;
97                        }
98                    }
99                }
100            } else if tag_name == "ul" {
101                flush_content_html(
102                    &mut result,
103                    &mut content_html,
104                    &mut first_chunk,
105                    indent,
106                    converter,
107                );
108
109                result.push_str("\n\n");
110                result.push_str(&render_ul(&child_element, indent + 1, converter));
111            } else {
112                // Accumulate element HTML (p, var, emu-xref, div, span, etc.)
113                content_html.push_str(&child_element.html());
114            }
115        } else if let Node::Text(text) = child.value() {
116            content_html.push_str(text);
117        }
118    }
119
120    // Flush any remaining content
121    flush_content_html(
122        &mut result,
123        &mut content_html,
124        &mut first_chunk,
125        indent,
126        converter,
127    );
128
129    // Ensure the step ends with a newline
130    if !result.ends_with('\n') {
131        result.push('\n');
132    }
133
134    result
135}
136
137/// Convert accumulated HTML content to markdown and append to the result.
138/// First chunk goes inline with the step number; subsequent chunks get continuation indentation.
139fn flush_content_html(
140    result: &mut String,
141    content_html: &mut String,
142    first_chunk: &mut bool,
143    indent: usize,
144    converter: &HtmlToMarkdown,
145) {
146    if content_html.trim().is_empty() {
147        content_html.clear();
148        return;
149    }
150
151    let md = converter
152        .convert(content_html)
153        .unwrap_or_default()
154        .trim()
155        .to_string();
156    content_html.clear();
157
158    if md.is_empty() {
159        return;
160    }
161
162    if *first_chunk {
163        result.push_str(&md);
164        *first_chunk = false;
165    } else {
166        // Continuation content after a nested list needs indentation
167        result.push_str("\n\n");
168        let indented = indent_lines(&md, indent + 1);
169        result.push_str(&indented);
170    }
171}
172
173/// Indent every line of a multi-line string with N levels of 4-space indentation
174fn indent_lines(text: &str, indent: usize) -> String {
175    let prefix = "    ".repeat(indent);
176    text.lines()
177        .map(|line| {
178            if line.trim().is_empty() {
179                line.to_string()
180            } else {
181                format!("{}{}", prefix, line)
182            }
183        })
184        .collect::<Vec<_>>()
185        .join("\n")
186}
187
188/// Render a `<ul>` element with proper indentation
189fn render_ul(ul: &ElementRef, indent: usize, converter: &HtmlToMarkdown) -> String {
190    let mut result = String::new();
191
192    for child in ul.children() {
193        if let Some(li_element) = ElementRef::wrap(child) {
194            if li_element.value().name() == "li" {
195                // Add indentation (4 spaces per level for markdown consistency)
196                for _ in 0..indent {
197                    result.push_str("    ");
198                }
199
200                // Add bullet marker
201                result.push_str("* ");
202
203                // Extract and convert the li content to markdown
204                let li_html = li_element.html();
205                let li_content = converter
206                    .convert(&li_html)
207                    .unwrap_or_default()
208                    .trim()
209                    .to_string();
210
211                // Remove the outer <li> tags that the converter might leave
212                let li_content = li_content.strip_prefix("*").unwrap_or(&li_content).trim();
213
214                result.push_str(li_content);
215                result.push('\n');
216            }
217        }
218    }
219
220    result
221}
222
223#[cfg(test)]
224mod tests {
225    use super::*;
226    use crate::parse::markdown;
227    use scraper::{Html, Selector};
228
229    fn test_converter() -> HtmlToMarkdown {
230        markdown::build_converter("https://test.example.com")
231    }
232
233    #[test]
234    fn test_simple_algorithm() {
235        let html = r#"
236            <ol>
237                <li><p>First step</p></li>
238                <li><p>Second step</p></li>
239                <li><p>Third step</p></li>
240            </ol>
241        "#;
242
243        let fragment = Html::parse_fragment(html);
244        let selector = Selector::parse("ol").unwrap();
245        let ol = fragment.select(&selector).next().unwrap();
246
247        let result = render_algorithm_ol(&ol, &test_converter());
248        assert!(result.contains("1. First step"));
249        assert!(result.contains("2. Second step"));
250        assert!(result.contains("3. Third step"));
251    }
252
253    #[test]
254    fn test_nested_algorithm() {
255        let html = r#"
256            <ol>
257                <li><p>Step one</p></li>
258                <li><p>Step two</p>
259                    <ol>
260                        <li><p>Sub-step 2.1</p></li>
261                        <li><p>Sub-step 2.2</p></li>
262                    </ol>
263                </li>
264                <li><p>Step three</p></li>
265            </ol>
266        "#;
267
268        let fragment = Html::parse_fragment(html);
269        let selector = Selector::parse("ol").unwrap();
270        let ol = fragment.select(&selector).next().unwrap();
271
272        let result = render_algorithm_ol(&ol, &test_converter());
273        assert!(result.contains("1. Step one"));
274        assert!(result.contains("2. Step two"));
275        assert!(result.contains("    1. Sub-step 2.1"));
276        assert!(result.contains("    2. Sub-step 2.2"));
277        assert!(result.contains("3. Step three"));
278    }
279
280    #[test]
281    fn test_deeply_nested_algorithm() {
282        let html = r#"
283            <ol>
284                <li><p>Level 1</p>
285                    <ol>
286                        <li><p>Level 1.1</p>
287                            <ol>
288                                <li><p>Level 1.1.1</p></li>
289                            </ol>
290                        </li>
291                    </ol>
292                </li>
293            </ol>
294        "#;
295
296        let fragment = Html::parse_fragment(html);
297        let selector = Selector::parse("ol").unwrap();
298        let ol = fragment.select(&selector).next().unwrap();
299
300        let result = render_algorithm_ol(&ol, &test_converter());
301        assert!(result.contains("1. Level 1"));
302        assert!(result.contains("    1. Level 1.1"));
303        assert!(result.contains("        1. Level 1.1.1"));
304    }
305
306    #[test]
307    fn test_algorithm_with_var_and_code() {
308        let html = r#"
309            <ol>
310                <li><p>Let <var>foo</var> be a <code>Document</code>.</p></li>
311                <li><p>Return <var>foo</var>.</p></li>
312            </ol>
313        "#;
314
315        let fragment = Html::parse_fragment(html);
316        let selector = Selector::parse("ol").unwrap();
317        let ol = fragment.select(&selector).next().unwrap();
318
319        let result = render_algorithm_ol(&ol, &test_converter());
320        // <var> now renders as *italic* in markdown, <code> as `backtick`
321        assert!(result.contains("1. Let *foo* be a `Document`."));
322        assert!(result.contains("2. Return *foo*."));
323    }
324
325    #[test]
326    fn test_algorithm_from_fixture() {
327        let html = include_str!("../../tests/fixtures/algorithms/bikeshed_algorithm.html");
328        let fragment = Html::parse_fragment(html);
329        let selector = Selector::parse("div.algorithm ol").unwrap();
330        let ol = fragment.select(&selector).next().unwrap();
331
332        let result = render_algorithm_ol(&ol, &test_converter());
333
334        // Should have numbered steps
335        assert!(result.contains("1. "));
336        assert!(result.contains("2. "));
337
338        // Check that it's not empty
339        assert!(!result.trim().is_empty());
340    }
341
342    #[test]
343    fn test_indentation() {
344        let html = r#"
345            <ol>
346                <li><p>Top</p>
347                    <ol>
348                        <li><p>Nested</p></li>
349                    </ol>
350                </li>
351            </ol>
352        "#;
353
354        let fragment = Html::parse_fragment(html);
355        let selector = Selector::parse("ol").unwrap();
356        let ol = fragment.select(&selector).next().unwrap();
357
358        let result = render_algorithm_ol(&ol, &test_converter());
359
360        // Top level should have no indentation
361        assert!(result.contains("1. Top"));
362
363        // Nested numbered steps should have 4 spaces indentation per level
364        assert!(result.contains("    1. Nested"));
365        let lines: Vec<&str> = result.lines().collect();
366        let nested_line = lines.iter().find(|l| l.contains("Nested")).unwrap();
367        assert!(nested_line.starts_with("    1."));
368    }
369
370    #[test]
371    fn test_note_between_steps() {
372        // Notes/examples/warnings between steps should be formatted as blockquotes
373        let html = r#"
374            <ol>
375                <li><p>First step</p></li>
376                <li><p>Second step</p>
377                    <div class="note">
378                        <p>This is a note between steps.</p>
379                    </div>
380                </li>
381                <li><p>Third step</p></li>
382            </ol>
383        "#;
384
385        let fragment = Html::parse_fragment(html);
386        let selector = Selector::parse("ol").unwrap();
387        let ol = fragment.select(&selector).next().unwrap();
388
389        let result = render_algorithm_ol(&ol, &test_converter());
390
391        // All three steps should be present
392        assert!(result.contains("1. First step"));
393        assert!(result.contains("2. Second step"));
394        assert!(result.contains("3. Third step"));
395
396        // Note should be formatted as blockquote with prefix
397        assert!(
398            result.contains("> **Note:** This is a note between steps."),
399            "Note should be a blockquote: {}",
400            result
401        );
402
403        // Third step should start on a new line after the blockquote
404        let lines: Vec<&str> = result.lines().collect();
405        let step3_index = lines
406            .iter()
407            .position(|l| l.contains("3. Third step"))
408            .unwrap();
409        let note_index = lines
410            .iter()
411            .position(|l| l.contains("> **Note:**"))
412            .unwrap();
413
414        // Step 3 should come after the note
415        assert!(
416            step3_index > note_index,
417            "Step 3 should appear after the note"
418        );
419    }
420
421    #[test]
422    fn test_nested_bullet_list() {
423        // Test that nested <ul> lists are properly indented and in document order
424        let html = r#"
425            <ol>
426                <li><p>If all of the following are true:</p>
427                    <ul>
428                        <li><var>x</var> is null;</li>
429                        <li><var>y</var> is null;</li>
430                    </ul>
431                    <p>then return.</p>
432                </li>
433                <li><p>Next step</p></li>
434            </ol>
435        "#;
436
437        let fragment = Html::parse_fragment(html);
438        let selector = Selector::parse("ol").unwrap();
439        let ol = fragment.select(&selector).next().unwrap();
440
441        let result = render_algorithm_ol(&ol, &test_converter());
442
443        // Step 1 should contain the intro text
444        assert!(result.contains("1. If all of the following are true:"));
445
446        // Bullet items should be indented (4 spaces) and appear BEFORE "then return"
447        assert!(result.contains("    * *x* is null;"));
448        assert!(result.contains("    * *y* is null;"));
449
450        // The "then return" should come AFTER the bullets
451        let x_pos = result.find("*x* is null").expect("x bullet should exist");
452        let y_pos = result.find("*y* is null").expect("y bullet should exist");
453        let then_pos = result
454            .find("then return")
455            .expect("then return should exist");
456
457        assert!(x_pos < then_pos, "bullets should come before 'then return'");
458        assert!(y_pos < then_pos, "bullets should come before 'then return'");
459
460        // The "then return" should be indented (continuation content)
461        assert!(
462            result.contains("    then return"),
463            "continuation content should be indented"
464        );
465
466        // Step 2 should be present
467        assert!(result.contains("2. Next step"));
468    }
469}