html2md/scraper/
lists.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
use super::StructuredPrinter;
use super::TagHandler;

use markup5ever_rcdom::Handle;

/// gets all list elements registered by a `StructuredPrinter` in reverse order
fn list_hierarchy(printer: &mut StructuredPrinter) -> Vec<&String> {
    printer
        .parent_chain
        .iter()
        .rev()
        .filter(|&tag| tag == "ul" || tag == "ol" || tag == "menu")
        .collect()
}

#[derive(Default)]
pub struct ListHandler;

impl TagHandler for ListHandler {
    /// we're entering "ul" or "ol" tag, no "li" handling here
    fn handle(&mut self, _tag: &Handle, printer: &mut StructuredPrinter) {
        printer.insert_newline();

        // insert an extra newline for non-nested lists
        if list_hierarchy(printer).is_empty() {
            printer.insert_newline();
        }
    }

    /// indent now-ready list
    fn after_handle(&mut self, printer: &mut StructuredPrinter) {
        printer.insert_newline();
    }
}

#[derive(Default)]
pub struct ListItemHandler {
    start_pos: usize,
    list_type: String,
}

impl TagHandler for ListItemHandler {
    fn handle(&mut self, _tag: &Handle, printer: &mut StructuredPrinter) {
        {
            let parent_lists = list_hierarchy(printer);
            let nearest_parent_list = parent_lists.first();
            if nearest_parent_list.is_none() {
                // no parent list
                // should not happen - html5ever cleans html input when parsing
                return;
            }

            if let Some(s) = nearest_parent_list {
                self.list_type = s.to_string();
            }
        }

        if !printer.data.ends_with('\n') {
            // insert newline when declaring a list item only in case there isn't any newline at the end of text
            printer.insert_newline();
        }

        let current_depth = printer.parent_chain.len();

        let order = if printer.siblings.len() >= current_depth {
            printer.siblings[&current_depth].len() + 1
        } else {
            0
        };

        match self.list_type.as_ref() {
            "ul" | "menu" => printer.append_str("* "), // unordered list: *, *, *
            "ol" => printer.append_str(&(order.to_string() + ". ")), // ordered list: 1, 2, 3
            _ => (),                                   // never happens
        }

        self.start_pos = printer.data.len();
    }

    fn after_handle(&mut self, printer: &mut StructuredPrinter) {
        let padding = match self.list_type.as_ref() {
            "ul" => 2,
            "ol" => 3,
            _ => 4,
        };

        // need to cleanup leading newlines, <p> inside <li> should produce valid
        // list element, not an empty line
        let index = self.start_pos;
        while index < printer.data.len() {
            if printer.data.as_bytes().get(index).copied() == Some(b'\n')
                || printer.data.as_bytes().get(index).copied() == Some(b' ')
            {
                printer.data.remove(index);
            } else {
                break;
            }
        }

        // non-nested indentation (padding). Markdown requires that all paragraphs in the
        // list item except first should be indented with at least 1 space
        let mut index = printer.data.len();
        while index > self.start_pos {
            if printer.data.as_bytes().get(index).copied() == Some(b'\n') {
                printer.insert_str(index + 1, &" ".repeat(padding));
            }
            index -= 1;
        }
    }
}