taboc/utils/
toc.rs

1#[cfg(feature = "memmap2")]
2use memmap2::MmapMut;
3#[cfg(feature = "memmap2")]
4use std::cmp::min;
5
6use crate::prelude::*;
7use std::{
8    cell::Cell,
9    fs::OpenOptions,
10    io::{BufRead, BufReader, Seek, Write},
11    path::Path,
12};
13
14#[cfg(not(feature = "memmap2"))]
15use std::io::Read;
16
17/// # Table of contents struct
18///
19/// This is the main struct responsible for reading the README.md file and parsing out the table of
20/// contents with all the proper links in this format:
21///
22/// ```txt
23/// - [Heading 1](#heading-1)
24///   - [Heading 2](#heading-2)
25///     - [Heading 3](#heading-3)
26///     - [Heading 3 2](#heading-3-2)
27///   - [Heading 2 2](#heading-2-2)
28/// ```
29pub struct Taboc {
30    pub file: std::fs::File,
31    code_block: Cell<bool>,
32    max_depth: usize,
33}
34
35impl Taboc {
36    const MIN_HEADING: usize = 1;
37    const HEADING_CHAR: char = '#';
38    const CODE_BLOCK_STR: &'static str = "```";
39    const TOC_HEADING: &'static str = "## Table of contents";
40
41    pub fn new(file: std::fs::File, max_depth: usize) -> Self {
42        Self {
43            file,
44            code_block: Cell::new(false),
45            max_depth,
46        }
47    }
48
49    /// See [`RFC3986`](https://www.rfc-editor.org/rfc/rfc3986).
50    fn percent_encode(c: char) -> String {
51        let mut utf8_bytes: [u8; 4] = [0u8; 4];
52        let bytes = c.encode_utf8(&mut utf8_bytes);
53        let mut encoded = String::with_capacity(bytes.len() * 3);
54
55        for byte in utf8_bytes {
56            if byte == 0 {
57                break;
58            }
59            encoded.push_str(&format!("%{:02X}", byte));
60        }
61
62        encoded
63    }
64
65    /// Make a Table of contents line based on the current heading level.
66    fn make_link(heading_name: &str) -> String {
67        let mut res = String::with_capacity(heading_name.len());
68
69        /// Most are based on
70        /// [RFC3986#section-2.2](https://www.rfc-editor.org/rfc/rfc3986#section-2.2) with some
71        /// additional ones like: \`, `'`, `~`, `{` and `}`.
72        ///
73        /// NOTE: It's possible that not all excludable characters are covered. If you encounter a
74        /// issue with a missed one then feel free to post an issue on
75        /// [GitHub](https://github.com/1Git2Clone/taboc/issues).
76        const IGNORED_CHARACTERS: &[char] = &[
77            '+', ':', ';', '.', ',', '{', '}', '"', '@', '#', '>', '<', '[', ']', '|', '/', '?',
78            '!', '$', '*', '=', '&', '\'', '(', ')', '~',
79        ];
80
81        for c in heading_name.chars() {
82            if IGNORED_CHARACTERS.contains(&c) {
83                continue;
84            }
85
86            if c == ' ' {
87                res.push('-');
88                continue;
89            }
90
91            if c.is_ascii() {
92                res.push(c.to_ascii_lowercase());
93                continue;
94            }
95
96            if c.is_uppercase() {
97                res.push_str(&c.to_lowercase().to_string());
98                continue;
99            }
100
101            if !c.is_alphanumeric() {
102                res.push_str(&Self::percent_encode(c));
103                continue;
104            }
105
106            res.push(c);
107        }
108
109        res
110    }
111
112    /// Make a Table of contents line based on the current heading level.
113    fn make_line(heading_level: usize, line: &str) -> String {
114        format!(
115            "{}- [{}](#{})\n",
116            "  ".repeat(heading_level - 1),
117            line,
118            Self::make_link(line)
119        )
120    }
121
122    /// Check if a markdown line is valid.
123    fn valid_heading(&self, heading_level: usize, line: &str) -> bool {
124        if !(Self::MIN_HEADING..=self.max_depth).contains(&heading_level) {
125            return false;
126        }
127        if line.len() <= heading_level || line.chars().nth(heading_level) != Some(' ') {
128            return false;
129        }
130        if line.chars().nth(heading_level + 1).is_none() {
131            return false;
132        }
133        true
134    }
135
136    /// We shouldn't parse headings that are in code blocks: ```.
137    fn is_in_code_block(&self, line: &str) -> bool {
138        if line.starts_with(Self::CODE_BLOCK_STR) {
139            self.code_block.replace(!self.code_block.get());
140        }
141        self.code_block.get()
142    }
143
144    /// Make the table of contents based on a file.
145    pub fn parse(&self) -> Result<String, Error> {
146        let mut res = format!("\n\n{}\n\n", Self::TOC_HEADING);
147
148        for l in BufReader::new(&self.file).lines() {
149            let line = l?;
150
151            if self.is_in_code_block(&line) {
152                continue;
153            }
154
155            let heading_count = line
156                .chars()
157                .take_while(|c| *c == Self::HEADING_CHAR)
158                .count();
159
160            if !self.valid_heading(heading_count, &line) {
161                continue;
162            }
163
164            if line.starts_with(Self::TOC_HEADING) {
165                continue;
166            }
167
168            let heading = line
169                .chars()
170                .skip(heading_count)
171                .skip_while(|c| c.is_whitespace())
172                .collect::<String>();
173
174            res.push_str(&Self::make_line(heading_count, &heading));
175        }
176
177        // remove the trailing newline symbol.
178        res.pop();
179
180        Ok(res)
181    }
182
183    /// Writes to the specified path.
184    ///
185    /// NOTE: This ensures that there's no table of contents as the first second-level heading of a
186    /// markdown document but it doesn't ensure it if it's located anywhere else.
187    pub fn write_to_file<P: AsRef<Path>>(
188        &self,
189        path: P,
190        input: &str,
191        update_existing: bool,
192    ) -> Result<(), Error> {
193        let mut target_file = OpenOptions::new().read(true).write(true).open(path)?;
194
195        let mut pos = 0;
196        let lookup_header = "## ";
197        let mut line_buf = Vec::new();
198        let mut reader = BufReader::new(&target_file);
199
200        let mut already_exists = false;
201
202        while let Ok(char_count) = reader.read_until(b'\n', &mut line_buf) {
203            if char_count == 0 {
204                break;
205            }
206
207            if line_buf.starts_with(lookup_header.as_bytes()) {
208                let windows_toc = line_buf[line_buf.len().saturating_sub(2)] != b'\r'
209                    && &line_buf[0..line_buf.len().saturating_sub(1)]
210                        == Self::TOC_HEADING.as_bytes();
211                let unix_toc = &line_buf[0..line_buf.len()] == Self::TOC_HEADING.as_bytes();
212                if !update_existing && (windows_toc || unix_toc) {
213                    return Err(
214                        anyhow!("There's already a table of contents in the first heading of the second level of this file.")
215                    );
216                } else if windows_toc || unix_toc {
217                    already_exists = true;
218                }
219                // I wish I had an explanation for the off-by-one error here.
220                pos -= lookup_header.len() as u64 - 1;
221                break;
222            }
223
224            pos += char_count as u64;
225
226            line_buf.clear();
227        }
228
229        target_file.seek(std::io::SeekFrom::Start(pos))?;
230        #[cfg(feature = "memmap2")]
231        let rest_map = unsafe { MmapMut::map_mut(&target_file)? };
232        #[cfg(feature = "memmap2")]
233        let mut rest = &rest_map[..];
234        #[cfg(not(feature = "memmap2"))]
235        let mut rest = Vec::<u8>::new();
236        #[cfg(not(feature = "memmap2"))]
237        target_file.read_to_end(&mut rest)?;
238
239        target_file.seek(std::io::SeekFrom::Start(pos))?;
240
241        if already_exists {
242            let mut reader = BufReader::new(&target_file);
243            let mut drain_pos = 0;
244
245            // reads [`Self::TOC_HEADING`] twice. Log each line to understand why it's like this.
246            let mut end_heading_count = 3;
247            let mut last_line_char_count = 0;
248
249            while let Ok(char_count) = reader.read_until(b'\n', &mut line_buf) {
250                // println!("LINE: {}", String::from_utf8_lossy(&line_buf));
251
252                if line_buf.starts_with(b"#") {
253                    end_heading_count -= 1;
254                } else if end_heading_count == 0 {
255                    drain_pos -= last_line_char_count;
256                    break;
257                }
258                if line_buf.trim_ascii().is_empty() && end_heading_count == 1 {
259                    drain_pos -= char_count;
260                }
261
262                drain_pos += char_count;
263
264                line_buf.clear();
265                last_line_char_count = char_count;
266            }
267
268            #[cfg(feature = "memmap2")]
269            {
270                rest = &rest_map[min(drain_pos - 1, rest_map.len() - 1)..];
271            }
272            #[cfg(not(feature = "memmap2"))]
273            rest.drain(..drain_pos);
274        }
275
276        target_file.seek(std::io::SeekFrom::Start(pos))?;
277        target_file.write_all(input.as_bytes())?;
278        #[cfg(feature = "memmap2")]
279        target_file.write_all(rest)?;
280        #[cfg(feature = "memmap2")]
281        rest_map.flush()?;
282        #[cfg(not(feature = "memmap2"))]
283        target_file.write_all(&rest)?;
284
285        Ok(())
286    }
287}
288
289#[cfg(test)]
290mod tests {
291    use super::Taboc;
292
293    #[test]
294    fn percent_encode() {
295        assert_eq!(Taboc::percent_encode('😁'), "%F0%9F%98%81");
296        assert_eq!(Taboc::percent_encode('♊'), "%E2%99%8A");
297        assert_eq!(Taboc::percent_encode('⏳'), "%E2%8F%B3");
298        assert_eq!(Taboc::percent_encode('❌'), "%E2%9D%8C");
299        assert_eq!(Taboc::percent_encode('⏪'), "%E2%8F%AA");
300        assert_eq!(Taboc::percent_encode('⛪'), "%E2%9B%AA");
301        assert_eq!(Taboc::percent_encode('⟣'), "%E2%9F%A3");
302        assert_eq!(Taboc::percent_encode('⛟'), "%E2%9B%9F");
303    }
304}