html_to_markdown_rs/
text.rs

1//! Text processing utilities for Markdown conversion.
2
3use once_cell::sync::Lazy;
4use regex::Regex;
5
6/// Regex for escaping miscellaneous characters
7static ESCAPE_MISC_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"([\\&<`\[\]>~#=+|\-])").unwrap());
8
9/// Regex for escaping numbered lists
10static ESCAPE_NUMBERED_LIST_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"([0-9])([.)])").unwrap());
11
12/// Regex for escaping ASCII punctuation (CommonMark spec example 12)
13/// Matches: ! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~
14static ESCAPE_ASCII_RE: Lazy<Regex> =
15    Lazy::new(|| Regex::new(r"([!\x22#$%&\x27()*+,\-./:;<=>?@\[\\\]^_`{|}~])").unwrap());
16
17/// Escape Markdown special characters in text.
18///
19/// # Arguments
20///
21/// * `text` - Text to escape
22/// * `escape_misc` - Escape miscellaneous characters (\ & < ` [ > ~ # = + | -)
23/// * `escape_asterisks` - Escape asterisks (*)
24/// * `escape_underscores` - Escape underscores (_)
25/// * `escape_ascii` - Escape all ASCII punctuation (for CommonMark spec compliance)
26///
27/// # Returns
28///
29/// Escaped text
30pub fn escape(
31    text: &str,
32    escape_misc: bool,
33    escape_asterisks: bool,
34    escape_underscores: bool,
35    escape_ascii: bool,
36) -> String {
37    if text.is_empty() {
38        return String::new();
39    }
40
41    let mut result = text.to_string();
42
43    if escape_ascii {
44        result = ESCAPE_ASCII_RE.replace_all(&result, r"\$1").to_string();
45        return result;
46    }
47
48    if escape_misc {
49        result = ESCAPE_MISC_RE.replace_all(&result, r"\$1").to_string();
50
51        result = ESCAPE_NUMBERED_LIST_RE.replace_all(&result, r"$1\$2").to_string();
52    }
53
54    if escape_asterisks {
55        result = result.replace('*', r"\*");
56    }
57
58    if escape_underscores {
59        result = result.replace('_', r"\_");
60    }
61
62    result
63}
64
65/// Extract boundary whitespace from text (chomp).
66///
67/// Returns (prefix, suffix, trimmed_text) tuple.
68/// Prefix/suffix are " " if original text had leading/trailing whitespace.
69/// However, suffix is "" if the trailing whitespace is only newlines (not spaces/tabs).
70/// This prevents trailing newlines from becoming trailing spaces in the output.
71/// The trimmed text has all leading/trailing whitespace removed.
72pub fn chomp(text: &str) -> (&str, &str, &str) {
73    if text.is_empty() {
74        return ("", "", "");
75    }
76
77    let prefix = if text.starts_with(|c: char| c.is_whitespace()) {
78        " "
79    } else {
80        ""
81    };
82
83    let suffix = if text.ends_with("\n\n") || text.ends_with("\r\n\r\n") {
84        "\n\n"
85    } else if text.ends_with([' ', '\t']) {
86        " "
87    } else {
88        ""
89    };
90
91    let trimmed = if suffix == "\n\n" {
92        text.trim_end_matches("\n\n").trim_end_matches("\r\n\r\n").trim()
93    } else {
94        text.trim()
95    };
96
97    (prefix, suffix, trimmed)
98}
99
100/// Normalize whitespace by collapsing consecutive spaces and tabs.
101///
102/// Multiple spaces and tabs are replaced with a single space.
103/// Newlines are preserved.
104/// Unicode spaces are normalized to ASCII spaces.
105///
106/// # Arguments
107///
108/// * `text` - The text to normalize
109///
110/// # Returns
111///
112/// Normalized text with collapsed spaces/tabs but preserved newlines
113pub fn normalize_whitespace(text: &str) -> String {
114    let mut result = String::with_capacity(text.len());
115    let mut prev_was_space = false;
116
117    for ch in text.chars() {
118        let is_space = ch == ' ' || ch == '\t' || is_unicode_space(ch);
119
120        if is_space {
121            if !prev_was_space {
122                result.push(' ');
123                prev_was_space = true;
124            }
125        } else {
126            result.push(ch);
127            prev_was_space = false;
128        }
129    }
130
131    result
132}
133
134/// Decode common HTML entities.
135///
136/// Decodes the most common HTML entities to their character equivalents:
137/// - `&quot;` → `"`
138/// - `&apos;` → `'`
139/// - `&lt;` → `<`
140/// - `&gt;` → `>`
141/// - `&amp;` → `&` (must be last to avoid double-decoding)
142///
143/// # Arguments
144///
145/// * `text` - Text containing HTML entities
146///
147/// # Returns
148///
149/// Text with entities decoded
150pub fn decode_html_entities(text: &str) -> String {
151    html_escape::decode_html_entities(text).into_owned()
152}
153
154/// Check if a character is a unicode space character.
155///
156/// Includes: non-breaking space, various width spaces, etc.
157fn is_unicode_space(ch: char) -> bool {
158    matches!(
159        ch,
160        '\u{00A0}'
161            | '\u{1680}'
162            | '\u{2000}'
163            | '\u{2001}'
164            | '\u{2002}'
165            | '\u{2003}'
166            | '\u{2004}'
167            | '\u{2005}'
168            | '\u{2006}'
169            | '\u{2007}'
170            | '\u{2008}'
171            | '\u{2009}'
172            | '\u{200A}'
173            | '\u{202F}'
174            | '\u{205F}'
175            | '\u{3000}'
176    )
177}
178
179/// Underline text with a character.
180pub fn underline(text: &str, pad_char: char) -> String {
181    let text = text.trim_end();
182    if text.is_empty() {
183        return String::new();
184    }
185    format!("{}\n{}\n\n", text, pad_char.to_string().repeat(text.len()))
186}
187
188/// Indent text with a string prefix.
189pub fn indent(text: &str, level: usize, indent_str: &str) -> String {
190    if text.is_empty() {
191        return String::new();
192    }
193
194    let prefix = indent_str.repeat(level);
195    text.lines()
196        .map(|line| {
197            if line.is_empty() {
198                String::new()
199            } else {
200                format!("{}{}", prefix, line)
201            }
202        })
203        .collect::<Vec<_>>()
204        .join("\n")
205}
206
207#[cfg(test)]
208mod tests {
209    use super::*;
210
211    #[test]
212    fn test_escape_misc() {
213        assert_eq!(escape("foo & bar", true, false, false, false), r"foo \& bar");
214        assert_eq!(escape("foo [bar]", true, false, false, false), r"foo \[bar\]");
215        assert_eq!(escape("1. Item", true, false, false, false), r"1\. Item");
216        assert_eq!(escape("1) Item", true, false, false, false), r"1\) Item");
217    }
218
219    #[test]
220    fn test_escape_asterisks() {
221        assert_eq!(escape("foo * bar", false, true, false, false), r"foo \* bar");
222        assert_eq!(escape("**bold**", false, true, false, false), r"\*\*bold\*\*");
223    }
224
225    #[test]
226    fn test_escape_underscores() {
227        assert_eq!(escape("foo_bar", false, false, true, false), r"foo\_bar");
228        assert_eq!(escape("__bold__", false, false, true, false), r"\_\_bold\_\_");
229    }
230
231    #[test]
232    fn test_escape_ascii() {
233        assert_eq!(escape(r##"!"#$%&"##, false, false, false, true), r##"\!\"\#\$\%\&"##);
234        assert_eq!(escape("*+,-./", false, false, false, true), r"\*\+\,\-\.\/");
235        assert_eq!(escape("<=>?@", false, false, false, true), r"\<\=\>\?\@");
236        assert_eq!(escape(r"[\]^_`", false, false, false, true), r"\[\\\]\^\_\`");
237        assert_eq!(escape("{|}~", false, false, false, true), r"\{\|\}\~");
238    }
239
240    #[test]
241    fn test_chomp() {
242        assert_eq!(chomp("  text  "), (" ", " ", "text"));
243        assert_eq!(chomp("text"), ("", "", "text"));
244        assert_eq!(chomp(" text"), (" ", "", "text"));
245        assert_eq!(chomp("text "), ("", " ", "text"));
246        assert_eq!(chomp(""), ("", "", ""));
247    }
248
249    #[test]
250    fn test_underline() {
251        assert_eq!(underline("Title", '='), "Title\n=====\n\n");
252        assert_eq!(underline("Subtitle", '-'), "Subtitle\n--------\n\n");
253        assert_eq!(underline("", '='), "");
254    }
255
256    #[test]
257    fn test_indent() {
258        assert_eq!(indent("line1\nline2", 1, "\t"), "\tline1\n\tline2");
259        assert_eq!(indent("text", 2, "  "), "    text");
260        assert_eq!(indent("", 1, "\t"), "");
261    }
262}