Skip to main content

rumdl_lib/utils/
mkdocs_abbreviations.rs

1/// MkDocs/Python-Markdown Abbreviations extension support
2///
3/// This module provides support for the Python-Markdown Abbreviations extension,
4/// which allows defining abbreviations that get expanded with `<abbr>` tags.
5///
6/// ## Syntax
7///
8/// Abbreviation definitions appear at the end of the document:
9/// ```markdown
10/// The HTML specification is maintained by the W3C.
11///
12/// *[HTML]: Hypertext Markup Language
13/// *[W3C]: World Wide Web Consortium
14/// ```
15///
16/// When rendered, each occurrence of HTML and W3C in the document text
17/// gets wrapped in an `<abbr>` tag with a `title` attribute.
18///
19/// ## Format Requirements
20///
21/// - Must start with `*[` followed by the abbreviation
22/// - Abbreviation is closed with `]:`
23/// - Definition follows after the colon, optionally with whitespace
24/// - Typically placed at the end of the document (but can appear anywhere)
25///
26/// ## References
27///
28/// - [Python-Markdown Abbreviations](https://python-markdown.github.io/extensions/abbreviations/)
29/// - [MkDocs Material - Abbreviations](https://squidfunk.github.io/mkdocs-material/reference/tooltips/#adding-abbreviations)
30use regex::Regex;
31use std::sync::LazyLock;
32
33/// Pattern to match abbreviation definitions: `*[ABBR]: Definition`
34/// Supports:
35/// - Simple abbreviations: `*[HTML]: Hypertext Markup Language`
36/// - Multi-word abbreviations: `*[W3C]: World Wide Web Consortium`
37/// - Abbreviations with numbers: `*[CSS3]: Cascading Style Sheets Level 3`
38static ABBREVIATION_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\*\[([^\]]+)\]:\s*(.*)$").unwrap());
39
40/// Parsed abbreviation definition
41#[derive(Debug, Clone, PartialEq)]
42pub struct Abbreviation {
43    /// The abbreviation text (e.g., "HTML")
44    pub abbr: String,
45    /// The definition/expansion (e.g., "Hypertext Markup Language")
46    pub definition: String,
47    /// Line number where defined (1-indexed)
48    pub line: usize,
49}
50
51/// Check if a line is an abbreviation definition
52#[inline]
53pub fn is_abbreviation_definition(line: &str) -> bool {
54    // Fast path: check for distinctive prefix
55    if !line.trim_start().starts_with("*[") {
56        return false;
57    }
58    ABBREVIATION_PATTERN.is_match(line)
59}
60
61/// Check if a line might be an abbreviation definition (fast check)
62#[inline]
63pub fn might_be_abbreviation(line: &str) -> bool {
64    let trimmed = line.trim_start();
65    trimmed.starts_with("*[") && trimmed.contains("]:")
66}
67
68/// Parse an abbreviation definition from a line
69///
70/// # Returns
71/// Some(Abbreviation) if the line is a valid abbreviation definition, None otherwise
72///
73/// # Examples
74/// ```
75/// use rumdl_lib::utils::mkdocs_abbreviations::parse_abbreviation;
76///
77/// let abbr = parse_abbreviation("*[HTML]: Hypertext Markup Language", 1);
78/// assert!(abbr.is_some());
79/// let abbr = abbr.unwrap();
80/// assert_eq!(abbr.abbr, "HTML");
81/// assert_eq!(abbr.definition, "Hypertext Markup Language");
82/// ```
83pub fn parse_abbreviation(line: &str, line_num: usize) -> Option<Abbreviation> {
84    if let Some(caps) = ABBREVIATION_PATTERN.captures(line) {
85        let abbr = caps.get(1)?.as_str().to_string();
86        let definition = caps.get(2).map(|m| m.as_str().to_string()).unwrap_or_default();
87
88        Some(Abbreviation {
89            abbr,
90            definition,
91            line: line_num,
92        })
93    } else {
94        None
95    }
96}
97
98/// Extract all abbreviation definitions from content
99///
100/// # Returns
101/// A vector of Abbreviation structs for each definition found
102pub fn extract_abbreviations(content: &str) -> Vec<Abbreviation> {
103    let mut abbreviations = Vec::new();
104
105    for (line_idx, line) in content.lines().enumerate() {
106        if let Some(abbr) = parse_abbreviation(line, line_idx + 1) {
107            abbreviations.push(abbr);
108        }
109    }
110
111    abbreviations
112}
113
114/// Check if a position in a line is within an abbreviation definition
115pub fn is_in_abbreviation_definition(line: &str, position: usize) -> bool {
116    // If the line is an abbreviation definition, all positions are within it
117    if is_abbreviation_definition(line) {
118        return position < line.len();
119    }
120    false
121}
122
123/// Get all abbreviation terms from content (just the abbreviation part, not definitions)
124///
125/// # Returns
126/// A vector of abbreviation terms (e.g., ["HTML", "CSS", "W3C"])
127pub fn get_abbreviation_terms(content: &str) -> Vec<String> {
128    extract_abbreviations(content).into_iter().map(|a| a.abbr).collect()
129}
130
131/// Check if a word in content matches a defined abbreviation
132///
133/// This is useful for rules like MD013 that need to know if a word
134/// should be treated specially because it's a defined abbreviation.
135pub fn is_defined_abbreviation(content: &str, word: &str) -> bool {
136    for line in content.lines() {
137        if let Some(abbr) = parse_abbreviation(line, 0)
138            && abbr.abbr == word
139        {
140            return true;
141        }
142    }
143    false
144}
145
146#[cfg(test)]
147mod tests {
148    use super::*;
149
150    #[test]
151    fn test_is_abbreviation_definition() {
152        // Valid abbreviation definitions
153        assert!(is_abbreviation_definition("*[HTML]: Hypertext Markup Language"));
154        assert!(is_abbreviation_definition("*[CSS]: Cascading Style Sheets"));
155        assert!(is_abbreviation_definition("*[W3C]: World Wide Web Consortium"));
156        assert!(is_abbreviation_definition("*[CSS3]: CSS Level 3"));
157        assert!(is_abbreviation_definition("*[abbr]: definition"));
158
159        // Empty definition is valid
160        assert!(is_abbreviation_definition("*[HTML]:"));
161        assert!(is_abbreviation_definition("*[HTML]: "));
162
163        // Invalid patterns
164        assert!(!is_abbreviation_definition("# Heading"));
165        assert!(!is_abbreviation_definition("Regular text"));
166        assert!(!is_abbreviation_definition("[HTML]: Not an abbr"));
167        assert!(!is_abbreviation_definition("*HTML: Not an abbr"));
168        assert!(!is_abbreviation_definition("*[HTML] Not an abbr"));
169    }
170
171    #[test]
172    fn test_parse_abbreviation() {
173        let abbr = parse_abbreviation("*[HTML]: Hypertext Markup Language", 1);
174        assert!(abbr.is_some());
175        let abbr = abbr.unwrap();
176        assert_eq!(abbr.abbr, "HTML");
177        assert_eq!(abbr.definition, "Hypertext Markup Language");
178        assert_eq!(abbr.line, 1);
179
180        let abbr = parse_abbreviation("*[CSS3]: CSS Level 3", 5);
181        assert!(abbr.is_some());
182        let abbr = abbr.unwrap();
183        assert_eq!(abbr.abbr, "CSS3");
184        assert_eq!(abbr.definition, "CSS Level 3");
185        assert_eq!(abbr.line, 5);
186
187        let abbr = parse_abbreviation("Not an abbreviation", 1);
188        assert!(abbr.is_none());
189    }
190
191    #[test]
192    fn test_extract_abbreviations() {
193        let content = r#"# Document
194
195The HTML specification is maintained by the W3C.
196
197CSS is used for styling.
198
199*[HTML]: Hypertext Markup Language
200*[W3C]: World Wide Web Consortium
201*[CSS]: Cascading Style Sheets
202"#;
203        let abbreviations = extract_abbreviations(content);
204        assert_eq!(abbreviations.len(), 3);
205
206        assert_eq!(abbreviations[0].abbr, "HTML");
207        assert_eq!(abbreviations[0].definition, "Hypertext Markup Language");
208
209        assert_eq!(abbreviations[1].abbr, "W3C");
210        assert_eq!(abbreviations[1].definition, "World Wide Web Consortium");
211
212        assert_eq!(abbreviations[2].abbr, "CSS");
213        assert_eq!(abbreviations[2].definition, "Cascading Style Sheets");
214    }
215
216    #[test]
217    fn test_is_defined_abbreviation() {
218        let content = r#"Some text.
219
220*[HTML]: Hypertext Markup Language
221*[CSS]: Cascading Style Sheets
222"#;
223        assert!(is_defined_abbreviation(content, "HTML"));
224        assert!(is_defined_abbreviation(content, "CSS"));
225        assert!(!is_defined_abbreviation(content, "W3C"));
226        assert!(!is_defined_abbreviation(content, "html")); // Case-sensitive
227    }
228
229    #[test]
230    fn test_get_abbreviation_terms() {
231        let content = r#"Text here.
232
233*[HTML]: Hypertext Markup Language
234*[CSS]: Cascading Style Sheets
235*[W3C]: World Wide Web Consortium
236"#;
237        let terms = get_abbreviation_terms(content);
238        assert_eq!(terms, vec!["HTML", "CSS", "W3C"]);
239    }
240
241    #[test]
242    fn test_might_be_abbreviation() {
243        assert!(might_be_abbreviation("*[HTML]: Definition"));
244        assert!(might_be_abbreviation("  *[HTML]: Definition")); // With leading spaces
245        assert!(!might_be_abbreviation("*HTML: Not abbr"));
246        assert!(!might_be_abbreviation("[HTML]: Not abbr"));
247        assert!(!might_be_abbreviation("Regular text"));
248    }
249
250    #[test]
251    fn test_abbreviation_with_special_characters() {
252        // Abbreviations can contain various characters
253        let abbr = parse_abbreviation("*[C++]: C Plus Plus", 1);
254        assert!(abbr.is_some());
255        let abbr = abbr.unwrap();
256        assert_eq!(abbr.abbr, "C++");
257
258        let abbr = parse_abbreviation("*[.NET]: Dot NET Framework", 1);
259        assert!(abbr.is_some());
260        let abbr = abbr.unwrap();
261        assert_eq!(abbr.abbr, ".NET");
262    }
263
264    #[test]
265    fn test_multi_word_definitions() {
266        let abbr = parse_abbreviation("*[API]: Application Programming Interface", 1);
267        assert!(abbr.is_some());
268        let abbr = abbr.unwrap();
269        assert_eq!(abbr.definition, "Application Programming Interface");
270    }
271
272    #[test]
273    fn test_empty_definition() {
274        let abbr = parse_abbreviation("*[HTML]:", 1);
275        assert!(abbr.is_some());
276        let abbr = abbr.unwrap();
277        assert_eq!(abbr.abbr, "HTML");
278        assert_eq!(abbr.definition, "");
279    }
280}