Skip to main content

rdx_transform/transforms/
abbreviation.rs

1use rdx_ast::*;
2
3use crate::{Transform, synthetic_pos};
4
5// ---------------------------------------------------------------------------
6// Transform
7// ---------------------------------------------------------------------------
8
9/// Reads `abbreviations` from the document frontmatter and wraps the **first**
10/// occurrence of each abbreviation in the document text with an `<Abbr>`
11/// inline component that carries a `title` attribute containing the expansion.
12///
13/// # Frontmatter format
14///
15/// ```yaml
16/// ---
17/// abbreviations:
18///   HTML: HyperText Markup Language
19///   CSS: Cascading Style Sheets
20/// ---
21/// ```
22///
23/// The above will wrap the first occurrence of the literal string `"HTML"` with
24/// `<Abbr title="HyperText Markup Language">HTML</Abbr>` and similarly for
25/// `"CSS"`.
26///
27/// Subsequent occurrences of the same abbreviation are left as plain text.
28///
29/// # Notes
30///
31/// - Only [`Node::Text`] nodes inside the document body are searched.
32/// - If the frontmatter is absent, or has no `abbreviations` map, the
33///   transform is a no-op.
34/// - Abbreviation matching is **case-sensitive** and is done on exact
35///   sub-string boundaries (the abbreviation must appear as-is in the text).
36pub struct AbbreviationExpand;
37
38impl Transform for AbbreviationExpand {
39    fn name(&self) -> &str {
40        "abbreviation-expand"
41    }
42
43    fn transform(&self, root: &mut Root, _source: &str) {
44        // Extract the abbreviations map from frontmatter.
45        let abbreviations = match root.frontmatter.as_ref() {
46            Some(fm) => match fm.get("abbreviations") {
47                Some(serde_json::Value::Object(map)) => map
48                    .iter()
49                    .filter_map(|(k, v)| {
50                        if let serde_json::Value::String(expansion) = v {
51                            Some((k.clone(), expansion.clone()))
52                        } else {
53                            None
54                        }
55                    })
56                    .collect::<Vec<_>>(),
57                _ => return,
58            },
59            None => return,
60        };
61
62        if abbreviations.is_empty() {
63            return;
64        }
65
66        // Track which abbreviations have already been wrapped (first-occurrence only).
67        let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
68
69        expand_nodes(&mut root.children, &abbreviations, &mut seen);
70    }
71}
72
73// ---------------------------------------------------------------------------
74// Internal helpers
75// ---------------------------------------------------------------------------
76
77/// Build an `<Abbr title="...">text</Abbr>` inline component node.
78fn make_abbr_component(abbr: &str, expansion: &str) -> Node {
79    Node::Component(ComponentNode {
80        name: "Abbr".to_string(),
81        is_inline: true,
82        attributes: vec![AttributeNode {
83            name: "title".to_string(),
84            value: AttributeValue::String(expansion.to_string()),
85            position: synthetic_pos(),
86        }],
87        children: vec![Node::Text(TextNode {
88            value: abbr.to_string(),
89            position: synthetic_pos(),
90        })],
91        raw_content: String::new(),
92        position: synthetic_pos(),
93    })
94}
95
96/// Try to expand the first unseen abbreviation found in `text`.
97///
98/// Returns `None` if no abbreviation from the list (that hasn't been seen yet)
99/// appears in `text`.  Returns `Some(Vec<Node>)` of replacement nodes
100/// otherwise — the split text fragments interspersed with the `<Abbr>`
101/// component for the matched abbreviation.
102///
103/// We look for the *longest* matching abbreviation at the *earliest* position
104/// to produce predictable output.
105fn split_on_first_abbr(
106    text: &str,
107    abbreviations: &[(String, String)],
108    seen: &std::collections::HashSet<String>,
109) -> Option<(String, Vec<Node>)> {
110    // Find the earliest (leftmost) occurrence of any unseen abbreviation.
111    // If two abbreviations start at the same position, prefer the longer one.
112    let mut best: Option<(usize, usize, &str, &str)> = None; // (start, end, abbr, expansion)
113
114    for (abbr, expansion) in abbreviations {
115        if seen.contains(abbr.as_str()) {
116            continue;
117        }
118        if let Some(pos) = text.find(abbr.as_str()) {
119            let end = pos + abbr.len();
120            let is_better = match best {
121                None => true,
122                Some((best_start, best_end, _, _)) => {
123                    pos < best_start || (pos == best_start && end > best_end)
124                }
125            };
126            if is_better {
127                best = Some((pos, end, abbr.as_str(), expansion.as_str()));
128            }
129        }
130    }
131
132    let (start, end, abbr, expansion) = best?;
133
134    let mut nodes: Vec<Node> = Vec::new();
135    if start > 0 {
136        nodes.push(Node::Text(TextNode {
137            value: text[..start].to_string(),
138            position: synthetic_pos(),
139        }));
140    }
141    nodes.push(make_abbr_component(abbr, expansion));
142    if end < text.len() {
143        nodes.push(Node::Text(TextNode {
144            value: text[end..].to_string(),
145            position: synthetic_pos(),
146        }));
147    }
148
149    Some((abbr.to_string(), nodes))
150}
151
152/// Recursively process a children vector, expanding text nodes in place.
153///
154/// When a text node is expanded into multiple nodes, we replace that single
155/// slot with the new nodes.  We then continue scanning the rest of the
156/// children (the newly inserted `Text` suffix node may contain further
157/// abbreviations).
158fn expand_nodes(
159    nodes: &mut Vec<Node>,
160    abbreviations: &[(String, String)],
161    seen: &mut std::collections::HashSet<String>,
162) {
163    let mut i = 0;
164    while i < nodes.len() {
165        let expansion_result = if let Node::Text(ref t) = nodes[i] {
166            split_on_first_abbr(&t.value, abbreviations, seen)
167        } else {
168            None
169        };
170
171        if let Some((matched_abbr, replacement_nodes)) = expansion_result {
172            seen.insert(matched_abbr);
173            let how_many = replacement_nodes.len();
174            // Replace the single text node with the replacement sequence.
175            nodes.splice(i..=i, replacement_nodes);
176            // Advance past all newly inserted nodes so we don't re-process them,
177            // except for the trailing text node (if any) which may contain more
178            // abbreviations.  We advance `i` to just before the last inserted
179            // node and let the loop increment handle the rest.
180            if how_many > 0 {
181                i += how_many - 1;
182            }
183        } else {
184            // Recurse into children of non-text or already-matched-text nodes.
185            if let Some(children) = nodes[i].children_mut() {
186                expand_nodes(children, abbreviations, seen);
187            }
188            i += 1;
189        }
190    }
191}
192
193// ---------------------------------------------------------------------------
194// Tests
195// ---------------------------------------------------------------------------
196
197#[cfg(test)]
198mod tests {
199    use super::*;
200    use rdx_parser::parse;
201
202    fn parse_with_abbrevs(input: &str) -> Root {
203        parse(input)
204    }
205
206    #[test]
207    fn no_frontmatter_is_noop() {
208        let mut root = parse("HTML is great.\n");
209        AbbreviationExpand.transform(&mut root, "");
210        // No changes expected.
211        match &root.children[0] {
212            Node::Paragraph(p) => {
213                assert!(
214                    p.children.iter().all(|n| !matches!(n, Node::Component(_))),
215                    "Should have no Abbr components without frontmatter"
216                );
217            }
218            other => panic!("Expected paragraph, got {:?}", other),
219        }
220    }
221
222    #[test]
223    fn first_occurrence_wrapped() {
224        let input = "---\nabbreviations:\n  HTML: HyperText Markup Language\n---\nHTML is a language. HTML again.\n";
225        let mut root = parse_with_abbrevs(input);
226        AbbreviationExpand.transform(&mut root, "");
227
228        // Collect all Abbr components in the tree.
229        let mut abbr_count = 0;
230        crate::walk(&root.children, &mut |n| {
231            if let Node::Component(c) = n
232                && c.name == "Abbr"
233            {
234                abbr_count += 1;
235                let title = c.attributes.iter().find_map(|a| {
236                    if a.name == "title" {
237                        if let AttributeValue::String(s) = &a.value {
238                            Some(s.as_str())
239                        } else {
240                            None
241                        }
242                    } else {
243                        None
244                    }
245                });
246                assert_eq!(
247                    title,
248                    Some("HyperText Markup Language"),
249                    "Abbr title should be the expansion"
250                );
251            }
252        });
253        assert_eq!(abbr_count, 1, "Only the first occurrence should be wrapped");
254    }
255
256    #[test]
257    fn second_occurrence_left_as_text() {
258        let input =
259            "---\nabbreviations:\n  CSS: Cascading Style Sheets\n---\nCSS rules. CSS is cool.\n";
260        let mut root = parse_with_abbrevs(input);
261        AbbreviationExpand.transform(&mut root, "");
262
263        let mut abbr_count = 0;
264        crate::walk(&root.children, &mut |n| {
265            if let Node::Component(c) = n
266                && c.name == "Abbr"
267            {
268                abbr_count += 1;
269            }
270        });
271        assert_eq!(
272            abbr_count, 1,
273            "Should wrap only the first occurrence of CSS"
274        );
275    }
276
277    #[test]
278    fn multiple_abbreviations_each_first_wrapped() {
279        let input = "---\nabbreviations:\n  HTML: HyperText Markup Language\n  CSS: Cascading Style Sheets\n---\nHTML and CSS and HTML and CSS.\n";
280        let mut root = parse_with_abbrevs(input);
281        AbbreviationExpand.transform(&mut root, "");
282
283        let mut abbr_count = 0;
284        crate::walk(&root.children, &mut |n| {
285            if let Node::Component(c) = n
286                && c.name == "Abbr"
287            {
288                abbr_count += 1;
289            }
290        });
291        assert_eq!(
292            abbr_count, 2,
293            "First HTML and first CSS should each be wrapped once"
294        );
295    }
296
297    #[test]
298    fn abbreviation_not_in_text_is_noop() {
299        let input =
300            "---\nabbreviations:\n  XML: Extensible Markup Language\n---\nNo abbreviations here.\n";
301        let mut root = parse_with_abbrevs(input);
302        AbbreviationExpand.transform(&mut root, "");
303
304        let has_abbr = {
305            let mut found = false;
306            crate::walk(&root.children, &mut |n| {
307                if let Node::Component(c) = n
308                    && c.name == "Abbr"
309                {
310                    found = true;
311                }
312            });
313            found
314        };
315        assert!(
316            !has_abbr,
317            "No Abbr component should be created when abbreviation isn't in text"
318        );
319    }
320
321    #[test]
322    fn abbr_component_has_correct_children() {
323        let input = "---\nabbreviations:\n  API: Application Programming Interface\n---\nThe API endpoint.\n";
324        let mut root = parse_with_abbrevs(input);
325        AbbreviationExpand.transform(&mut root, "");
326
327        let mut found_abbr = false;
328        crate::walk(&root.children, &mut |n| {
329            if let Node::Component(c) = n
330                && c.name == "Abbr"
331            {
332                found_abbr = true;
333                // The Abbr component's child should be a Text node with value "API".
334                match c.children.first() {
335                    Some(Node::Text(t)) => assert_eq!(t.value, "API"),
336                    other => panic!("Expected Text child in Abbr, got {:?}", other),
337                }
338            }
339        });
340        assert!(found_abbr, "Should have found an Abbr component");
341    }
342
343    #[test]
344    fn empty_abbreviations_map_is_noop() {
345        let input = "---\nabbreviations: {}\n---\nSome text.\n";
346        let mut root = parse_with_abbrevs(input);
347        AbbreviationExpand.transform(&mut root, "");
348        // Should not panic and should produce no Abbr components.
349        let has_abbr = {
350            let mut found = false;
351            crate::walk(&root.children, &mut |n| {
352                if matches!(n, Node::Component(c) if c.name == "Abbr") {
353                    found = true;
354                }
355            });
356            found
357        };
358        assert!(!has_abbr);
359    }
360}