Skip to main content

terraphim_markdown_parser/
heading.rs

1use std::ops::Range;
2
3use markdown::mdast::Node;
4use serde::{Deserialize, Serialize};
5use ulid::Ulid;
6
7use crate::{MarkdownParserError, NormalizedMarkdown, children, collect_text_content};
8
9/// Strategy used to match a heading title against a `SectionPattern`'s pattern string.
10#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
11pub enum MatchStrategy {
12    /// Match when the heading title starts with the pattern.
13    Prefix,
14    /// Match when the heading title contains the pattern anywhere.
15    Contains,
16}
17
18#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
19pub enum SectionType {
20    Main,
21    Sidebar(String),
22    Career,
23    Assessment,
24}
25
26impl std::fmt::Display for SectionType {
27    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
28        match self {
29            SectionType::Main => write!(f, "Main"),
30            SectionType::Sidebar(s) => write!(f, "Sidebar({s})"),
31            SectionType::Career => write!(f, "Career"),
32            SectionType::Assessment => write!(f, "Assessment"),
33        }
34    }
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct SectionPattern {
39    pub pattern: String,
40    pub section_type: SectionType,
41    pub match_strategy: MatchStrategy,
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct SectionConfig {
46    pub rules: Vec<SectionPattern>,
47}
48
49impl SectionConfig {
50    pub fn textbook_default() -> Self {
51        Self {
52            rules: vec![
53                SectionPattern {
54                    pattern: "Power Selling".to_string(),
55                    section_type: SectionType::Sidebar("PowerSelling".to_string()),
56                    match_strategy: MatchStrategy::Prefix,
57                },
58                SectionPattern {
59                    pattern: "Power Player".to_string(),
60                    section_type: SectionType::Sidebar("PowerPlayer".to_string()),
61                    match_strategy: MatchStrategy::Prefix,
62                },
63                SectionPattern {
64                    pattern: "Power Point".to_string(),
65                    section_type: SectionType::Sidebar("PowerPoint".to_string()),
66                    match_strategy: MatchStrategy::Prefix,
67                },
68                // "Selling U" uses Contains because the phrase can appear
69                // mid-title (e.g. "Chapter 3: Selling U -- Your Career").
70                SectionPattern {
71                    pattern: "Selling U".to_string(),
72                    section_type: SectionType::Career,
73                    match_strategy: MatchStrategy::Contains,
74                },
75                SectionPattern {
76                    pattern: "Key Takeaways".to_string(),
77                    section_type: SectionType::Assessment,
78                    match_strategy: MatchStrategy::Prefix,
79                },
80                SectionPattern {
81                    pattern: "Test Your Power Knowledge".to_string(),
82                    section_type: SectionType::Assessment,
83                    match_strategy: MatchStrategy::Prefix,
84                },
85            ],
86        }
87    }
88
89    pub fn classify(&self, title: &str) -> SectionType {
90        let title_trimmed = title.trim();
91        for rule in &self.rules {
92            let matched = match rule.match_strategy {
93                MatchStrategy::Prefix => title_trimmed.starts_with(&rule.pattern),
94                MatchStrategy::Contains => title_trimmed.contains(&rule.pattern),
95            };
96            if matched {
97                return rule.section_type.clone();
98            }
99        }
100        SectionType::Main
101    }
102}
103
104impl Default for SectionConfig {
105    fn default() -> Self {
106        Self::textbook_default()
107    }
108}
109
110#[derive(Debug, Clone)]
111pub struct HeadingNode {
112    pub level: u8,
113    pub title: String,
114    pub section_type: SectionType,
115    pub blocks: Vec<Ulid>,
116    pub children: Vec<HeadingNode>,
117    pub byte_range: Range<usize>,
118}
119
120#[derive(Debug, Clone)]
121pub struct HeadingTree {
122    pub roots: Vec<HeadingNode>,
123}
124
125pub fn build_heading_tree(
126    normalized: &NormalizedMarkdown,
127) -> Result<HeadingTree, MarkdownParserError> {
128    let Some(ref ast) = normalized.ast else {
129        return Ok(HeadingTree { roots: vec![] });
130    };
131
132    let headings = extract_headings(ast);
133    let tree = build_tree_from_headings(&headings, normalized);
134    Ok(tree)
135}
136
137pub fn classify_sections(tree: &mut HeadingTree, config: &SectionConfig) {
138    for root in &mut tree.roots {
139        classify_node(root, config);
140    }
141}
142
143fn classify_node(node: &mut HeadingNode, config: &SectionConfig) {
144    node.section_type = config.classify(&node.title);
145    for child in &mut node.children {
146        classify_node(child, config);
147    }
148}
149
150pub struct RawHeading {
151    level: u8,
152    title: String,
153    byte_start: usize,
154    byte_end: usize,
155}
156
157fn extract_headings(node: &Node) -> Vec<RawHeading> {
158    let mut result = Vec::new();
159    collect_headings(node, &mut result);
160    result
161}
162
163fn collect_headings(node: &Node, out: &mut Vec<RawHeading>) {
164    if let Node::Heading(h) = node {
165        let title = collect_text_content(&h.children);
166        let (start, end) = if let Some(pos) = node.position() {
167            (pos.start.offset, pos.end.offset)
168        } else {
169            (0, 0)
170        };
171        out.push(RawHeading {
172            level: h.depth,
173            title,
174            byte_start: start,
175            byte_end: end,
176        });
177        return;
178    }
179
180    if let Some(children) = children(node) {
181        for child in children {
182            collect_headings(child, out);
183        }
184    }
185}
186
187fn build_tree_from_headings(
188    headings: &[RawHeading],
189    normalized: &NormalizedMarkdown,
190) -> HeadingTree {
191    if headings.is_empty() {
192        return HeadingTree { roots: vec![] };
193    }
194
195    let mut roots: Vec<HeadingNode> = Vec::new();
196    let mut stack: Vec<HeadingNode> = Vec::new();
197
198    for (i, heading) in headings.iter().enumerate() {
199        let next_byte_start = headings
200            .get(i + 1)
201            .map(|h| h.byte_start)
202            .unwrap_or(normalized.markdown.len());
203
204        let blocks = blocks_in_range(&normalized.blocks, heading.byte_end, next_byte_start);
205
206        let node = HeadingNode {
207            level: heading.level,
208            title: heading.title.clone(),
209            section_type: SectionType::Main,
210            blocks,
211            children: Vec::new(),
212            byte_range: heading.byte_start..next_byte_start,
213        };
214
215        while let Some(top) = stack.last() {
216            if top.level < node.level {
217                break;
218            }
219            let popped = stack.pop().unwrap();
220            if let Some(parent) = stack.last_mut() {
221                parent.children.push(popped);
222            } else {
223                roots.push(popped);
224            }
225        }
226
227        stack.push(node);
228    }
229
230    while let Some(popped) = stack.pop() {
231        if let Some(parent) = stack.last_mut() {
232            parent.children.push(popped);
233        } else {
234            roots.push(popped);
235        }
236    }
237
238    HeadingTree { roots }
239}
240
241fn blocks_in_range(blocks: &[crate::Block], start: usize, end: usize) -> Vec<Ulid> {
242    blocks
243        .iter()
244        .filter(|b| b.span.start >= start && b.span.start < end)
245        .map(|b| b.id)
246        .collect()
247}
248
249#[cfg(test)]
250mod tests {
251    use super::*;
252    use crate::normalize_markdown;
253
254    #[test]
255    fn section_config_classify_sidebar() {
256        let config = SectionConfig::textbook_default();
257        assert_eq!(
258            config.classify("Power Selling: The Art of Persuasion"),
259            SectionType::Sidebar("PowerSelling".to_string())
260        );
261    }
262
263    #[test]
264    fn section_config_classify_career() {
265        let config = SectionConfig::textbook_default();
266        assert_eq!(
267            config.classify("Selling U: Your Career"),
268            SectionType::Career
269        );
270    }
271
272    #[test]
273    fn section_config_classify_assessment() {
274        let config = SectionConfig::textbook_default();
275        assert_eq!(
276            config.classify("Key Takeaways from Chapter 3"),
277            SectionType::Assessment
278        );
279    }
280
281    #[test]
282    fn section_config_classify_main_fallback() {
283        let config = SectionConfig::textbook_default();
284        assert_eq!(config.classify("Introduction to Sales"), SectionType::Main);
285    }
286
287    #[test]
288    fn build_heading_tree_simple() {
289        let input = "# Chapter 1\n\nIntro paragraph\n\n## Section 1.1\n\nSome text\n\n# Chapter 2\n\nMore text\n";
290        let normalized = normalize_markdown(input).unwrap();
291        let tree = build_heading_tree(&normalized).unwrap();
292
293        assert_eq!(tree.roots.len(), 2);
294        assert_eq!(tree.roots[0].title, "Chapter 1");
295        assert_eq!(tree.roots[0].level, 1);
296        assert_eq!(tree.roots[0].children.len(), 1);
297        assert_eq!(tree.roots[0].children[0].title, "Section 1.1");
298        assert_eq!(tree.roots[1].title, "Chapter 2");
299    }
300
301    #[test]
302    fn build_heading_tree_attaches_blocks() {
303        let input = "# Chapter\n\nParagraph one\n\nParagraph two\n";
304        let normalized = normalize_markdown(input).unwrap();
305        let tree = build_heading_tree(&normalized).unwrap();
306
307        assert_eq!(tree.roots.len(), 1);
308        assert_eq!(tree.roots[0].blocks.len(), 2);
309    }
310
311    #[test]
312    fn build_heading_tree_all_levels() {
313        let input = "# H1\n\n## H2\n\n### H3\n\n#### H4\n\n##### H5\n\n###### H6\n\nText\n";
314        let normalized = normalize_markdown(input).unwrap();
315        let tree = build_heading_tree(&normalized).unwrap();
316
317        assert_eq!(tree.roots.len(), 1);
318        assert_eq!(tree.roots[0].level, 1);
319        assert_eq!(tree.roots[0].children.len(), 1);
320        assert_eq!(tree.roots[0].children[0].level, 2);
321        assert_eq!(tree.roots[0].children[0].children.len(), 1);
322        assert_eq!(tree.roots[0].children[0].children[0].level, 3);
323    }
324
325    #[test]
326    fn classify_sections_applies_config() {
327        let input = "# Main Title\n\nText\n\n## Power Selling: Tips\n\nTip text\n\n## Selling U: Careers\n\nCareer text\n";
328        let normalized = normalize_markdown(input).unwrap();
329        let mut tree = build_heading_tree(&normalized).unwrap();
330        classify_sections(&mut tree, &SectionConfig::textbook_default());
331
332        assert_eq!(tree.roots[0].section_type, SectionType::Main);
333        let ps = &tree.roots[0].children[0];
334        assert_eq!(
335            ps.section_type,
336            SectionType::Sidebar("PowerSelling".to_string())
337        );
338        let su = &tree.roots[0].children[1];
339        assert_eq!(su.section_type, SectionType::Career);
340    }
341
342    #[test]
343    fn build_heading_tree_empty() {
344        let input = "No headings here\n\nJust text\n";
345        let normalized = normalize_markdown(input).unwrap();
346        let tree = build_heading_tree(&normalized).unwrap();
347        assert!(tree.roots.is_empty());
348    }
349
350    #[test]
351    fn custom_section_config() {
352        let config = SectionConfig {
353            rules: vec![SectionPattern {
354                pattern: "Experiment".to_string(),
355                section_type: SectionType::Sidebar("Lab".to_string()),
356                match_strategy: MatchStrategy::Prefix,
357            }],
358        };
359        assert_eq!(
360            config.classify("Experiment 3: Results"),
361            SectionType::Sidebar("Lab".to_string())
362        );
363        assert_eq!(config.classify("Introduction"), SectionType::Main);
364    }
365
366    #[test]
367    fn match_strategy_contains() {
368        let config = SectionConfig {
369            rules: vec![SectionPattern {
370                pattern: "Selling U".to_string(),
371                section_type: SectionType::Career,
372                match_strategy: MatchStrategy::Contains,
373            }],
374        };
375        // Contains matches mid-title
376        assert_eq!(
377            config.classify("Chapter 3: Selling U -- Your Career"),
378            SectionType::Career,
379        );
380        // Contains also matches at the start
381        assert_eq!(config.classify("Selling U: Careers"), SectionType::Career,);
382        // No match when substring is absent
383        assert_eq!(config.classify("Introduction"), SectionType::Main);
384    }
385}