typst_count/
counter.rs

1//! Document counting logic for Typst documents.
2//!
3//! This module provides functionality to count words and characters in compiled
4//! Typst documents by traversing the document's element tree and extracting
5//! rendered text content.
6
7use typst::introspection::Introspector;
8use typst::math::EquationElem;
9use typst::model::{EmphElem, StrongElem};
10use typst::syntax::FileId;
11use typst::text::{OverlineElem, RawElem, StrikeElem, SubElem, SuperElem, UnderlineElem};
12
13/// Result of counting words and characters in a document.
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub struct Count {
16    /// Total number of words in the document.
17    ///
18    /// Words are counted by splitting on whitespace, which works well for
19    /// space-separated languages but may not be accurate for languages like
20    /// Chinese or Japanese where words are not separated by spaces.
21    pub words: usize,
22
23    /// Total number of characters in the document.
24    ///
25    /// This includes all rendered characters including spaces and punctuation,
26    /// but excludes markup syntax that doesn't appear in the rendered output.
27    pub characters: usize,
28}
29
30/// Counts words and characters in a compiled Typst document.
31///
32/// This function traverses all elements in the document using the introspector
33/// and extracts plain text content. It handles the following cases:
34///
35/// - **Text styling**: Skips styling elements (bold, italic, etc.) to avoid
36///   double-counting since their text is already included in parent elements.
37/// - **Math equations**: Skips mathematical notation to avoid counting math symbols as words.
38/// - **Imports**: Optionally excludes text from imported/included files.
39/// - **Rendered content**: Only counts text that appears in the final rendered
40///   document, ignoring code, comments, and markup syntax.
41///
42/// # Arguments
43///
44/// * `introspector` - The Typst introspector providing access to document elements
45/// * `exclude_imports` - If `true`, only counts text from the main file
46/// * `main_file_id` - File ID of the main document (used when `exclude_imports` is `true`)
47///
48/// # Returns
49///
50/// A `Count` struct containing the word and character counts.
51///
52/// # Examples
53///
54/// ```ignore
55/// use typst_count::count_document;
56///
57/// let count = count_document(&introspector, false, main_file_id);
58/// println!("Words: {}, Characters: {}", count.words, count.characters);
59/// ```
60///
61/// # Counting Method
62///
63/// - **Words**: Split by Unicode whitespace (equivalent to Rust's `split_whitespace()`)
64/// - **Characters**: Total Unicode scalar values (equivalent to Rust's `chars().count()`)
65///
66/// # Avoiding Double-Counting
67///
68/// Typst's document tree includes both container elements and their styled children.
69/// For example, `*bold text*` creates:
70/// - A paragraph element containing "bold text"
71/// - A `strong` element also containing "bold text"
72///
73/// To avoid counting the same text twice, we skip known styling elements whose
74/// content is already included in their parent elements.
75pub fn count_document(
76    introspector: &Introspector,
77    exclude_imports: bool,
78    main_file_id: FileId,
79) -> Count {
80    let mut words = 0;
81    let mut characters = 0;
82
83    for element in introspector.all() {
84        // Skip elements from imported/included files if requested
85        if exclude_imports
86            && let Some(file_id) = element.span().id()
87            && file_id != main_file_id
88        {
89            continue;
90        }
91
92        // Skip styling elements to avoid double-counting.
93        // These elements' text is already included in their parent elements
94        // (typically paragraphs or other text containers).
95        if is_styling_element(element) {
96            continue;
97        }
98
99        let text = element.plain_text();
100        if !text.is_empty() {
101            characters += text.chars().count();
102            words += text.split_whitespace().count();
103        }
104    }
105
106    Count { words, characters }
107}
108
109/// Checks if an element is a text styling element that should be skipped during counting.
110///
111/// Text styling elements (like bold, italic, underline) wrap text content but don't
112/// add new text. Their content is already included in parent elements, so counting
113/// them would result in double-counting.
114///
115/// # Arguments
116///
117/// * `element` - The element to check
118///
119/// # Returns
120///
121/// `true` if the element is a styling element that should be skipped, `false` otherwise.
122///
123/// # Styling Elements
124///
125/// The following element types are considered styling elements:
126/// - `strong` - Bold text (`*bold*`)
127/// - `emph` - Italic/emphasis text (`_italic_`)
128/// - `underline` - Underlined text
129/// - `strike` - Strikethrough text
130/// - `overline` - Overlined text
131/// - `sub` - Subscript text
132/// - `super` - Superscript text
133/// - `highlight` - Highlighted text
134/// - `equation` - Math equations (`$...$` or `$ ... $`)
135/// - `raw` - code blocks `code`
136///
137/// # Examples
138///
139/// ```ignore
140/// if is_styling_element(&element) {
141///     // Skip this element to avoid double-counting
142///     continue;
143/// }
144/// ```
145fn is_styling_element(element: &typst::foundations::Content) -> bool {
146    element.is::<StrongElem>()
147        || element.is::<EmphElem>()
148        || element.is::<UnderlineElem>()
149        || element.is::<StrikeElem>()
150        || element.is::<OverlineElem>()
151        || element.is::<SubElem>()
152        || element.is::<SuperElem>()
153        || element.is::<EquationElem>() // Skip math equations
154        || element.is::<RawElem>()
155        || element.func().name() == "highlight" // highlight doesn't have a public struct
156}
157
158#[cfg(test)]
159mod tests {
160    use super::*;
161
162    #[test]
163    fn test_count_struct_creation() {
164        let count = Count {
165            words: 42,
166            characters: 256,
167        };
168        assert_eq!(count.words, 42);
169        assert_eq!(count.characters, 256);
170    }
171
172    #[test]
173    fn test_count_equality() {
174        let count1 = Count {
175            words: 10,
176            characters: 50,
177        };
178        let count2 = Count {
179            words: 10,
180            characters: 50,
181        };
182        let count3 = Count {
183            words: 11,
184            characters: 50,
185        };
186
187        assert_eq!(count1, count2);
188        assert_ne!(count1, count3);
189    }
190}