typst_count/counter.rs
1//! Document counting logic for Typst documents.
2//!
3//! This module provides functionality to count words and characters in compiled
4//! Typst documents by traversing the document's element tree and extracting
5//! rendered text content.
6
7use typst::introspection::Introspector;
8use typst::math::EquationElem;
9use typst::model::{EmphElem, StrongElem};
10use typst::syntax::FileId;
11use typst::text::{OverlineElem, RawElem, StrikeElem, SubElem, SuperElem, UnderlineElem};
12
13/// Result of counting words and characters in a document.
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub struct Count {
16 /// Total number of words in the document.
17 ///
18 /// Words are counted by splitting on whitespace, which works well for
19 /// space-separated languages but may not be accurate for languages like
20 /// Chinese or Japanese where words are not separated by spaces.
21 pub words: usize,
22
23 /// Total number of characters in the document.
24 ///
25 /// This includes all rendered characters including spaces and punctuation,
26 /// but excludes markup syntax that doesn't appear in the rendered output.
27 pub characters: usize,
28}
29
30/// Counts words and characters in a compiled Typst document.
31///
32/// This function traverses all elements in the document using the introspector
33/// and extracts plain text content. It handles the following cases:
34///
35/// - **Text styling**: Skips styling elements (bold, italic, etc.) to avoid
36/// double-counting since their text is already included in parent elements.
37/// - **Math equations**: Skips mathematical notation to avoid counting math symbols as words.
38/// - **Imports**: Optionally excludes text from imported/included files.
39/// - **Rendered content**: Only counts text that appears in the final rendered
40/// document, ignoring code, comments, and markup syntax.
41///
42/// # Arguments
43///
44/// * `introspector` - The Typst introspector providing access to document elements
45/// * `exclude_imports` - If `true`, only counts text from the main file
46/// * `main_file_id` - File ID of the main document (used when `exclude_imports` is `true`)
47///
48/// # Returns
49///
50/// A `Count` struct containing the word and character counts.
51///
52/// # Examples
53///
54/// ```ignore
55/// use typst_count::count_document;
56///
57/// let count = count_document(&introspector, false, main_file_id);
58/// println!("Words: {}, Characters: {}", count.words, count.characters);
59/// ```
60///
61/// # Counting Method
62///
63/// - **Words**: Split by Unicode whitespace (equivalent to Rust's `split_whitespace()`)
64/// - **Characters**: Total Unicode scalar values (equivalent to Rust's `chars().count()`)
65///
66/// # Avoiding Double-Counting
67///
68/// Typst's document tree includes both container elements and their styled children.
69/// For example, `*bold text*` creates:
70/// - A paragraph element containing "bold text"
71/// - A `strong` element also containing "bold text"
72///
73/// To avoid counting the same text twice, we skip known styling elements whose
74/// content is already included in their parent elements.
75pub fn count_document(
76 introspector: &Introspector,
77 exclude_imports: bool,
78 main_file_id: FileId,
79) -> Count {
80 let mut words = 0;
81 let mut characters = 0;
82
83 for element in introspector.all() {
84 // Skip elements from imported/included files if requested
85 if exclude_imports
86 && let Some(file_id) = element.span().id()
87 && file_id != main_file_id
88 {
89 continue;
90 }
91
92 // Skip styling elements to avoid double-counting.
93 // These elements' text is already included in their parent elements
94 // (typically paragraphs or other text containers).
95 if is_styling_element(element) {
96 continue;
97 }
98
99 let text = element.plain_text();
100 if !text.is_empty() {
101 characters += text.chars().count();
102 words += text.split_whitespace().count();
103 }
104 }
105
106 Count { words, characters }
107}
108
109/// Checks if an element is a text styling element that should be skipped during counting.
110///
111/// Text styling elements (like bold, italic, underline) wrap text content but don't
112/// add new text. Their content is already included in parent elements, so counting
113/// them would result in double-counting.
114///
115/// # Arguments
116///
117/// * `element` - The element to check
118///
119/// # Returns
120///
121/// `true` if the element is a styling element that should be skipped, `false` otherwise.
122///
123/// # Styling Elements
124///
125/// The following element types are considered styling elements:
126/// - `strong` - Bold text (`*bold*`)
127/// - `emph` - Italic/emphasis text (`_italic_`)
128/// - `underline` - Underlined text
129/// - `strike` - Strikethrough text
130/// - `overline` - Overlined text
131/// - `sub` - Subscript text
132/// - `super` - Superscript text
133/// - `highlight` - Highlighted text
134/// - `equation` - Math equations (`$...$` or `$ ... $`)
135/// - `raw` - code blocks `code`
136///
137/// # Examples
138///
139/// ```ignore
140/// if is_styling_element(&element) {
141/// // Skip this element to avoid double-counting
142/// continue;
143/// }
144/// ```
145fn is_styling_element(element: &typst::foundations::Content) -> bool {
146 element.is::<StrongElem>()
147 || element.is::<EmphElem>()
148 || element.is::<UnderlineElem>()
149 || element.is::<StrikeElem>()
150 || element.is::<OverlineElem>()
151 || element.is::<SubElem>()
152 || element.is::<SuperElem>()
153 || element.is::<EquationElem>() // Skip math equations
154 || element.is::<RawElem>()
155 || element.func().name() == "highlight" // highlight doesn't have a public struct
156}
157
158#[cfg(test)]
159mod tests {
160 use super::*;
161
162 #[test]
163 fn test_count_struct_creation() {
164 let count = Count {
165 words: 42,
166 characters: 256,
167 };
168 assert_eq!(count.words, 42);
169 assert_eq!(count.characters, 256);
170 }
171
172 #[test]
173 fn test_count_equality() {
174 let count1 = Count {
175 words: 10,
176 characters: 50,
177 };
178 let count2 = Count {
179 words: 10,
180 characters: 50,
181 };
182 let count3 = Count {
183 words: 11,
184 characters: 50,
185 };
186
187 assert_eq!(count1, count2);
188 assert_ne!(count1, count3);
189 }
190}