Skip to main content

tiptap_rusty_parser/
text.rs

1//! Text extraction utilities.
2
3use crate::node::Node;
4use unicode_segmentation::UnicodeSegmentation;
5
6impl Node {
7    /// Concatenate all descendant text, with no separators.
8    ///
9    /// Matches ProseMirror's `node.textContent`.
10    ///
11    /// ```
12    /// use tiptap_rusty_parser::Document;
13    /// let doc = Document::from_json_str(
14    ///     r#"{"type":"doc","content":[
15    ///         {"type":"paragraph","content":[{"type":"text","text":"Hello "},{"type":"text","text":"world"}]}
16    ///     ]}"#,
17    /// ).unwrap();
18    /// assert_eq!(doc.text_content(), "Hello world");
19    /// ```
20    pub fn text_content(&self) -> String {
21        let mut out = String::new();
22        self.walk(&mut |n| {
23            if let Some(t) = &n.text {
24                out.push_str(t);
25            }
26        });
27        out
28    }
29
30    /// Concatenate descendant text, inserting `sep` between adjacent block-level
31    /// siblings (a node is "block-level" if it has `content` and is not a `text`
32    /// node). Text within a block stays contiguous.
33    ///
34    /// ```
35    /// use tiptap_rusty_parser::Document;
36    /// let doc = Document::from_json_str(
37    ///     r#"{"type":"doc","content":[
38    ///         {"type":"paragraph","content":[{"type":"text","text":"Hello"}]},
39    ///         {"type":"paragraph","content":[{"type":"text","text":"world"}]}
40    ///     ]}"#,
41    /// ).unwrap();
42    /// assert_eq!(doc.text_content_with_separator("\n\n"), "Hello\n\nworld");
43    /// ```
44    pub fn text_content_with_separator(&self, sep: &str) -> String {
45        let mut out = String::new();
46        render_sep(self, sep, &mut out);
47        out
48    }
49
50    /// Total number of Unicode scalar values across all descendant text.
51    pub fn char_count(&self) -> usize {
52        let mut n = 0;
53        self.walk(&mut |node| {
54            if let Some(t) = &node.text {
55                n += t.chars().count();
56            }
57        });
58        n
59    }
60
61    /// Number of words across all text, via Unicode word segmentation.
62    ///
63    /// Blocks are separated by a space first, so words don't merge across block
64    /// boundaries. Correct for CJK and other complex scripts.
65    ///
66    /// ```
67    /// use tiptap_rusty_parser::Document;
68    /// let doc = Document::from_json_str(
69    ///     r#"{"type":"doc","content":[
70    ///         {"type":"paragraph","content":[{"type":"text","text":"Hello"}]},
71    ///         {"type":"paragraph","content":[{"type":"text","text":"brave world"}]}
72    ///     ]}"#,
73    /// ).unwrap();
74    /// assert_eq!(doc.word_count(), 3);
75    /// ```
76    pub fn word_count(&self) -> usize {
77        self.text_content_with_separator(" ")
78            .unicode_words()
79            .count()
80    }
81}
82
83fn is_block(node: &Node) -> bool {
84    node.content.is_some() && node.node_type.as_deref() != Some("text")
85}
86
87fn render_sep(node: &Node, sep: &str, out: &mut String) {
88    if let Some(t) = &node.text {
89        out.push_str(t);
90    }
91    if let Some(children) = &node.content {
92        let mut prev_block = false;
93        for child in children {
94            let block = is_block(child);
95            if block && prev_block {
96                out.push_str(sep);
97            }
98            render_sep(child, sep, out);
99            prev_block = block;
100        }
101    }
102}