tiptap_rusty_parser/text.rs
1//! Text extraction utilities.
2
3use crate::node::Node;
4use unicode_segmentation::UnicodeSegmentation;
5
6impl Node {
7 /// Concatenate all descendant text, with no separators.
8 ///
9 /// Matches ProseMirror's `node.textContent`.
10 ///
11 /// ```
12 /// use tiptap_rusty_parser::Document;
13 /// let doc = Document::from_json_str(
14 /// r#"{"type":"doc","content":[
15 /// {"type":"paragraph","content":[{"type":"text","text":"Hello "},{"type":"text","text":"world"}]}
16 /// ]}"#,
17 /// ).unwrap();
18 /// assert_eq!(doc.text_content(), "Hello world");
19 /// ```
20 pub fn text_content(&self) -> String {
21 let mut out = String::new();
22 self.walk(&mut |n| {
23 if let Some(t) = &n.text {
24 out.push_str(t);
25 }
26 });
27 out
28 }
29
30 /// Concatenate descendant text, inserting `sep` between adjacent block-level
31 /// siblings (a node is "block-level" if it has `content` and is not a `text`
32 /// node). Text within a block stays contiguous.
33 ///
34 /// ```
35 /// use tiptap_rusty_parser::Document;
36 /// let doc = Document::from_json_str(
37 /// r#"{"type":"doc","content":[
38 /// {"type":"paragraph","content":[{"type":"text","text":"Hello"}]},
39 /// {"type":"paragraph","content":[{"type":"text","text":"world"}]}
40 /// ]}"#,
41 /// ).unwrap();
42 /// assert_eq!(doc.text_content_with_separator("\n\n"), "Hello\n\nworld");
43 /// ```
44 pub fn text_content_with_separator(&self, sep: &str) -> String {
45 let mut out = String::new();
46 render_sep(self, sep, &mut out);
47 out
48 }
49
50 /// Total number of Unicode scalar values across all descendant text.
51 pub fn char_count(&self) -> usize {
52 let mut n = 0;
53 self.walk(&mut |node| {
54 if let Some(t) = &node.text {
55 n += t.chars().count();
56 }
57 });
58 n
59 }
60
61 /// Number of words across all text, via Unicode word segmentation.
62 ///
63 /// Blocks are separated by a space first, so words don't merge across block
64 /// boundaries. Correct for CJK and other complex scripts.
65 ///
66 /// ```
67 /// use tiptap_rusty_parser::Document;
68 /// let doc = Document::from_json_str(
69 /// r#"{"type":"doc","content":[
70 /// {"type":"paragraph","content":[{"type":"text","text":"Hello"}]},
71 /// {"type":"paragraph","content":[{"type":"text","text":"brave world"}]}
72 /// ]}"#,
73 /// ).unwrap();
74 /// assert_eq!(doc.word_count(), 3);
75 /// ```
76 pub fn word_count(&self) -> usize {
77 self.text_content_with_separator(" ")
78 .unicode_words()
79 .count()
80 }
81}
82
83fn is_block(node: &Node) -> bool {
84 node.content.is_some() && node.node_type.as_deref() != Some("text")
85}
86
87fn render_sep(node: &Node, sep: &str, out: &mut String) {
88 if let Some(t) = &node.text {
89 out.push_str(t);
90 }
91 if let Some(children) = &node.content {
92 let mut prev_block = false;
93 for child in children {
94 let block = is_block(child);
95 if block && prev_block {
96 out.push_str(sep);
97 }
98 render_sep(child, sep, out);
99 prev_block = block;
100 }
101 }
102}