oxihuman_core/
paragraph_detector.rs1#![allow(dead_code)]
4
5#[derive(Debug, Clone, PartialEq)]
12pub struct Paragraph {
13 pub text: String,
14 pub start_line: usize,
15 pub end_line: usize,
16 pub kind: ParagraphKind,
17}
18
19impl Paragraph {
20 pub fn line_count(&self) -> usize {
21 self.end_line.saturating_sub(self.start_line)
22 }
23
24 pub fn word_count(&self) -> usize {
25 self.text.split_whitespace().count()
26 }
27
28 pub fn is_empty_paragraph(&self) -> bool {
29 self.text.trim().is_empty()
30 }
31}
32
33#[derive(Debug, Clone, PartialEq)]
35pub enum ParagraphKind {
36 Normal,
37 Heading,
38 ListItem,
39 CodeBlock,
40 Empty,
41}
42
43#[derive(Debug, Clone)]
45pub struct ParagraphConfig {
46 pub min_blank_lines: usize,
48 pub detect_headings: bool,
50 pub detect_lists: bool,
52}
53
54impl Default for ParagraphConfig {
55 fn default() -> Self {
56 Self {
57 min_blank_lines: 1,
58 detect_headings: true,
59 detect_lists: true,
60 }
61 }
62}
63
64fn infer_kind(line: &str, cfg: &ParagraphConfig) -> ParagraphKind {
66 let trimmed = line.trim_start();
67 if trimmed.is_empty() {
68 return ParagraphKind::Empty;
69 }
70 if cfg.detect_headings && trimmed.starts_with('#') {
71 return ParagraphKind::Heading;
72 }
73 if cfg.detect_lists && (trimmed.starts_with("- ") || trimmed.starts_with("* ")) {
74 return ParagraphKind::ListItem;
75 }
76 if cfg.detect_lists
77 && trimmed
78 .chars()
79 .next()
80 .map(|c| c.is_ascii_digit())
81 .unwrap_or(false)
82 && trimmed.contains(". ")
83 {
84 return ParagraphKind::ListItem;
85 }
86 if trimmed.starts_with("```") {
87 return ParagraphKind::CodeBlock;
88 }
89 ParagraphKind::Normal
90}
91
92pub fn detect_paragraphs(text: &str, cfg: &ParagraphConfig) -> Vec<Paragraph> {
94 let mut paragraphs: Vec<Paragraph> = Vec::new();
95 let lines: Vec<&str> = text.lines().collect();
96 let n = lines.len();
97
98 let mut start = 0usize;
99 let mut blank_run = 0usize;
100 let mut current_lines: Vec<&str> = Vec::new();
101
102 #[allow(clippy::ptr_arg)]
103 let flush = |lines: &Vec<&str>,
104 s: usize,
105 e: usize,
106 paragraphs: &mut Vec<Paragraph>,
107 cfg: &ParagraphConfig| {
108 let text = lines.join("\n");
109 if text.trim().is_empty() {
110 return;
111 }
112 let kind = lines
113 .first()
114 .map(|l| infer_kind(l, cfg))
115 .unwrap_or(ParagraphKind::Normal);
116 paragraphs.push(Paragraph {
117 text,
118 start_line: s,
119 end_line: e,
120 kind,
121 });
122 };
123
124 let mut i = 0;
125 while i < n {
126 let line = lines[i];
127 if line.trim().is_empty() {
128 blank_run += 1;
129 if blank_run >= cfg.min_blank_lines && !current_lines.is_empty() {
130 flush(¤t_lines, start, i, &mut paragraphs, cfg);
131 current_lines.clear();
132 start = i + 1;
133 }
134 } else {
135 blank_run = 0;
136 if current_lines.is_empty() {
137 start = i;
138 }
139 current_lines.push(line);
140 }
141 i += 1;
142 }
143
144 if !current_lines.is_empty() {
145 flush(¤t_lines, start, n, &mut paragraphs, cfg);
146 }
147
148 paragraphs
149}
150
151pub fn paragraph_count(text: &str) -> usize {
153 let cfg = ParagraphConfig::default();
154 detect_paragraphs(text, &cfg).len()
155}
156
157pub fn kind_summary(paragraphs: &[Paragraph]) -> (usize, usize, usize, usize) {
159 let normal = paragraphs
160 .iter()
161 .filter(|p| p.kind == ParagraphKind::Normal)
162 .count();
163 let heading = paragraphs
164 .iter()
165 .filter(|p| p.kind == ParagraphKind::Heading)
166 .count();
167 let list = paragraphs
168 .iter()
169 .filter(|p| p.kind == ParagraphKind::ListItem)
170 .count();
171 let code = paragraphs
172 .iter()
173 .filter(|p| p.kind == ParagraphKind::CodeBlock)
174 .count();
175 (normal, heading, list, code)
176}
177
178pub fn longest_paragraph(paragraphs: &[Paragraph]) -> Option<&Paragraph> {
180 paragraphs.iter().max_by_key(|p| p.word_count())
181}
182
183pub fn filter_by_min_words(paragraphs: Vec<Paragraph>, min_words: usize) -> Vec<Paragraph> {
185 paragraphs
186 .into_iter()
187 .filter(|p| p.word_count() >= min_words)
188 .collect()
189}
190
191#[cfg(test)]
192mod tests {
193 use super::*;
194
195 const SAMPLE: &str =
196 "First paragraph.\nStill first.\n\nSecond paragraph.\n\n# A heading\n\n- list item\n";
197
198 #[test]
199 fn test_paragraph_count() {
200 assert!(paragraph_count(SAMPLE) >= 3);
201 }
202
203 #[test]
204 fn test_heading_detected() {
205 let cfg = ParagraphConfig::default();
206 let paras = detect_paragraphs(SAMPLE, &cfg);
207 assert!(paras.iter().any(|p| p.kind == ParagraphKind::Heading));
208 }
209
210 #[test]
211 fn test_list_item_detected() {
212 let cfg = ParagraphConfig::default();
213 let paras = detect_paragraphs(SAMPLE, &cfg);
214 assert!(paras.iter().any(|p| p.kind == ParagraphKind::ListItem));
215 }
216
217 #[test]
218 fn test_normal_paragraph() {
219 let cfg = ParagraphConfig::default();
220 let paras = detect_paragraphs(SAMPLE, &cfg);
221 assert!(paras.iter().any(|p| p.kind == ParagraphKind::Normal));
222 }
223
224 #[test]
225 fn test_paragraph_word_count() {
226 let p = Paragraph {
227 text: "one two three".into(),
228 start_line: 0,
229 end_line: 1,
230 kind: ParagraphKind::Normal,
231 };
232 assert_eq!(p.word_count(), 3);
233 }
234
235 #[test]
236 fn test_line_count() {
237 let p = Paragraph {
238 text: "x".into(),
239 start_line: 2,
240 end_line: 5,
241 kind: ParagraphKind::Normal,
242 };
243 assert_eq!(p.line_count(), 3);
244 }
245
246 #[test]
247 fn test_kind_summary() {
248 let cfg = ParagraphConfig::default();
249 let paras = detect_paragraphs(SAMPLE, &cfg);
250 let (_, headings, lists, _) = kind_summary(¶s);
251 assert!(headings >= 1);
252 assert!(lists >= 1);
253 }
254
255 #[test]
256 fn test_filter_by_min_words() {
257 let cfg = ParagraphConfig::default();
258 let paras = detect_paragraphs(SAMPLE, &cfg);
259 let filtered = filter_by_min_words(paras, 5);
260 assert!(filtered.iter().all(|p| p.word_count() >= 5));
261 }
262
263 #[test]
264 fn test_empty_text() {
265 assert_eq!(paragraph_count(""), 0);
266 }
267}