Skip to main content

oxihuman_core/
paragraph_detector.rs

1// Copyright (C) 2026 COOLJAPAN OU (Team KitaSan)
2// SPDX-License-Identifier: Apache-2.0
3#![allow(dead_code)]
4
5//! Paragraph boundary detector stub.
6//!
7//! Splits text into paragraphs based on blank-line separators, with
8//! configurable minimum blank-line count and optional list/heading detection.
9
10/// A detected paragraph with its text and byte span.
11#[derive(Debug, Clone, PartialEq)]
12pub struct Paragraph {
13    pub text: String,
14    pub start_line: usize,
15    pub end_line: usize,
16    pub kind: ParagraphKind,
17}
18
19impl Paragraph {
20    pub fn line_count(&self) -> usize {
21        self.end_line.saturating_sub(self.start_line)
22    }
23
24    pub fn word_count(&self) -> usize {
25        self.text.split_whitespace().count()
26    }
27
28    pub fn is_empty_paragraph(&self) -> bool {
29        self.text.trim().is_empty()
30    }
31}
32
33/// The inferred type of a paragraph.
34#[derive(Debug, Clone, PartialEq)]
35pub enum ParagraphKind {
36    Normal,
37    Heading,
38    ListItem,
39    CodeBlock,
40    Empty,
41}
42
43/// Configuration for paragraph detection.
44#[derive(Debug, Clone)]
45pub struct ParagraphConfig {
46    /// Number of consecutive blank lines required to start a new paragraph.
47    pub min_blank_lines: usize,
48    /// Enable detection of Markdown-like headings (`#`, `##`, ...).
49    pub detect_headings: bool,
50    /// Enable detection of list items (`-`, `*`, `1.`).
51    pub detect_lists: bool,
52}
53
54impl Default for ParagraphConfig {
55    fn default() -> Self {
56        Self {
57            min_blank_lines: 1,
58            detect_headings: true,
59            detect_lists: true,
60        }
61    }
62}
63
64/// Infer the kind of paragraph from its first non-empty line.
65fn infer_kind(line: &str, cfg: &ParagraphConfig) -> ParagraphKind {
66    let trimmed = line.trim_start();
67    if trimmed.is_empty() {
68        return ParagraphKind::Empty;
69    }
70    if cfg.detect_headings && trimmed.starts_with('#') {
71        return ParagraphKind::Heading;
72    }
73    if cfg.detect_lists && (trimmed.starts_with("- ") || trimmed.starts_with("* ")) {
74        return ParagraphKind::ListItem;
75    }
76    if cfg.detect_lists
77        && trimmed
78            .chars()
79            .next()
80            .map(|c| c.is_ascii_digit())
81            .unwrap_or(false)
82        && trimmed.contains(". ")
83    {
84        return ParagraphKind::ListItem;
85    }
86    if trimmed.starts_with("```") {
87        return ParagraphKind::CodeBlock;
88    }
89    ParagraphKind::Normal
90}
91
92/// Split text into paragraphs.
93pub fn detect_paragraphs(text: &str, cfg: &ParagraphConfig) -> Vec<Paragraph> {
94    let mut paragraphs: Vec<Paragraph> = Vec::new();
95    let lines: Vec<&str> = text.lines().collect();
96    let n = lines.len();
97
98    let mut start = 0usize;
99    let mut blank_run = 0usize;
100    let mut current_lines: Vec<&str> = Vec::new();
101
102    #[allow(clippy::ptr_arg)]
103    let flush = |lines: &Vec<&str>,
104                 s: usize,
105                 e: usize,
106                 paragraphs: &mut Vec<Paragraph>,
107                 cfg: &ParagraphConfig| {
108        let text = lines.join("\n");
109        if text.trim().is_empty() {
110            return;
111        }
112        let kind = lines
113            .first()
114            .map(|l| infer_kind(l, cfg))
115            .unwrap_or(ParagraphKind::Normal);
116        paragraphs.push(Paragraph {
117            text,
118            start_line: s,
119            end_line: e,
120            kind,
121        });
122    };
123
124    let mut i = 0;
125    while i < n {
126        let line = lines[i];
127        if line.trim().is_empty() {
128            blank_run += 1;
129            if blank_run >= cfg.min_blank_lines && !current_lines.is_empty() {
130                flush(&current_lines, start, i, &mut paragraphs, cfg);
131                current_lines.clear();
132                start = i + 1;
133            }
134        } else {
135            blank_run = 0;
136            if current_lines.is_empty() {
137                start = i;
138            }
139            current_lines.push(line);
140        }
141        i += 1;
142    }
143
144    if !current_lines.is_empty() {
145        flush(&current_lines, start, n, &mut paragraphs, cfg);
146    }
147
148    paragraphs
149}
150
151/// Count paragraphs in text.
152pub fn paragraph_count(text: &str) -> usize {
153    let cfg = ParagraphConfig::default();
154    detect_paragraphs(text, &cfg).len()
155}
156
157/// Return paragraph kinds summary.
158pub fn kind_summary(paragraphs: &[Paragraph]) -> (usize, usize, usize, usize) {
159    let normal = paragraphs
160        .iter()
161        .filter(|p| p.kind == ParagraphKind::Normal)
162        .count();
163    let heading = paragraphs
164        .iter()
165        .filter(|p| p.kind == ParagraphKind::Heading)
166        .count();
167    let list = paragraphs
168        .iter()
169        .filter(|p| p.kind == ParagraphKind::ListItem)
170        .count();
171    let code = paragraphs
172        .iter()
173        .filter(|p| p.kind == ParagraphKind::CodeBlock)
174        .count();
175    (normal, heading, list, code)
176}
177
178/// Find the longest paragraph by word count.
179pub fn longest_paragraph(paragraphs: &[Paragraph]) -> Option<&Paragraph> {
180    paragraphs.iter().max_by_key(|p| p.word_count())
181}
182
183/// Filter paragraphs with fewer than `min_words` words.
184pub fn filter_by_min_words(paragraphs: Vec<Paragraph>, min_words: usize) -> Vec<Paragraph> {
185    paragraphs
186        .into_iter()
187        .filter(|p| p.word_count() >= min_words)
188        .collect()
189}
190
191#[cfg(test)]
192mod tests {
193    use super::*;
194
195    const SAMPLE: &str =
196        "First paragraph.\nStill first.\n\nSecond paragraph.\n\n# A heading\n\n- list item\n";
197
198    #[test]
199    fn test_paragraph_count() {
200        assert!(paragraph_count(SAMPLE) >= 3);
201    }
202
203    #[test]
204    fn test_heading_detected() {
205        let cfg = ParagraphConfig::default();
206        let paras = detect_paragraphs(SAMPLE, &cfg);
207        assert!(paras.iter().any(|p| p.kind == ParagraphKind::Heading));
208    }
209
210    #[test]
211    fn test_list_item_detected() {
212        let cfg = ParagraphConfig::default();
213        let paras = detect_paragraphs(SAMPLE, &cfg);
214        assert!(paras.iter().any(|p| p.kind == ParagraphKind::ListItem));
215    }
216
217    #[test]
218    fn test_normal_paragraph() {
219        let cfg = ParagraphConfig::default();
220        let paras = detect_paragraphs(SAMPLE, &cfg);
221        assert!(paras.iter().any(|p| p.kind == ParagraphKind::Normal));
222    }
223
224    #[test]
225    fn test_paragraph_word_count() {
226        let p = Paragraph {
227            text: "one two three".into(),
228            start_line: 0,
229            end_line: 1,
230            kind: ParagraphKind::Normal,
231        };
232        assert_eq!(p.word_count(), 3);
233    }
234
235    #[test]
236    fn test_line_count() {
237        let p = Paragraph {
238            text: "x".into(),
239            start_line: 2,
240            end_line: 5,
241            kind: ParagraphKind::Normal,
242        };
243        assert_eq!(p.line_count(), 3);
244    }
245
246    #[test]
247    fn test_kind_summary() {
248        let cfg = ParagraphConfig::default();
249        let paras = detect_paragraphs(SAMPLE, &cfg);
250        let (_, headings, lists, _) = kind_summary(&paras);
251        assert!(headings >= 1);
252        assert!(lists >= 1);
253    }
254
255    #[test]
256    fn test_filter_by_min_words() {
257        let cfg = ParagraphConfig::default();
258        let paras = detect_paragraphs(SAMPLE, &cfg);
259        let filtered = filter_by_min_words(paras, 5);
260        assert!(filtered.iter().all(|p| p.word_count() >= 5));
261    }
262
263    #[test]
264    fn test_empty_text() {
265        assert_eq!(paragraph_count(""), 0);
266    }
267}