triplets_core/
utils.rs

1//! Text normalization helpers shared by source implementations.
2
3use chrono::{DateTime, Utc};
4use line_ending::LineEnding;
5use std::fs;
6use std::path::Path;
7
8use crate::data::{RecordSection, SectionRole};
9use crate::types::Sentence;
10
11/// Returns the newline string for the current platform (`"\n"` on Unix, `"\r\n"` on Windows).
12pub fn platform_newline() -> &'static str {
13    LineEnding::from_current_platform().as_str()
14}
15
16/// Collapse repeated whitespace in-place while preserving single spaces.
17/// Collapse runs of whitespace into single spaces and trim.
18pub fn normalize_inline_whitespace<T: AsRef<str>>(text: T) -> String {
19    let mut normalized = String::new();
20    let mut seen_space = false;
21    for ch in text.as_ref().chars() {
22        if ch.is_whitespace() {
23            if !seen_space {
24                normalized.push(' ');
25                seen_space = true;
26            }
27        } else {
28            normalized.push(ch);
29            seen_space = false;
30        }
31    }
32    normalized.trim().to_string()
33}
34
35/// Split a block of text into sentences, falling back to the whole string when needed.
36/// Heuristic sentence splitter with tokenizer-friendly rules.
37pub fn sentences(text: &str) -> Vec<Sentence> {
38    let mut results = Vec::new();
39    let normalized = LineEnding::normalize(text);
40    let paragraph_sep = LineEnding::LF.as_str().repeat(2);
41    for block in normalized.split(&paragraph_sep) {
42        if block.trim().is_empty() {
43            continue;
44        }
45        let normalized = normalize_inline_whitespace(block);
46        if normalized.is_empty() {
47            continue;
48        }
49        push_block_sentences(&normalized, &mut results);
50    }
51
52    results
53}
54
55/// Convenience helper to construct a `RecordSection` with normalized text metadata.
56/// Convenience helper to build a `RecordSection` with precomputed sentences.
57pub fn make_section(role: SectionRole, heading: Option<&str>, text: &str) -> RecordSection {
58    RecordSection {
59        role,
60        heading: heading.map(|h| h.to_string()),
61        text: text.to_string(),
62        sentences: sentences(text),
63    }
64}
65
66fn push_block_sentences(block: &str, results: &mut Vec<Sentence>) {
67    let chars: Vec<char> = block.chars().collect();
68    let mut buffer = String::new();
69
70    for (idx, ch) in chars.iter().enumerate() {
71        buffer.push(*ch);
72        if is_sentence_boundary(&chars, idx) {
73            let trimmed = buffer.trim();
74            if !trimmed.is_empty() {
75                results.push(trimmed.to_string());
76            }
77            buffer.clear();
78        }
79    }
80
81    let trailing = buffer.trim();
82    if !trailing.is_empty() {
83        results.push(trailing.to_string());
84    }
85}
86
87fn is_sentence_boundary(chars: &[char], idx: usize) -> bool {
88    match chars[idx] {
89        '.' => is_dot_boundary(chars, idx),
90        '!' | '?' => true,
91        _ => false,
92    }
93}
94
95fn is_dot_boundary(chars: &[char], idx: usize) -> bool {
96    if is_decimal_middle(chars, idx) || is_ticker_middle(chars, idx) {
97        return false;
98    }
99    if idx + 1 < chars.len() && chars[idx + 1] == '.' {
100        return false;
101    }
102    true
103}
104
105fn is_decimal_middle(chars: &[char], idx: usize) -> bool {
106    idx > 0
107        && idx + 1 < chars.len()
108        && chars[idx - 1].is_ascii_digit()
109        && chars[idx + 1].is_ascii_digit()
110}
111
112fn is_ticker_middle(chars: &[char], idx: usize) -> bool {
113    idx > 0
114        && idx + 1 < chars.len()
115        && is_ticker_char(chars[idx - 1])
116        && is_ticker_char(chars[idx + 1])
117}
118
119fn is_ticker_char(ch: char) -> bool {
120    ch.is_ascii_uppercase() || ch.is_ascii_digit()
121}
122
123// ---------------------------------------------------------------------------
124// Filesystem helpers
125// ---------------------------------------------------------------------------
126
127/// True if the path has a `.txt` extension (case-insensitive).
128pub fn is_text_file(path: &Path) -> bool {
129    path.extension()
130        .and_then(|ext| ext.to_str())
131        .map(|ext| ext.eq_ignore_ascii_case("txt"))
132        .unwrap_or(false)
133}
134
135/// Best-effort file modified time.
136pub fn file_mtime(path: &Path) -> Option<DateTime<Utc>> {
137    let metadata = fs::metadata(path).ok()?;
138    let modified = metadata.modified().ok()?;
139    Some(system_time_to_utc(modified))
140}
141
142/// Best-effort (created_at, updated_at) pair for a file.
143pub fn file_times(path: &Path) -> (DateTime<Utc>, DateTime<Utc>) {
144    let metadata = fs::metadata(path).ok();
145    let updated_at = metadata
146        .as_ref()
147        .and_then(|meta| meta.modified().ok())
148        .map(system_time_to_utc)
149        .unwrap_or_else(Utc::now);
150    let created_at = metadata
151        .and_then(|meta| meta.created().ok())
152        .map(system_time_to_utc)
153        .unwrap_or(updated_at);
154    (created_at, updated_at)
155}
156
157fn system_time_to_utc(time: std::time::SystemTime) -> DateTime<Utc> {
158    DateTime::<Utc>::from(time)
159}
160
161#[cfg(test)]
162mod tests {
163    use super::*;
164
165    #[test]
166    fn normalize_inline_whitespace_collapses_runs() {
167        let nl = platform_newline();
168        let input = format!("Alpha{nl}{nl}  Beta\tGamma");
169        assert_eq!(normalize_inline_whitespace(&input), "Alpha Beta Gamma");
170    }
171
172    #[test]
173    fn sentences_falls_back_to_full_text_when_needed() {
174        let text = format!("   {}", platform_newline());
175        let result = sentences(&text);
176        assert!(result.is_empty());
177
178        let text2 = "Single block without punctuation";
179        let result2 = sentences(text2);
180        assert_eq!(
181            result2,
182            vec![String::from("Single block without punctuation")]
183        );
184    }
185
186    #[test]
187    fn make_section_populates_sentences() {
188        let section = make_section(SectionRole::Context, Some("Summary"), "Line one. Line two!");
189        assert_eq!(section.heading.as_deref(), Some("Summary"));
190        assert_eq!(section.sentences.len(), 2);
191        assert_eq!(section.role, SectionRole::Context);
192    }
193
194    #[test]
195    fn sentences_keep_decimal_values_together() {
196        let text = "Price closed at 3.14. Outlook improved.";
197        let result = sentences(text);
198        assert_eq!(result, vec!["Price closed at 3.14.", "Outlook improved."]);
199    }
200
201    #[test]
202    fn sentences_keep_dot_tickers_together() {
203        let text = "BRK.B rallied while RDS.A lagged.";
204        let result = sentences(text);
205        assert_eq!(result, vec!["BRK.B rallied while RDS.A lagged."]);
206    }
207
208    #[test]
209    fn file_time_helpers_handle_existing_and_missing_paths() {
210        use tempfile::tempdir;
211        let temp = tempdir().unwrap();
212        let existing = temp.path().join("exists.txt");
213        std::fs::write(&existing, "hello").unwrap();
214
215        assert!(file_mtime(&existing).is_some());
216        let (created_at, updated_at) = file_times(&existing);
217        assert!(updated_at >= created_at);
218
219        let missing = temp.path().join("missing.txt");
220        assert!(file_mtime(&missing).is_none());
221        let (missing_created, missing_updated) = file_times(&missing);
222        assert!(missing_updated >= missing_created);
223    }
224
225    #[test]
226    fn sentences_treat_blank_line_as_boundary() {
227        let nl = platform_newline();
228        let text = format!("First line without punctuation{nl}{nl}Second line with more context.");
229        let result = sentences(&text);
230        assert_eq!(
231            result,
232            vec![
233                "First line without punctuation".to_string(),
234                "Second line with more context.".to_string()
235            ]
236        );
237    }
238
239    #[test]
240    fn sentences_keep_ellipsis_together() {
241        let text = "Wait... really? Yes.";
242        let result = sentences(text);
243        assert_eq!(result, vec!["Wait...", "really?", "Yes."]);
244    }
245
246    #[test]
247    fn is_text_file_matches_txt_case_insensitively() {
248        use std::path::PathBuf;
249        assert!(is_text_file(&PathBuf::from("hello.txt")));
250        assert!(is_text_file(&PathBuf::from("hello.TXT")));
251        assert!(is_text_file(&PathBuf::from("hello.Txt")));
252        assert!(!is_text_file(&PathBuf::from("hello.md")));
253        assert!(!is_text_file(&PathBuf::from("hello")));
254    }
255}
triplets_core/utils.rs

triplets_core/
utils.rs