Skip to main content

oxios_markdown/
parser.rs

1//! Markdown text processing utilities.
2//!
3//! Ported from files.md (`server/txt/mod.rs`) by Artem Zakirullin.
4//! Provides string similarity, link extraction, and text normalization.
5
6use regex::Regex;
7
8/// Normalize CRLF and CR to LF.
9pub fn norm_new_lines(s: &str) -> String {
10    s.replace("\r\n", "\n").replace('\r', "\n")
11}
12
13/// Get the first word from a string.
14pub fn first_word(s: &str) -> &str {
15    s.split_whitespace().next().unwrap_or(s)
16}
17
18/// Calculate similarity between two strings (0.0 – 100.0) using Levenshtein distance.
19pub fn similar(a: &str, b: &str) -> f64 {
20    if a.is_empty() || b.is_empty() {
21        return 0.0;
22    }
23    let a_lower = a.to_lowercase();
24    let b_lower = b.to_lowercase();
25    if a_lower == b_lower {
26        return 100.0;
27    }
28    let max_len = a_lower.len().max(b_lower.len());
29    if max_len == 0 {
30        return 100.0;
31    }
32    let distance = levenshtein(&a_lower, &b_lower);
33    ((max_len - distance) as f64 / max_len as f64) * 100.0
34}
35
36/// Compute Levenshtein edit distance between two strings.
37#[allow(clippy::needless_range_loop)]
38pub fn levenshtein(a: &str, b: &str) -> usize {
39    let len_a = a.len();
40    let len_b = b.len();
41    if len_a == 0 {
42        return len_b;
43    }
44    if len_b == 0 {
45        return len_a;
46    }
47
48    let mut matrix = vec![vec![0usize; len_b + 1]; len_a + 1];
49    for i in 0..=len_a {
50        matrix[i][0] = i;
51    }
52    for j in 0..=len_b {
53        matrix[0][j] = j;
54    }
55
56    for i in 1..=len_a {
57        for j in 1..=len_b {
58            let cost = if a.as_bytes()[i - 1] == b.as_bytes()[j - 1] {
59                0
60            } else {
61                1
62            };
63            matrix[i][j] = (matrix[i - 1][j] + 1)
64                .min(matrix[i][j - 1] + 1)
65                .min(matrix[i - 1][j - 1] + cost);
66        }
67    }
68    matrix[len_a][len_b]
69}
70
71/// Truncate a string to `max_len`, appending "..." if truncated.
72pub fn truncate(s: &str, max_len: usize) -> String {
73    if s.len() <= max_len {
74        s.to_string()
75    } else {
76        format!("{}...", &s[..max_len.saturating_sub(3)])
77    }
78}
79
80/// First character uppercase (Unicode-aware).
81///
82/// ```
83/// use oxios_markdown::parser::ucfirst;
84/// assert_eq!(ucfirst("hello"), "Hello");
85/// assert_eq!(ucfirst(""), "");
86/// assert_eq!(ucfirst("über"), "Über");
87/// ```
88pub fn ucfirst(s: &str) -> String {
89    let mut chars = s.chars();
90    match chars.next() {
91        Some(first) => first.to_uppercase().chain(chars).collect(),
92        None => String::new(),
93    }
94}
95
96/// First character lowercase (Unicode-aware).
97///
98/// ```
99/// use oxios_markdown::parser::lcfirst;
100/// assert_eq!(lcfirst("Hello"), "hello");
101/// assert_eq!(lcfirst(""), "");
102/// ```
103pub fn lcfirst(s: &str) -> String {
104    let mut chars = s.chars();
105    match chars.next() {
106        Some(first) => first.to_lowercase().chain(chars).collect(),
107        None => String::new(),
108    }
109}
110
111/// Unicode-safe substring.
112///
113/// Respects Unicode codepoints but is not grapheme-cluster aware
114/// (combining characters like skin-tone modifiers count as separate codepoints).
115///
116/// ```
117/// use oxios_markdown::parser::substr;
118/// assert_eq!(substr("Hello", 0, 3), "Hel");
119/// assert_eq!(substr("Hello", 3, 10), "lo");
120/// assert_eq!(substr("Hello", 10, 2), "");
121/// ```
122pub fn substr(input: &str, start: usize, length: usize) -> String {
123    let runes: Vec<char> = input.chars().collect();
124    if start >= runes.len() {
125        return String::new();
126    }
127    let end = (start + length).min(runes.len());
128    runes[start..end].iter().collect()
129}
130
131/// Check if text has multiple lines.
132///
133/// ```
134/// use oxios_markdown::parser::is_multiline;
135/// assert!(is_multiline("line one\nline two"));
136/// assert!(!is_multiline("single line"));
137/// ```
138pub fn is_multiline(text: &str) -> bool {
139    let text = norm_new_lines(text);
140    text.lines().count() > 1
141}
142
143/// Split text into chunks of at most `max_len` characters.
144///
145/// Tries to break at the last newline, then the last space within the window.
146/// Trims leading/trailing whitespace from each chunk.
147///
148/// ```
149/// use oxios_markdown::parser::split_text_into_chunks;
150/// let chunks = split_text_into_chunks("Hello world how are you", 11);
151/// assert!(chunks.len() > 1);
152/// for chunk in &chunks {
153///     assert!(chunk.len() <= 11);
154/// }
155/// ```
156pub fn split_text_into_chunks(text: &str, max_len: usize) -> Vec<String> {
157    let text = text.trim();
158
159    if max_len == 0 {
160        return vec![text.to_string()];
161    }
162
163    let mut chunks = Vec::new();
164    let mut runes: Vec<char> = text.chars().collect();
165
166    while runes.len() > max_len {
167        let window = &runes[..max_len];
168
169        // Find the last newline in the window
170        let mut split_index = None;
171        for i in (0..window.len()).rev() {
172            if window[i] == '\n' {
173                split_index = Some(i);
174                break;
175            }
176        }
177
178        // No newline — find the last space
179        if split_index.is_none() {
180            for i in (0..window.len()).rev() {
181                if window[i] == ' ' {
182                    split_index = Some(i);
183                    break;
184                }
185            }
186        }
187
188        // No space either — split at max_len
189        let split_index = split_index.unwrap_or(max_len);
190
191        let chunk: String = runes[..split_index].iter().collect();
192        let chunk = chunk.trim();
193        if !chunk.is_empty() {
194            chunks.push(chunk.to_string());
195        }
196
197        let remainder: String = runes[split_index..].iter().collect();
198        runes = remainder.trim().chars().collect();
199    }
200
201    // Add the remaining runes as the final chunk
202    let remainder: String = runes.iter().collect();
203    let remainder = remainder.trim();
204    if !remainder.is_empty() {
205        chunks.push(remainder.to_string());
206    }
207
208    chunks
209}
210
211/// Known emoji prefixes to strip before re-adding.
212const EMOJI_STRIP_PREFIXES: &[&str] = &["WRK ", "UA ", "US ", "CY ", "HOB ", "SRB ", "PL "];
213
214/// Add emoji prefix to string, stripping known prefixes first.
215///
216/// If `emoji` is empty the string is returned with prefixes stripped only.
217///
218/// ```
219/// use oxios_markdown::parser::emoji_prefix;
220/// assert_eq!(emoji_prefix("📝", "WRK Task"), "📝 Task");
221/// assert_eq!(emoji_prefix("", "Hello"), "Hello");
222/// ```
223pub fn emoji_prefix(emoji: &str, s: &str) -> String {
224    let mut s = s.to_string();
225    for prefix in EMOJI_STRIP_PREFIXES {
226        s = s.trim_start_matches(prefix).to_string();
227    }
228    if emoji.is_empty() {
229        return s;
230    }
231    format!("{emoji} {s}")
232}
233
234/// Check if text contains a markdown image.
235pub fn has_image(msg: &str) -> bool {
236    Regex::new(r"!\[.*?\]\(.*?\)").unwrap().is_match(msg)
237}
238
239/// Strip a leading `` `HH:MM` `` timestamp from chat entries.
240pub fn strip_chat_timestamp(s: &str) -> String {
241    Regex::new(r"^`\d{2}:\d{2}` ")
242        .unwrap()
243        .replace(s, "")
244        .to_string()
245}
246
247/// Extract all markdown links `[text](path)` from content.
248///
249/// Returns a list of (link_text, target_path) pairs.
250pub fn extract_markdown_links(content: &str) -> Vec<(String, String)> {
251    let re = Regex::new(r"\[([^\]]*)\]\(([^)]+)\)").unwrap();
252    re.captures_iter(content)
253        .filter_map(|cap| {
254            let text = cap.get(1)?.as_str().to_string();
255            let path = cap.get(2)?.as_str().to_string();
256            // Skip external links and images
257            if path.starts_with("http://") || path.starts_with("https://") {
258                return None;
259            }
260            Some((text, path))
261        })
262        .collect()
263}
264
265/// Extract all headings (`## Title`) from content.
266///
267/// Returns heading texts (without the `#` prefix).
268pub fn extract_headings(content: &str) -> Vec<String> {
269    let re = Regex::new(r"(?m)^(#{1,6})\s+(.+)$").unwrap();
270    re.captures_iter(content)
271        .filter_map(|cap| cap.get(2).map(|m| m.as_str().trim().to_string()))
272        .collect()
273}
274
275/// Minimum similarity score (0-100) for fuzzy name search.
276pub const MIN_SEARCH_SIMILARITY: i32 = 70;
277
278/// 오늘 날짜의 Chat.md 헤더 문자열 (예: "#### 20 May, Tuesday").
279pub fn today_chat_header() -> String {
280    use chrono::Local;
281    let now = Local::now();
282    format!("#### {} {}", now.format("%d %B,"), now.format("%A"))
283}
284
285/// 오늘 날짜의 저널 파일 경로 (예: "journal/2026.05 May.md").
286pub fn today_journal_path() -> String {
287    use chrono::Local;
288    let now = Local::now();
289    format!("journal/{}.{}.md", now.format("%Y.%m"), now.format("%B"))
290}
291
292#[cfg(test)]
293mod tests {
294    use super::*;
295
296    #[test]
297    fn test_norm_newlines() {
298        assert_eq!(norm_new_lines("a\r\nb\r\nc"), "a\nb\nc");
299        assert_eq!(norm_new_lines("a\rb\rc"), "a\nb\nc");
300    }
301
302    #[test]
303    fn test_similar() {
304        assert!(similar("hello", "helo") > 70.0);
305        assert!(similar("test", "test") > 99.0);
306        assert_eq!(similar("", ""), 0.0);
307    }
308
309    #[test]
310    fn test_levenshtein() {
311        assert_eq!(levenshtein("kitten", "sitting"), 3);
312        assert_eq!(levenshtein("test", "test"), 0);
313    }
314
315    #[test]
316    fn test_truncate() {
317        assert_eq!(truncate("hello", 10), "hello");
318        assert_eq!(truncate("hello world", 8), "hello...");
319    }
320
321    #[test]
322    fn test_extract_links() {
323        let md =
324            "See [Rust](brain/Rust.md) and [Go](brain/Go.md) but not [ext](https://example.com)";
325        let links = extract_markdown_links(md);
326        assert_eq!(links.len(), 2);
327        assert_eq!(links[0].0, "Rust");
328        assert_eq!(links[0].1, "brain/Rust.md");
329    }
330
331    #[test]
332    fn test_extract_headings() {
333        let md = "# Title\n## Section\n### Sub\nsome text";
334        let headings = extract_headings(md);
335        assert_eq!(headings, vec!["Title", "Section", "Sub"]);
336    }
337
338    #[test]
339    fn test_ucfirst() {
340        assert_eq!(ucfirst("hello"), "Hello");
341        assert_eq!(ucfirst(""), "");
342        assert_eq!(ucfirst("Already"), "Already");
343        assert_eq!(ucfirst("über"), "Über");
344    }
345
346    #[test]
347    fn test_lcfirst() {
348        assert_eq!(lcfirst("Hello"), "hello");
349        assert_eq!(lcfirst(""), "");
350        assert_eq!(lcfirst("lower"), "lower");
351    }
352
353    #[test]
354    fn test_substr() {
355        assert_eq!(substr("Hello", 0, 3), "Hel");
356        assert_eq!(substr("Hello", 2, 3), "llo");
357        assert_eq!(substr("Hello", 3, 10), "lo");
358        assert_eq!(substr("Hello", 10, 2), "");
359        assert_eq!(substr("", 0, 5), "");
360        // Unicode
361        assert_eq!(substr("안녕하세요", 0, 2), "안녕");
362    }
363
364    #[test]
365    fn test_is_multiline() {
366        assert!(is_multiline("line one\nline two"));
367        assert!(!is_multiline("single line"));
368        assert!(is_multiline("a\r\nb"));
369        assert!(!is_multiline(""));
370    }
371
372    #[test]
373    fn test_split_text_into_chunks() {
374        // Exact fit
375        let chunks = split_text_into_chunks("Hello", 5);
376        assert_eq!(chunks, vec!["Hello"]);
377
378        // Split at space (Go test: basic split with spaces)
379        let chunks = split_text_into_chunks("This is a test to check the splitting of text", 10);
380        for chunk in &chunks {
381            assert!(
382                chunk.len() <= 10,
383                "chunk too long: '{}' ({})",
384                chunk,
385                chunk.len()
386            );
387        }
388
389        // Split at newline (Go test: max_len=15)
390        let chunks = split_text_into_chunks("Line one\nLine two\nLine three", 15);
391        assert_eq!(chunks, vec!["Line one", "Line two", "Line three"]);
392
393        // max_len == 0 returns everything as one chunk
394        let chunks = split_text_into_chunks("Hello world", 0);
395        assert_eq!(chunks, vec!["Hello world"]);
396    }
397
398    #[test]
399    fn test_emoji_prefix() {
400        assert_eq!(emoji_prefix("📝", "WRK Task"), "📝 Task");
401        assert_eq!(emoji_prefix("✅", "Task"), "✅ Task");
402        assert_eq!(emoji_prefix("", "Hello"), "Hello");
403        assert_eq!(emoji_prefix("🎉", "UA Celebration"), "🎉 Celebration");
404    }
405
406    #[test]
407    fn test_has_image() {
408        assert!(has_image("look: ![alt](img.png)"));
409        assert!(!has_image("just text"));
410    }
411}