Skip to main content

talon_core/text/
processing.rs

1//! Text utilities for markdown parsing and chunking.
2//!
3//! Line splitting, fence/heading detection, token estimation, wikilink parsing,
4//! and keyword/path normalization. Ported from the TypeScript Talon implementation.
5
6use regex::Regex;
7
8use super::nfd;
9
10/// Token-to-character ratio for rough token estimation.
11pub const TOKEN_CHAR_RATIO: u8 = 4;
12
13/// Length of a line feed character.
14const LF_LENGTH: usize = 1;
15
16/// Minimum length for outer quote stripping.
17const MIN_QUOTED_LENGTH: usize = 2;
18
19/// Heading pattern: `# ` through `###### `.
20const HEADING_PATTERN: &str = r"(?u)^#{1,6}\s+(.*)$";
21
22/// Fence pattern: triple backtick or triple tilde lines.
23const FENCE_PATTERN: &str = r"(?u)^(`{3,}|~{3,})\s*.*$";
24
25/// A line span within the original content.
26#[derive(Debug, Clone, PartialEq, Eq)]
27pub struct LineSpan {
28    /// Number of bytes consumed by the line break (0 for last line).
29    pub break_length: usize,
30    /// Byte offset where the line ends (exclusive).
31    pub end: usize,
32    /// 1-indexed line number.
33    pub line_number: u32,
34    /// Byte offset where the line starts (inclusive).
35    pub start: usize,
36    /// The line text (without the line break).
37    pub text: String,
38}
39
40/// Splits markdown content into line spans.
41///
42/// Handles both LF and CRLF line endings. The last line (no trailing newline)
43/// gets `break_length = 0` and `end = content.len()`.
44///
45/// # Examples
46///
47/// ```
48/// use talon_core::text::{split_lines, LineSpan};
49///
50/// let lines = split_lines("line1\nline2\nline3");
51/// assert_eq!(lines.len(), 3);
52/// assert_eq!(lines[0].text, "line1");
53/// assert_eq!(lines[0].line_number, 1);
54/// assert_eq!(lines[2].text, "line3");
55/// assert_eq!(lines[2].break_length, 0);
56/// ```
57#[must_use]
58pub fn split_lines(content: &str) -> Vec<LineSpan> {
59    let mut lines = Vec::new();
60    let mut start = 0;
61    let mut line_number: u32 = 1;
62
63    let bytes = content.as_bytes();
64    while start < bytes.len() {
65        let end_of_line = bytes[start..].iter().position(|&b| b == b'\n');
66
67        if let Some(offset) = end_of_line {
68            let end = start + offset;
69            lines.push(LineSpan {
70                break_length: LF_LENGTH,
71                end,
72                line_number,
73                start,
74                text: content[start..end].to_string(),
75            });
76            start = end + LF_LENGTH;
77            line_number += 1;
78        } else {
79            lines.push(LineSpan {
80                break_length: 0,
81                end: content.len(),
82                line_number,
83                start,
84                text: content[start..].to_string(),
85            });
86            break;
87        }
88    }
89
90    lines
91}
92
93/// Cached regex patterns for fence and heading detection.
94struct Patterns {
95    fence: Regex,
96    heading: Regex,
97}
98
99impl Patterns {
100    fn new() -> Self {
101        Self {
102            fence: Regex::new(FENCE_PATTERN).unwrap_or_else(|_| panic!("valid fence regex")),
103            heading: Regex::new(HEADING_PATTERN).unwrap_or_else(|_| panic!("valid heading regex")),
104        }
105    }
106}
107
108thread_local! {
109    static PATTERNS: Patterns = Patterns::new();
110}
111
112/// Checks if a line is a fenced code block (3+ backticks or tildes).
113///
114/// # Examples
115///
116/// ```
117/// use talon_core::text::is_fence_line;
118///
119/// assert!(is_fence_line("```ts"));
120/// assert!(is_fence_line("~~~"));
121/// assert!(!is_fence_line("# heading"));
122/// assert!(!is_fence_line("not a fence"));
123/// ```
124#[must_use]
125pub fn is_fence_line(line: &str) -> bool {
126    PATTERNS.with(|p| p.fence.is_match(line.trim()))
127}
128
129/// Checks if a line is an ATX heading (1-6 hash characters followed by space).
130///
131/// # Examples
132///
133/// ```
134/// use talon_core::text::is_heading_line;
135///
136/// assert!(is_heading_line("# Title"));
137/// assert!(is_heading_line("###### Deep"));
138/// assert!(!is_heading_line("####### Too deep"));
139/// assert!(!is_heading_line("Not a heading"));
140/// ```
141#[must_use]
142pub fn is_heading_line(line: &str) -> bool {
143    PATTERNS.with(|p| p.heading.is_match(line.trim()))
144}
145
146/// Strips heading markers from a heading line.
147///
148/// Removes leading `#` characters (1-6), whitespace, and trailing `#` characters.
149///
150/// # Examples
151///
152/// ```
153/// use talon_core::text::strip_heading_text;
154///
155/// assert_eq!(strip_heading_text("# Hello World"), "Hello World");
156/// assert_eq!(strip_heading_text("### Nested ##"), "Nested");
157/// assert_eq!(strip_heading_text("###### Deep heading"), "Deep heading");
158/// ```
159#[must_use]
160pub fn strip_heading_text(line: &str) -> String {
161    let trimmed = line.trim();
162    // Count leading # characters (up to 6)
163    let hash_count = trimmed.chars().take_while(|&c| c == '#').count().min(6);
164    let without_hashes = &trimmed[hash_count..];
165    // Skip leading whitespace after #s
166    let without_ws = without_hashes.trim_start();
167    // Remove trailing # characters
168    let without_trailing = without_ws.trim_end_matches('#');
169    without_trailing.trim().to_string()
170}
171
172/// Estimates the number of tokens in text using a character ratio.
173///
174/// Uses `max(1, ceil(text.len() / TOKEN_CHAR_RATIO))` where
175/// `TOKEN_CHAR_RATIO = 4`.
176///
177/// # Examples
178///
179/// ```
180/// use talon_core::text::estimate_tokens;
181///
182/// assert_eq!(estimate_tokens(""), 1);
183/// assert_eq!(estimate_tokens("hello"), 2);  // ceil(5/4) = 2
184/// assert_eq!(estimate_tokens("hello world"), 3);  // ceil(11/4) = 3
185/// ```
186#[must_use]
187pub fn estimate_tokens(text: &str) -> usize {
188    if text.is_empty() {
189        return 1;
190    }
191    let len = text.len();
192    len.div_ceil(TOKEN_CHAR_RATIO as usize).max(1)
193}
194
195/// Normalizes a keyword for comparison: NFD normalization + lowercase + trim.
196///
197/// Matches the TypeScript `normalizeTalonKeyword` behavior exactly.
198///
199/// # Examples
200///
201/// ```
202/// use talon_core::text::normalize_keyword;
203///
204/// assert_eq!(normalize_keyword("Hello World"), "hello world");
205/// assert_eq!(normalize_keyword("  Test  "), "test");
206/// assert_eq!(normalize_keyword("CAFÉ"), "cafe\u{0301}");
207/// ```
208#[must_use]
209pub fn normalize_keyword(value: &str) -> String {
210    nfd::normalize(value.trim()).to_lowercase()
211}
212
213/// Normalizes a vault path: backslashes to forward slashes, NFD normalization.
214///
215/// Matches the TypeScript `normalizeTalonVaultPath` behavior.
216///
217/// # Examples
218///
219/// ```
220/// use talon_core::text::normalize_vault_path;
221///
222/// assert_eq!(normalize_vault_path("notes\\hello.md"), "notes/hello.md");
223/// assert_eq!(normalize_vault_path("notes/hello.md"), "notes/hello.md");
224/// ```
225#[must_use]
226pub fn normalize_vault_path(value: &str) -> String {
227    nfd::normalize(&value.replace('\\', "/"))
228}
229
230/// Parsed components of a wikilink.
231#[derive(Debug, Clone, PartialEq, Eq)]
232pub struct ParsedWikiLink {
233    /// Display alias (if `[[target|alias]]`).
234    pub alias: Option<String>,
235    /// Section heading anchor (if `[[target#heading]]`).
236    pub heading: Option<String>,
237    /// Raw target part before `|` or `#`.
238    pub raw_target: String,
239    /// The resolved target (without alias or heading).
240    pub target: String,
241}
242
243/// Parses a raw wikilink string into components.
244///
245/// Handles `[[target]]`, `[[target|alias]]`, and `[[target#heading]]`.
246///
247/// # Examples
248///
249/// ```
250/// use talon_core::text::parse_wikilink;
251///
252/// let link = parse_wikilink("My Note");
253/// assert_eq!(link.target, "My Note");
254/// assert_eq!(link.alias, None);
255/// assert_eq!(link.heading, None);
256///
257/// let link = parse_wikilink("Target|alias");
258/// assert_eq!(link.target, "Target");
259/// assert_eq!(link.alias, Some("alias".to_string()));
260///
261/// let link = parse_wikilink("Target#heading");
262/// assert_eq!(link.target, "Target");
263/// assert_eq!(link.heading, Some("heading".to_string()));
264/// ```
265#[must_use]
266pub fn parse_wikilink(raw: &str) -> ParsedWikiLink {
267    // Split on | first to separate target from alias
268    let (target_part, alias_part) = raw
269        .find('|')
270        .map_or((raw, ""), |i| (&raw[..i], &raw[i + 1..]));
271    // Split target on # to separate target from heading
272    let (target, heading) = target_part.find('#').map_or_else(
273        || (target_part.trim(), None),
274        |i| {
275            let t = target_part[..i].trim();
276            let h = target_part[i + 1..].trim();
277            (
278                t,
279                if h.is_empty() {
280                    None
281                } else {
282                    Some(h.to_string())
283                },
284            )
285        },
286    );
287    let alias = if alias_part.is_empty() {
288        None
289    } else {
290        Some(alias_part.trim().to_string())
291    };
292
293    ParsedWikiLink {
294        alias,
295        heading,
296        raw_target: target_part.trim().to_string(),
297        target: target.to_string(),
298    }
299}
300
301/// Strips outer matching quotes from a string.
302///
303/// Only strips if the string starts and ends with the same quote character
304/// (`"` or `'`) and has at least 2 characters after trimming.
305///
306/// # Examples
307///
308/// ```
309/// use talon_core::text::strip_outer_quotes;
310///
311/// assert_eq!(strip_outer_quotes("\"hello\""), "hello");
312/// assert_eq!(strip_outer_quotes("'hello'"), "hello");
313/// assert_eq!(strip_outer_quotes("hello"), "hello");
314/// assert_eq!(strip_outer_quotes("\""), "\"");
315/// ```
316#[must_use]
317pub fn strip_outer_quotes(value: &str) -> String {
318    let trimmed = value.trim();
319    if trimmed.len() < MIN_QUOTED_LENGTH {
320        return trimmed.to_string();
321    }
322    let first = trimmed.chars().next().unwrap_or('\0');
323    let last = trimmed.chars().last().unwrap_or('\0');
324    if (first == '"' || first == '\'') && first == last {
325        trimmed[1..trimmed.len() - 1].to_string()
326    } else {
327        trimmed.to_string()
328    }
329}
330
331#[cfg(test)]
332mod tests;