talon_core/text/processing.rs
1//! Text utilities for markdown parsing and chunking.
2//!
3//! Line splitting, fence/heading detection, token estimation, wikilink parsing,
4//! and keyword/path normalization. Ported from the TypeScript Talon implementation.
5
6use regex::Regex;
7
8use super::nfd;
9
10/// Token-to-character ratio for rough token estimation.
11pub const TOKEN_CHAR_RATIO: u8 = 4;
12
13/// Length of a line feed character.
14const LF_LENGTH: usize = 1;
15
16/// Minimum length for outer quote stripping.
17const MIN_QUOTED_LENGTH: usize = 2;
18
19/// Heading pattern: `# ` through `###### `.
20const HEADING_PATTERN: &str = r"(?u)^#{1,6}\s+(.*)$";
21
22/// Fence pattern: triple backtick or triple tilde lines.
23const FENCE_PATTERN: &str = r"(?u)^(`{3,}|~{3,})\s*.*$";
24
25/// A line span within the original content.
26#[derive(Debug, Clone, PartialEq, Eq)]
27pub struct LineSpan {
28 /// Number of bytes consumed by the line break (0 for last line).
29 pub break_length: usize,
30 /// Byte offset where the line ends (exclusive).
31 pub end: usize,
32 /// 1-indexed line number.
33 pub line_number: u32,
34 /// Byte offset where the line starts (inclusive).
35 pub start: usize,
36 /// The line text (without the line break).
37 pub text: String,
38}
39
40/// Splits markdown content into line spans.
41///
42/// Handles both LF and CRLF line endings. The last line (no trailing newline)
43/// gets `break_length = 0` and `end = content.len()`.
44///
45/// # Examples
46///
47/// ```
48/// use talon_core::text::{split_lines, LineSpan};
49///
50/// let lines = split_lines("line1\nline2\nline3");
51/// assert_eq!(lines.len(), 3);
52/// assert_eq!(lines[0].text, "line1");
53/// assert_eq!(lines[0].line_number, 1);
54/// assert_eq!(lines[2].text, "line3");
55/// assert_eq!(lines[2].break_length, 0);
56/// ```
57#[must_use]
58pub fn split_lines(content: &str) -> Vec<LineSpan> {
59 let mut lines = Vec::new();
60 let mut start = 0;
61 let mut line_number: u32 = 1;
62
63 let bytes = content.as_bytes();
64 while start < bytes.len() {
65 let end_of_line = bytes[start..].iter().position(|&b| b == b'\n');
66
67 if let Some(offset) = end_of_line {
68 let end = start + offset;
69 lines.push(LineSpan {
70 break_length: LF_LENGTH,
71 end,
72 line_number,
73 start,
74 text: content[start..end].to_string(),
75 });
76 start = end + LF_LENGTH;
77 line_number += 1;
78 } else {
79 lines.push(LineSpan {
80 break_length: 0,
81 end: content.len(),
82 line_number,
83 start,
84 text: content[start..].to_string(),
85 });
86 break;
87 }
88 }
89
90 lines
91}
92
93/// Cached regex patterns for fence and heading detection.
94struct Patterns {
95 fence: Regex,
96 heading: Regex,
97}
98
99impl Patterns {
100 fn new() -> Self {
101 Self {
102 fence: Regex::new(FENCE_PATTERN).unwrap_or_else(|_| panic!("valid fence regex")),
103 heading: Regex::new(HEADING_PATTERN).unwrap_or_else(|_| panic!("valid heading regex")),
104 }
105 }
106}
107
108thread_local! {
109 static PATTERNS: Patterns = Patterns::new();
110}
111
112/// Checks if a line is a fenced code block (3+ backticks or tildes).
113///
114/// # Examples
115///
116/// ```
117/// use talon_core::text::is_fence_line;
118///
119/// assert!(is_fence_line("```ts"));
120/// assert!(is_fence_line("~~~"));
121/// assert!(!is_fence_line("# heading"));
122/// assert!(!is_fence_line("not a fence"));
123/// ```
124#[must_use]
125pub fn is_fence_line(line: &str) -> bool {
126 PATTERNS.with(|p| p.fence.is_match(line.trim()))
127}
128
129/// Checks if a line is an ATX heading (1-6 hash characters followed by space).
130///
131/// # Examples
132///
133/// ```
134/// use talon_core::text::is_heading_line;
135///
136/// assert!(is_heading_line("# Title"));
137/// assert!(is_heading_line("###### Deep"));
138/// assert!(!is_heading_line("####### Too deep"));
139/// assert!(!is_heading_line("Not a heading"));
140/// ```
141#[must_use]
142pub fn is_heading_line(line: &str) -> bool {
143 PATTERNS.with(|p| p.heading.is_match(line.trim()))
144}
145
146/// Strips heading markers from a heading line.
147///
148/// Removes leading `#` characters (1-6), whitespace, and trailing `#` characters.
149///
150/// # Examples
151///
152/// ```
153/// use talon_core::text::strip_heading_text;
154///
155/// assert_eq!(strip_heading_text("# Hello World"), "Hello World");
156/// assert_eq!(strip_heading_text("### Nested ##"), "Nested");
157/// assert_eq!(strip_heading_text("###### Deep heading"), "Deep heading");
158/// ```
159#[must_use]
160pub fn strip_heading_text(line: &str) -> String {
161 let trimmed = line.trim();
162 // Count leading # characters (up to 6)
163 let hash_count = trimmed.chars().take_while(|&c| c == '#').count().min(6);
164 let without_hashes = &trimmed[hash_count..];
165 // Skip leading whitespace after #s
166 let without_ws = without_hashes.trim_start();
167 // Remove trailing # characters
168 let without_trailing = without_ws.trim_end_matches('#');
169 without_trailing.trim().to_string()
170}
171
172/// Estimates the number of tokens in text using a character ratio.
173///
174/// Uses `max(1, ceil(text.len() / TOKEN_CHAR_RATIO))` where
175/// `TOKEN_CHAR_RATIO = 4`.
176///
177/// # Examples
178///
179/// ```
180/// use talon_core::text::estimate_tokens;
181///
182/// assert_eq!(estimate_tokens(""), 1);
183/// assert_eq!(estimate_tokens("hello"), 2); // ceil(5/4) = 2
184/// assert_eq!(estimate_tokens("hello world"), 3); // ceil(11/4) = 3
185/// ```
186#[must_use]
187pub fn estimate_tokens(text: &str) -> usize {
188 if text.is_empty() {
189 return 1;
190 }
191 let len = text.len();
192 len.div_ceil(TOKEN_CHAR_RATIO as usize).max(1)
193}
194
195/// Normalizes a keyword for comparison: NFD normalization + lowercase + trim.
196///
197/// Matches the TypeScript `normalizeTalonKeyword` behavior exactly.
198///
199/// # Examples
200///
201/// ```
202/// use talon_core::text::normalize_keyword;
203///
204/// assert_eq!(normalize_keyword("Hello World"), "hello world");
205/// assert_eq!(normalize_keyword(" Test "), "test");
206/// assert_eq!(normalize_keyword("CAFÉ"), "cafe\u{0301}");
207/// ```
208#[must_use]
209pub fn normalize_keyword(value: &str) -> String {
210 nfd::normalize(value.trim()).to_lowercase()
211}
212
213/// Normalizes a vault path: backslashes to forward slashes, NFD normalization.
214///
215/// Matches the TypeScript `normalizeTalonVaultPath` behavior.
216///
217/// # Examples
218///
219/// ```
220/// use talon_core::text::normalize_vault_path;
221///
222/// assert_eq!(normalize_vault_path("notes\\hello.md"), "notes/hello.md");
223/// assert_eq!(normalize_vault_path("notes/hello.md"), "notes/hello.md");
224/// ```
225#[must_use]
226pub fn normalize_vault_path(value: &str) -> String {
227 nfd::normalize(&value.replace('\\', "/"))
228}
229
230/// Parsed components of a wikilink.
231#[derive(Debug, Clone, PartialEq, Eq)]
232pub struct ParsedWikiLink {
233 /// Display alias (if `[[target|alias]]`).
234 pub alias: Option<String>,
235 /// Section heading anchor (if `[[target#heading]]`).
236 pub heading: Option<String>,
237 /// Raw target part before `|` or `#`.
238 pub raw_target: String,
239 /// The resolved target (without alias or heading).
240 pub target: String,
241}
242
243/// Parses a raw wikilink string into components.
244///
245/// Handles `[[target]]`, `[[target|alias]]`, and `[[target#heading]]`.
246///
247/// # Examples
248///
249/// ```
250/// use talon_core::text::parse_wikilink;
251///
252/// let link = parse_wikilink("My Note");
253/// assert_eq!(link.target, "My Note");
254/// assert_eq!(link.alias, None);
255/// assert_eq!(link.heading, None);
256///
257/// let link = parse_wikilink("Target|alias");
258/// assert_eq!(link.target, "Target");
259/// assert_eq!(link.alias, Some("alias".to_string()));
260///
261/// let link = parse_wikilink("Target#heading");
262/// assert_eq!(link.target, "Target");
263/// assert_eq!(link.heading, Some("heading".to_string()));
264/// ```
265#[must_use]
266pub fn parse_wikilink(raw: &str) -> ParsedWikiLink {
267 // Split on | first to separate target from alias
268 let (target_part, alias_part) = raw
269 .find('|')
270 .map_or((raw, ""), |i| (&raw[..i], &raw[i + 1..]));
271 // Split target on # to separate target from heading
272 let (target, heading) = target_part.find('#').map_or_else(
273 || (target_part.trim(), None),
274 |i| {
275 let t = target_part[..i].trim();
276 let h = target_part[i + 1..].trim();
277 (
278 t,
279 if h.is_empty() {
280 None
281 } else {
282 Some(h.to_string())
283 },
284 )
285 },
286 );
287 let alias = if alias_part.is_empty() {
288 None
289 } else {
290 Some(alias_part.trim().to_string())
291 };
292
293 ParsedWikiLink {
294 alias,
295 heading,
296 raw_target: target_part.trim().to_string(),
297 target: target.to_string(),
298 }
299}
300
301/// Strips outer matching quotes from a string.
302///
303/// Only strips if the string starts and ends with the same quote character
304/// (`"` or `'`) and has at least 2 characters after trimming.
305///
306/// # Examples
307///
308/// ```
309/// use talon_core::text::strip_outer_quotes;
310///
311/// assert_eq!(strip_outer_quotes("\"hello\""), "hello");
312/// assert_eq!(strip_outer_quotes("'hello'"), "hello");
313/// assert_eq!(strip_outer_quotes("hello"), "hello");
314/// assert_eq!(strip_outer_quotes("\""), "\"");
315/// ```
316#[must_use]
317pub fn strip_outer_quotes(value: &str) -> String {
318 let trimmed = value.trim();
319 if trimmed.len() < MIN_QUOTED_LENGTH {
320 return trimmed.to_string();
321 }
322 let first = trimmed.chars().next().unwrap_or('\0');
323 let last = trimmed.chars().last().unwrap_or('\0');
324 if (first == '"' || first == '\'') && first == last {
325 trimmed[1..trimmed.len() - 1].to_string()
326 } else {
327 trimmed.to_string()
328 }
329}
330
331#[cfg(test)]
332mod tests;