mathypad_core/expression/
parser.rs

1//! Expression parsing and tokenization functions
2
3use super::chumsky_parser::parse_expression_chumsky;
4use super::tokens::Token;
5use crate::units::parse_unit;
6
7/// Parse a line reference string like "line1", "line2" etc.
8pub fn parse_line_reference(text: &str) -> Option<usize> {
9    let text_lower = text.to_lowercase();
10    if let Some(number_part) = text_lower.strip_prefix("line") {
11        if let Ok(line_num) = number_part.parse::<usize>() {
12            if line_num > 0 {
13                return Some(line_num - 1); // Convert to 0-based indexing
14            }
15        }
16    }
17    None
18}
19
20/// Extract all line references from a text string
21/// Returns a vector of (start_pos, end_pos, line_number) tuples for each "lineN" found
22pub fn extract_line_references(text: &str) -> Vec<(usize, usize, usize)> {
23    let mut references = Vec::new();
24    let text_lower = text.to_lowercase();
25    let mut search_start = 0;
26
27    while let Some(line_pos) = text_lower[search_start..].find("line") {
28        let absolute_pos = search_start + line_pos;
29
30        // Check if "line" is at a word boundary (not preceded by alphanumeric)
31        let is_word_start = absolute_pos == 0
32            || !text_lower
33                .chars()
34                .nth(absolute_pos - 1)
35                .unwrap_or(' ')
36                .is_ascii_alphanumeric();
37
38        if is_word_start {
39            let remaining = &text_lower[absolute_pos + 4..]; // Skip "line"
40
41            // Find the number part in the lowercase version
42            let mut num_end = 0;
43            for ch in remaining.chars() {
44                if ch.is_ascii_digit() {
45                    num_end += 1;
46                } else {
47                    break;
48                }
49            }
50
51            if num_end > 0 {
52                // Check if "lineN" is at a word boundary (not followed by alphanumeric)
53                let is_word_end = absolute_pos + 4 + num_end >= text_lower.len()
54                    || !text_lower
55                        .chars()
56                        .nth(absolute_pos + 4 + num_end)
57                        .unwrap_or(' ')
58                        .is_ascii_alphanumeric();
59
60                if is_word_end {
61                    // Parse the number from the original text (not lowercase) to preserve digits
62                    let original_remaining = &text[absolute_pos + 4..];
63                    if let Ok(line_num) = original_remaining[..num_end].parse::<usize>() {
64                        if line_num > 0 {
65                            let start_pos = absolute_pos;
66                            let end_pos = absolute_pos + 4 + num_end; // "line" + digits
67                            references.push((start_pos, end_pos, line_num - 1)); // Convert to 0-based
68                        }
69                    }
70                }
71            }
72        }
73
74        search_start = absolute_pos + 4; // Move past "line"
75    }
76
77    references
78}
79
80/// Update line references in text by applying an offset to references >= threshold
81/// If offset is positive: increment references >= threshold
82/// If offset is negative: decrement references > threshold, mark deleted line refs as invalid
83pub fn update_line_references_in_text(text: &str, threshold: usize, offset: i32) -> String {
84    let references = extract_line_references(text);
85
86    if references.is_empty() {
87        return text.to_string();
88    }
89
90    let mut result = text.to_string();
91
92    // Process references in reverse order to maintain correct string positions
93    for (start_pos, end_pos, line_num) in references.into_iter().rev() {
94        if offset > 0 {
95            // Line insertion: increment ALL references >= insertion point
96            // This ensures that when a line is inserted, all subsequent references shift down
97            if line_num >= threshold {
98                let new_ref = format!("line{}", line_num + 2); // +1 for insertion shift, +1 for 1-based display
99                result.replace_range(start_pos..end_pos, &new_ref);
100            }
101        } else {
102            // Line deletion: handle references to deleted line and after
103            if line_num == threshold {
104                // Reference to the deleted line - mark as invalid
105                result.replace_range(start_pos..end_pos, "INVALID_REF");
106            } else if line_num > threshold {
107                // Reference after deleted line - decrement by 1
108                let new_ref = format!("line{}", line_num); // line_num already represents the new 1-based line number after shift
109                result.replace_range(start_pos..end_pos, &new_ref);
110            }
111            // References before deleted line stay unchanged
112        }
113    }
114
115    result
116}
117
118/// Tokenize any text into tokens - always succeeds, may include non-mathematical tokens
119pub fn tokenize_with_units(expr: &str) -> Option<Vec<Token>> {
120    // Use the chumsky parser - now accepts any input
121    match parse_expression_chumsky(expr) {
122        Ok(tokens) if tokens.is_empty() => None, // Only fail on truly empty input
123        Ok(tokens) => Some(tokens),              // Accept any non-empty token sequence
124        Err(_) => None,                          // Only fail on parse errors
125    }
126}
127
128/// Check if a sequence of tokens forms a valid mathematical expression
129pub fn is_valid_mathematical_expression(tokens: &[Token]) -> bool {
130    if tokens.is_empty() {
131        return false;
132    }
133
134    // Count different token types
135    let mut has_number_or_value = false;
136    let mut consecutive_operators = 0;
137    let mut consecutive_values = 0;
138
139    for (i, token) in tokens.iter().enumerate() {
140        match token {
141            Token::Number(_)
142            | Token::NumberWithUnit(_, _)
143            | Token::LineReference(_)
144            | Token::Variable(_) => {
145                has_number_or_value = true;
146                consecutive_values += 1;
147                consecutive_operators = 0;
148
149                // More than 1 consecutive value without operators is invalid (except for assignments and conversions)
150                if consecutive_values > 1 {
151                    // Allow if this is part of an assignment (Variable = Expression)
152                    if i >= 2
153                        && matches!(tokens[i - 1], Token::Assign)
154                        && matches!(tokens[i - 2], Token::Variable(_))
155                    {
156                        consecutive_values = 1; // Reset count after assignment
157                    } else {
158                        return false;
159                    }
160                }
161            }
162            Token::Plus | Token::Minus | Token::Multiply | Token::Divide | Token::Power => {
163                consecutive_operators += 1;
164                consecutive_values = 0;
165
166                // More than 1 consecutive operator is invalid (except minus for negation)
167                if consecutive_operators > 1 && !matches!(token, Token::Minus) {
168                    return false;
169                }
170            }
171            Token::LeftParen | Token::RightParen => {
172                consecutive_operators = 0;
173                consecutive_values = 0;
174            }
175            Token::To | Token::In | Token::Of => {
176                // These are OK for conversions and percentage operations
177                consecutive_operators = 0;
178                consecutive_values = 0;
179            }
180            Token::Assign => {
181                // Assignment is only valid after a variable
182                if i == 0 || !matches!(tokens[i - 1], Token::Variable(_)) {
183                    return false;
184                }
185                consecutive_operators = 0;
186                consecutive_values = 0;
187            }
188            Token::Function(_) => {
189                // Functions act like operators in terms of resetting counters
190                consecutive_operators = 0;
191                consecutive_values = 0;
192            }
193        }
194    }
195
196    // Must have at least one number/value to be a mathematical expression
197    has_number_or_value
198}
199
200/// Check if a string represents a valid mathematical expression
201pub fn is_valid_math_expression(expr: &str) -> bool {
202    let expr = expr.trim();
203    if expr.is_empty() {
204        return false;
205    }
206
207    let mut has_number = false;
208    // let mut has_operator = false;
209    let mut paren_count = 0;
210    let mut prev_was_operator = true; // Start as true to allow leading numbers
211
212    let chars: Vec<char> = expr.chars().collect();
213    let mut i = 0;
214
215    while i < chars.len() {
216        let ch = chars[i];
217        match ch {
218            ' ' => {
219                i += 1;
220                continue;
221            }
222            '0'..='9' => {
223                has_number = true;
224                prev_was_operator = false;
225                // Skip through the whole number (including commas and decimals)
226                while i < chars.len()
227                    && (chars[i].is_ascii_digit() || chars[i] == '.' || chars[i] == ',')
228                {
229                    i += 1;
230                }
231
232                // Skip whitespace
233                while i < chars.len() && chars[i] == ' ' {
234                    i += 1;
235                }
236
237                // Check for unit
238                if i < chars.len() && chars[i].is_ascii_alphabetic() {
239                    let unit_start = i;
240                    while i < chars.len() && (chars[i].is_ascii_alphabetic() || chars[i] == '/') {
241                        i += 1;
242                    }
243
244                    let unit_str: String = chars[unit_start..i].iter().collect();
245                    if parse_unit(&unit_str).is_none()
246                        && unit_str.to_lowercase() != "to"
247                        && unit_str.to_lowercase() != "in"
248                        && parse_line_reference(&unit_str).is_none()
249                    {
250                        // Not a recognized unit or line reference, rewind
251                        i = unit_start;
252                    }
253                }
254                continue;
255            }
256            '.' => {
257                if prev_was_operator {
258                    return false; // Can't start with decimal point
259                }
260                i += 1;
261            }
262            '+' | '-' | '*' | '/' => {
263                if prev_was_operator && ch != '-' {
264                    return false; // Two operators in a row (except minus for negation)
265                }
266                prev_was_operator = true;
267                i += 1;
268            }
269            '(' => {
270                paren_count += 1;
271                prev_was_operator = true;
272                i += 1;
273            }
274            ')' => {
275                paren_count -= 1;
276                if paren_count < 0 {
277                    return false;
278                }
279                prev_was_operator = false;
280                i += 1;
281            }
282            _ => {
283                if ch.is_ascii_alphabetic() {
284                    let unit_start = i;
285                    // For potential line references, also include digits
286                    while i < chars.len()
287                        && (chars[i].is_ascii_alphabetic()
288                            || chars[i].is_ascii_digit()
289                            || chars[i] == '/')
290                    {
291                        i += 1;
292                    }
293
294                    let word: String = chars[unit_start..i].iter().collect();
295                    if word.to_lowercase() == "to" || word.to_lowercase() == "in" {
296                        prev_was_operator = true;
297                    } else if parse_line_reference(&word).is_some() {
298                        // Valid line reference, acts like a number
299                        has_number = true;
300                        prev_was_operator = false;
301                    } else if parse_unit(&word).is_some() {
302                        // Valid unit, continue
303                        prev_was_operator = false;
304                    } else {
305                        // Unknown word - treat as the end of the expression
306                        // Check if what we have so far is valid
307                        break;
308                    }
309                } else {
310                    // If we encounter any other character, check if what we have so far is valid
311                    break;
312                }
313            }
314        }
315    }
316
317    // Must have balanced parentheses, at least one number, and if it has operators, must end properly
318    paren_count == 0 && has_number && !prev_was_operator
319}
320
321#[cfg(test)]
322mod parser_tests {
323    use super::*;
324
325    #[test]
326    fn test_parse_line_reference() {
327        // Test valid line references
328        assert_eq!(parse_line_reference("line1"), Some(0));
329        assert_eq!(parse_line_reference("line2"), Some(1));
330        assert_eq!(parse_line_reference("line10"), Some(9));
331        assert_eq!(parse_line_reference("line999"), Some(998));
332
333        // Test case insensitive
334        assert_eq!(parse_line_reference("LINE1"), Some(0));
335        assert_eq!(parse_line_reference("Line2"), Some(1));
336        assert_eq!(parse_line_reference("LiNe3"), Some(2));
337
338        // Test invalid line references
339        assert_eq!(parse_line_reference("line0"), None); // 0 is invalid
340        assert_eq!(parse_line_reference("line"), None); // No number
341        assert_eq!(parse_line_reference("line-1"), None); // Negative
342        assert_eq!(parse_line_reference("linea"), None); // Not a number
343        assert_eq!(parse_line_reference("notline1"), None); // Wrong prefix
344        assert_eq!(parse_line_reference(""), None); // Empty
345        assert_eq!(parse_line_reference("1line"), None); // Wrong order
346    }
347
348    #[test]
349    fn test_tokenize_with_units_basic() {
350        // Test basic numbers
351        let tokens = tokenize_with_units("42").unwrap();
352        assert_eq!(tokens.len(), 1);
353        assert!(matches!(tokens[0], Token::Number(42.0)));
354
355        // Test numbers with units
356        let tokens = tokenize_with_units("5 GiB").unwrap();
357        assert_eq!(tokens.len(), 1);
358        assert!(matches!(tokens[0], Token::NumberWithUnit(5.0, _)));
359
360        // Test simple arithmetic
361        let tokens = tokenize_with_units("2 + 3").unwrap();
362        assert_eq!(tokens.len(), 3);
363        assert!(matches!(tokens[0], Token::Number(2.0)));
364        assert!(matches!(tokens[1], Token::Plus));
365        assert!(matches!(tokens[2], Token::Number(3.0)));
366    }
367
368    #[test]
369    fn test_tokenize_with_units_invalid() {
370        // Test that tokenizer now accepts all text (refactored approach)
371        let result = tokenize_with_units("invalid text");
372        assert!(result.is_some()); // Tokenizer now accepts everything
373
374        // Still fails on clearly malformed expressions
375        assert!(tokenize_with_units("1 + 2)").is_none());
376        assert!(tokenize_with_units("1 invalidunit").is_some()); // Now parses as [Number, Variable]
377
378        // Note: empty string actually returns Ok([]) in chumsky parser
379        // but tokenize_with_units returns None for empty results
380        let result = tokenize_with_units("");
381        assert!(result.is_none());
382    }
383
384    #[test]
385    fn test_is_valid_math_expression() {
386        // Test valid expressions
387        assert!(is_valid_math_expression("42"));
388        assert!(is_valid_math_expression("2 + 3"));
389        assert!(is_valid_math_expression("(1 + 2) * 3"));
390        assert!(is_valid_math_expression("5 GiB + 10 MiB"));
391        assert!(is_valid_math_expression("line1 * 2"));
392        assert!(is_valid_math_expression("1 TiB to GiB"));
393        assert!(is_valid_math_expression("24 MiB * 32 in KiB"));
394
395        // Test invalid expressions
396        assert!(!is_valid_math_expression(""));
397        assert!(!is_valid_math_expression("invalid text"));
398        assert!(!is_valid_math_expression("1 +"));
399        assert!(!is_valid_math_expression("+ 2"));
400        assert!(!is_valid_math_expression("1 + + 2"));
401        assert!(!is_valid_math_expression("(1 + 2"));
402        assert!(!is_valid_math_expression("1 + 2)"));
403
404        // Note: "1 invalidunit" is actually considered valid by is_valid_math_expression
405        // because it sees "1" as a valid number and stops there
406        // The actual parsing will fail later, but this function is for syntax validation
407
408        // Test edge cases
409        assert!(is_valid_math_expression("0"));
410        assert!(is_valid_math_expression("-5")); // Negative numbers
411        assert!(is_valid_math_expression("1.5"));
412        assert!(is_valid_math_expression("1,000"));
413        assert!(is_valid_math_expression("1,000,000.50"));
414    }
415
416    #[test]
417    fn test_is_valid_math_expression_units() {
418        // Test various unit formats
419        assert!(is_valid_math_expression("5GiB")); // No space
420        assert!(is_valid_math_expression("5 GiB")); // With space
421        assert!(is_valid_math_expression("10.5 MB/s")); // Compound unit
422        assert!(is_valid_math_expression("100 QPS")); // QPS unit
423        assert!(is_valid_math_expression("1 hour")); // Time unit
424        assert!(is_valid_math_expression("8 bit")); // Bit unit
425
426        // Test conversions
427        assert!(is_valid_math_expression("1 GiB to MiB"));
428        assert!(is_valid_math_expression("24 MiB * 32 in KiB"));
429        assert!(is_valid_math_expression("100 QPS to req/min"));
430
431        // Test case variations
432        assert!(is_valid_math_expression("1 gib TO mib"));
433        assert!(is_valid_math_expression("1 GIB to MIB"));
434    }
435
436    #[test]
437    fn test_is_valid_math_expression_operators() {
438        // Test all operators
439        assert!(is_valid_math_expression("1 + 2"));
440        assert!(is_valid_math_expression("5 - 3"));
441        assert!(is_valid_math_expression("4 * 6"));
442        assert!(is_valid_math_expression("8 / 2"));
443
444        // Test operator combinations
445        assert!(is_valid_math_expression("1 + 2 - 3"));
446        assert!(is_valid_math_expression("2 * 3 + 4"));
447        assert!(is_valid_math_expression("10 / 2 - 1"));
448
449        // Test with parentheses
450        assert!(is_valid_math_expression("(1 + 2) * 3"));
451        assert!(is_valid_math_expression("1 + (2 * 3)"));
452        assert!(is_valid_math_expression("((1 + 2) * 3) - 4"));
453
454        // Test invalid operator usage
455        assert!(!is_valid_math_expression("1 + * 2"));
456        assert!(!is_valid_math_expression("* 1 + 2"));
457        assert!(!is_valid_math_expression("1 + 2 *"));
458    }
459
460    #[test]
461    fn test_is_valid_math_expression_line_references() {
462        // Test line references
463        assert!(is_valid_math_expression("line1"));
464        assert!(is_valid_math_expression("line10"));
465        assert!(is_valid_math_expression("line1 + line2"));
466        assert!(is_valid_math_expression("line1 * 2"));
467        assert!(is_valid_math_expression("(line1 + line2) / 2"));
468
469        // Test line references with units
470        assert!(is_valid_math_expression("line1 + 5 GiB"));
471        assert!(is_valid_math_expression("line1 to MiB"));
472        assert!(is_valid_math_expression("line1 + line2 in KiB"));
473
474        // Test case insensitive line references
475        assert!(is_valid_math_expression("LINE1"));
476        assert!(is_valid_math_expression("Line2"));
477        assert!(is_valid_math_expression("LiNe3 + LiNe4"));
478    }
479
480    #[test]
481    fn test_whitespace_handling() {
482        // Test various whitespace scenarios
483        assert!(is_valid_math_expression("  1 + 2  "));
484        assert!(is_valid_math_expression("1   +   2"));
485        assert!(is_valid_math_expression("1\t+\t2"));
486        assert!(is_valid_math_expression("1+2")); // No spaces
487
488        // Test whitespace in units
489        assert!(is_valid_math_expression("5   GiB"));
490        assert!(is_valid_math_expression("5GiB"));
491
492        // Test whitespace around keywords
493        assert!(is_valid_math_expression("1 GiB  to  MiB"));
494        assert!(is_valid_math_expression("1 GiB to MiB"));
495    }
496
497    #[test]
498    fn test_extract_line_references() {
499        // Test basic line reference extraction
500        assert_eq!(extract_line_references("line1 + 5"), vec![(0, 5, 0)]);
501        assert_eq!(extract_line_references("10 + line2"), vec![(5, 10, 1)]);
502        assert_eq!(
503            extract_line_references("line1 + line2 * line3"),
504            vec![(0, 5, 0), (8, 13, 1), (16, 21, 2)]
505        );
506
507        // Test case insensitivity
508        assert_eq!(
509            extract_line_references("Line1 + Line2"),
510            vec![(0, 5, 0), (8, 13, 1)]
511        );
512        assert_eq!(
513            extract_line_references("LINE1 + line2"),
514            vec![(0, 5, 0), (8, 13, 1)]
515        );
516
517        // Test with complex expressions
518        assert_eq!(
519            extract_line_references("(line1 + line2) * 2 to GiB"),
520            vec![(1, 6, 0), (9, 14, 1)]
521        );
522
523        // Test multi-digit line numbers
524        assert_eq!(
525            extract_line_references("line10 + line123"),
526            vec![(0, 6, 9), (9, 16, 122)]
527        );
528
529        // Test no line references
530        assert_eq!(extract_line_references("5 + 3 * 2"), vec![]);
531        assert_eq!(extract_line_references("hello world"), vec![]);
532
533        // Test edge cases
534        assert_eq!(extract_line_references("line0"), vec![]); // line0 is invalid
535        assert_eq!(extract_line_references("line"), vec![]); // no number
536        assert_eq!(extract_line_references("myline1"), vec![]); // not starting with "line"
537
538        // Test with text around
539        assert_eq!(
540            extract_line_references("result: line1 + 2"),
541            vec![(8, 13, 0)]
542        );
543    }
544
545    #[test]
546    fn test_update_line_references_insertion() {
547        // Test insertion at the beginning (all references should be incremented)
548        assert_eq!(
549            update_line_references_in_text("line1 + line2", 0, 1),
550            "line2 + line3"
551        );
552        assert_eq!(
553            update_line_references_in_text("line3 + 5", 0, 1),
554            "line4 + 5"
555        );
556
557        // Test insertion in the middle (only references >= insertion point are updated)
558        assert_eq!(
559            update_line_references_in_text("line1 + line3", 2, 1),
560            "line1 + line4"
561        );
562        assert_eq!(
563            update_line_references_in_text("line1 + line2 + line3", 2, 1),
564            "line1 + line2 + line4"
565        );
566
567        // Test insertion at the end (no references should be updated)
568        assert_eq!(
569            update_line_references_in_text("line1 + line2", 5, 1),
570            "line1 + line2"
571        );
572
573        // Test no line references
574        assert_eq!(update_line_references_in_text("5 + 3", 1, 1), "5 + 3");
575
576        // Test complex expressions
577        assert_eq!(
578            update_line_references_in_text("(line2 + line4) * 2 to GiB", 3, 1),
579            "(line2 + line5) * 2 to GiB"
580        );
581
582        // Test user's reported scenario: splitting a line should update references
583        // When inserting at position 1, "line1" (pointing to position 0) shouldn't change
584        // according to position-based logic, but users might expect content-based updates
585        assert_eq!(
586            update_line_references_in_text("line1 + 1", 1, 1),
587            "line1 + 1"
588        );
589    }
590
591    #[test]
592    fn test_update_line_references_deletion() {
593        // Test deletion at the beginning
594        assert_eq!(
595            update_line_references_in_text("line1 + line2 + line3", 0, -1),
596            "INVALID_REF + line1 + line2"
597        );
598
599        // Test deletion in the middle
600        assert_eq!(
601            update_line_references_in_text("line1 + line2 + line3", 1, -1),
602            "line1 + INVALID_REF + line2"
603        );
604
605        // Test deletion at the end
606        assert_eq!(
607            update_line_references_in_text("line1 + line2 + line3", 2, -1),
608            "line1 + line2 + INVALID_REF"
609        );
610
611        // Test references before deleted line stay unchanged
612        assert_eq!(
613            update_line_references_in_text("line1 + line5", 3, -1),
614            "line1 + line4"
615        );
616
617        // Test no line references
618        assert_eq!(update_line_references_in_text("5 + 3", 1, -1), "5 + 3");
619
620        // Test complex scenarios
621        assert_eq!(
622            update_line_references_in_text("line1 + line3 + line5", 2, -1),
623            "line1 + INVALID_REF + line4"
624        );
625
626        // Test the user's reported scenario: deleting empty first line
627        assert_eq!(
628            update_line_references_in_text("line2 + 1", 0, -1),
629            "line1 + 1"
630        );
631    }
632
633    #[test]
634    fn test_update_line_references_edge_cases() {
635        // Test multiple references to the same line
636        assert_eq!(
637            update_line_references_in_text("line2 + line2 * line2", 1, -1),
638            "INVALID_REF + INVALID_REF * INVALID_REF"
639        );
640
641        // Test large line numbers
642        assert_eq!(
643            update_line_references_in_text("line100 + line200", 150, 1),
644            "line100 + line201"
645        );
646
647        // Test case preservation in complex text
648        assert_eq!(
649            update_line_references_in_text("Result: Line1 + LINE2", 1, -1),
650            "Result: Line1 + INVALID_REF"
651        );
652
653        // Test with mixed content
654        assert_eq!(
655            update_line_references_in_text("Memory usage: line3 * 1024 bytes", 2, 1),
656            "Memory usage: line4 * 1024 bytes"
657        );
658    }
659}