rsonpath-syntax 0.4.1

A JSONPath parser.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
use super::display::UnicodeWidth as _;

#[derive(Debug, PartialEq, Eq)]
pub(crate) struct SyntaxErrorLine {
    pub(crate) line: String,
    pub(crate) line_number: usize,
    pub(crate) underline: Option<SyntaxErrorUnderline>,
    pub(crate) truncated_start: bool,
    pub(crate) truncated_end: bool,
}

#[derive(Debug, PartialEq, Eq)]
pub(crate) struct SyntaxErrorUnderline {
    pub(crate) start_pos: usize,
    pub(crate) len: usize,
    pub(crate) message: Option<String>,
}

pub(super) struct ErrorFormatter<'a> {
    input: &'a str,
    line_data: Vec<LineData>,
}

#[derive(Debug, PartialEq, Eq)]
struct LineData {
    start_idx: usize,
    one_past_end_idx: usize,
    char_data: Vec<CharData>,
}

#[derive(Debug, PartialEq, Eq)]
struct CharData {
    byte_idx: usize,
    acc_width: usize,
    c: char,
}

impl<'a> ErrorFormatter<'a> {
    pub(super) fn new(input: &'a str) -> Self {
        // Special case for empty inputs - pretend we do have a single line with a whitespace so that we can display
        // any diagnostics at all.
        if input.is_empty() {
            return Self {
                input,
                line_data: vec![LineData::new(0, 0, vec![CharData::new(0, 1, ' ')])],
            };
        }

        // Go through input lines and construct the index of all characters.
        // We keep the accumulated length in bytes to have accurate byte indices for all chars.
        let mut acc_len = 0;
        let line_data = input
            .split_inclusive('\n')
            .map(|line| {
                let start_idx = acc_len;
                let one_past_end_idx = acc_len + line.len();
                // For each line accumulate the total display width of each prefix.
                let mut char_data = line
                    .char_indices()
                    .scan(0, |acc_width, (i, c)| {
                        *acc_width += c.width();
                        Some(CharData::new(i + start_idx, *acc_width, c))
                    })
                    .collect::<Vec<_>>();
                // Insert a dummy whitespace at the end.
                // This helps for errors that sometimes display an underline one past the line end, e.g. because
                // something is missing (`$.` as an example).
                if let Some(last) = char_data.last() {
                    char_data.push(CharData::new(last.byte_idx + 1, last.acc_width + 1, ' '));
                } else {
                    char_data.push(CharData::new(start_idx + 1, 1, ' '));
                }
                acc_len += line.len();
                LineData {
                    start_idx,
                    one_past_end_idx,
                    char_data,
                }
            })
            .collect::<Vec<_>>();

        Self { input, line_data }
    }

    pub(super) fn str(&self) -> &str {
        self.input
    }

    pub(super) fn len(&self) -> usize {
        self.input.len()
    }

    pub(super) fn is_multiline(&self) -> bool {
        self.line_data.len() > 1
    }

    /// Create [`SyntaxErrorLines`](SyntaxErrorLine) for display of an error occurring between
    /// byte `error_byte_start` and `error_byte_end` (inclusive).
    ///
    /// The amount of context displayed for the error is controlled by `min_context_width` and `soft_width_limit`.
    /// First ensures that some amount of pre- and post-context is always displayed. Second limits the total width
    /// of every line; it is a soft limit, because the error is always displayed in full alongside 2 times
    /// `min_context_width` context.
    pub(super) fn build_error_lines(
        &self,
        error_byte_start: usize,
        error_byte_end: usize,
        min_context_width: usize,
        soft_width_limit: usize,
        underline_message: String,
    ) -> Vec<SyntaxErrorLine> {
        let start_line_num = self.find_line_containing(error_byte_start);
        let end_line_num = self.find_line_containing(error_byte_end);
        let start_line = &self.line_data[start_line_num];
        let end_line = &self.line_data[end_line_num];

        let error_start_char_idx = start_line.find_char_at_idx(error_byte_start);
        let error_end_char_idx = end_line.find_char_at_idx(error_byte_end);

        let full_pre_context_width = if error_start_char_idx == 0 {
            0
        } else {
            start_line.width_to_char(error_start_char_idx - 1)
        };
        let full_post_context_width = if error_end_char_idx == end_line.char_data.len() - 1 {
            0
        } else {
            end_line.width_from_char(error_end_char_idx + 1)
        };

        // There are two cases - either the error is fully contained within a single line or it spans multiple lines.
        // In the first case we need to balance the width of both contexts.
        // In the second case the pre-context is fully within the first line and post-context fully within the last
        // line, so calculations are independent. The lines in between are always fully displayed.
        if start_line_num == end_line_num {
            let only_line = start_line; // Just rename to avoid confusion.
            let line_error_width = only_line.width_of_char_span(error_start_char_idx, error_end_char_idx);
            let total_width = only_line.total_width();
            let (pre_width, post_width) = if total_width <= soft_width_limit {
                // All context fits, no truncating needed.
                (full_pre_context_width, full_post_context_width)
            } else {
                // Need to truncate at least one side. Try to do it in a balanced manner.
                // First, allocate half of the total allowed width to each side.
                let allowed_total_context_width = soft_width_limit.saturating_sub(line_error_width);
                let pre_allocation = allowed_total_context_width / 2;
                let post_allocation = allowed_total_context_width - pre_allocation;
                // This allocation might be too much if the full width is actually shorter.
                let pre_overallocation = pre_allocation.saturating_sub(full_pre_context_width);
                let post_overallocation = post_allocation.saturating_sub(full_post_context_width);
                // Give back the overallocation to the other side and enforce the min width requirement.
                let pre_width = (pre_allocation + post_overallocation)
                    .max(min_context_width)
                    .min(full_pre_context_width);
                let post_width = (post_allocation + pre_overallocation)
                    .max(min_context_width)
                    .min(full_post_context_width);
                (pre_width, post_width)
            };
            let pre_start_idx = only_line.find_start_of_pre_context(error_start_char_idx, pre_width);
            // Get the width from pre-context start to the error start. This is the effective offset of the underline.
            let underline_offset = if error_start_char_idx == 0 {
                0
            } else {
                only_line.width_of_char_span(pre_start_idx, error_start_char_idx - 1)
            };
            let post_end_idx = only_line.find_end_of_post_context(error_end_char_idx, post_width);
            let display_line = self.slice_line(only_line, pre_start_idx, post_end_idx).to_string();
            vec![SyntaxErrorLine {
                truncated_start: full_pre_context_width != pre_width,
                truncated_end: full_post_context_width != post_width,
                line: display_line,
                line_number: start_line_num,
                underline: if line_error_width == 0 {
                    None
                } else {
                    Some(SyntaxErrorUnderline {
                        len: line_error_width,
                        start_pos: underline_offset,
                        message: Some(underline_message),
                    })
                },
            }]
        } else {
            // Calculate the allowed width of pre-context and post-context independently on each of their lines.
            let first_line_error_width = start_line.width_from_char(error_start_char_idx);
            let last_line_error_width = end_line.width_to_char(error_end_char_idx);
            let pre_width = soft_width_limit
                .saturating_sub(first_line_error_width)
                .max(min_context_width)
                .min(full_pre_context_width);
            let post_width = soft_width_limit
                .saturating_sub(last_line_error_width)
                .max(min_context_width)
                .min(full_post_context_width);
            let pre_start_idx = start_line.find_start_of_pre_context(error_start_char_idx, pre_width);
            let post_end_idx = end_line.find_end_of_post_context(error_end_char_idx, post_width);
            // There are three types of lines - the first line, middle lines, and the last line.
            let mut lines = Vec::with_capacity(end_line_num - start_line_num + 1);
            let first_line_display =
                self.input[start_line.char_data[pre_start_idx].byte_idx..start_line.one_past_end_idx].to_string();
            lines.push(SyntaxErrorLine {
                truncated_start: full_pre_context_width != pre_width,
                truncated_end: false,
                line: first_line_display,
                line_number: start_line_num,
                underline: Some(SyntaxErrorUnderline {
                    len: start_line.width_from_char(error_start_char_idx),
                    start_pos: pre_width,
                    message: None,
                }),
            });
            for line_num in start_line_num + 1..end_line_num {
                let line = &self.line_data[line_num];
                let display_line = &self.input[line.start_idx..line.one_past_end_idx];
                lines.push(SyntaxErrorLine {
                    truncated_start: false,
                    truncated_end: false,
                    line: display_line.to_string(),
                    line_number: line_num,
                    underline: Some(SyntaxErrorUnderline {
                        len: line.total_width(),
                        start_pos: 0,
                        message: None,
                    }),
                });
            }
            let display_line = self.slice_line(end_line, 0, post_end_idx).to_string();
            lines.push(SyntaxErrorLine {
                truncated_start: false,
                truncated_end: full_post_context_width != post_width,
                line: display_line,
                line_number: end_line_num,
                underline: Some(SyntaxErrorUnderline {
                    len: end_line.width_to_char(error_end_char_idx),
                    start_pos: 0,
                    message: Some(underline_message),
                }),
            });
            lines
        }
    }

    /// Returns the line number which contains the given byte index.
    fn find_line_containing(&self, idx: usize) -> usize {
        self.line_data
            .binary_search_by_key(&idx, |l| l.start_idx)
            .unwrap_or_else(|idx| idx - 1)
    }

    /// Get the input slice for a given line between two *char* indices (inclusive).
    fn slice_line(&self, line: &LineData, start_char_idx: usize, end_char_idx: usize) -> &str {
        let start = line.char_data[start_char_idx].byte_idx;
        // This looks overcomplicated but is correct. Because the char at end_char_idx might have byte-width
        // greater than one we need to ask the next character, if any, for its index. If it is the last char,
        // then it is the dummy whitespace we inserted at the end that we don't want to display anyway.
        let end = if end_char_idx == line.char_data.len() - 1 {
            line.char_data[end_char_idx].byte_idx
        } else {
            line.char_data[end_char_idx + 1].byte_idx
        };
        &self.input[start..end]
    }
}

impl CharData {
    fn new(idx: usize, acc_width: usize, c: char) -> Self {
        Self {
            byte_idx: idx,
            acc_width,
            c,
        }
    }
}

impl LineData {
    fn new(start_idx: usize, one_past_end_idx: usize, char_data: Vec<CharData>) -> Self {
        Self {
            start_idx,
            one_past_end_idx,
            char_data,
        }
    }

    /// Get the char index of the char that contains the byte index.
    fn find_char_at_idx(&self, byte_idx: usize) -> usize {
        self.char_data
            .binary_search_by_key(&byte_idx, |c| c.byte_idx)
            .unwrap_or_else(|idx| idx - 1)
    }

    /// Get the char index at which pre-context should start if the error starts at the given
    /// index, and we are limited by a maximum `pre_width`.
    fn find_start_of_pre_context(&self, error_start_char_idx: usize, pre_width: usize) -> usize {
        let width_at_error_start = if error_start_char_idx == 0 {
            0
        } else {
            self.width_to_char(error_start_char_idx - 1)
        };
        let target_width = width_at_error_start.saturating_sub(pre_width);
        self.char_data
            .binary_search_by_key(&target_width, |d| d.acc_width - d.c.width())
            .unwrap_or_else(|idx| idx)
    }

    /// Get the char index at which post-context should end if the error ends at the given index,
    /// and we are limited by a maximum `post_width`.
    fn find_end_of_post_context(&self, error_end_char_idx: usize, post_width: usize) -> usize {
        let width_at_error_end = self.width_to_char(error_end_char_idx);
        let target_width = width_at_error_end + post_width;
        self.char_data
            .binary_search_by_key(&target_width, |c| c.acc_width)
            .unwrap_or_else(|idx| idx - 1)
    }

    /// Get the total width of characters between the two char indices (inclusive).
    fn width_of_char_span(&self, start_char_idx: usize, end_char_idx: usize) -> usize {
        self.char_data[end_char_idx].acc_width - self.char_data[start_char_idx].acc_width
            + self.char_data[start_char_idx].c.width()
    }

    /// Get the total width of the line suffix starting at the given char index.
    fn width_from_char(&self, start_char_idx: usize) -> usize {
        self.total_width() + self.char_data[start_char_idx].c.width() - self.char_data[start_char_idx].acc_width
    }

    /// Get the total width of the line prefix ending at the given char index (inclusive).
    fn width_to_char(&self, end_char_idx: usize) -> usize {
        self.char_data[end_char_idx].acc_width
    }

    /// Get the total width of all characters in this line.
    fn total_width(&self) -> usize {
        // Subtract one for the dummy whitespace we inserted.
        self.char_data.last().map_or(0, |c| c.acc_width) - 1
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use pretty_assertions::assert_eq;

    #[test]
    fn empty_input() {
        let s = "";
        let expected_lines = vec![SyntaxErrorLine {
            line: String::new(),
            truncated_start: false,
            truncated_end: false,
            line_number: 0,
            underline: Some(SyntaxErrorUnderline {
                start_pos: 0,
                len: 1,
                message: Some("message".to_string()),
            }),
        }];
        let input = ErrorFormatter::new(s);
        assert_eq!(input.str(), "");
        assert_eq!(input.len(), 0);
        assert!(!input.is_multiline());
        let lines = input.build_error_lines(0, 0, 30, 80, "message".to_string());
        assert_eq!(expected_lines, lines);
    }

    #[test]
    fn simple_ascii_input() {
        let s = "abc123...";
        let expected_char_data = vec![
            CharData::new(0, 1, 'a'),
            CharData::new(1, 2, 'b'),
            CharData::new(2, 3, 'c'),
            CharData::new(3, 4, '1'),
            CharData::new(4, 5, '2'),
            CharData::new(5, 6, '3'),
            CharData::new(6, 7, '.'),
            CharData::new(7, 8, '.'),
            CharData::new(8, 9, '.'),
            CharData::new(9, 10, ' '),
        ];
        let expected_line_data = vec![LineData::new(0, s.len(), expected_char_data)];
        let expected_error_data_1 = vec![SyntaxErrorLine {
            truncated_start: false,
            truncated_end: false,
            line: s.to_string(),
            underline: Some(SyntaxErrorUnderline {
                len: 9,
                start_pos: 0,
                message: Some("message".to_string()),
            }),
            line_number: 0,
        }];
        let expected_error_data_2 = vec![SyntaxErrorLine {
            truncated_start: false,
            truncated_end: false,
            line: s.to_string(),
            underline: Some(SyntaxErrorUnderline {
                len: 2,
                start_pos: 5,
                message: Some("message".to_string()),
            }),
            line_number: 0,
        }];
        let input = ErrorFormatter::new(s);
        assert_eq!(input.str(), s);
        assert_eq!(input.len(), s.len());
        assert!(!input.is_multiline());
        assert_eq!(input.line_data, expected_line_data);
        let data = input.build_error_lines(0, 8, 30, 80, "message".to_string());
        assert_eq!(expected_error_data_1, data);
        let data = input.build_error_lines(5, 6, 30, 80, "message".to_string());
        assert_eq!(expected_error_data_2, data);
    }

    #[test]
    fn variable_width_input() {
        const WIDTH_TO_TEST: usize = 80;
        let base_s = "🦀."; // This string has width 3 (2 for Ferris, 1 for the period) but byte-length of 5.
        assert_eq!(base_s.len(), 5);
        let s = base_s.repeat(100); // Total width 300, byte-length 500.
        let mut expected_char_data = vec![];
        for i in 0..100 {
            expected_char_data.push(CharData::new(5 * i, 3 * i + 2, '🦀'));
            expected_char_data.push(CharData::new(5 * i + 4, 3 * i + 3, '.'));
        }
        expected_char_data.push(CharData::new(500, 301, ' '));
        let expected_data = vec![LineData::new(0, s.len(), expected_char_data)];
        let input = ErrorFormatter::new(&s);
        assert_eq!(input.str(), s);
        assert_eq!(input.len(), s.len());
        assert!(!input.is_multiline());
        assert_eq!(input.line_data, expected_data);

        // We select the Ferris at byte index 50.
        // There is 10 copies of base_s to the left, with a total width of 30. All of them should be included.
        // Ferris has width 2 so we are left with 48 width for the post-context.
        // There is a period and then 89 copies of base_s to the right, with a total width of 268.
        // We are limited by 48 width, so we can select the period and up to 15 copies of base_s, leaving us with
        // two width remaining; we can use that width to select the next Ferris.
        // This is a total of 20 + 1 + 1 + 30 + 1 = 53 characters.
        let iter = input.build_error_lines(50, 53, 5, WIDTH_TO_TEST, "message".to_string());
        let expected_lines = vec![SyntaxErrorLine {
            truncated_start: false,
            truncated_end: true,
            line_number: 0,
            line: s.chars().take(53).collect::<String>(),
            underline: Some(SyntaxErrorUnderline {
                len: 2,
                start_pos: 30,
                message: Some("message".to_string()),
            }),
        }];
        assert_eq!(expected_lines, iter);

        // Now select the Ferris at byte index 445.
        // There is 89 copies of base_s to the left and 10 copies to the right.
        // The situation is symmetrical, only that we take the period and all 10 copies to the right
        // and 15 copies to the left.
        // We spent 45 + 2 + 31 = 78 width and we have 2 remaining.
        // Now we can spend one of the 2 remaining width for the period to the left, but we cannot
        // include another Ferris.
        // We need to skip the first 89 - 15 = 74 groups without skipping the last period,
        // skipping 2 * 74 - 1 = 147 characters.
        let iter = input.build_error_lines(445, 448, 5, WIDTH_TO_TEST, "message".to_string());
        let expected_lines = vec![SyntaxErrorLine {
            truncated_start: true,
            truncated_end: false,
            line_number: 0,
            line: s.chars().skip(147).collect::<String>(),
            underline: Some(SyntaxErrorUnderline {
                len: 2,
                start_pos: 46,
                message: Some("message".to_string()),
            }),
        }];
        assert_eq!(expected_lines, iter);

        // Now we select Ferris at byte index 270.
        // There is 54 copies of base_s to the left and 45 to the right.
        // We can include 13 copies to the left, the period, 12 copies, and then a Ferris
        // to the right.
        // We thus skip 28 groups to the left for a total of 2 * 28 = 56 characters;
        // take 13 groups and target Ferris, coming up to 2 * 13 + 1 = 27 characters;
        // and to the right we take 26 characters.
        let iter = input.build_error_lines(270, 273, 5, WIDTH_TO_TEST, "message".to_string());
        let expected_lines = vec![SyntaxErrorLine {
            truncated_start: true,
            truncated_end: true,
            line_number: 0,
            line: s.chars().skip(56).take(27 + 26).collect::<String>(),
            underline: Some(SyntaxErrorUnderline {
                len: 2,
                start_pos: 39,
                message: Some("message".to_string()),
            }),
        }];
        assert_eq!(expected_lines, iter);
    }
}