doc_chunks/
util.rs

1use crate::errors::*;
2use crate::{LineColumn, Range, Span};
3use core::ops::{Bound, RangeBounds};
4use fs_err as fs;
5use std::io::Read;
6use std::path::Path;
7
8#[derive(Debug, PartialEq, Eq)]
9struct LineSepStat {
10    first_appearance: usize,
11    count: usize,
12    newline: &'static str,
13}
14
15#[inline(always)]
16fn extract_delimiter_inner<'a>(
17    mut iter: impl Iterator<Item = usize>,
18    newline: &'static str,
19) -> Option<LineSepStat> {
20    if let Some(first) = iter.next() {
21        let n = iter.count() + 1;
22        Some(LineSepStat {
23            first_appearance: first,
24            count: n,
25            newline,
26        })
27    } else {
28        None
29    }
30}
31
32/// Extract line delimiter of a string.
33pub fn extract_delimiter(s: &str) -> Option<&'static str> {
34    // TODO lots of room for optimizations here
35    let lf = memchr::memchr_iter(b'\n', s.as_bytes());
36    let cr = memchr::memchr_iter(b'\r', s.as_bytes());
37    let crlf = memchr::memmem::find_iter(s.as_bytes(), "\r\n");
38    let lfcr = memchr::memmem::find_iter(s.as_bytes(), "\n\r");
39    // first look for two letter line delimiters
40    let lfcr = extract_delimiter_inner(lfcr, "\n\r");
41    let crlf = extract_delimiter_inner(crlf, "\r\n");
42
43    // remove the 2 line line delimiters from the single line line delimiters, since they overlap
44    let lf = extract_delimiter_inner(lf, "\n").map(|mut stat| {
45        stat.count = stat.count.saturating_sub(std::cmp::max(
46            crlf.as_ref().map(|stat| stat.count).unwrap_or_default(),
47            lfcr.as_ref().map(|stat| stat.count).unwrap_or_default(),
48        ));
49        stat
50    });
51    let cr = extract_delimiter_inner(cr, "\r").map(|mut stat| {
52        stat.count = stat.count.saturating_sub(std::cmp::max(
53            crlf.as_ref().map(|stat| stat.count).unwrap_or_default(),
54            lfcr.as_ref().map(|stat| stat.count).unwrap_or_default(),
55        ));
56        stat
57    });
58
59    // order is important, `max_by` prefers the latter ones over the earlier ones on equality
60    vec![cr, lf, crlf, lfcr]
61        .into_iter()
62        .flatten()
63        .max_by(|b, a| {
64            if a.count == b.count {
65                a.first_appearance.cmp(&b.first_appearance)
66            } else {
67                b.count.cmp(&a.count)
68            }
69        })
70        .map(|x| x.newline)
71}
72
73/// Iterate over a str and annotate with line and column.
74///
75/// Assumes `s` is content starting from point `start_point`.
76pub fn iter_with_line_column_from(
77    s: &str,
78    start_point: LineColumn,
79) -> impl Iterator<Item = (char, usize, usize, LineColumn)> + '_ {
80    #[derive(Clone)]
81    struct State {
82        cursor: LineColumn,
83        previous_char_was_newline: bool,
84    }
85
86    let initial = State {
87        cursor: start_point,
88        previous_char_was_newline: false,
89    };
90
91    s.char_indices()
92        .enumerate()
93        .map(|(idx, (byte_offset, c))| (idx, byte_offset, c))
94        .scan(initial, |state, (idx, byte_offset, c)| -> Option<_> {
95            let cursor = state.cursor;
96            state.previous_char_was_newline = c == '\n';
97            if state.previous_char_was_newline {
98                state.cursor.line += 1;
99                state.cursor.column = 0;
100            } else {
101                state.cursor.column += 1;
102            }
103            Some((c, byte_offset, idx, cursor))
104        })
105}
106
107/// Iterate over annotated chars starting from line 1 and column 0 assuming `s`
108/// starts there.
109pub fn iter_with_line_column(
110    s: &str,
111) -> impl Iterator<Item = (char, usize, usize, LineColumn)> + '_ {
112    iter_with_line_column_from(s, LineColumn { line: 1, column: 0 })
113}
114
115/// Extract `span` from a `Read`-able source as `String`.
116///
117/// # Errors
118/// Returns an Error if `span` describes a impossible range.
119pub fn load_span_from<R>(mut source: R, span: Span) -> Result<String>
120where
121    R: Read,
122{
123    log::trace!("Loading {span:?} from source");
124    if span.start.line < 1 {
125        return Err(Error::Span(
126            "Lines are 1-indexed, can't be less than 1".to_string(),
127        ));
128    }
129    if span.end.line < span.start.line {
130        return Err(Error::Span(
131            "Line range would be negative, bail".to_string(),
132        ));
133    }
134    if span.end.line == span.start.line && span.end.column < span.start.column {
135        return Err(Error::Span(
136            "Column range would be negative, bail".to_string(),
137        ));
138    }
139    let mut s = String::with_capacity(256);
140    source
141        .read_to_string(&mut s)
142        .expect("Must read successfully");
143
144    let extraction = iter_with_line_column(s.as_str())
145        .skip_while(|(_c, _byte_offset, _idx, cursor)| {
146            cursor.line < span.start.line
147                || (cursor.line == span.start.line && cursor.column < span.start.column)
148        })
149        .take_while(|(_c, _byte_offset, _idx, cursor)| {
150            cursor.line < span.end.line
151                || (cursor.line == span.end.line && cursor.column <= span.end.column)
152        })
153        .fuse()
154        .map(|(c, _byte_offset, _idx, _cursor)| c)
155        .collect::<String>();
156    // log::trace!("Loading {range:?} from line >{line}<");
157    Ok(extraction)
158}
159
160/// Extract span from a file as `String`.
161///
162/// Helpful to validate bandaids against what's actually in the file.
163#[allow(unused)]
164pub(crate) fn load_span_from_file(path: impl AsRef<Path>, span: Span) -> Result<String> {
165    let path = path.as_ref();
166    let path = fs::canonicalize(path)?;
167
168    let ro = fs::OpenOptions::new().read(true).open(&path)?;
169
170    let mut reader = std::io::BufReader::new(ro);
171
172    load_span_from(reader, span)
173}
174
175/// Extract a subset of chars by iterating. Range must be in characters.
176pub fn sub_chars(s: &str, range: Range) -> String {
177    s.chars()
178        .skip(range.start)
179        .take(range.len())
180        .collect::<String>()
181}
182
183/// Convert a given byte range of a string, that is known to be at valid char
184/// bounds, to a character range.
185///
186/// If the bounds are not bounded by a character, it will take the bounding
187/// characters that are intersected inclusive.
188pub fn byte_range_to_char_range<R>(s: &str, byte_range: R) -> Option<Range>
189where
190    R: RangeBounds<usize>,
191{
192    let mut peekable = s.char_indices().enumerate().peekable();
193    let mut range = Range { start: 0, end: 0 };
194    let mut started = false;
195    while let Some((idx, (byte_offset, _c))) = peekable.next() {
196        match byte_range.start_bound() {
197            Bound::Included(&start) if byte_offset == start => {
198                started = true;
199                range.start = idx;
200            }
201            Bound::Included(&start) if byte_offset > start && !started => {
202                started = true;
203                range.start = idx.saturating_sub(1);
204            }
205            Bound::Excluded(_start) => {
206                unreachable!("Exclusive start bounds do not exist. qed");
207            }
208            _ => {}
209        }
210
211        match byte_range.end_bound() {
212            Bound::Included(&end) if byte_offset > end => {
213                range.end = idx;
214                return Some(range);
215            }
216            Bound::Excluded(&end) if byte_offset >= end => {
217                range.end = idx;
218                return Some(range);
219            }
220            _ => {}
221        }
222        if peekable.peek().is_none() && started {
223            range.end = idx + 1;
224            return Some(range);
225        }
226    }
227    None
228}
229
230/// Convert many byte ranges to character ranges.
231///
232/// Attention: All byte ranges most NOT overlap and be relative to the same `s`.
233pub fn byte_range_to_char_range_many<R>(s: &str, byte_ranges: &[R]) -> Vec<Range>
234where
235    R: std::ops::RangeBounds<usize> + std::fmt::Debug,
236{
237    let mut peekable = s.char_indices().enumerate().peekable();
238    let mut cursor = 0usize;
239    let mut acc = Vec::with_capacity(byte_ranges.len());
240    for byte_range in byte_ranges {
241        let mut range = Range { start: 0, end: 0 };
242        let mut started = false;
243        'inner: while let Some((idx, (byte_offset, _c))) = peekable.peek() {
244            cursor = *idx;
245            let byte_offset = *byte_offset;
246            match byte_range.start_bound() {
247                Bound::Included(&start) if byte_offset == start => {
248                    started = true;
249                    range.start = cursor;
250                }
251                Bound::Included(&start) if byte_offset > start && !started => {
252                    started = true;
253                    range.start = cursor.saturating_sub(1);
254                }
255                Bound::Excluded(_start) => {
256                    unreachable!("Exclusive start bounds do not exist. qed");
257                }
258                _ => {}
259            }
260
261            match byte_range.end_bound() {
262                Bound::Included(&end) if byte_offset > end => {
263                    range.end = cursor;
264                    acc.push(range.clone());
265                    started = false;
266                    break 'inner;
267                }
268                Bound::Excluded(&end) if byte_offset >= end => {
269                    range.end = cursor;
270                    acc.push(range.clone());
271                    started = false;
272                    break 'inner;
273                }
274                _ => {}
275            }
276
277            let _ = peekable.next();
278        }
279        if started {
280            range.end = cursor + 1;
281            acc.push(range);
282        }
283    }
284    acc
285}
286
287/// Extract a subset of chars by iterating. Range must be in characters.
288pub fn sub_char_range<R>(s: &str, range: R) -> &str
289where
290    R: RangeBounds<usize>,
291{
292    let mut peekable = s.char_indices().enumerate().peekable();
293    let mut byte_range = Range { start: 0, end: 0 };
294    let mut started = false;
295    'loopy: while let Some((idx, (byte_offset_start, _c))) = peekable.next() {
296        match range.start_bound() {
297            Bound::Included(&start) if idx == start => {
298                started = true;
299                byte_range.start = byte_offset_start;
300            }
301            Bound::Excluded(_start) => {
302                unreachable!("Exclusive start bounds do not exist. qed");
303            }
304            _ => {}
305        }
306
307        match range.end_bound() {
308            Bound::Included(&end) if idx > end => {
309                byte_range.end = byte_offset_start;
310                break 'loopy;
311            }
312            Bound::Excluded(&end) if idx >= end => {
313                byte_range.end = byte_offset_start;
314                break 'loopy;
315            }
316            _ => {}
317        }
318        if peekable.peek().is_none() && started {
319            byte_range.end = s.len();
320        }
321    }
322    &s[byte_range]
323}
324
325#[cfg(test)]
326mod tests {
327    use super::*;
328    macro_rules! lcc {
329        ($line:literal, $column:literal, $c:literal) => {
330            (
331                LineColumn {
332                    line: $line,
333                    column: $column,
334                },
335                $c,
336            )
337        };
338    }
339    #[test]
340    fn iter_chars() {
341        const S: &str = r#"
342abc
343d
344"#;
345        const S2: &str = r#"c
346d"#;
347        const EXPECT: &[(LineColumn, char)] = &[
348            lcc!(1, 0, '\n'),
349            lcc!(2, 0, 'a'),
350            lcc!(2, 1, 'b'),
351            lcc!(2, 2, 'c'),
352            lcc!(2, 3, '\n'),
353            lcc!(3, 0, 'd'),
354            lcc!(3, 1, '\n'),
355        ];
356
357        iter_with_line_column(S).zip(EXPECT.iter()).for_each(
358            |((c, _byte_offset, _idx, lc), (expected_lc, expected_c))| {
359                assert_eq!(lc, expected_lc.clone());
360                assert_eq!(c, expected_c.clone());
361            },
362        );
363
364        const SPAN: Span = Span {
365            start: LineColumn { line: 2, column: 2 },
366            end: LineColumn { line: 3, column: 0 },
367        };
368
369        assert_eq!(
370            load_span_from(&mut S.as_bytes(), SPAN).expect("Must succeed"),
371            S2.to_owned()
372        );
373    }
374
375    #[test]
376    fn iter_span_doc_0_trivial() {
377        const SOURCE: &str = r##"#[doc=r#"Zebra
378Schlupfwespe,
379Grünfink"#]"##;
380        const S2: &str = r#"Zebra
381Schlupfwespe,
382Grünfink"#;
383
384        const SPAN: Span = Span {
385            start: LineColumn {
386                line: 1,
387                column: 0 + 9,
388            }, // prefix is #[doc=r#"
389            end: LineColumn { line: 3, column: 7 }, // suffix is pointeless
390        };
391
392        assert_eq!(
393            load_span_from(&mut SOURCE.as_bytes(), SPAN).expect("Must succeed"),
394            S2.to_owned()
395        );
396    }
397
398    #[test]
399    fn iter_span_doc_1_trailing_newline() {
400        const SOURCE: &str = r##"#[doc=r#"Zebra
401Schlupfwespe,
402"#]"##;
403        const S2: &str = r#"Zebra
404Schlupfwespe,
405"#;
406
407        const SPAN: Span = Span {
408            start: LineColumn {
409                line: 1,
410                column: 0 + 9,
411            }, // prefix is #[doc=r#"
412            end: LineColumn {
413                line: 2,
414                column: 13,
415            }, // suffix is pointeless
416        };
417
418        assert_eq!(
419            load_span_from(&mut SOURCE.as_bytes(), SPAN).expect("Must succeed"),
420            S2.to_owned()
421        );
422    }
423
424    #[test]
425    fn sub_a() {
426        const A: &str = "a🐲o🌡i🡴f🕧aodnferntkng";
427        const A_EXPECTED: &str = "a🐲o";
428
429        assert_eq!(sub_char_range(A, 0..3), A_EXPECTED);
430        assert_eq!(sub_char_range(A, ..3), A_EXPECTED);
431        assert_eq!(sub_chars(A, 0..3), A_EXPECTED.to_owned());
432    }
433
434    #[test]
435    fn sub_b() {
436        const B: &str = "fff🦦🡴🕧";
437        const B_EXPECTED: &str = "🦦🡴🕧";
438
439        assert_eq!(sub_char_range(B, 3..=5), B_EXPECTED);
440        assert_eq!(sub_char_range(B, 3..), B_EXPECTED);
441    }
442
443    #[test]
444    fn sub_c() {
445        const B: &str = "fff🦦🡴🕧";
446        const B_EXPECTED: &str = "";
447
448        assert_eq!(sub_char_range(B, 10..), B_EXPECTED);
449        assert_eq!(sub_char_range(B, 15..16), B_EXPECTED);
450    }
451
452    #[test]
453    fn range_bytes_to_chars() {
454        // 4 3 4
455        assert_eq!(byte_range_to_char_range("🕱™🐡", 4..7), Some(1..2));
456        // 4 1 1 3 4
457        assert_eq!(byte_range_to_char_range("🕱12™🐡", 6..13), Some(3..5));
458        assert_eq!(byte_range_to_char_range("🕱12™🐡", 0..0), Some(0..0));
459        assert_eq!(byte_range_to_char_range("🕱12™🐡", 25..26), None);
460    }
461
462    #[test]
463    fn range_bytes_to_chars_many() {
464        // 4 3 4
465        assert_eq!(
466            byte_range_to_char_range_many("🕱™🐡", &[4..7, 7..11]),
467            vec![1..2, 2..3]
468        );
469        assert_eq!(
470            byte_range_to_char_range_many("🕱™🐡", &[0..0, 4..11]),
471            vec![0..0, 1..3]
472        );
473    }
474}