1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/*
 * text.rs
 *
 * ftml - Library to parse Wikidot text
 * Copyright (C) 2019-2022 Wikijump Team
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

use crate::parsing::ExtractedToken;

/// Wrapper for the input string that was tokenized.
///
/// This structure does not expose the internal string (preventing weird ad-hoc
/// or hack parsing), but permits joining adjacent `ExtractedToken` string slices
/// by selecting from the original text source.
#[derive(Debug, Copy, Clone)]
pub struct FullText<'t> {
    text: &'t str,
}

impl<'t> FullText<'t> {
    #[inline]
    pub fn new(text: &'t str) -> Self {
        FullText { text }
    }

    /// Returns the entire inner string. This should not be used.
    ///
    /// If you wish to slice between tokens, use the other methods instead.
    /// This is for very unusual cases where you need the entire input string
    /// as-is, with no tokenization.
    #[inline]
    #[doc(hidden)]
    pub(crate) fn inner(&self) -> &'t str {
        self.text
    }

    /// Slices from the given start to end token.
    ///
    /// This is performed inclusively, capturing both tokens on each side,
    /// and all the tokens which lie in the middle.
    ///
    /// # Panics
    /// If the ending token does not come after the first, or if
    /// the slices specified are out of range for the string (unlikely),
    /// this function will panic.
    pub fn slice(
        &self,
        start_token: &ExtractedToken,
        end_token: &ExtractedToken,
    ) -> &'t str {
        let start = start_token.span.start;
        let end = end_token.span.end;
        self.slice_impl("full", start, end)
    }

    /// Slices from the given start, but before the end token.
    ///
    /// This is performed exclusively, capturing the full starting token,
    /// but terminating at the start of the end token (capturing none of it).
    ///
    /// This function is provided specifically because it easier to bump
    /// the start token should you wish to exclude it, but doing so
    /// for the end token is not trivial while observing lifetime safety rules.
    ///
    /// # Panics
    /// If the ending token does not come after the first, or if
    /// the slices specified are out of range for the string (unlikely),
    /// this function will panic.
    pub fn slice_partial(
        &self,
        start_token: &ExtractedToken,
        end_token: &ExtractedToken,
    ) -> &'t str {
        let start = start_token.span.start;
        let end = end_token.span.start;
        self.slice_impl("partial", start, end)
    }

    fn slice_impl(&self, slice_kind: &'static str, start: usize, end: usize) -> &'t str {
        info!("Extracting {slice_kind} slice ({start}-{end}) from full text");

        assert!(
            start <= end,
            "Starting index is later than the ending index: {start} > {end}",
        );

        &self.text[start..end]
    }
}

#[test]
fn slice() {
    use crate::parsing::Token;

    let text = "Apple banana!";
    let full_text = FullText::new(text);

    macro_rules! range {
        ($span:expr) => {
            &ExtractedToken {
                token: Token::Other,
                slice: &text[$span],
                span: $span,
            }
        };
    }

    {
        let slice = full_text.slice(range!(0..1), range!(4..5));
        assert_eq!(slice, "Apple", "Full slice didn't match expected");
    }

    {
        let slice = full_text.slice_partial(range!(6..9), range!(12..13));
        assert_eq!(slice, "banana", "Partial slice didn't match expected");
    }
}

#[test]
#[should_panic]
fn slice_invalid() {
    use crate::parsing::Token;

    let text = "Durian...";
    let full_text = FullText::new(text);

    macro_rules! range {
        ($span:expr) => {
            &ExtractedToken {
                token: Token::Other,
                slice: &text[$span],
                span: $span,
            }
        };
    }

    // "Durian"
    let _ = full_text.slice(range!(6..7), range!(0..1));
}