Skip to main content

perl_module_token_core/
lib.rs

1//! Shared primitives for Perl module token parsing and boundary detection.
2//!
3//! This microcrate owns the grammar-level mechanics used by token parsing and
4//! standalone token matching:
5//!
6//! - module token shape parsing (`Foo::Bar`, `Foo'Bar`, etc.)
7//! - boundary checks to avoid partial identifier matches
8
9#![deny(unsafe_code)]
10#![warn(rust_2018_idioms)]
11#![warn(missing_docs)]
12#![warn(clippy::all)]
13
14/// Byte span for a parsed module token.
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16pub struct ModuleTokenSpan {
17    /// Inclusive byte start offset in the source text.
18    pub start: usize,
19    /// Exclusive byte end offset in the source text.
20    pub end: usize,
21}
22
23/// Parse a module token that starts at `start` in `text`.
24///
25/// A module token is one or more identifier segments separated by either
26/// `::` (canonical) or `'` (legacy) separators.
27#[must_use]
28pub fn parse_module_token(text: &str, start: usize) -> Option<ModuleTokenSpan> {
29    let bytes = text.as_bytes();
30    if start >= bytes.len() || !is_identifier_start(bytes[start]) {
31        return None;
32    }
33
34    let token_start = start;
35    let mut index = parse_identifier_segment(bytes, start)?;
36
37    while let Some(next) = next_separator(bytes, index) {
38        index = match next {
39            Separator::Canonical => index + 2,
40            Separator::Legacy => index + 1,
41        };
42
43        index = parse_identifier_segment(bytes, index)?;
44    }
45
46    Some(ModuleTokenSpan { start: token_start, end: index })
47}
48
49/// Check if a span from `start` to `end` is bounded as a standalone token.
50#[must_use]
51pub fn has_standalone_module_token_boundaries(line: &str, start: usize, end: usize) -> bool {
52    let left_ok = !left_context_is_module_char(line, start);
53    let right_ok = !right_context_is_module_char(line, end);
54
55    left_ok && right_ok
56}
57
58/// Check whether `ch` belongs to the module token character class.
59#[must_use]
60pub fn is_module_token_char(ch: char) -> bool {
61    ch.is_ascii_alphanumeric() || ch == '_' || ch == ':'
62}
63
64/// Check whether `ch` belongs to Perl module identifier characters.
65#[must_use]
66pub fn is_module_identifier_char(ch: char) -> bool {
67    ch.is_ascii_alphanumeric() || ch == '_'
68}
69
70#[derive(Debug, Clone, Copy)]
71enum Separator {
72    Canonical,
73    Legacy,
74}
75
76fn next_separator(bytes: &[u8], index: usize) -> Option<Separator> {
77    if text_starts_with(bytes, index, "::") {
78        return Some(Separator::Canonical);
79    }
80
81    if index < bytes.len() && bytes[index] == b'\'' {
82        return Some(Separator::Legacy);
83    }
84
85    None
86}
87
88fn parse_identifier_segment(bytes: &[u8], start: usize) -> Option<usize> {
89    if start >= bytes.len() || !is_identifier_start(bytes[start]) {
90        return None;
91    }
92
93    let mut index = start + 1;
94    while index < bytes.len() && is_identifier_byte(bytes[index]) {
95        index += 1;
96    }
97
98    Some(index)
99}
100
101fn is_identifier_start(byte: u8) -> bool {
102    byte.is_ascii_alphabetic() || byte == b'_'
103}
104
105fn is_identifier_byte(byte: u8) -> bool {
106    byte.is_ascii_alphanumeric() || byte == b'_'
107}
108
109fn left_context_is_module_char(line: &str, start: usize) -> bool {
110    if start == 0 {
111        return false;
112    }
113
114    let mut left = line[..start].char_indices();
115    let Some((left_idx, ch)) = left.next_back() else {
116        return false;
117    };
118
119    if ch != '\'' {
120        return is_module_token_char(ch);
121    }
122
123    if left_idx == 0 {
124        return false;
125    }
126
127    line[..left_idx].chars().next_back().is_some_and(is_module_identifier_char)
128}
129
130fn right_context_is_module_char(line: &str, end: usize) -> bool {
131    if end >= line.len() {
132        return false;
133    }
134
135    let mut right = line[end..].chars();
136    let Some(ch) = right.next() else {
137        return false;
138    };
139
140    if ch != '\'' {
141        return is_module_token_char(ch);
142    }
143
144    right.next().is_some_and(is_module_identifier_char)
145}
146
147fn text_starts_with(bytes: &[u8], start: usize, needle: &str) -> bool {
148    let bytes_len = bytes.len();
149    let needle_bytes = needle.as_bytes();
150    if start + needle_bytes.len() > bytes_len {
151        return false;
152    }
153
154    &bytes[start..start + needle_bytes.len()] == needle_bytes
155}
156
157#[cfg(test)]
158mod tests {
159    use super::{
160        ModuleTokenSpan, has_standalone_module_token_boundaries, is_module_identifier_char,
161        is_module_token_char, parse_module_token,
162    };
163
164    #[test]
165    fn parses_canonical_and_legacy_tokens() {
166        assert_eq!(
167            parse_module_token("use Foo::Bar;", 4),
168            Some(ModuleTokenSpan { start: 4, end: 12 })
169        );
170        assert_eq!(
171            parse_module_token("use Foo'Bar;", 4),
172            Some(ModuleTokenSpan { start: 4, end: 11 })
173        );
174    }
175
176    #[test]
177    fn detects_standalone_token_boundaries() {
178        assert!(has_standalone_module_token_boundaries("use Foo::Bar;", 4, 12));
179        assert!(!has_standalone_module_token_boundaries("use Foo::Bar::Extra;", 4, 12));
180    }
181
182    #[test]
183    fn exports_token_character_classes() {
184        assert!(is_module_token_char(':'));
185        assert!(is_module_token_char('_'));
186        assert!(is_module_identifier_char('_'));
187        assert!(!is_module_identifier_char(':'));
188    }
189}