Skip to main content

perl_module/token_core/
mod.rs

1//! Shared primitives for Perl module token parsing and boundary detection.
2
3/// Byte span for a parsed module token.
4#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub struct ModuleTokenSpan {
6    /// Inclusive byte start offset in the source text.
7    pub start: usize,
8    /// Exclusive byte end offset in the source text.
9    pub end: usize,
10}
11
12/// Parse a module token that starts at `start` in `text`.
13///
14/// A module token is one or more identifier segments separated by either
15/// `::` (canonical) or `'` (legacy) separators.
16#[must_use]
17pub fn parse_module_token(text: &str, start: usize) -> Option<ModuleTokenSpan> {
18    let bytes = text.as_bytes();
19    if start >= bytes.len() || !is_identifier_start(bytes[start]) {
20        return None;
21    }
22
23    let token_start = start;
24    let mut index = parse_identifier_segment(bytes, start)?;
25
26    while let Some(next) = next_separator(bytes, index) {
27        index = match next {
28            Separator::Canonical => index + 2,
29            Separator::Legacy => index + 1,
30        };
31
32        index = parse_identifier_segment(bytes, index)?;
33    }
34
35    Some(ModuleTokenSpan { start: token_start, end: index })
36}
37
38/// Check if a span from `start` to `end` is bounded as a standalone token.
39#[must_use]
40pub fn has_standalone_module_token_boundaries(line: &str, start: usize, end: usize) -> bool {
41    let left_ok = !left_context_is_module_char(line, start);
42    let right_ok = !right_context_is_module_char(line, end);
43
44    left_ok && right_ok
45}
46
47/// Check whether `ch` belongs to the module token character class.
48#[must_use]
49pub fn is_module_token_char(ch: char) -> bool {
50    ch.is_ascii_alphanumeric() || ch == '_' || ch == ':'
51}
52
53/// Check whether `ch` belongs to Perl module identifier characters.
54#[must_use]
55pub fn is_module_identifier_char(ch: char) -> bool {
56    ch.is_ascii_alphanumeric() || ch == '_'
57}
58
59#[derive(Debug, Clone, Copy)]
60enum Separator {
61    Canonical,
62    Legacy,
63}
64
65fn next_separator(bytes: &[u8], index: usize) -> Option<Separator> {
66    if text_starts_with(bytes, index, "::") {
67        return Some(Separator::Canonical);
68    }
69
70    if index < bytes.len() && bytes[index] == b'\'' {
71        return Some(Separator::Legacy);
72    }
73
74    None
75}
76
77fn parse_identifier_segment(bytes: &[u8], start: usize) -> Option<usize> {
78    if start >= bytes.len() || !is_identifier_start(bytes[start]) {
79        return None;
80    }
81
82    let mut index = start + 1;
83    while index < bytes.len() && is_identifier_byte(bytes[index]) {
84        index += 1;
85    }
86
87    Some(index)
88}
89
90fn is_identifier_start(byte: u8) -> bool {
91    byte.is_ascii_alphabetic() || byte == b'_'
92}
93
94fn is_identifier_byte(byte: u8) -> bool {
95    byte.is_ascii_alphanumeric() || byte == b'_'
96}
97
98fn left_context_is_module_char(line: &str, start: usize) -> bool {
99    if start == 0 {
100        return false;
101    }
102
103    let mut left = line[..start].char_indices();
104    let Some((left_idx, ch)) = left.next_back() else {
105        return false;
106    };
107
108    if ch != '\'' {
109        return is_module_token_char(ch);
110    }
111
112    if left_idx == 0 {
113        return false;
114    }
115
116    line[..left_idx].chars().next_back().is_some_and(is_module_identifier_char)
117}
118
119fn right_context_is_module_char(line: &str, end: usize) -> bool {
120    if end >= line.len() {
121        return false;
122    }
123
124    let mut right = line[end..].chars();
125    let Some(ch) = right.next() else {
126        return false;
127    };
128
129    if ch != '\'' {
130        return is_module_token_char(ch);
131    }
132
133    right.next().is_some_and(is_module_identifier_char)
134}
135
136fn text_starts_with(bytes: &[u8], start: usize, needle: &str) -> bool {
137    let bytes_len = bytes.len();
138    let needle_bytes = needle.as_bytes();
139    if start + needle_bytes.len() > bytes_len {
140        return false;
141    }
142
143    &bytes[start..start + needle_bytes.len()] == needle_bytes
144}