Skip to main content

perl_lexer/tokenizer/
util.rs

1//! Tokenization utilities shared by parser-facing entry points.
2//!
3//! Helpers in this module identify Perl data-section markers (`__DATA__` and
4//! `__END__`) using lexer tokens, so callers can safely split executable code
5//! from trailing payload without matching markers embedded in strings,
6//! heredocs, or POD content.
7
8const DATA_SECTION_MARKERS: [&str; 2] = ["__DATA__", "__END__"];
9
10fn may_contain_data_section_marker(text: &str) -> bool {
11    DATA_SECTION_MARKERS.iter().any(|marker| text.contains(marker))
12}
13
14fn marker_is_unindented_line_start(source: &str, marker_start: usize) -> bool {
15    let line_start = source[..marker_start].rfind(['\n', '\r']).map_or(0, |idx| idx + 1);
16    source[line_start..marker_start].is_empty()
17}
18
19/// Find the byte offset of a __DATA__ or __END__ marker in the source text.
20/// Uses the lexer to avoid false positives in heredocs/POD.
21/// Returns the byte offset of the start of the marker, or None if not found.
22pub fn find_data_marker_byte_lexed(s: &str) -> Option<usize> {
23    // Cheap prefilter: avoid constructing the lexer when marker substrings are absent.
24    if !may_contain_data_section_marker(s) {
25        return None;
26    }
27
28    use crate::{PerlLexer, TokenType};
29    let mut lx = PerlLexer::new(s);
30    while let Some(tok) = lx.next_token() {
31        match tok.token_type {
32            TokenType::DataMarker(_) if marker_is_unindented_line_start(s, tok.start) => {
33                return Some(tok.start);
34            }
35            TokenType::EOF => break,
36            _ => {}
37        }
38    }
39    None
40}
41
42/// Helper to get the code portion of text (before __DATA__/__END__)
43pub fn code_slice(text: &str) -> &str {
44    split_code_and_data(text).0
45}
46
47/// Split source text into executable code and optional trailing data section.
48///
49/// The data section starts at a lexed `__DATA__` or `__END__` marker and includes
50/// the marker line itself.
51pub fn split_code_and_data(text: &str) -> (&str, Option<&str>) {
52    if !may_contain_data_section_marker(text) {
53        return (text, None);
54    }
55
56    if let Some(marker_start) = find_data_marker_byte_lexed(text) {
57        (&text[..marker_start], Some(&text[marker_start..]))
58    } else {
59        (text, None)
60    }
61}
62
63/// Find the byte offset of a __DATA__ or __END__ marker in the source text.
64/// Returns the byte offset of the start of the marker line, or None if not found.
65#[deprecated(note = "Use find_data_marker_byte_lexed to avoid false positives in heredocs/POD")]
66pub fn find_data_marker_byte(s: &str) -> Option<usize> {
67    find_data_marker_byte_lexed(s)
68}
69
70#[cfg(test)]
71mod tests {
72    use super::*;
73
74    #[test]
75    fn test_find_data_marker_lexed() {
76        // No marker
77        assert_eq!(find_data_marker_byte_lexed("print 'hello';\n"), None);
78
79        // __DATA__ marker
80        let src = "print 'hello';\n__DATA__\ndata here";
81        assert_eq!(find_data_marker_byte_lexed(src), Some(15));
82
83        // __END__ marker at line start
84        let src2 = "code;\n__END__\ndata";
85        assert_eq!(find_data_marker_byte_lexed(src2), Some(6));
86
87        // Marker not at line start (should not match)
88        let src3 = "print '__DATA__';\n";
89        assert_eq!(find_data_marker_byte_lexed(src3), None);
90    }
91
92    #[test]
93    fn test_may_contain_data_section_marker_prefilter() {
94        assert!(!may_contain_data_section_marker("print 'hello';\n"));
95        assert!(may_contain_data_section_marker("__DATA__"));
96        assert!(may_contain_data_section_marker("prefix __END__ suffix"));
97    }
98
99    #[test]
100    fn test_find_data_marker_handles_crlf_and_leading_whitespace() {
101        let crlf_src = "print 'hello';\r\n__DATA__\r\nvalue";
102        assert_eq!(find_data_marker_byte_lexed(crlf_src), Some(16));
103
104        let indented_marker = "print 'hello';\n  __DATA__\nvalue";
105        assert_eq!(find_data_marker_byte_lexed(indented_marker), None);
106    }
107
108    #[test]
109    fn test_split_code_and_data_handles_crlf_line_endings() {
110        let src = "print 'ok';\r\n__DATA__\r\nvalue";
111        assert_eq!(split_code_and_data(src), ("print 'ok';\r\n", Some("__DATA__\r\nvalue")));
112    }
113
114    #[test]
115    fn test_find_data_marker_ignores_marker_inside_regex_literal() {
116        let regex = "my $re = qr/__DATA__/;\nprint 'ok';\n";
117        assert_eq!(find_data_marker_byte_lexed(regex), None);
118    }
119
120    #[test]
121    fn test_find_data_marker_with_cr_only_line_endings() {
122        let src = "print 'hello';\r__DATA__\rpayload";
123        assert_eq!(find_data_marker_byte_lexed(src), Some(15));
124        assert_eq!(split_code_and_data(src), ("print 'hello';\r", Some("__DATA__\rpayload")));
125    }
126
127    #[test]
128    fn test_code_slice() {
129        // No marker - returns full text
130        assert_eq!(code_slice("print 'hello';\n"), "print 'hello';\n");
131
132        // With __DATA__ marker
133        let src = "print 'hello';\n__DATA__\ndata here";
134        assert_eq!(code_slice(src), "print 'hello';\n");
135
136        // With __END__ marker
137        let src2 = "code;\n__END__\ndata";
138        assert_eq!(code_slice(src2), "code;\n");
139    }
140
141    #[test]
142    fn test_split_code_and_data_prefers_first_marker() {
143        let src = "print 'a';\n__DATA__\none\n__END__\ntwo";
144        assert_eq!(split_code_and_data(src), ("print 'a';\n", Some("__DATA__\none\n__END__\ntwo")));
145    }
146
147    #[test]
148    fn test_find_data_marker_ignores_markers_inside_heredoc_and_pod() {
149        let heredoc = "my $x = <<'TXT';\n__DATA__\nTXT\nprint $x;\n";
150        assert_eq!(find_data_marker_byte_lexed(heredoc), None);
151
152        let pod = "=pod\n__END__\n=cut\nprint 'ok';\n";
153        assert_eq!(find_data_marker_byte_lexed(pod), None);
154    }
155
156    #[test]
157    fn test_split_code_and_data() {
158        let no_marker = "print 'hello';\n";
159        assert_eq!(split_code_and_data(no_marker), (no_marker, None));
160
161        let with_data = "print 'hello';\n__DATA__\nvalue";
162        assert_eq!(split_code_and_data(with_data), ("print 'hello';\n", Some("__DATA__\nvalue")));
163
164        let with_end = "code;\n__END__\nvalue";
165        assert_eq!(split_code_and_data(with_end), ("code;\n", Some("__END__\nvalue")));
166    }
167
168    #[test]
169    fn test_find_data_marker_ignores_pod_and_heredoc_content() {
170        let pod = "=head1 NAME\n__DATA__\n=cut\nprint 'done';\n";
171        assert_eq!(find_data_marker_byte_lexed(pod), None);
172
173        let heredoc = "my $text = <<\"TXT\";\n__END__\nTXT\nprint $text;\n";
174        assert_eq!(find_data_marker_byte_lexed(heredoc), None);
175    }
176
177    #[test]
178    fn test_split_code_and_data_prefers_first_lexed_marker() {
179        let src = "print 'prelude';\n__DATA__\nchunk\n__END__\nignored";
180        assert_eq!(
181            split_code_and_data(src),
182            ("print 'prelude';\n", Some("__DATA__\nchunk\n__END__\nignored"))
183        );
184    }
185
186    #[test]
187    fn test_find_data_marker_uses_byte_offsets_with_unicode_prefix() {
188        let src = "say '\u{1F600}';\n__DATA__\npayload";
189        assert_eq!(find_data_marker_byte_lexed(src), Some(12));
190        assert_eq!(split_code_and_data(src), ("say '\u{1F600}';\n", Some("__DATA__\npayload")));
191    }
192
193    #[test]
194    // Allow deprecated call to verify compatibility wrapper behavior.
195    #[allow(deprecated)]
196    fn test_find_data_marker_deprecated_matches_lexed_helper() {
197        let src = "say 1;\n__END__\ntrailer";
198        assert_eq!(find_data_marker_byte(src), find_data_marker_byte_lexed(src));
199    }
200}