Skip to main content

perl_lexer/tokenizer/
util.rs

1//! Tokenization utilities shared by parser-facing entry points.
2//!
3//! Helpers in this module identify Perl data-section markers (`__DATA__` and
4//! `__END__`) using lexer tokens, so callers can safely split executable code
5//! from trailing payload without matching markers embedded in strings,
6//! heredocs, or POD content.
7
8/// Find the byte offset of a __DATA__ or __END__ marker in the source text.
9/// Uses the lexer to avoid false positives in heredocs/POD.
10/// Returns the byte offset of the start of the marker, or None if not found.
11pub fn find_data_marker_byte_lexed(s: &str) -> Option<usize> {
12    // Cheap prefilter: avoid constructing the lexer when marker substrings are absent.
13    const MARKERS: [&str; 2] = ["__DATA__", "__END__"];
14    if !MARKERS.iter().any(|marker| s.contains(marker)) {
15        return None;
16    }
17
18    use crate::{PerlLexer, TokenType};
19    let mut lx = PerlLexer::new(s);
20    while let Some(tok) = lx.next_token() {
21        match tok.token_type {
22            TokenType::DataMarker(_) => return Some(tok.start),
23            TokenType::EOF => break,
24            _ => {}
25        }
26    }
27    None
28}
29
30/// Helper to get the code portion of text (before __DATA__/__END__)
31pub fn code_slice(text: &str) -> &str {
32    split_code_and_data(text).0
33}
34
35/// Split source text into executable code and optional trailing data section.
36///
37/// The data section starts at a lexed `__DATA__` or `__END__` marker and includes
38/// the marker line itself.
39pub fn split_code_and_data(text: &str) -> (&str, Option<&str>) {
40    if let Some(marker_start) = find_data_marker_byte_lexed(text) {
41        (&text[..marker_start], Some(&text[marker_start..]))
42    } else {
43        (text, None)
44    }
45}
46
47/// Find the byte offset of a __DATA__ or __END__ marker in the source text.
48/// Returns the byte offset of the start of the marker line, or None if not found.
49#[deprecated(note = "Use find_data_marker_byte_lexed to avoid false positives in heredocs/POD")]
50pub fn find_data_marker_byte(s: &str) -> Option<usize> {
51    find_data_marker_byte_lexed(s)
52}
53
54#[cfg(test)]
55mod tests {
56    use super::*;
57
58    #[test]
59    fn test_find_data_marker_lexed() {
60        // No marker
61        assert_eq!(find_data_marker_byte_lexed("print 'hello';\n"), None);
62
63        // __DATA__ marker
64        let src = "print 'hello';\n__DATA__\ndata here";
65        assert_eq!(find_data_marker_byte_lexed(src), Some(15));
66
67        // __END__ marker at line start
68        let src2 = "code;\n__END__\ndata";
69        assert_eq!(find_data_marker_byte_lexed(src2), Some(6));
70
71        // Marker not at line start (should not match)
72        let src3 = "print '__DATA__';\n";
73        assert_eq!(find_data_marker_byte_lexed(src3), None);
74    }
75
76    #[test]
77    fn test_code_slice() {
78        // No marker - returns full text
79        assert_eq!(code_slice("print 'hello';\n"), "print 'hello';\n");
80
81        // With __DATA__ marker
82        let src = "print 'hello';\n__DATA__\ndata here";
83        assert_eq!(code_slice(src), "print 'hello';\n");
84
85        // With __END__ marker
86        let src2 = "code;\n__END__\ndata";
87        assert_eq!(code_slice(src2), "code;\n");
88    }
89
90    #[test]
91    fn test_split_code_and_data_prefers_first_marker() {
92        let src = "print 'a';\n__DATA__\none\n__END__\ntwo";
93        assert_eq!(split_code_and_data(src), ("print 'a';\n", Some("__DATA__\none\n__END__\ntwo")));
94    }
95
96    #[test]
97    fn test_find_data_marker_ignores_markers_inside_heredoc_and_pod() {
98        let heredoc = "my $x = <<'TXT';\n__DATA__\nTXT\nprint $x;\n";
99        assert_eq!(find_data_marker_byte_lexed(heredoc), None);
100
101        let pod = "=pod\n__END__\n=cut\nprint 'ok';\n";
102        assert_eq!(find_data_marker_byte_lexed(pod), None);
103    }
104
105    #[test]
106    fn test_split_code_and_data() {
107        let no_marker = "print 'hello';\n";
108        assert_eq!(split_code_and_data(no_marker), (no_marker, None));
109
110        let with_data = "print 'hello';\n__DATA__\nvalue";
111        assert_eq!(split_code_and_data(with_data), ("print 'hello';\n", Some("__DATA__\nvalue")));
112
113        let with_end = "code;\n__END__\nvalue";
114        assert_eq!(split_code_and_data(with_end), ("code;\n", Some("__END__\nvalue")));
115    }
116
117    #[test]
118    fn test_find_data_marker_ignores_pod_and_heredoc_content() {
119        let pod = "=head1 NAME\n__DATA__\n=cut\nprint 'done';\n";
120        assert_eq!(find_data_marker_byte_lexed(pod), None);
121
122        let heredoc = "my $text = <<\"TXT\";\n__END__\nTXT\nprint $text;\n";
123        assert_eq!(find_data_marker_byte_lexed(heredoc), None);
124    }
125
126    #[test]
127    fn test_split_code_and_data_prefers_first_lexed_marker() {
128        let src = "print 'prelude';\n__DATA__\nchunk\n__END__\nignored";
129        assert_eq!(
130            split_code_and_data(src),
131            ("print 'prelude';\n", Some("__DATA__\nchunk\n__END__\nignored"))
132        );
133    }
134
135    #[test]
136    // Allow deprecated call to verify compatibility wrapper behavior.
137    #[allow(deprecated)]
138    fn test_find_data_marker_deprecated_matches_lexed_helper() {
139        let src = "say 1;\n__END__\ntrailer";
140        assert_eq!(find_data_marker_byte(src), find_data_marker_byte_lexed(src));
141    }
142}