const DATA_SECTION_MARKERS: [&str; 2] = ["__DATA__", "__END__"];
fn may_contain_data_section_marker(text: &str) -> bool {
DATA_SECTION_MARKERS.iter().any(|marker| text.contains(marker))
}
fn marker_is_unindented_line_start(source: &str, marker_start: usize) -> bool {
let line_start = source[..marker_start].rfind(['\n', '\r']).map_or(0, |idx| idx + 1);
source[line_start..marker_start].is_empty()
}
pub fn find_data_marker_byte_lexed(s: &str) -> Option<usize> {
if !may_contain_data_section_marker(s) {
return None;
}
use crate::{PerlLexer, TokenType};
let mut lx = PerlLexer::new(s);
while let Some(tok) = lx.next_token() {
match tok.token_type {
TokenType::DataMarker(_) if marker_is_unindented_line_start(s, tok.start) => {
return Some(tok.start);
}
TokenType::EOF => break,
_ => {}
}
}
None
}
pub fn code_slice(text: &str) -> &str {
split_code_and_data(text).0
}
pub fn split_code_and_data(text: &str) -> (&str, Option<&str>) {
if !may_contain_data_section_marker(text) {
return (text, None);
}
if let Some(marker_start) = find_data_marker_byte_lexed(text) {
(&text[..marker_start], Some(&text[marker_start..]))
} else {
(text, None)
}
}
#[deprecated(note = "Use find_data_marker_byte_lexed to avoid false positives in heredocs/POD")]
pub fn find_data_marker_byte(s: &str) -> Option<usize> {
find_data_marker_byte_lexed(s)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_find_data_marker_lexed() {
assert_eq!(find_data_marker_byte_lexed("print 'hello';\n"), None);
let src = "print 'hello';\n__DATA__\ndata here";
assert_eq!(find_data_marker_byte_lexed(src), Some(15));
let src2 = "code;\n__END__\ndata";
assert_eq!(find_data_marker_byte_lexed(src2), Some(6));
let src3 = "print '__DATA__';\n";
assert_eq!(find_data_marker_byte_lexed(src3), None);
}
#[test]
fn test_may_contain_data_section_marker_prefilter() {
assert!(!may_contain_data_section_marker("print 'hello';\n"));
assert!(may_contain_data_section_marker("__DATA__"));
assert!(may_contain_data_section_marker("prefix __END__ suffix"));
}
#[test]
fn test_find_data_marker_handles_crlf_and_leading_whitespace() {
let crlf_src = "print 'hello';\r\n__DATA__\r\nvalue";
assert_eq!(find_data_marker_byte_lexed(crlf_src), Some(16));
let indented_marker = "print 'hello';\n __DATA__\nvalue";
assert_eq!(find_data_marker_byte_lexed(indented_marker), None);
}
#[test]
fn test_split_code_and_data_handles_crlf_line_endings() {
let src = "print 'ok';\r\n__DATA__\r\nvalue";
assert_eq!(split_code_and_data(src), ("print 'ok';\r\n", Some("__DATA__\r\nvalue")));
}
#[test]
fn test_find_data_marker_ignores_marker_inside_regex_literal() {
let regex = "my $re = qr/__DATA__/;\nprint 'ok';\n";
assert_eq!(find_data_marker_byte_lexed(regex), None);
}
#[test]
fn test_find_data_marker_with_cr_only_line_endings() {
let src = "print 'hello';\r__DATA__\rpayload";
assert_eq!(find_data_marker_byte_lexed(src), Some(15));
assert_eq!(split_code_and_data(src), ("print 'hello';\r", Some("__DATA__\rpayload")));
}
#[test]
fn test_code_slice() {
assert_eq!(code_slice("print 'hello';\n"), "print 'hello';\n");
let src = "print 'hello';\n__DATA__\ndata here";
assert_eq!(code_slice(src), "print 'hello';\n");
let src2 = "code;\n__END__\ndata";
assert_eq!(code_slice(src2), "code;\n");
}
#[test]
fn test_split_code_and_data_prefers_first_marker() {
let src = "print 'a';\n__DATA__\none\n__END__\ntwo";
assert_eq!(split_code_and_data(src), ("print 'a';\n", Some("__DATA__\none\n__END__\ntwo")));
}
#[test]
fn test_find_data_marker_ignores_markers_inside_heredoc_and_pod() {
let heredoc = "my $x = <<'TXT';\n__DATA__\nTXT\nprint $x;\n";
assert_eq!(find_data_marker_byte_lexed(heredoc), None);
let pod = "=pod\n__END__\n=cut\nprint 'ok';\n";
assert_eq!(find_data_marker_byte_lexed(pod), None);
}
#[test]
fn test_split_code_and_data() {
let no_marker = "print 'hello';\n";
assert_eq!(split_code_and_data(no_marker), (no_marker, None));
let with_data = "print 'hello';\n__DATA__\nvalue";
assert_eq!(split_code_and_data(with_data), ("print 'hello';\n", Some("__DATA__\nvalue")));
let with_end = "code;\n__END__\nvalue";
assert_eq!(split_code_and_data(with_end), ("code;\n", Some("__END__\nvalue")));
}
#[test]
fn test_find_data_marker_ignores_pod_and_heredoc_content() {
let pod = "=head1 NAME\n__DATA__\n=cut\nprint 'done';\n";
assert_eq!(find_data_marker_byte_lexed(pod), None);
let heredoc = "my $text = <<\"TXT\";\n__END__\nTXT\nprint $text;\n";
assert_eq!(find_data_marker_byte_lexed(heredoc), None);
}
#[test]
fn test_split_code_and_data_prefers_first_lexed_marker() {
let src = "print 'prelude';\n__DATA__\nchunk\n__END__\nignored";
assert_eq!(
split_code_and_data(src),
("print 'prelude';\n", Some("__DATA__\nchunk\n__END__\nignored"))
);
}
#[test]
fn test_find_data_marker_uses_byte_offsets_with_unicode_prefix() {
let src = "say '\u{1F600}';\n__DATA__\npayload";
assert_eq!(find_data_marker_byte_lexed(src), Some(12));
assert_eq!(split_code_and_data(src), ("say '\u{1F600}';\n", Some("__DATA__\npayload")));
}
#[test]
#[allow(deprecated)]
fn test_find_data_marker_deprecated_matches_lexed_helper() {
let src = "say 1;\n__END__\ntrailer";
assert_eq!(find_data_marker_byte(src), find_data_marker_byte_lexed(src));
}
}