use crate::ast::{Node, NodeKind};
use perl_semantic_facts::{
AnchorFact, AnchorId, Confidence, EntityFact, EntityId, EntityKind, FileId, OccurrenceFact,
OccurrenceId, OccurrenceKind, Provenance,
};
pub fn extract_eval_sub_boundaries(
ast: &Node,
file_id: FileId,
) -> Vec<(EntityFact, AnchorFact, OccurrenceFact)> {
let mut out = Vec::new();
walk(ast, file_id, &mut out);
out
}
fn walk(node: &Node, file_id: FileId, out: &mut Vec<(EntityFact, AnchorFact, OccurrenceFact)>) {
if let NodeKind::Eval { block } = &node.kind {
if let NodeKind::String { value, .. } = &block.kind {
extract_from_eval_string(value, node.location.start, node.location.end, file_id, out);
}
walk(block, file_id, out);
return;
}
for child in node.children() {
walk(child, file_id, out);
}
}
fn extract_from_eval_string(
eval_string: &str,
node_start_byte: usize,
node_end_byte: usize,
file_id: FileId,
out: &mut Vec<(EntityFact, AnchorFact, OccurrenceFact)>,
) {
let content = eval_string
.trim_start_matches('"')
.trim_end_matches('"')
.trim_start_matches('\'')
.trim_end_matches('\'');
let mut search = content;
while !search.is_empty() {
let Some(sub_pos) = find_sub_keyword(search) else {
break;
};
let after_sub = &search[sub_pos + 3..];
let ws_len =
after_sub.len() - after_sub.trim_start_matches(|c: char| c.is_ascii_whitespace()).len();
let after_ws = &after_sub[ws_len..];
if after_ws.starts_with('{') || after_ws.starts_with(['$', '@', '%', '&', '*']) {
let advance = sub_pos + 3 + ws_len.max(1);
if advance >= search.len() {
break;
}
search = &search[advance..];
continue;
}
let name_len = after_ws
.find(|c: char| !c.is_ascii_alphanumeric() && c != '_')
.unwrap_or(after_ws.len());
if name_len > 0 {
let name = &after_ws[..name_len];
if name.as_bytes().first().is_some_and(|&b| b.is_ascii_alphabetic() || b == b'_') {
let after_name = after_ws[name_len..].trim_start();
let plausible = after_name.starts_with('{')
|| after_name.starts_with(';')
|| after_name.starts_with('(');
if plausible {
emit_triple(name, node_start_byte, node_end_byte, file_id, out);
}
}
}
let advance = sub_pos + 3 + ws_len + name_len.max(1);
if advance >= search.len() {
break;
}
search = &search[advance..];
}
}
fn find_sub_keyword(text: &str) -> Option<usize> {
let mut start = 0;
while start < text.len() {
let pos = text[start..].find("sub")?;
let abs_pos = start + pos;
let left_ok = abs_pos == 0
|| !text.as_bytes()[abs_pos - 1].is_ascii_alphanumeric()
&& text.as_bytes()[abs_pos - 1] != b'_';
let right_byte = text.as_bytes().get(abs_pos + 3).copied();
let right_ok = right_byte.map(|b| b.is_ascii_whitespace()).unwrap_or(true);
if left_ok && right_ok {
return Some(abs_pos);
}
start = abs_pos + 3;
}
None
}
fn emit_triple(
name: &str,
node_start_byte: usize,
node_end_byte: usize,
file_id: FileId,
out: &mut Vec<(EntityFact, AnchorFact, OccurrenceFact)>,
) {
let base_id = stable_id(file_id.0, node_start_byte as u64, name);
let entity_id = EntityId(base_id);
let anchor_id = AnchorId(base_id + 1);
let occurrence_id = OccurrenceId(base_id + 2);
let entity = EntityFact {
id: entity_id,
canonical_name: name.to_string(),
kind: EntityKind::Subroutine,
anchor_id: Some(anchor_id),
scope_id: None,
provenance: Provenance::DynamicBoundary,
confidence: Confidence::Low,
};
let span_end =
if node_end_byte > node_start_byte { node_end_byte } else { node_start_byte + 1 };
let anchor = AnchorFact {
id: anchor_id,
file_id,
span_start_byte: node_start_byte as u32,
span_end_byte: span_end as u32,
scope_id: None,
provenance: Provenance::DynamicBoundary,
confidence: Confidence::Low,
};
let occurrence = OccurrenceFact {
id: occurrence_id,
kind: OccurrenceKind::DynamicBoundary,
entity_id: Some(entity_id),
anchor_id,
scope_id: None,
provenance: Provenance::DynamicBoundary,
confidence: Confidence::Low,
};
out.push((entity, anchor, occurrence));
}
fn stable_id(file_id: u64, node_start: u64, name: &str) -> u64 {
const FNV_OFFSET: u64 = 14_695_981_039_346_656_037;
const FNV_PRIME: u64 = 1_099_511_628_211;
let mut hash = FNV_OFFSET;
for &byte in &file_id.to_le_bytes() {
hash ^= u64::from(byte);
hash = hash.wrapping_mul(FNV_PRIME);
}
for &byte in &node_start.to_le_bytes() {
hash ^= u64::from(byte);
hash = hash.wrapping_mul(FNV_PRIME);
}
for &byte in name.as_bytes() {
hash ^= u64::from(byte);
hash = hash.wrapping_mul(FNV_PRIME);
}
0xE_0000_0000_u64.wrapping_add(hash.wrapping_shl(3))
}
#[cfg(test)]
mod tests {
use super::*;
use perl_semantic_facts::FileId;
#[test]
fn find_sub_keyword_basic() -> Result<(), Box<dyn std::error::Error>> {
assert_eq!(find_sub_keyword("sub foo { 1 }"), Some(0));
assert_eq!(find_sub_keyword(" sub bar { }"), Some(2));
assert_eq!(find_sub_keyword("no sub here really sub baz"), Some(3));
Ok(())
}
#[test]
fn find_sub_keyword_rejects_suburb() -> Result<(), Box<dyn std::error::Error>> {
assert_eq!(find_sub_keyword("suburb"), None);
assert_eq!(find_sub_keyword("sub sub foo"), Some(0));
Ok(())
}
#[test]
fn find_sub_keyword_none_when_absent() -> Result<(), Box<dyn std::error::Error>> {
assert_eq!(find_sub_keyword("hello world"), None);
assert_eq!(find_sub_keyword(""), None);
Ok(())
}
fn parse_and_extract(
code: &str,
file_id: FileId,
) -> Vec<(EntityFact, AnchorFact, OccurrenceFact)> {
let mut parser = crate::Parser::new(code);
let ast = match parser.parse() {
Ok(a) => a,
Err(_) => return vec![],
};
extract_eval_sub_boundaries(&ast, file_id)
}
#[test]
fn extracts_single_sub_from_eval_string() -> Result<(), Box<dyn std::error::Error>> {
let file_id = FileId(1);
let triples = parse_and_extract(r#"eval "sub generated_from_string { 1 }";"#, file_id);
assert_eq!(triples.len(), 1, "should extract exactly one sub");
let (entity, _anchor, occurrence) = &triples[0];
assert_eq!(entity.canonical_name, "generated_from_string");
assert_eq!(entity.kind, EntityKind::Subroutine);
assert_eq!(entity.provenance, Provenance::DynamicBoundary);
assert_eq!(entity.confidence, Confidence::Low);
assert_eq!(occurrence.kind, OccurrenceKind::DynamicBoundary);
assert_eq!(occurrence.entity_id, Some(entity.id));
Ok(())
}
#[test]
fn extracts_multiple_subs_from_eval_string() -> Result<(), Box<dyn std::error::Error>> {
let file_id = FileId(2);
let triples = parse_and_extract(r#"eval "sub foo { 1 } sub bar { 2 }";"#, file_id);
assert_eq!(triples.len(), 2, "should extract two subs");
let names: Vec<&str> = triples.iter().map(|(e, _, _)| e.canonical_name.as_str()).collect();
assert!(names.contains(&"foo"), "should include 'foo'");
assert!(names.contains(&"bar"), "should include 'bar'");
Ok(())
}
#[test]
fn non_literal_eval_does_not_produce_evidence() -> Result<(), Box<dyn std::error::Error>> {
let file_id = FileId(3);
let triples = parse_and_extract(r#"eval $code;"#, file_id);
assert!(triples.is_empty(), "non-literal eval must not produce evidence");
Ok(())
}
#[test]
fn eval_block_does_not_produce_evidence() -> Result<(), Box<dyn std::error::Error>> {
let file_id = FileId(4);
let triples = parse_and_extract(r#"eval { die "oops" };"#, file_id);
assert!(triples.is_empty(), "block eval must not produce evidence");
Ok(())
}
#[test]
fn anonymous_sub_in_eval_does_not_produce_named_evidence()
-> Result<(), Box<dyn std::error::Error>> {
let file_id = FileId(5);
let triples = parse_and_extract(r#"eval "sub { 1 }";"#, file_id);
assert!(triples.is_empty(), "anonymous sub in eval must not produce named evidence");
Ok(())
}
#[test]
fn prose_sub_in_eval_does_not_produce_evidence() -> Result<(), Box<dyn std::error::Error>> {
let file_id = FileId(6);
let triples = {
let mut out = Vec::new();
extract_from_eval_string("no sub here really sub baz", 0, 26, file_id, &mut out);
out
};
assert!(
triples.is_empty(),
"prose containing 'sub' without delimiters must not produce evidence, got: {:?}",
triples.iter().map(|(e, _, _)| e.canonical_name.as_str()).collect::<Vec<_>>()
);
Ok(())
}
#[test]
fn sub_with_semicolon_delimiter_is_accepted() -> Result<(), Box<dyn std::error::Error>> {
let file_id = FileId(7);
let triples = {
let mut out = Vec::new();
extract_from_eval_string("sub forward_decl;", 0, 18, file_id, &mut out);
out
};
assert_eq!(triples.len(), 1, "sub NAME; (forward decl) should produce evidence");
assert_eq!(triples[0].0.canonical_name, "forward_decl");
Ok(())
}
#[test]
fn sub_with_prototype_is_accepted() -> Result<(), Box<dyn std::error::Error>> {
let file_id = FileId(8);
let triples = {
let mut out = Vec::new();
extract_from_eval_string("sub proto_sub ($$) { 1 }", 0, 24, file_id, &mut out);
out
};
assert_eq!(triples.len(), 1, "sub NAME (proto) should produce evidence");
assert_eq!(triples[0].0.canonical_name, "proto_sub");
Ok(())
}
#[test]
fn interpolated_name_sub_does_not_produce_evidence() -> Result<(), Box<dyn std::error::Error>> {
let file_id = FileId(9);
let triples = {
let mut out = Vec::new();
extract_from_eval_string("sub $dynamic_name { 1 }", 0, 23, file_id, &mut out);
out
};
assert!(triples.is_empty(), "sub with sigil-prefixed name must not produce evidence");
Ok(())
}
#[test]
fn stable_id_is_deterministic() -> Result<(), Box<dyn std::error::Error>> {
let id1 = stable_id(1, 42, "foo");
let id2 = stable_id(1, 42, "foo");
assert_eq!(id1, id2, "stable_id must be deterministic");
let id3 = stable_id(1, 42, "bar");
assert_ne!(id1, id3, "different names must produce different IDs");
Ok(())
}
}