use std::time::{Instant, SystemTime, UNIX_EPOCH};
use tree_sitter::{Node as TsNode, Parser, Tree};
use crate::types::{
generate_node_id, Edge, EdgeKind, ExtractionResult, Node, NodeKind, UnresolvedRef, Visibility,
};
pub struct CobolExtractor;
struct ExtractionState {
nodes: Vec<Node>,
edges: Vec<Edge>,
unresolved_refs: Vec<UnresolvedRef>,
errors: Vec<String>,
node_stack: Vec<(String, String)>,
file_path: String,
source: Vec<u8>,
timestamp: u64,
}
impl ExtractionState {
fn new(file_path: &str, source: &str) -> Self {
let timestamp = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
Self {
nodes: Vec::new(),
edges: Vec::new(),
unresolved_refs: Vec::new(),
errors: Vec::new(),
node_stack: Vec::new(),
file_path: file_path.to_string(),
source: source.as_bytes().to_vec(),
timestamp,
}
}
fn qualified_prefix(&self) -> String {
let mut parts = vec![self.file_path.clone()];
for (name, _) in &self.node_stack {
parts.push(name.clone());
}
parts.join("::")
}
fn parent_node_id(&self) -> Option<&str> {
self.node_stack.last().map(|(_, id)| id.as_str())
}
fn node_text(&self, node: TsNode<'_>) -> String {
node.utf8_text(&self.source)
.unwrap_or("<invalid utf8>")
.to_string()
}
fn full_line_at(&self, byte_offset: usize) -> String {
let line_start = self.source[..byte_offset]
.iter()
.rposition(|&b| b == b'\n')
.map(|p| p + 1)
.unwrap_or(0);
let line_end = self.source[byte_offset..]
.iter()
.position(|&b| b == b'\n')
.map(|p| byte_offset + p)
.unwrap_or(self.source.len());
String::from_utf8_lossy(&self.source[line_start..line_end]).to_string()
}
}
impl CobolExtractor {
pub fn extract_cobol(file_path: &str, source: &str) -> ExtractionResult {
let start = Instant::now();
let mut state = ExtractionState::new(file_path, source);
let tree = match Self::parse_source(source) {
Ok(tree) => tree,
Err(msg) => {
state.errors.push(msg);
return Self::build_result(state, start);
}
};
let file_node = Node {
id: generate_node_id(file_path, &NodeKind::File, file_path, 0),
kind: NodeKind::File,
name: file_path.to_string(),
qualified_name: file_path.to_string(),
file_path: file_path.to_string(),
start_line: 0,
end_line: source.lines().count().saturating_sub(1) as u32,
start_column: 0,
end_column: 0,
signature: None,
docstring: None,
visibility: Visibility::Pub,
is_async: false,
branches: 0,
loops: 0,
returns: 0,
max_nesting: 0,
unsafe_blocks: 0,
unchecked_calls: 0,
assertions: 0,
updated_at: state.timestamp,
};
let file_node_id = file_node.id.clone();
state.nodes.push(file_node);
state
.node_stack
.push((file_path.to_string(), file_node_id));
let root = tree.root_node();
Self::visit_children(&mut state, root);
state.node_stack.pop();
Self::build_result(state, start)
}
fn parse_source(source: &str) -> Result<Tree, String> {
let mut parser = Parser::new();
let language = crate::extraction::ts_provider::language("cobol");
parser
.set_language(&language)
.map_err(|e| format!("failed to load COBOL grammar: {e}"))?;
parser
.parse(source, None)
.ok_or_else(|| "tree-sitter parse returned None".to_string())
}
fn visit_children(state: &mut ExtractionState, node: TsNode<'_>) {
let mut cursor = node.walk();
if cursor.goto_first_child() {
loop {
let child = cursor.node();
Self::visit_node(state, child);
if !cursor.goto_next_sibling() {
break;
}
}
}
}
fn visit_node(state: &mut ExtractionState, node: TsNode<'_>) {
match node.kind() {
"program_definition" => Self::visit_program_definition(state, node),
_ => {}
}
}
fn visit_program_definition(state: &mut ExtractionState, node: TsNode<'_>) {
let mut cursor = node.walk();
if cursor.goto_first_child() {
loop {
let child = cursor.node();
match child.kind() {
"identification_division" => Self::visit_identification_division(state, child),
"data_division" => Self::visit_data_division(state, child),
"procedure_division" => Self::visit_procedure_division(state, child),
_ => {}
}
if !cursor.goto_next_sibling() {
break;
}
}
}
}
fn visit_identification_division(state: &mut ExtractionState, node: TsNode<'_>) {
let program_name_node = Self::find_child_by_kind(node, "program_name");
if let Some(pn) = program_name_node {
let name = state.node_text(pn);
let start_line = node.start_position().row as u32;
let end_line = node.end_position().row as u32;
let start_column = node.start_position().column as u32;
let end_column = node.end_position().column as u32;
let qualified_name = format!("{}::{}", state.qualified_prefix(), name);
let id = generate_node_id(&state.file_path, &NodeKind::Module, &name, start_line);
let text = state.node_text(node);
let signature = text
.lines()
.find(|l| l.to_uppercase().contains("PROGRAM-ID"))
.map(|l| l.trim().to_string())
.filter(|l| !l.is_empty());
let graph_node = Node {
id: id.clone(),
kind: NodeKind::Module,
name: name.clone(),
qualified_name,
file_path: state.file_path.clone(),
start_line,
end_line,
start_column,
end_column,
signature,
docstring: None,
visibility: Visibility::Pub,
is_async: false,
branches: 0,
loops: 0,
returns: 0,
max_nesting: 0,
unsafe_blocks: 0,
unchecked_calls: 0,
assertions: 0,
updated_at: state.timestamp,
};
state.nodes.push(graph_node);
if let Some(parent_id) = state.parent_node_id() {
state.edges.push(Edge {
source: parent_id.to_string(),
target: id.clone(),
kind: EdgeKind::Contains,
line: Some(start_line),
});
}
state.node_stack.push((name, id));
}
}
fn visit_data_division(state: &mut ExtractionState, node: TsNode<'_>) {
let mut cursor = node.walk();
if cursor.goto_first_child() {
loop {
let child = cursor.node();
if child.kind() == "working_storage_section" {
Self::visit_working_storage(state, child);
}
if !cursor.goto_next_sibling() {
break;
}
}
}
}
fn visit_working_storage(state: &mut ExtractionState, node: TsNode<'_>) {
let mut cursor = node.walk();
if cursor.goto_first_child() {
let mut pending_comment: Option<String> = None;
loop {
let child = cursor.node();
match child.kind() {
"comment" => {
let line = state.full_line_at(child.start_byte());
let trimmed = line.trim();
let comment_text = if let Some(rest) = trimmed.strip_prefix('*') {
rest.trim().to_string()
} else {
trimmed.to_string()
};
pending_comment = Some(comment_text);
}
"data_description" => {
Self::visit_data_description(state, child, pending_comment.take());
}
_ => {
pending_comment = None;
}
}
if !cursor.goto_next_sibling() {
break;
}
}
}
}
fn visit_data_description(
state: &mut ExtractionState,
node: TsNode<'_>,
docstring: Option<String>,
) {
let level_node = Self::find_child_by_kind(node, "level_number");
if let Some(ln) = level_node {
let level_text = state.node_text(ln);
if level_text.trim() != "01" {
return;
}
} else {
return;
}
let name_node = Self::find_child_by_kind(node, "entry_name");
let name = if let Some(n) = name_node {
state.node_text(n)
} else {
return;
};
let has_value = Self::find_child_by_kind(node, "value_clause").is_some();
let kind = if has_value {
NodeKind::Const
} else {
NodeKind::Field
};
let start_line = node.start_position().row as u32;
let end_line = node.end_position().row as u32;
let start_column = node.start_position().column as u32;
let end_column = node.end_position().column as u32;
let text = state.node_text(node);
let qualified_name = format!("{}::{}", state.qualified_prefix(), name);
let id = generate_node_id(&state.file_path, &kind, &name, start_line);
let graph_node = Node {
id: id.clone(),
kind,
name,
qualified_name,
file_path: state.file_path.clone(),
start_line,
end_line,
start_column,
end_column,
signature: Some(text.trim().to_string()),
docstring,
visibility: Visibility::Pub,
is_async: false,
branches: 0,
loops: 0,
returns: 0,
max_nesting: 0,
unsafe_blocks: 0,
unchecked_calls: 0,
assertions: 0,
updated_at: state.timestamp,
};
state.nodes.push(graph_node);
if let Some(parent_id) = state.parent_node_id() {
state.edges.push(Edge {
source: parent_id.to_string(),
target: id,
kind: EdgeKind::Contains,
line: Some(start_line),
});
}
}
fn visit_procedure_division(state: &mut ExtractionState, node: TsNode<'_>) {
let child_count = node.child_count();
let mut children: Vec<TsNode<'_>> = Vec::new();
let mut i: usize = 0;
while i < child_count {
if let Some(child) = node.child(i as u32) {
children.push(child);
}
i += 1;
}
let mut idx = 0;
while idx < children.len() {
let child = children[idx];
if child.kind() == "paragraph_header" {
let docstring = Self::gather_preceding_comments(state, &children, idx);
let para_start = idx;
let mut para_end = idx + 1;
while para_end < children.len()
&& children[para_end].kind() != "paragraph_header"
&& children[para_end].kind() != "comment"
|| (para_end < children.len()
&& children[para_end].kind() == "comment"
&& para_end + 1 < children.len()
&& children[para_end + 1].kind() != "paragraph_header")
{
para_end += 1;
}
if para_end < children.len() && children[para_end].kind() == "comment" {
} else {
}
Self::visit_paragraph(state, &children, para_start, para_end, docstring);
idx = para_end;
} else {
idx += 1;
}
}
}
fn gather_preceding_comments(
state: &ExtractionState,
children: &[TsNode<'_>],
header_idx: usize,
) -> Option<String> {
let mut comments: Vec<String> = Vec::new();
let mut i = header_idx;
while i > 0 {
i -= 1;
if children[i].kind() == "comment" {
let line = state.full_line_at(children[i].start_byte());
let trimmed = line.trim();
let comment_text = if let Some(rest) = trimmed.strip_prefix('*') {
rest.trim().to_string()
} else {
trimmed.to_string()
};
comments.push(comment_text);
} else {
break;
}
}
if comments.is_empty() {
return None;
}
comments.reverse();
Some(comments.join("\n"))
}
fn visit_paragraph(
state: &mut ExtractionState,
children: &[TsNode<'_>],
start_idx: usize,
end_idx: usize,
docstring: Option<String>,
) {
let header = children[start_idx];
let header_text = state.node_text(header);
let name = header_text
.trim()
.trim_end_matches('.')
.to_string();
let start_line = header.start_position().row as u32;
let last_child = if end_idx > start_idx + 1 {
children[end_idx - 1]
} else {
header
};
let end_line = last_child.end_position().row as u32;
let start_column = header.start_position().column as u32;
let end_column = last_child.end_position().column as u32;
let qualified_name = format!("{}::{}", state.qualified_prefix(), name);
let id = generate_node_id(&state.file_path, &NodeKind::Function, &name, start_line);
let mut branches: u32 = 0;
let mut loops: u32 = 0;
let mut returns: u32 = 0;
for child in &children[(start_idx + 1)..end_idx] {
match child.kind() {
"if_header" => branches += 1,
"perform_statement_loop" => loops += 1,
"stop_statement" | "goback_statement" => returns += 1,
_ => {}
}
}
let graph_node = Node {
id: id.clone(),
kind: NodeKind::Function,
name: name.clone(),
qualified_name,
file_path: state.file_path.clone(),
start_line,
end_line,
start_column,
end_column,
signature: Some(header_text.trim().to_string()),
docstring,
visibility: Visibility::Pub,
is_async: false,
branches,
loops,
returns,
max_nesting: 0,
unsafe_blocks: 0,
unchecked_calls: 0,
assertions: 0,
updated_at: state.timestamp,
};
state.nodes.push(graph_node);
if let Some(parent_id) = state.parent_node_id() {
state.edges.push(Edge {
source: parent_id.to_string(),
target: id.clone(),
kind: EdgeKind::Contains,
line: Some(start_line),
});
}
for child in &children[(start_idx + 1)..end_idx] {
Self::extract_call_sites_from_node(state, *child, &id);
}
}
fn extract_call_sites_from_node(state: &mut ExtractionState, node: TsNode<'_>, fn_node_id: &str) {
match node.kind() {
"perform_statement_call_proc" => {
if let Some(proc_node) = node.child_by_field_name("procedure") {
let callee = Self::extract_label_name(state, proc_node);
if let Some(name) = callee {
state.unresolved_refs.push(UnresolvedRef {
from_node_id: fn_node_id.to_string(),
reference_name: name,
reference_kind: EdgeKind::Calls,
line: node.start_position().row as u32,
column: node.start_position().column as u32,
file_path: state.file_path.clone(),
});
}
}
}
"perform_statement_loop" => {
Self::recurse_call_sites(state, node, fn_node_id);
}
"call_statement" => {
let mut cursor = node.walk();
if cursor.goto_first_child() {
loop {
let child = cursor.node();
if child.kind() == "string" {
let text = state.node_text(child);
let name = text.trim_matches('"').trim_matches('\'').to_string();
if !name.is_empty() {
state.unresolved_refs.push(UnresolvedRef {
from_node_id: fn_node_id.to_string(),
reference_name: name,
reference_kind: EdgeKind::Calls,
line: node.start_position().row as u32,
column: node.start_position().column as u32,
file_path: state.file_path.clone(),
});
}
}
if !cursor.goto_next_sibling() {
break;
}
}
}
}
_ => {
Self::recurse_call_sites(state, node, fn_node_id);
}
}
}
fn recurse_call_sites(state: &mut ExtractionState, node: TsNode<'_>, fn_node_id: &str) {
let mut cursor = node.walk();
if cursor.goto_first_child() {
loop {
Self::extract_call_sites_from_node(state, cursor.node(), fn_node_id);
if !cursor.goto_next_sibling() {
break;
}
}
}
}
fn extract_label_name(state: &ExtractionState, node: TsNode<'_>) -> Option<String> {
let label = Self::find_child_by_kind(node, "label")?;
let qw = Self::find_child_by_kind(label, "qualified_word")?;
let word = Self::find_child_by_kind(qw, "WORD")?;
Some(state.node_text(word))
}
fn find_child_by_kind<'a>(node: TsNode<'a>, kind: &str) -> Option<TsNode<'a>> {
let mut cursor = node.walk();
if cursor.goto_first_child() {
loop {
let child = cursor.node();
if child.kind() == kind {
return Some(child);
}
if !cursor.goto_next_sibling() {
break;
}
}
}
None
}
fn build_result(state: ExtractionState, start: Instant) -> ExtractionResult {
ExtractionResult {
nodes: state.nodes,
edges: state.edges,
unresolved_refs: state.unresolved_refs,
errors: state.errors,
duration_ms: start.elapsed().as_millis() as u64,
}
}
}
impl crate::extraction::LanguageExtractor for CobolExtractor {
fn extensions(&self) -> &[&str] {
&["cob", "cbl", "cpy"]
}
fn language_name(&self) -> &str {
"COBOL"
}
fn extract(&self, file_path: &str, source: &str) -> ExtractionResult {
Self::extract_cobol(file_path, source)
}
}