cruxlines 0.2.0

Ranks symbol definitions by cross-file references using tree-sitter.
Documentation
use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};

use tree_sitter::{Node, Parser, Tree};

#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct Location {
    pub path: PathBuf,
    pub line: usize,
    pub column: usize,
    pub name: String,
}

#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ReferenceEdge {
    pub definition: Location,
    pub usage: Location,
    pub ecosystem: crate::languages::Ecosystem,
}

struct FileInput {
    path: PathBuf,
    source: String,
    language: crate::languages::Language,
    tree: Tree,
}

pub fn find_references<I, P>(files: I) -> impl Iterator<Item = ReferenceEdge>
where
    I: IntoIterator<Item = (P, String)>,
    P: Into<PathBuf>,
{
    let mut inputs_by_ecosystem: HashMap<crate::languages::Ecosystem, Vec<FileInput>> =
        HashMap::new();
    for (path, source) in files {
        let path = path.into();
        let Some(language) = crate::languages::language_for_path(&path) else {
            continue;
        };
        let Some(tree) = parse_tree(&language, &source) else {
            continue;
        };
        let ecosystem = crate::languages::ecosystem_for_language(language);
        inputs_by_ecosystem
            .entry(ecosystem)
            .or_default()
            .push(FileInput {
                path,
                source,
                language,
                tree,
            });
    }

    let mut edges = Vec::new();
    for (ecosystem, inputs) in &inputs_by_ecosystem {
        let mut definitions: HashMap<String, Vec<Location>> = HashMap::new();
        let mut definition_positions: HashSet<(PathBuf, usize, usize)> = HashSet::new();

        for input in inputs {
            match input.language {
                crate::languages::Language::Python => crate::languages::python::emit_definitions(
                    &input.path,
                    &input.source,
                    &input.tree,
                    |location| record_definition(
                        location,
                        &mut definitions,
                        &mut definition_positions,
                    ),
                ),
                crate::languages::Language::JavaScript
                | crate::languages::Language::TypeScript
                | crate::languages::Language::TypeScriptReact => {
                    crate::languages::javascript::emit_definitions(
                        &input.path,
                        &input.source,
                        &input.tree,
                        |location| record_definition(
                            location,
                            &mut definitions,
                            &mut definition_positions,
                        ),
                    )
                }
                crate::languages::Language::Rust => crate::languages::rust::emit_definitions(
                    &input.path,
                    &input.source,
                    &input.tree,
                    |location| record_definition(
                        location,
                        &mut definitions,
                        &mut definition_positions,
                    ),
                ),
            }
        }

        for input in inputs {
            match input.language {
                crate::languages::Language::Python => crate::languages::python::emit_references(
                    &input.path,
                    &input.source,
                    &input.tree,
                    |location| record_reference(
                        location,
                        *ecosystem,
                        &definitions,
                        &definition_positions,
                        &mut edges,
                    ),
                ),
                crate::languages::Language::JavaScript
                | crate::languages::Language::TypeScript
                | crate::languages::Language::TypeScriptReact => {
                    crate::languages::javascript::emit_references(
                        &input.path,
                        &input.source,
                        &input.tree,
                        |location| record_reference(
                            location,
                            *ecosystem,
                            &definitions,
                            &definition_positions,
                            &mut edges,
                        ),
                    )
                }
                crate::languages::Language::Rust => crate::languages::rust::emit_references(
                    &input.path,
                    &input.source,
                    &input.tree,
                    |location| record_reference(
                        location,
                        *ecosystem,
                        &definitions,
                        &definition_positions,
                        &mut edges,
                    ),
                ),
            }
        }
    }

    edges.into_iter()
}

fn parse_tree(language: &crate::languages::Language, source: &str) -> Option<Tree> {
    let mut parser = Parser::new();
    let ts_language = crate::languages::tree_sitter_language(*language);
    parser.set_language(&ts_language).ok()?;
    parser.parse(source, None)
}

pub(crate) fn walk_tree(tree: &Tree, mut visit: impl FnMut(Node)) {
    let root = tree.root_node();
    let mut stack = vec![root];
    while let Some(node) = stack.pop() {
        visit(node);
        for i in 0..node.child_count() {
            if let Some(child) = node.child(i) {
                stack.push(child);
            }
        }
    }
}

pub(crate) fn collect_identifier_nodes<F>(node: Node, source: &str, mut on_ident: F)
where
    F: FnMut(Node),
{
    if node.kind() == "identifier" {
        if node.utf8_text(source.as_bytes()).is_ok() {
            on_ident(node);
        }
    }
    let mut stack = vec![node];
    while let Some(current) = stack.pop() {
        for i in 0..current.child_count() {
            if let Some(child) = current.child(i) {
                if child.kind() == "identifier" {
                    if child.utf8_text(source.as_bytes()).is_ok() {
                        on_ident(child);
                    }
                } else {
                    stack.push(child);
                }
            }
        }
    }
}

pub(crate) fn location_from_node(path: &Path, source: &str, node: Node) -> Option<Location> {
    let (line, column) = position(node);
    let name = node.utf8_text(source.as_bytes()).ok()?;
    Some(Location {
        path: path.to_path_buf(),
        line,
        column,
        name: name.to_string(),
    })
}

fn record_definition(
    location: Location,
    definitions: &mut HashMap<String, Vec<Location>>,
    definition_positions: &mut HashSet<(PathBuf, usize, usize)>,
) {
    let key = location.name.clone();
    let entry = definitions.entry(key).or_default();
    if !entry.iter().any(|item| {
        item.path == location.path && item.line == location.line && item.column == location.column
    }) {
        entry.push(location.clone());
    }
    definition_positions.insert((location.path, location.line, location.column));
}

fn record_reference(
    location: Location,
    ecosystem: crate::languages::Ecosystem,
    definitions: &HashMap<String, Vec<Location>>,
    definition_positions: &HashSet<(PathBuf, usize, usize)>,
    edges: &mut Vec<ReferenceEdge>,
) {
    if definition_positions.contains(&(location.path.clone(), location.line, location.column)) {
        return;
    }
    if let Some(defs) = definitions.get(&location.name) {
        for def in defs {
            edges.push(ReferenceEdge {
                definition: def.clone(),
                usage: location.clone(),
                ecosystem,
            });
        }
    }
}

fn position(node: Node) -> (usize, usize) {
    let pos = node.start_position();
    (pos.row + 1, pos.column + 1)
}

#[cfg(test)]
mod tests {
    use super::walk_tree;
    use tree_sitter::Parser;

    #[test]
    fn walk_tree_visits_nodes() {
        let mut parser = Parser::new();
        let language = tree_sitter_python::LANGUAGE;
        parser.set_language(&language.into()).expect("set language");
        let tree = parser.parse("x = 1\n", None).expect("parse");

        let mut kinds = Vec::new();
        walk_tree(&tree, |node| {
            kinds.push(node.kind().to_string());
        });

        assert!(kinds.contains(&"module".to_string()));
        assert!(kinds.contains(&"identifier".to_string()));
    }
}