Skip to main content

lynx_parser/symbol_extraction/
python.rs

1use anyhow::Result;
2use lynx_protocol::{CodeChunk, SymbolRecord};
3use std::path::Path;
4use tree_sitter::{Parser, Query, QueryCursor, StreamingIterator};
5use tree_sitter_python::LANGUAGE;
6
7pub fn extract(path: &Path, content: &str) -> Result<(Vec<CodeChunk>, Vec<SymbolRecord>)> {
8    let mut parser = Parser::new();
9    parser.set_language(&LANGUAGE.into())?;
10
11    let tree = parser
12        .parse(content, None)
13        .ok_or_else(|| anyhow::anyhow!("Failed to parse Python file"))?;
14    let root_node = tree.root_node();
15
16    let mut chunks = Vec::new();
17    let mut symbols = Vec::new();
18
19    let query_str = r#"
20        (function_definition name: (identifier) @func_name) @func
21        (class_definition name: (identifier) @class_name) @class
22    "#;
23
24    let query = Query::new(&LANGUAGE.into(), query_str)?;
25    let mut cursor = QueryCursor::new();
26    let mut captures = cursor.captures(&query, root_node, content.as_bytes());
27
28    while let Some(&(ref mat, capture_index)) = captures.next() {
29        let capture = mat.captures[capture_index];
30        let capture_name = query.capture_names()[capture.index as usize];
31
32        if !["func", "class"].contains(&capture_name) {
33            continue;
34        }
35
36        let node = capture.node;
37        let start_line = node.start_position().row + 1;
38        let end_line = node.end_position().row + 1;
39        let raw_content = node.utf8_text(content.as_bytes())?.to_string();
40
41        let symbol_name = match resolve_symbol_name(mat, node, &query, content.as_bytes()) {
42            Some(name) => name,
43            None => continue,
44        };
45
46        let file_path = path.to_string_lossy().replace('\\', "/");
47        let symbol_id = format!("{}:{}:{}", capture_name, file_path, symbol_name);
48
49        symbols.push(SymbolRecord {
50            symbol_id: symbol_id.clone(),
51            symbol_name: symbol_name.clone(),
52            file_path: file_path.clone(),
53            start_line,
54            end_line,
55        });
56
57        chunks.push(CodeChunk {
58            id: blake3::hash(raw_content.as_bytes()).to_string(),
59            file_path: file_path.clone(),
60            start_line,
61            end_line,
62            raw_content,
63            symbols_defined: vec![symbol_id],
64        });
65    }
66
67    Ok((chunks, symbols))
68}
69
70fn resolve_symbol_name(
71    mat: &tree_sitter::QueryMatch,
72    node: tree_sitter::Node,
73    query: &Query,
74    content: &[u8],
75) -> Option<String> {
76    if let Some(capture) = mat.captures.iter().find(|c| {
77        let name = query.capture_names()[c.index as usize];
78        name.ends_with("_name")
79    }) {
80        if let Ok(text) = capture.node.utf8_text(content) {
81            return Some(text.to_string());
82        }
83    }
84
85    if let Some(name_node) = node
86        .child_by_field_name("name")
87        .or_else(|| node.child_by_field_name("type"))
88    {
89        if let Ok(text) = name_node.utf8_text(content) {
90            return Some(text.to_string());
91        }
92    }
93
94    find_identifier_in_node(node, content)
95}
96
97fn find_identifier_in_node(node: tree_sitter::Node, content: &[u8]) -> Option<String> {
98    let mut cursor = node.walk();
99    for child in node.named_children(&mut cursor) {
100        if matches!(
101            child.kind(),
102            "identifier" | "type_identifier" | "field_identifier" | "property_identifier"
103        ) {
104            if let Ok(text) = child.utf8_text(content) {
105                return Some(text.to_string());
106            }
107        }
108        if let Some(name) = find_identifier_in_node(child, content) {
109            return Some(name);
110        }
111    }
112    None
113}