Skip to main content

deagle_parse/
lib.rs

1//! deagle-parse — tree-sitter based code parser.
2//!
3//! Extracts code entities (functions, structs, traits, impls, imports)
4//! from source files using tree-sitter grammars.
5//!
6//! ## Feature Flags
7//!
8//! - `pattern` — structural pattern matching via [ast-grep-core](https://crates.io/crates/ast-grep-core)
9
10pub mod rust_parser;
11pub mod python_parser;
12pub mod go_parser;
13pub mod typescript_parser;
14pub mod java_parser;
15pub mod c_parser;
16pub mod cpp_parser;
17
18#[cfg(feature = "pattern")]
19pub mod pattern;
20
21#[cfg(feature = "text-search")]
22pub mod text_search;
23
24use deagle_core::{Language, Node, Result};
25use std::path::Path;
26
27pub use rust_parser::ParseResult;
28
29/// Truncate a string to at most `max_bytes`, respecting UTF-8 char boundaries.
30pub(crate) fn truncate_content(s: &str, max_bytes: usize) -> String {
31    if s.len() <= max_bytes {
32        return s.to_string();
33    }
34    // Find the last char boundary at or before max_bytes
35    let mut end = max_bytes;
36    while end > 0 && !s.is_char_boundary(end) {
37        end -= 1;
38    }
39    format!("{}...", &s[..end])
40}
41
42/// Parse a source file and extract code entities.
43pub fn parse_file(path: &Path, content: &str, language: Language) -> Result<Vec<Node>> {
44    match language {
45        Language::Rust => rust_parser::parse(path, content),
46        Language::Python => python_parser::parse(path, content),
47        Language::Go => go_parser::parse(path, content),
48        Language::TypeScript | Language::JavaScript => typescript_parser::parse(path, content),
49        Language::Java => java_parser::parse(path, content),
50        Language::C => c_parser::parse(path, content),
51        Language::Cpp => cpp_parser::parse(path, content),
52        _ => Ok(Vec::new()),
53    }
54}
55
56/// Parse with edge extraction — returns nodes and relationship tuples.
57pub fn parse_file_with_edges(path: &Path, content: &str, language: Language) -> Result<ParseResult> {
58    match language {
59        Language::Rust => rust_parser::parse_with_edges(path, content),
60        Language::Python => python_parser::parse_with_edges(path, content),
61        Language::Go => go_parser::parse_with_edges(path, content),
62        Language::TypeScript | Language::JavaScript => typescript_parser::parse_with_edges(path, content),
63        Language::Java => java_parser::parse_with_edges(path, content),
64        Language::C => c_parser::parse_with_edges(path, content),
65        Language::Cpp => cpp_parser::parse_with_edges(path, content),
66        _ => Ok(ParseResult { nodes: Vec::new(), edges: Vec::new() }),
67    }
68}
69
70#[cfg(test)]
71mod tests {
72    use super::truncate_content;
73
74    #[test]
75    fn test_truncate_ascii_short() {
76        assert_eq!(truncate_content("hello", 500), "hello");
77    }
78
79    #[test]
80    fn test_truncate_ascii_exact() {
81        let s = "a".repeat(500);
82        assert_eq!(truncate_content(&s, 500), s);
83    }
84
85    #[test]
86    fn test_truncate_ascii_long() {
87        let s = "a".repeat(600);
88        let result = truncate_content(&s, 500);
89        assert!(result.ends_with("..."));
90        assert!(result.len() <= 503); // 500 + "..."
91    }
92
93    #[test]
94    fn test_truncate_multibyte_at_boundary() {
95        // "→" is 3 bytes (E2 86 92). Place it so byte 500 falls inside it.
96        let mut s = "x".repeat(499); // 499 ASCII bytes
97        s.push('→'); // bytes 499..502
98        s.push_str("after");
99        // Truncating at 500 would split "→". Should back up to 499.
100        let result = truncate_content(&s, 500);
101        assert!(result.ends_with("..."));
102        assert!(!result.contains('→'), "should not include partial char");
103        assert_eq!(&result[..499], &"x".repeat(499));
104    }
105
106    #[test]
107    fn test_truncate_emoji_boundary() {
108        // "🦀" is 4 bytes. Place it at the cut point.
109        let mut s = "a".repeat(498);
110        s.push('🦀'); // bytes 498..502
111        s.push_str("tail");
112        let result = truncate_content(&s, 500);
113        assert!(result.ends_with("..."));
114        // Should back up to byte 498
115        assert_eq!(&result[..498], &"a".repeat(498));
116    }
117
118    #[test]
119    fn test_truncate_all_multibyte() {
120        // All 2-byte chars: "é" = 2 bytes
121        let s: String = std::iter::repeat('é').take(300).collect(); // 600 bytes
122        let result = truncate_content(&s, 500);
123        assert!(result.ends_with("..."));
124        // 500 bytes / 2 bytes per char = 250 chars, perfectly aligned
125        assert_eq!(result.chars().filter(|c| *c == 'é').count(), 250);
126    }
127
128    #[test]
129    fn test_truncate_empty() {
130        assert_eq!(truncate_content("", 500), "");
131    }
132
133    #[test]
134    fn test_truncate_zero_max() {
135        assert_eq!(truncate_content("hello", 0), "...");
136    }
137}
138
139/// Detect language from file path and parse.
140pub fn parse_auto(path: &Path, content: &str) -> Result<Vec<Node>> {
141    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
142    let lang = Language::from_extension(ext);
143    parse_file(path, content, lang)
144}