Skip to main content

deagle_parse/
lib.rs

1//! deagle-parse — tree-sitter based code parser.
2//!
3//! Extracts code entities (functions, structs, traits, impls, imports)
4//! from source files using tree-sitter grammars.
5//!
6//! ## Feature Flags
7//!
8//! - `pattern` — structural pattern matching via [ast-grep-core](https://crates.io/crates/ast-grep-core)
9
10pub mod rust_parser;
11pub mod python_parser;
12pub mod go_parser;
13pub mod typescript_parser;
14pub mod java_parser;
15pub mod c_parser;
16pub mod cpp_parser;
17pub mod ruby_parser;
18
19#[cfg(feature = "pattern")]
20pub mod pattern;
21
22#[cfg(feature = "text-search")]
23pub mod text_search;
24
25use deagle_core::{Language, Node, Result};
26use std::path::Path;
27
28pub use rust_parser::ParseResult;
29
30/// Truncate a string to at most `max_bytes`, respecting UTF-8 char boundaries.
31pub(crate) fn truncate_content(s: &str, max_bytes: usize) -> String {
32    if s.len() <= max_bytes {
33        return s.to_string();
34    }
35    // Find the last char boundary at or before max_bytes
36    let mut end = max_bytes;
37    while end > 0 && !s.is_char_boundary(end) {
38        end -= 1;
39    }
40    format!("{}...", &s[..end])
41}
42
43/// Parse a source file and extract code entities.
44pub fn parse_file(path: &Path, content: &str, language: Language) -> Result<Vec<Node>> {
45    match language {
46        Language::Rust => rust_parser::parse(path, content),
47        Language::Python => python_parser::parse(path, content),
48        Language::Go => go_parser::parse(path, content),
49        Language::TypeScript | Language::JavaScript => typescript_parser::parse(path, content),
50        Language::Java => java_parser::parse(path, content),
51        Language::C => c_parser::parse(path, content),
52        Language::Cpp => cpp_parser::parse(path, content),
53        Language::Ruby => ruby_parser::parse(path, content),
54        _ => Ok(Vec::new()),
55    }
56}
57
58/// Parse with edge extraction — returns nodes and relationship tuples.
59pub fn parse_file_with_edges(path: &Path, content: &str, language: Language) -> Result<ParseResult> {
60    match language {
61        Language::Rust => rust_parser::parse_with_edges(path, content),
62        Language::Python => python_parser::parse_with_edges(path, content),
63        Language::Go => go_parser::parse_with_edges(path, content),
64        Language::TypeScript | Language::JavaScript => typescript_parser::parse_with_edges(path, content),
65        Language::Java => java_parser::parse_with_edges(path, content),
66        Language::C => c_parser::parse_with_edges(path, content),
67        Language::Cpp => cpp_parser::parse_with_edges(path, content),
68        Language::Ruby => ruby_parser::parse_with_edges(path, content),
69        _ => Ok(ParseResult { nodes: Vec::new(), edges: Vec::new() }),
70    }
71}
72
73#[cfg(test)]
74mod tests {
75    use super::truncate_content;
76
77    #[test]
78    fn test_truncate_ascii_short() {
79        assert_eq!(truncate_content("hello", 500), "hello");
80    }
81
82    #[test]
83    fn test_truncate_ascii_exact() {
84        let s = "a".repeat(500);
85        assert_eq!(truncate_content(&s, 500), s);
86    }
87
88    #[test]
89    fn test_truncate_ascii_long() {
90        let s = "a".repeat(600);
91        let result = truncate_content(&s, 500);
92        assert!(result.ends_with("..."));
93        assert!(result.len() <= 503); // 500 + "..."
94    }
95
96    #[test]
97    fn test_truncate_multibyte_at_boundary() {
98        // "→" is 3 bytes (E2 86 92). Place it so byte 500 falls inside it.
99        let mut s = "x".repeat(499); // 499 ASCII bytes
100        s.push('→'); // bytes 499..502
101        s.push_str("after");
102        // Truncating at 500 would split "→". Should back up to 499.
103        let result = truncate_content(&s, 500);
104        assert!(result.ends_with("..."));
105        assert!(!result.contains('→'), "should not include partial char");
106        assert_eq!(&result[..499], &"x".repeat(499));
107    }
108
109    #[test]
110    fn test_truncate_emoji_boundary() {
111        // "🦀" is 4 bytes. Place it at the cut point.
112        let mut s = "a".repeat(498);
113        s.push('🦀'); // bytes 498..502
114        s.push_str("tail");
115        let result = truncate_content(&s, 500);
116        assert!(result.ends_with("..."));
117        // Should back up to byte 498
118        assert_eq!(&result[..498], &"a".repeat(498));
119    }
120
121    #[test]
122    fn test_truncate_all_multibyte() {
123        // All 2-byte chars: "é" = 2 bytes
124        let s: String = std::iter::repeat('é').take(300).collect(); // 600 bytes
125        let result = truncate_content(&s, 500);
126        assert!(result.ends_with("..."));
127        // 500 bytes / 2 bytes per char = 250 chars, perfectly aligned
128        assert_eq!(result.chars().filter(|c| *c == 'é').count(), 250);
129    }
130
131    #[test]
132    fn test_truncate_empty() {
133        assert_eq!(truncate_content("", 500), "");
134    }
135
136    #[test]
137    fn test_truncate_zero_max() {
138        assert_eq!(truncate_content("hello", 0), "...");
139    }
140}
141
142/// Detect language from file path and parse.
143pub fn parse_auto(path: &Path, content: &str) -> Result<Vec<Node>> {
144    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
145    let lang = Language::from_extension(ext);
146    parse_file(path, content, lang)
147}