codesearch/chunker/
mod.rs1#![allow(dead_code)]
2
3use anyhow::Result;
4use sha2::{Digest, Sha256};
5use std::path::Path;
6
7mod dedup;
8mod extractor;
9mod fallback;
10mod grammar;
11mod parser;
12mod semantic;
13mod tree_sitter;
14
15pub use semantic::SemanticChunker;
16
17pub const DEFAULT_CONTEXT_LINES: usize = 3;
19
20#[derive(Debug, Clone)]
22pub struct Chunk {
23 pub content: String,
25
26 pub start_line: usize,
28
29 pub end_line: usize,
31
32 pub kind: ChunkKind,
34
35 pub context: Vec<String>,
37
38 pub path: String,
40
41 pub signature: Option<String>,
44
45 pub docstring: Option<String>,
47
48 pub is_complete: bool,
50
51 pub split_index: Option<usize>,
53
54 pub hash: String,
56
57 pub context_prev: Option<String>,
59
60 pub context_next: Option<String>,
62}
63
64impl Chunk {
65 pub fn new(
67 content: String,
68 start_line: usize,
69 end_line: usize,
70 kind: ChunkKind,
71 path: String,
72 ) -> Self {
73 let hash = Self::compute_hash(&content);
74
75 Self {
76 content,
77 start_line,
78 end_line,
79 kind,
80 context: Vec::new(),
81 path,
82 signature: None,
83 docstring: None,
84 is_complete: true,
85 split_index: None,
86 hash,
87 context_prev: None,
88 context_next: None,
89 }
90 }
91
92 pub fn compute_hash(content: &str) -> String {
94 let mut hasher = Sha256::new();
95 hasher.update(content.as_bytes());
96 format!("{:x}", hasher.finalize())
97 }
98
99 pub fn estimate_memory_usage(&self) -> usize {
101 let content_size = self.content.len();
102 let context_size = self.context.iter().map(|s| s.len()).sum::<usize>();
103 let signature_size = self.signature.as_ref().map_or(0, |s| s.len());
104 let docstring_size = self.docstring.as_ref().map_or(0, |s| s.len());
105 let context_prev_size = self.context_prev.as_ref().map_or(0, |s| s.len());
106 let context_next_size = self.context_next.as_ref().map_or(0, |s| s.len());
107
108 content_size
109 + context_size
110 + signature_size
111 + docstring_size
112 + context_prev_size
113 + context_next_size
114 }
115
116 pub fn contains_keyword(&self, keyword: &str) -> bool {
118 self.content.contains(keyword)
119 || self.signature.as_ref().is_some_and(|s| s.contains(keyword))
120 || self.docstring.as_ref().is_some_and(|s| s.contains(keyword))
121 }
122
123 pub fn is_duplicate_of(&self, other: &Chunk) -> bool {
125 self.hash == other.hash
126 }
127
128 pub fn line_count(&self) -> usize {
130 self.end_line.saturating_sub(self.start_line)
131 }
132
133 pub fn size_bytes(&self) -> usize {
135 self.content.len()
136 }
137}
138
139#[derive(Debug, Clone, Copy, PartialEq, Eq)]
140pub enum ChunkKind {
141 Function, Class, Method, Struct, Enum, Trait, Interface, Impl, Mod, TypeAlias, Const, Static, Block, Anchor, Comment, Imports, ModuleDocs, Other, }
160
161pub trait Chunker: Send + Sync {
163 fn chunk_file(&self, path: &Path, content: &str) -> Result<Vec<Chunk>>;
165}
166
167#[cfg(test)]
168mod tests {
169 #[allow(unused_imports)]
170 use super::*;
171
172 #[test]
173 fn test_chunker() {
174 }
176}