#![allow(dead_code)]
use anyhow::Result;
use sha2::{Digest, Sha256};
use std::path::Path;
mod dedup;
mod extractor;
mod fallback;
mod grammar;
mod parser;
mod semantic;
mod tree_sitter;
pub use semantic::SemanticChunker;
pub const DEFAULT_CONTEXT_LINES: usize = 3;
#[derive(Debug, Clone)]
pub struct Chunk {
pub content: String,
pub start_line: usize,
pub end_line: usize,
pub kind: ChunkKind,
pub context: Vec<String>,
pub path: String,
pub signature: Option<String>,
pub docstring: Option<String>,
pub is_complete: bool,
pub split_index: Option<usize>,
pub hash: String,
pub context_prev: Option<String>,
pub context_next: Option<String>,
}
impl Chunk {
pub fn new(
content: String,
start_line: usize,
end_line: usize,
kind: ChunkKind,
path: String,
) -> Self {
let hash = Self::compute_hash(&content);
Self {
content,
start_line,
end_line,
kind,
context: Vec::new(),
path,
signature: None,
docstring: None,
is_complete: true,
split_index: None,
hash,
context_prev: None,
context_next: None,
}
}
pub fn compute_hash(content: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(content.as_bytes());
format!("{:x}", hasher.finalize())
}
pub fn estimate_memory_usage(&self) -> usize {
let content_size = self.content.len();
let context_size = self.context.iter().map(|s| s.len()).sum::<usize>();
let signature_size = self.signature.as_ref().map_or(0, |s| s.len());
let docstring_size = self.docstring.as_ref().map_or(0, |s| s.len());
let context_prev_size = self.context_prev.as_ref().map_or(0, |s| s.len());
let context_next_size = self.context_next.as_ref().map_or(0, |s| s.len());
content_size
+ context_size
+ signature_size
+ docstring_size
+ context_prev_size
+ context_next_size
}
pub fn contains_keyword(&self, keyword: &str) -> bool {
self.content.contains(keyword)
|| self.signature.as_ref().is_some_and(|s| s.contains(keyword))
|| self.docstring.as_ref().is_some_and(|s| s.contains(keyword))
}
pub fn is_duplicate_of(&self, other: &Chunk) -> bool {
self.hash == other.hash
}
pub fn line_count(&self) -> usize {
self.end_line.saturating_sub(self.start_line)
}
pub fn size_bytes(&self) -> usize {
self.content.len()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ChunkKind {
Function, Class, Method, Struct, Enum, Trait, Interface, Impl, Mod, TypeAlias, Const, Static, Block, Anchor, Comment, Imports, ModuleDocs, Other, }
pub trait Chunker: Send + Sync {
fn chunk_file(&self, path: &Path, content: &str) -> Result<Vec<Chunk>>;
}
#[cfg(test)]
mod tests {
#[allow(unused_imports)]
use super::*;
#[test]
fn test_chunker() {
}
}