rusty_files/indexer/
content.rs

1use crate::core::error::Result;
2use crate::core::types::ContentPreview;
3use crate::utils::encoding::{detect_encoding, is_likely_text, read_file_with_encoding};
4use std::fs::File;
5use std::io::Read;
6use std::path::Path;
7
8pub struct ContentAnalyzer {
9    max_file_size: u64,
10    preview_length: usize,
11}
12
13impl ContentAnalyzer {
14    pub fn new(max_file_size: u64) -> Self {
15        Self {
16            max_file_size,
17            preview_length: 1000,
18        }
19    }
20
21    pub fn analyze<P: AsRef<Path>>(&self, path: P) -> Result<Option<ContentPreview>> {
22        let path = path.as_ref();
23        let metadata = std::fs::metadata(path)?;
24
25        if metadata.len() > self.max_file_size {
26            return Ok(None);
27        }
28
29        if !self.is_text_file(path)? {
30            return Ok(None);
31        }
32
33        let content = read_file_with_encoding(path, self.max_file_size)?;
34
35        let preview = if content.len() > self.preview_length {
36            content.chars().take(self.preview_length).collect()
37        } else {
38            content.clone()
39        };
40
41        let word_count = content.split_whitespace().count();
42        let line_count = content.lines().count();
43
44        let mut file = File::open(path)?;
45        let mut buffer = vec![0u8; 8192.min(metadata.len() as usize)];
46        file.read_exact(&mut buffer)?;
47
48        let encoding = detect_encoding(&buffer);
49
50        Ok(Some(ContentPreview {
51            preview,
52            word_count,
53            line_count,
54            encoding: encoding.name().to_string(),
55        }))
56    }
57
58    pub fn analyze_batch<P: AsRef<Path> + Sync>(
59        &self,
60        paths: &[P],
61    ) -> Vec<(usize, Result<Option<ContentPreview>>)> {
62        use rayon::prelude::*;
63
64        paths
65            .par_iter()
66            .enumerate()
67            .map(|(idx, path)| (idx, self.analyze(path.as_ref())))
68            .collect()
69    }
70
71    fn is_text_file<P: AsRef<Path>>(&self, path: P) -> Result<bool> {
72        let mut file = File::open(path)?;
73        let mut buffer = vec![0u8; 8192];
74
75        let bytes_read = file.read(&mut buffer)?;
76        buffer.truncate(bytes_read);
77
78        Ok(is_likely_text(&buffer))
79    }
80
81    pub fn extract_text<P: AsRef<Path>>(&self, path: P, max_length: usize) -> Result<String> {
82        let content = read_file_with_encoding(path, self.max_file_size)?;
83
84        if content.len() > max_length {
85            Ok(content.chars().take(max_length).collect())
86        } else {
87            Ok(content)
88        }
89    }
90
91    pub fn get_snippet<P: AsRef<Path>>(
92        &self,
93        path: P,
94        query: &str,
95        context_chars: usize,
96    ) -> Result<Option<String>> {
97        let content = read_file_with_encoding(path, self.max_file_size)?;
98
99        if let Some(pos) = content.to_lowercase().find(&query.to_lowercase()) {
100            let start = pos.saturating_sub(context_chars);
101            let end = (pos + query.len() + context_chars).min(content.len());
102
103            let snippet: String = content.chars().skip(start).take(end - start).collect();
104            Ok(Some(snippet))
105        } else {
106            Ok(None)
107        }
108    }
109}
110
111impl Default for ContentAnalyzer {
112    fn default() -> Self {
113        Self::new(10 * 1024 * 1024)
114    }
115}
116
117#[cfg(test)]
118mod tests {
119    use super::*;
120    use std::fs;
121    use tempfile::TempDir;
122
123    #[test]
124    fn test_analyze_text_file() {
125        let temp_dir = TempDir::new().unwrap();
126        let file_path = temp_dir.path().join("test.txt");
127        fs::write(&file_path, "Hello world\nThis is a test\nWith multiple lines").unwrap();
128
129        let analyzer = ContentAnalyzer::default();
130        let preview = analyzer.analyze(&file_path).unwrap();
131
132        assert!(preview.is_some());
133        let preview = preview.unwrap();
134        // Count: Hello, world, This, is, a, test, With, multiple, lines = 9 words
135        assert_eq!(preview.word_count, 9);
136        assert_eq!(preview.line_count, 3);
137    }
138
139    #[test]
140    fn test_analyze_binary_file() {
141        let temp_dir = TempDir::new().unwrap();
142        let file_path = temp_dir.path().join("binary.bin");
143        fs::write(&file_path, vec![0u8; 100]).unwrap();
144
145        let analyzer = ContentAnalyzer::default();
146        let preview = analyzer.analyze(&file_path).unwrap();
147
148        assert!(preview.is_none());
149    }
150
151    #[test]
152    fn test_get_snippet() {
153        let temp_dir = TempDir::new().unwrap();
154        let file_path = temp_dir.path().join("test.txt");
155        fs::write(&file_path, "The quick brown fox jumps over the lazy dog").unwrap();
156
157        let analyzer = ContentAnalyzer::default();
158        let snippet = analyzer.get_snippet(&file_path, "brown", 10).unwrap();
159
160        assert!(snippet.is_some());
161        let snippet = snippet.unwrap();
162        assert!(snippet.contains("brown"));
163    }
164}