rusty_files/indexer/
content.rs1use crate::core::error::Result;
2use crate::core::types::ContentPreview;
3use crate::utils::encoding::{detect_encoding, is_likely_text, read_file_with_encoding};
4use std::fs::File;
5use std::io::Read;
6use std::path::Path;
7
8pub struct ContentAnalyzer {
9 max_file_size: u64,
10 preview_length: usize,
11}
12
13impl ContentAnalyzer {
14 pub fn new(max_file_size: u64) -> Self {
15 Self {
16 max_file_size,
17 preview_length: 1000,
18 }
19 }
20
21 pub fn analyze<P: AsRef<Path>>(&self, path: P) -> Result<Option<ContentPreview>> {
22 let path = path.as_ref();
23 let metadata = std::fs::metadata(path)?;
24
25 if metadata.len() > self.max_file_size {
26 return Ok(None);
27 }
28
29 if !self.is_text_file(path)? {
30 return Ok(None);
31 }
32
33 let content = read_file_with_encoding(path, self.max_file_size)?;
34
35 let preview = if content.len() > self.preview_length {
36 content.chars().take(self.preview_length).collect()
37 } else {
38 content.clone()
39 };
40
41 let word_count = content.split_whitespace().count();
42 let line_count = content.lines().count();
43
44 let mut file = File::open(path)?;
45 let mut buffer = vec![0u8; 8192.min(metadata.len() as usize)];
46 file.read_exact(&mut buffer)?;
47
48 let encoding = detect_encoding(&buffer);
49
50 Ok(Some(ContentPreview {
51 preview,
52 word_count,
53 line_count,
54 encoding: encoding.name().to_string(),
55 }))
56 }
57
58 pub fn analyze_batch<P: AsRef<Path> + Sync>(
59 &self,
60 paths: &[P],
61 ) -> Vec<(usize, Result<Option<ContentPreview>>)> {
62 use rayon::prelude::*;
63
64 paths
65 .par_iter()
66 .enumerate()
67 .map(|(idx, path)| (idx, self.analyze(path.as_ref())))
68 .collect()
69 }
70
71 fn is_text_file<P: AsRef<Path>>(&self, path: P) -> Result<bool> {
72 let mut file = File::open(path)?;
73 let mut buffer = vec![0u8; 8192];
74
75 let bytes_read = file.read(&mut buffer)?;
76 buffer.truncate(bytes_read);
77
78 Ok(is_likely_text(&buffer))
79 }
80
81 pub fn extract_text<P: AsRef<Path>>(&self, path: P, max_length: usize) -> Result<String> {
82 let content = read_file_with_encoding(path, self.max_file_size)?;
83
84 if content.len() > max_length {
85 Ok(content.chars().take(max_length).collect())
86 } else {
87 Ok(content)
88 }
89 }
90
91 pub fn get_snippet<P: AsRef<Path>>(
92 &self,
93 path: P,
94 query: &str,
95 context_chars: usize,
96 ) -> Result<Option<String>> {
97 let content = read_file_with_encoding(path, self.max_file_size)?;
98
99 if let Some(pos) = content.to_lowercase().find(&query.to_lowercase()) {
100 let start = pos.saturating_sub(context_chars);
101 let end = (pos + query.len() + context_chars).min(content.len());
102
103 let snippet: String = content.chars().skip(start).take(end - start).collect();
104 Ok(Some(snippet))
105 } else {
106 Ok(None)
107 }
108 }
109}
110
111impl Default for ContentAnalyzer {
112 fn default() -> Self {
113 Self::new(10 * 1024 * 1024)
114 }
115}
116
117#[cfg(test)]
118mod tests {
119 use super::*;
120 use std::fs;
121 use tempfile::TempDir;
122
123 #[test]
124 fn test_analyze_text_file() {
125 let temp_dir = TempDir::new().unwrap();
126 let file_path = temp_dir.path().join("test.txt");
127 fs::write(&file_path, "Hello world\nThis is a test\nWith multiple lines").unwrap();
128
129 let analyzer = ContentAnalyzer::default();
130 let preview = analyzer.analyze(&file_path).unwrap();
131
132 assert!(preview.is_some());
133 let preview = preview.unwrap();
134 assert_eq!(preview.word_count, 9);
136 assert_eq!(preview.line_count, 3);
137 }
138
139 #[test]
140 fn test_analyze_binary_file() {
141 let temp_dir = TempDir::new().unwrap();
142 let file_path = temp_dir.path().join("binary.bin");
143 fs::write(&file_path, vec![0u8; 100]).unwrap();
144
145 let analyzer = ContentAnalyzer::default();
146 let preview = analyzer.analyze(&file_path).unwrap();
147
148 assert!(preview.is_none());
149 }
150
151 #[test]
152 fn test_get_snippet() {
153 let temp_dir = TempDir::new().unwrap();
154 let file_path = temp_dir.path().join("test.txt");
155 fs::write(&file_path, "The quick brown fox jumps over the lazy dog").unwrap();
156
157 let analyzer = ContentAnalyzer::default();
158 let snippet = analyzer.get_snippet(&file_path, "brown", 10).unwrap();
159
160 assert!(snippet.is_some());
161 let snippet = snippet.unwrap();
162 assert!(snippet.contains("brown"));
163 }
164}