1pub mod complexity;
21
22use std::fs::File;
23use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
24use std::path::Path;
25
26use anyhow::{Context, Result};
27
28fn read_head_from_file(file: &mut File, max_bytes: usize) -> Result<Vec<u8>> {
29 use std::io::Read as _;
30 let mut buf = Vec::with_capacity(max_bytes);
31 file.take(max_bytes as u64).read_to_end(&mut buf)?;
32 Ok(buf)
33}
34
35pub fn read_head(path: &Path, max_bytes: usize) -> Result<Vec<u8>> {
36 let mut file =
37 File::open(path).with_context(|| format!("Failed to open {}", path.display()))?;
38 read_head_from_file(&mut file, max_bytes)
39}
40
41pub fn read_head_tail(path: &Path, max_bytes: usize) -> Result<Vec<u8>> {
42 if max_bytes == 0 {
43 return Ok(Vec::new());
44 }
45 let mut file =
46 File::open(path).with_context(|| format!("Failed to open {}", path.display()))?;
47 let size = file
48 .metadata()
49 .with_context(|| format!("Failed to get metadata for {}", path.display()))?
50 .len();
51 if size as usize <= max_bytes {
52 return read_head_from_file(&mut file, max_bytes);
53 }
54
55 let half = max_bytes / 2;
56 let head_len = half.max(1);
57 let tail_len = max_bytes.saturating_sub(head_len);
58
59 let mut head = vec![0u8; head_len];
60 file.read_exact(&mut head)?;
61
62 if tail_len == 0 {
63 return Ok(head);
64 }
65
66 let tail_start = size.saturating_sub(tail_len as u64);
67 file.seek(SeekFrom::Start(tail_start))?;
68 let mut tail = vec![0u8; tail_len];
69 file.read_exact(&mut tail)?;
70
71 head.extend_from_slice(&tail);
72 Ok(head)
73}
74
75pub fn read_lines(path: &Path, max_lines: usize, max_bytes: usize) -> Result<Vec<String>> {
76 if max_lines == 0 || max_bytes == 0 {
77 return Ok(Vec::new());
78 }
79 let file = File::open(path).with_context(|| format!("Failed to open {}", path.display()))?;
80 let reader = BufReader::new(file);
81 let mut lines = Vec::new();
82 let mut bytes = 0usize;
83
84 for line in reader.lines() {
85 let line = line?;
86 bytes += line.len();
87 lines.push(line);
88 if lines.len() >= max_lines || bytes >= max_bytes {
89 break;
90 }
91 }
92
93 Ok(lines)
94}
95
96pub fn read_text_capped(path: &Path, max_bytes: usize) -> Result<String> {
97 let bytes = read_head(path, max_bytes)?;
98 Ok(String::from_utf8_lossy(&bytes).to_string())
99}
100
101pub fn is_text_like(bytes: &[u8]) -> bool {
102 if bytes.contains(&0) {
103 return false;
104 }
105 std::str::from_utf8(bytes).is_ok()
106}
107
108pub fn hash_bytes(bytes: &[u8]) -> String {
109 blake3::hash(bytes).to_hex().to_string()
110}
111
112pub fn hash_file(path: &Path, max_bytes: usize) -> Result<String> {
113 let bytes = read_head(path, max_bytes)?;
114 Ok(hash_bytes(&bytes))
115}
116
117pub fn count_tags(text: &str, tags: &[&str]) -> Vec<(String, usize)> {
118 let upper = text.to_uppercase();
119 tags.iter()
120 .map(|tag| {
121 let needle = tag.to_uppercase();
122 let count = upper.matches(&needle).count();
123 (tag.to_string(), count)
124 })
125 .collect()
126}
127
128pub fn entropy_bits_per_byte(bytes: &[u8]) -> f32 {
129 if bytes.is_empty() {
130 return 0.0;
131 }
132 let mut counts = [0u32; 256];
133 for b in bytes {
134 counts[*b as usize] += 1;
135 }
136 let len = bytes.len() as f32;
137 let mut entropy = 0.0f32;
138 for count in counts {
139 if count == 0 {
140 continue;
141 }
142 let p = count as f32 / len;
143 entropy -= p * p.log2();
144 }
145 entropy
146}
147
148#[cfg(test)]
149mod tests {
150 use super::*;
151 use std::io::Write;
152
153 #[test]
154 fn test_read_head_empty() {
155 let tmp = tempfile::tempdir().unwrap();
156 let path = tmp.path().join("empty");
157 File::create(&path).unwrap();
158
159 let bytes = read_head(&path, 10).unwrap();
160 assert!(bytes.is_empty());
161 }
162
163 #[test]
164 fn test_read_head_small() {
165 let tmp = tempfile::tempdir().unwrap();
166 let path = tmp.path().join("small");
167 let mut f = File::create(&path).unwrap();
168 f.write_all(b"hello").unwrap();
169
170 let bytes = read_head(&path, 10).unwrap();
171 assert_eq!(bytes, b"hello");
172 }
173
174 #[test]
175 fn test_read_head_limit() {
176 let tmp = tempfile::tempdir().unwrap();
177 let path = tmp.path().join("limit");
178 let mut f = File::create(&path).unwrap();
179 f.write_all(b"hello world").unwrap();
180
181 let bytes = read_head(&path, 5).unwrap();
182 assert_eq!(bytes, b"hello");
183 }
184
185 #[test]
186 fn test_read_head_tail_small() {
187 let tmp = tempfile::tempdir().unwrap();
188 let path = tmp.path().join("small");
189 let mut f = File::create(&path).unwrap();
190 f.write_all(b"hello").unwrap();
191
192 let bytes = read_head_tail(&path, 10).unwrap();
193 assert_eq!(bytes, b"hello");
194 }
195
196 #[test]
197 fn test_read_head_tail_large() {
198 let tmp = tempfile::tempdir().unwrap();
199 let path = tmp.path().join("large");
200 let mut f = File::create(&path).unwrap();
201 f.write_all(b"0123456789").unwrap();
203
204 let bytes = read_head_tail(&path, 4).unwrap();
206 assert_eq!(bytes, b"0189");
207 }
208
209 #[test]
210 fn test_read_head_tail_odd() {
211 let tmp = tempfile::tempdir().unwrap();
212 let path = tmp.path().join("odd");
213 let mut f = File::create(&path).unwrap();
214 f.write_all(b"0123456789").unwrap();
216
217 let bytes = read_head_tail(&path, 5).unwrap();
219 assert_eq!(bytes, b"01789");
220 }
221
222 #[test]
227 fn test_read_lines_returns_actual_content() {
228 let tmp = tempfile::tempdir().unwrap();
229 let path = tmp.path().join("lines.txt");
230 let mut f = File::create(&path).unwrap();
231 writeln!(f, "first line").unwrap();
232 writeln!(f, "second line").unwrap();
233 writeln!(f, "third line").unwrap();
234
235 let lines = read_lines(&path, 10, 10000).unwrap();
236 assert_eq!(lines.len(), 3);
238 assert_eq!(lines[0], "first line");
239 assert_eq!(lines[1], "second line");
240 assert_eq!(lines[2], "third line");
241 }
242
243 #[test]
244 fn test_read_lines_respects_max_lines_limit() {
245 let tmp = tempfile::tempdir().unwrap();
246 let path = tmp.path().join("many_lines.txt");
247 let mut f = File::create(&path).unwrap();
248 for i in 0..10 {
249 writeln!(f, "line {}", i).unwrap();
250 }
251
252 let lines = read_lines(&path, 3, 10000).unwrap();
254 assert_eq!(lines.len(), 3);
255 assert_eq!(lines[0], "line 0");
256 assert_eq!(lines[1], "line 1");
257 assert_eq!(lines[2], "line 2");
258 }
259
260 #[test]
261 fn test_read_lines_respects_max_bytes_limit() {
262 let tmp = tempfile::tempdir().unwrap();
263 let path = tmp.path().join("bytes_limited.txt");
264 let mut f = File::create(&path).unwrap();
265 for i in 0..10 {
267 writeln!(f, "line {:04}", i).unwrap();
268 }
269
270 let lines = read_lines(&path, 100, 25).unwrap();
272 assert!(
275 lines.len() >= 2 && lines.len() <= 4,
276 "Expected 2-4 lines, got {}",
277 lines.len()
278 );
279 assert_eq!(lines[0], "line 0000");
281 }
282
283 #[test]
284 fn test_read_lines_bytes_accumulate_correctly() {
285 let tmp = tempfile::tempdir().unwrap();
286 let path = tmp.path().join("accumulate.txt");
287 let mut f = File::create(&path).unwrap();
288 writeln!(f, "12345").unwrap(); writeln!(f, "67890").unwrap(); writeln!(f, "abcde").unwrap(); writeln!(f, "fghij").unwrap(); let lines = read_lines(&path, 100, 10).unwrap();
296 assert_eq!(lines.len(), 2, "Should stop after reaching 10 bytes");
297 assert_eq!(lines[0], "12345");
298 assert_eq!(lines[1], "67890");
299 }
300
301 #[test]
302 fn test_read_lines_single_line_at_limit() {
303 let tmp = tempfile::tempdir().unwrap();
304 let path = tmp.path().join("single.txt");
305 let mut f = File::create(&path).unwrap();
306 writeln!(f, "exactlyten").unwrap(); let lines = read_lines(&path, 1, 10000).unwrap();
310 assert_eq!(lines.len(), 1);
311 assert_eq!(lines[0], "exactlyten");
312 }
313
314 #[test]
315 fn test_read_lines_bytes_limit_stops_after_reaching_threshold() {
316 let tmp = tempfile::tempdir().unwrap();
317 let path = tmp.path().join("threshold.txt");
318 let mut f = File::create(&path).unwrap();
319 writeln!(f, "aaaaa").unwrap(); writeln!(f, "bbbbb").unwrap(); writeln!(f, "ccccc").unwrap(); let lines = read_lines(&path, 100, 9).unwrap();
327 assert_eq!(lines.len(), 2);
328 }
329
330 #[test]
335 fn test_read_text_capped_returns_actual_content() {
336 let tmp = tempfile::tempdir().unwrap();
337 let path = tmp.path().join("text.txt");
338 let mut f = File::create(&path).unwrap();
339 f.write_all(b"Hello, World!").unwrap();
340
341 let text = read_text_capped(&path, 100).unwrap();
342 assert_eq!(text, "Hello, World!");
344 }
345
346 #[test]
347 fn test_read_text_capped_respects_limit() {
348 let tmp = tempfile::tempdir().unwrap();
349 let path = tmp.path().join("long_text.txt");
350 let mut f = File::create(&path).unwrap();
351 f.write_all(b"The quick brown fox jumps over the lazy dog")
352 .unwrap();
353
354 let text = read_text_capped(&path, 9).unwrap();
355 assert_eq!(text, "The quick");
356 }
357
358 #[test]
363 fn test_hash_file_returns_correct_blake3_hash() {
364 let tmp = tempfile::tempdir().unwrap();
365 let path = tmp.path().join("hash_test.txt");
366 let mut f = File::create(&path).unwrap();
367 f.write_all(b"test content").unwrap();
368
369 let hash = hash_file(&path, 1000).unwrap();
370
371 assert!(!hash.is_empty());
373 assert_eq!(hash.len(), 64);
375 let expected = hash_bytes(b"test content");
377 assert_eq!(hash, expected);
378 }
379
380 #[test]
381 fn test_hash_file_deterministic() {
382 let tmp = tempfile::tempdir().unwrap();
383 let path = tmp.path().join("deterministic.txt");
384 let mut f = File::create(&path).unwrap();
385 f.write_all(b"same content every time").unwrap();
386
387 let hash1 = hash_file(&path, 1000).unwrap();
388 let hash2 = hash_file(&path, 1000).unwrap();
389 assert_eq!(hash1, hash2);
390 }
391
392 #[test]
393 fn test_hash_file_different_content_different_hash() {
394 let tmp = tempfile::tempdir().unwrap();
395
396 let path1 = tmp.path().join("file1.txt");
397 let mut f1 = File::create(&path1).unwrap();
398 f1.write_all(b"content A").unwrap();
399
400 let path2 = tmp.path().join("file2.txt");
401 let mut f2 = File::create(&path2).unwrap();
402 f2.write_all(b"content B").unwrap();
403
404 let hash1 = hash_file(&path1, 1000).unwrap();
405 let hash2 = hash_file(&path2, 1000).unwrap();
406 assert_ne!(hash1, hash2);
407 }
408
409 #[test]
410 fn test_hash_file_respects_max_bytes() {
411 let tmp = tempfile::tempdir().unwrap();
412 let path = tmp.path().join("long_file.txt");
413 let mut f = File::create(&path).unwrap();
414 f.write_all(b"abcdefghij").unwrap();
415
416 let hash_limited = hash_file(&path, 5).unwrap();
418 let expected = hash_bytes(b"abcde");
419 assert_eq!(hash_limited, expected);
420
421 let hash_full = hash_file(&path, 1000).unwrap();
423 assert_ne!(hash_limited, hash_full);
424 }
425}