Skip to main content

tokmd_content/
lib.rs

1//! # tokmd-content
2//!
3//! **Tier 2 (Utilities)**
4//!
5//! Content scanning helpers for analysis. Provides file content inspection
6//! capabilities including reading, hashing, and entropy calculation.
7//!
8//! ## What belongs here
9//! * File content reading (head, tail, lines)
10//! * Text detection
11//! * File integrity hashing (BLAKE3)
12//! * Tag counting (TODOs, FIXMEs)
13//! * Entropy calculation
14//! * Function-level complexity metrics
15//!
16//! ## What does NOT belong here
17//! * File listing (use tokmd-walk)
18//! * File modification
19
20pub mod complexity;
21
22use std::fs::File;
23use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
24use std::path::Path;
25
26use anyhow::{Context, Result};
27
28fn read_head_from_file(file: &mut File, max_bytes: usize) -> Result<Vec<u8>> {
29    use std::io::Read as _;
30    let mut buf = Vec::with_capacity(max_bytes);
31    file.take(max_bytes as u64).read_to_end(&mut buf)?;
32    Ok(buf)
33}
34
35pub fn read_head(path: &Path, max_bytes: usize) -> Result<Vec<u8>> {
36    let mut file =
37        File::open(path).with_context(|| format!("Failed to open {}", path.display()))?;
38    read_head_from_file(&mut file, max_bytes)
39}
40
41pub fn read_head_tail(path: &Path, max_bytes: usize) -> Result<Vec<u8>> {
42    if max_bytes == 0 {
43        return Ok(Vec::new());
44    }
45    let mut file =
46        File::open(path).with_context(|| format!("Failed to open {}", path.display()))?;
47    let size = file
48        .metadata()
49        .with_context(|| format!("Failed to get metadata for {}", path.display()))?
50        .len();
51    if size as usize <= max_bytes {
52        return read_head_from_file(&mut file, max_bytes);
53    }
54
55    let half = max_bytes / 2;
56    let head_len = half.max(1);
57    let tail_len = max_bytes.saturating_sub(head_len);
58
59    let mut head = vec![0u8; head_len];
60    file.read_exact(&mut head)?;
61
62    if tail_len == 0 {
63        return Ok(head);
64    }
65
66    let tail_start = size.saturating_sub(tail_len as u64);
67    file.seek(SeekFrom::Start(tail_start))?;
68    let mut tail = vec![0u8; tail_len];
69    file.read_exact(&mut tail)?;
70
71    head.extend_from_slice(&tail);
72    Ok(head)
73}
74
75pub fn read_lines(path: &Path, max_lines: usize, max_bytes: usize) -> Result<Vec<String>> {
76    if max_lines == 0 || max_bytes == 0 {
77        return Ok(Vec::new());
78    }
79    let file = File::open(path).with_context(|| format!("Failed to open {}", path.display()))?;
80    let reader = BufReader::new(file);
81    let mut lines = Vec::new();
82    let mut bytes = 0usize;
83
84    for line in reader.lines() {
85        let line = line?;
86        bytes += line.len();
87        lines.push(line);
88        if lines.len() >= max_lines || bytes >= max_bytes {
89            break;
90        }
91    }
92
93    Ok(lines)
94}
95
96pub fn read_text_capped(path: &Path, max_bytes: usize) -> Result<String> {
97    let bytes = read_head(path, max_bytes)?;
98    Ok(String::from_utf8_lossy(&bytes).to_string())
99}
100
101pub fn is_text_like(bytes: &[u8]) -> bool {
102    if bytes.contains(&0) {
103        return false;
104    }
105    std::str::from_utf8(bytes).is_ok()
106}
107
108pub fn hash_bytes(bytes: &[u8]) -> String {
109    blake3::hash(bytes).to_hex().to_string()
110}
111
112pub fn hash_file(path: &Path, max_bytes: usize) -> Result<String> {
113    let bytes = read_head(path, max_bytes)?;
114    Ok(hash_bytes(&bytes))
115}
116
117pub fn count_tags(text: &str, tags: &[&str]) -> Vec<(String, usize)> {
118    let upper = text.to_uppercase();
119    tags.iter()
120        .map(|tag| {
121            let needle = tag.to_uppercase();
122            let count = upper.matches(&needle).count();
123            (tag.to_string(), count)
124        })
125        .collect()
126}
127
128pub fn entropy_bits_per_byte(bytes: &[u8]) -> f32 {
129    if bytes.is_empty() {
130        return 0.0;
131    }
132    let mut counts = [0u32; 256];
133    for b in bytes {
134        counts[*b as usize] += 1;
135    }
136    let len = bytes.len() as f32;
137    let mut entropy = 0.0f32;
138    for count in counts {
139        if count == 0 {
140            continue;
141        }
142        let p = count as f32 / len;
143        entropy -= p * p.log2();
144    }
145    entropy
146}
147
148#[cfg(test)]
149mod tests {
150    use super::*;
151    use std::io::Write;
152
153    #[test]
154    fn test_read_head_empty() {
155        let tmp = tempfile::tempdir().unwrap();
156        let path = tmp.path().join("empty");
157        File::create(&path).unwrap();
158
159        let bytes = read_head(&path, 10).unwrap();
160        assert!(bytes.is_empty());
161    }
162
163    #[test]
164    fn test_read_head_small() {
165        let tmp = tempfile::tempdir().unwrap();
166        let path = tmp.path().join("small");
167        let mut f = File::create(&path).unwrap();
168        f.write_all(b"hello").unwrap();
169
170        let bytes = read_head(&path, 10).unwrap();
171        assert_eq!(bytes, b"hello");
172    }
173
174    #[test]
175    fn test_read_head_limit() {
176        let tmp = tempfile::tempdir().unwrap();
177        let path = tmp.path().join("limit");
178        let mut f = File::create(&path).unwrap();
179        f.write_all(b"hello world").unwrap();
180
181        let bytes = read_head(&path, 5).unwrap();
182        assert_eq!(bytes, b"hello");
183    }
184
185    #[test]
186    fn test_read_head_tail_small() {
187        let tmp = tempfile::tempdir().unwrap();
188        let path = tmp.path().join("small");
189        let mut f = File::create(&path).unwrap();
190        f.write_all(b"hello").unwrap();
191
192        let bytes = read_head_tail(&path, 10).unwrap();
193        assert_eq!(bytes, b"hello");
194    }
195
196    #[test]
197    fn test_read_head_tail_large() {
198        let tmp = tempfile::tempdir().unwrap();
199        let path = tmp.path().join("large");
200        let mut f = File::create(&path).unwrap();
201        // 0123456789
202        f.write_all(b"0123456789").unwrap();
203
204        // max_bytes = 4. half=2. head=2 ("01"), tail=2 ("89").
205        let bytes = read_head_tail(&path, 4).unwrap();
206        assert_eq!(bytes, b"0189");
207    }
208
209    #[test]
210    fn test_read_head_tail_odd() {
211        let tmp = tempfile::tempdir().unwrap();
212        let path = tmp.path().join("odd");
213        let mut f = File::create(&path).unwrap();
214        // 0123456789
215        f.write_all(b"0123456789").unwrap();
216
217        // max_bytes = 5. half=2. head=2 ("01"), tail=3 ("789").
218        let bytes = read_head_tail(&path, 5).unwrap();
219        assert_eq!(bytes, b"01789");
220    }
221
222    // ========================
223    // read_lines tests
224    // ========================
225
226    #[test]
227    fn test_read_lines_returns_actual_content() {
228        let tmp = tempfile::tempdir().unwrap();
229        let path = tmp.path().join("lines.txt");
230        let mut f = File::create(&path).unwrap();
231        writeln!(f, "first line").unwrap();
232        writeln!(f, "second line").unwrap();
233        writeln!(f, "third line").unwrap();
234
235        let lines = read_lines(&path, 10, 10000).unwrap();
236        // Verify actual content, not empty or dummy values
237        assert_eq!(lines.len(), 3);
238        assert_eq!(lines[0], "first line");
239        assert_eq!(lines[1], "second line");
240        assert_eq!(lines[2], "third line");
241    }
242
243    #[test]
244    fn test_read_lines_respects_max_lines_limit() {
245        let tmp = tempfile::tempdir().unwrap();
246        let path = tmp.path().join("many_lines.txt");
247        let mut f = File::create(&path).unwrap();
248        for i in 0..10 {
249            writeln!(f, "line {}", i).unwrap();
250        }
251
252        // Request only 3 lines
253        let lines = read_lines(&path, 3, 10000).unwrap();
254        assert_eq!(lines.len(), 3);
255        assert_eq!(lines[0], "line 0");
256        assert_eq!(lines[1], "line 1");
257        assert_eq!(lines[2], "line 2");
258    }
259
260    #[test]
261    fn test_read_lines_respects_max_bytes_limit() {
262        let tmp = tempfile::tempdir().unwrap();
263        let path = tmp.path().join("bytes_limited.txt");
264        let mut f = File::create(&path).unwrap();
265        // Each line is 10 chars: "line 0\n" etc
266        for i in 0..10 {
267            writeln!(f, "line {:04}", i).unwrap();
268        }
269
270        // Limit to 25 bytes - should get ~2-3 lines (each ~10 bytes)
271        let lines = read_lines(&path, 100, 25).unwrap();
272        // With byte limit of 25 and lines of ~10 bytes each,
273        // we should stop after accumulating >= 25 bytes
274        assert!(
275            lines.len() >= 2 && lines.len() <= 4,
276            "Expected 2-4 lines, got {}",
277            lines.len()
278        );
279        // Verify first line content
280        assert_eq!(lines[0], "line 0000");
281    }
282
283    #[test]
284    fn test_read_lines_bytes_accumulate_correctly() {
285        let tmp = tempfile::tempdir().unwrap();
286        let path = tmp.path().join("accumulate.txt");
287        let mut f = File::create(&path).unwrap();
288        // Write lines with known sizes
289        writeln!(f, "12345").unwrap(); // 5 bytes (without newline in result)
290        writeln!(f, "67890").unwrap(); // 5 more = 10 total
291        writeln!(f, "abcde").unwrap(); // 5 more = 15 total
292        writeln!(f, "fghij").unwrap(); // 5 more = 20 total
293
294        // Stop at exactly 10 bytes - should get 2 lines
295        let lines = read_lines(&path, 100, 10).unwrap();
296        assert_eq!(lines.len(), 2, "Should stop after reaching 10 bytes");
297        assert_eq!(lines[0], "12345");
298        assert_eq!(lines[1], "67890");
299    }
300
301    #[test]
302    fn test_read_lines_single_line_at_limit() {
303        let tmp = tempfile::tempdir().unwrap();
304        let path = tmp.path().join("single.txt");
305        let mut f = File::create(&path).unwrap();
306        writeln!(f, "exactlyten").unwrap(); // 10 chars
307
308        // max_lines = 1 should stop after first line
309        let lines = read_lines(&path, 1, 10000).unwrap();
310        assert_eq!(lines.len(), 1);
311        assert_eq!(lines[0], "exactlyten");
312    }
313
314    #[test]
315    fn test_read_lines_bytes_limit_stops_after_reaching_threshold() {
316        let tmp = tempfile::tempdir().unwrap();
317        let path = tmp.path().join("threshold.txt");
318        let mut f = File::create(&path).unwrap();
319        writeln!(f, "aaaaa").unwrap(); // 5 bytes
320        writeln!(f, "bbbbb").unwrap(); // 5 bytes (total 10)
321        writeln!(f, "ccccc").unwrap(); // should not be read if limit is 9
322
323        // With limit of 9 bytes, we should get exactly 2 lines
324        // because after first line (5 bytes), bytes=5 < 9, continue
325        // after second line (5 bytes), bytes=10 >= 9, break
326        let lines = read_lines(&path, 100, 9).unwrap();
327        assert_eq!(lines.len(), 2);
328    }
329
330    // ========================
331    // read_text_capped tests
332    // ========================
333
334    #[test]
335    fn test_read_text_capped_returns_actual_content() {
336        let tmp = tempfile::tempdir().unwrap();
337        let path = tmp.path().join("text.txt");
338        let mut f = File::create(&path).unwrap();
339        f.write_all(b"Hello, World!").unwrap();
340
341        let text = read_text_capped(&path, 100).unwrap();
342        // Verify we get actual content, not empty or "xyzzy"
343        assert_eq!(text, "Hello, World!");
344    }
345
346    #[test]
347    fn test_read_text_capped_respects_limit() {
348        let tmp = tempfile::tempdir().unwrap();
349        let path = tmp.path().join("long_text.txt");
350        let mut f = File::create(&path).unwrap();
351        f.write_all(b"The quick brown fox jumps over the lazy dog")
352            .unwrap();
353
354        let text = read_text_capped(&path, 9).unwrap();
355        assert_eq!(text, "The quick");
356    }
357
358    // ========================
359    // hash_file tests
360    // ========================
361
362    #[test]
363    fn test_hash_file_returns_correct_blake3_hash() {
364        let tmp = tempfile::tempdir().unwrap();
365        let path = tmp.path().join("hash_test.txt");
366        let mut f = File::create(&path).unwrap();
367        f.write_all(b"test content").unwrap();
368
369        let hash = hash_file(&path, 1000).unwrap();
370
371        // Verify it's not empty
372        assert!(!hash.is_empty());
373        // Verify it's 64 hex chars (BLAKE3 output)
374        assert_eq!(hash.len(), 64);
375        // Verify it matches expected BLAKE3 hash
376        let expected = hash_bytes(b"test content");
377        assert_eq!(hash, expected);
378    }
379
380    #[test]
381    fn test_hash_file_deterministic() {
382        let tmp = tempfile::tempdir().unwrap();
383        let path = tmp.path().join("deterministic.txt");
384        let mut f = File::create(&path).unwrap();
385        f.write_all(b"same content every time").unwrap();
386
387        let hash1 = hash_file(&path, 1000).unwrap();
388        let hash2 = hash_file(&path, 1000).unwrap();
389        assert_eq!(hash1, hash2);
390    }
391
392    #[test]
393    fn test_hash_file_different_content_different_hash() {
394        let tmp = tempfile::tempdir().unwrap();
395
396        let path1 = tmp.path().join("file1.txt");
397        let mut f1 = File::create(&path1).unwrap();
398        f1.write_all(b"content A").unwrap();
399
400        let path2 = tmp.path().join("file2.txt");
401        let mut f2 = File::create(&path2).unwrap();
402        f2.write_all(b"content B").unwrap();
403
404        let hash1 = hash_file(&path1, 1000).unwrap();
405        let hash2 = hash_file(&path2, 1000).unwrap();
406        assert_ne!(hash1, hash2);
407    }
408
409    #[test]
410    fn test_hash_file_respects_max_bytes() {
411        let tmp = tempfile::tempdir().unwrap();
412        let path = tmp.path().join("long_file.txt");
413        let mut f = File::create(&path).unwrap();
414        f.write_all(b"abcdefghij").unwrap();
415
416        // Hash only first 5 bytes
417        let hash_limited = hash_file(&path, 5).unwrap();
418        let expected = hash_bytes(b"abcde");
419        assert_eq!(hash_limited, expected);
420
421        // Full hash should be different
422        let hash_full = hash_file(&path, 1000).unwrap();
423        assert_ne!(hash_limited, hash_full);
424    }
425}