tokmd-analysis 1.10.0

Analysis logic and enrichers for tokmd receipts.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
//! Content I/O helpers for analysis.
//!
//! These helpers provide file content inspection capabilities including
//! reading, hashing, tag counting, and entropy calculation.
//!
//! ## What belongs here
//! * File content reading (head, tail, lines)
//! * Text detection
//! * File integrity hashing (BLAKE3)
//! * Tag counting (TODOs, FIXMEs)
//! * Entropy calculation
//!
//! ## What does NOT belong here
//! * File listing (use tokmd-scan::walk)
//! * File modification

#![allow(dead_code)]

use std::fs::File;
use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
use std::path::Path;

use anyhow::{Context, Result};

fn read_head_from_file(file: &mut File, max_bytes: usize) -> Result<Vec<u8>> {
    use std::io::Read as _;
    let mut buf = Vec::with_capacity(max_bytes);
    file.take(max_bytes as u64).read_to_end(&mut buf)?;
    Ok(buf)
}

pub fn read_head(path: &Path, max_bytes: usize) -> Result<Vec<u8>> {
    let mut file =
        File::open(path).with_context(|| format!("Failed to open {}", path.display()))?;
    read_head_from_file(&mut file, max_bytes)
}

pub fn read_head_tail(path: &Path, max_bytes: usize) -> Result<Vec<u8>> {
    if max_bytes == 0 {
        return Ok(Vec::new());
    }
    let mut file =
        File::open(path).with_context(|| format!("Failed to open {}", path.display()))?;
    let size = file
        .metadata()
        .with_context(|| format!("Failed to get metadata for {}", path.display()))?
        .len();
    if size as usize <= max_bytes {
        return read_head_from_file(&mut file, max_bytes);
    }

    let half = max_bytes / 2;
    let head_len = half.max(1);
    let tail_len = max_bytes.saturating_sub(head_len);

    let mut head = vec![0u8; head_len];
    file.read_exact(&mut head)?;

    if tail_len == 0 {
        return Ok(head);
    }

    let tail_start = size.saturating_sub(tail_len as u64);
    file.seek(SeekFrom::Start(tail_start))?;
    let mut tail = vec![0u8; tail_len];
    file.read_exact(&mut tail)?;

    head.extend_from_slice(&tail);
    Ok(head)
}

pub fn read_lines(path: &Path, max_lines: usize, max_bytes: usize) -> Result<Vec<String>> {
    if max_lines == 0 || max_bytes == 0 {
        return Ok(Vec::new());
    }
    let file = File::open(path).with_context(|| format!("Failed to open {}", path.display()))?;
    let reader = BufReader::new(file);
    let mut lines = Vec::new();
    let mut bytes = 0usize;

    for line in reader.lines() {
        let line = line?;
        bytes += line.len();
        lines.push(line);
        if lines.len() >= max_lines || bytes >= max_bytes {
            break;
        }
    }

    Ok(lines)
}

pub fn read_text_capped(path: &Path, max_bytes: usize) -> Result<String> {
    let bytes = read_head(path, max_bytes)?;
    Ok(String::from_utf8_lossy(&bytes).to_string())
}

pub fn is_text_like(bytes: &[u8]) -> bool {
    if bytes.contains(&0) {
        return false;
    }
    std::str::from_utf8(bytes).is_ok()
}

pub fn hash_bytes(bytes: &[u8]) -> String {
    blake3::hash(bytes).to_hex().to_string()
}

pub fn hash_file(path: &Path, max_bytes: usize) -> Result<String> {
    let bytes = read_head(path, max_bytes)?;
    Ok(hash_bytes(&bytes))
}

pub fn count_tags(text: &str, tags: &[&str]) -> Vec<(String, usize)> {
    let upper = text.to_uppercase();
    tags.iter()
        .map(|tag| {
            let needle = tag.to_uppercase();
            let count = upper.matches(&needle).count();
            (tag.to_string(), count)
        })
        .collect()
}

pub fn entropy_bits_per_byte(bytes: &[u8]) -> f32 {
    if bytes.is_empty() {
        return 0.0;
    }
    let mut counts = [0u32; 256];
    for b in bytes {
        counts[*b as usize] += 1;
    }
    let len = bytes.len() as f32;
    let mut entropy = 0.0f32;
    for count in counts {
        if count == 0 {
            continue;
        }
        let p = count as f32 / len;
        entropy -= p * p.log2();
    }
    entropy
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Write;

    #[test]
    fn test_read_head_empty() {
        let tmp = tempfile::tempdir().unwrap();
        let path = tmp.path().join("empty");
        File::create(&path).unwrap();

        let bytes = read_head(&path, 10).unwrap();
        assert!(bytes.is_empty());
    }

    #[test]
    fn test_read_head_small() {
        let tmp = tempfile::tempdir().unwrap();
        let path = tmp.path().join("small");
        let mut f = File::create(&path).unwrap();
        f.write_all(b"hello").unwrap();

        let bytes = read_head(&path, 10).unwrap();
        assert_eq!(bytes, b"hello");
    }

    #[test]
    fn test_read_head_limit() {
        let tmp = tempfile::tempdir().unwrap();
        let path = tmp.path().join("limit");
        let mut f = File::create(&path).unwrap();
        f.write_all(b"hello world").unwrap();

        let bytes = read_head(&path, 5).unwrap();
        assert_eq!(bytes, b"hello");
    }

    #[test]
    fn test_read_head_tail_small() {
        let tmp = tempfile::tempdir().unwrap();
        let path = tmp.path().join("small");
        let mut f = File::create(&path).unwrap();
        f.write_all(b"hello").unwrap();

        let bytes = read_head_tail(&path, 10).unwrap();
        assert_eq!(bytes, b"hello");
    }

    #[test]
    fn test_read_head_tail_large() {
        let tmp = tempfile::tempdir().unwrap();
        let path = tmp.path().join("large");
        let mut f = File::create(&path).unwrap();
        // 0123456789
        f.write_all(b"0123456789").unwrap();

        // max_bytes = 4. half=2. head=2 ("01"), tail=2 ("89").
        let bytes = read_head_tail(&path, 4).unwrap();
        assert_eq!(bytes, b"0189");
    }

    #[test]
    fn test_read_head_tail_odd() {
        let tmp = tempfile::tempdir().unwrap();
        let path = tmp.path().join("odd");
        let mut f = File::create(&path).unwrap();
        // 0123456789
        f.write_all(b"0123456789").unwrap();

        // max_bytes = 5. half=2. head=2 ("01"), tail=3 ("789").
        let bytes = read_head_tail(&path, 5).unwrap();
        assert_eq!(bytes, b"01789");
    }

    // ========================
    // read_lines tests
    // ========================

    #[test]
    fn test_read_lines_returns_actual_content() {
        let tmp = tempfile::tempdir().unwrap();
        let path = tmp.path().join("lines.txt");
        let mut f = File::create(&path).unwrap();
        writeln!(f, "first line").unwrap();
        writeln!(f, "second line").unwrap();
        writeln!(f, "third line").unwrap();

        let lines = read_lines(&path, 10, 10000).unwrap();
        // Verify actual content, not empty or dummy values
        assert_eq!(lines.len(), 3);
        assert_eq!(lines[0], "first line");
        assert_eq!(lines[1], "second line");
        assert_eq!(lines[2], "third line");
    }

    #[test]
    fn test_read_lines_respects_max_lines_limit() {
        let tmp = tempfile::tempdir().unwrap();
        let path = tmp.path().join("many_lines.txt");
        let mut f = File::create(&path).unwrap();
        for i in 0..10 {
            writeln!(f, "line {}", i).unwrap();
        }

        // Request only 3 lines
        let lines = read_lines(&path, 3, 10000).unwrap();
        assert_eq!(lines.len(), 3);
        assert_eq!(lines[0], "line 0");
        assert_eq!(lines[1], "line 1");
        assert_eq!(lines[2], "line 2");
    }

    #[test]
    fn test_read_lines_respects_max_bytes_limit() {
        let tmp = tempfile::tempdir().unwrap();
        let path = tmp.path().join("bytes_limited.txt");
        let mut f = File::create(&path).unwrap();
        // Each line is 10 chars: "line 0\n" etc
        for i in 0..10 {
            writeln!(f, "line {:04}", i).unwrap();
        }

        // Limit to 25 bytes - should get ~2-3 lines (each ~10 bytes)
        let lines = read_lines(&path, 100, 25).unwrap();
        // With byte limit of 25 and lines of ~10 bytes each,
        // we should stop after accumulating >= 25 bytes
        assert!(
            lines.len() >= 2 && lines.len() <= 4,
            "Expected 2-4 lines, got {}",
            lines.len()
        );
        // Verify first line content
        assert_eq!(lines[0], "line 0000");
    }

    #[test]
    fn test_read_lines_bytes_accumulate_correctly() {
        let tmp = tempfile::tempdir().unwrap();
        let path = tmp.path().join("accumulate.txt");
        let mut f = File::create(&path).unwrap();
        // Write lines with known sizes
        writeln!(f, "12345").unwrap(); // 5 bytes (without newline in result)
        writeln!(f, "67890").unwrap(); // 5 more = 10 total
        writeln!(f, "abcde").unwrap(); // 5 more = 15 total
        writeln!(f, "fghij").unwrap(); // 5 more = 20 total

        // Stop at exactly 10 bytes - should get 2 lines
        let lines = read_lines(&path, 100, 10).unwrap();
        assert_eq!(lines.len(), 2, "Should stop after reaching 10 bytes");
        assert_eq!(lines[0], "12345");
        assert_eq!(lines[1], "67890");
    }

    #[test]
    fn test_read_lines_single_line_at_limit() {
        let tmp = tempfile::tempdir().unwrap();
        let path = tmp.path().join("single.txt");
        let mut f = File::create(&path).unwrap();
        writeln!(f, "exactlyten").unwrap(); // 10 chars

        // max_lines = 1 should stop after first line
        let lines = read_lines(&path, 1, 10000).unwrap();
        assert_eq!(lines.len(), 1);
        assert_eq!(lines[0], "exactlyten");
    }

    #[test]
    fn test_read_lines_bytes_limit_stops_after_reaching_threshold() {
        let tmp = tempfile::tempdir().unwrap();
        let path = tmp.path().join("threshold.txt");
        let mut f = File::create(&path).unwrap();
        writeln!(f, "aaaaa").unwrap(); // 5 bytes
        writeln!(f, "bbbbb").unwrap(); // 5 bytes (total 10)
        writeln!(f, "ccccc").unwrap(); // should not be read if limit is 9

        // With limit of 9 bytes, we should get exactly 2 lines
        // because after first line (5 bytes), bytes=5 < 9, continue
        // after second line (5 bytes), bytes=10 >= 9, break
        let lines = read_lines(&path, 100, 9).unwrap();
        assert_eq!(lines.len(), 2);
    }

    // ========================
    // read_text_capped tests
    // ========================

    #[test]
    fn test_read_text_capped_returns_actual_content() {
        let tmp = tempfile::tempdir().unwrap();
        let path = tmp.path().join("text.txt");
        let mut f = File::create(&path).unwrap();
        f.write_all(b"Hello, World!").unwrap();

        let text = read_text_capped(&path, 100).unwrap();
        // Verify we get actual content, not empty or "xyzzy"
        assert_eq!(text, "Hello, World!");
    }

    #[test]
    fn test_read_text_capped_respects_limit() {
        let tmp = tempfile::tempdir().unwrap();
        let path = tmp.path().join("long_text.txt");
        let mut f = File::create(&path).unwrap();
        f.write_all(b"The quick brown fox jumps over the lazy dog")
            .unwrap();

        let text = read_text_capped(&path, 9).unwrap();
        assert_eq!(text, "The quick");
    }

    // ========================
    // hash_file tests
    // ========================

    #[test]
    fn test_hash_file_returns_correct_blake3_hash() {
        let tmp = tempfile::tempdir().unwrap();
        let path = tmp.path().join("hash_test.txt");
        let mut f = File::create(&path).unwrap();
        f.write_all(b"test content").unwrap();

        let hash = hash_file(&path, 1000).unwrap();

        // Verify it's not empty
        assert!(!hash.is_empty());
        // Verify it's 64 hex chars (BLAKE3 output)
        assert_eq!(hash.len(), 64);
        // Verify it matches expected BLAKE3 hash
        let expected = hash_bytes(b"test content");
        assert_eq!(hash, expected);
    }

    #[test]
    fn test_hash_file_deterministic() {
        let tmp = tempfile::tempdir().unwrap();
        let path = tmp.path().join("deterministic.txt");
        let mut f = File::create(&path).unwrap();
        f.write_all(b"same content every time").unwrap();

        let hash1 = hash_file(&path, 1000).unwrap();
        let hash2 = hash_file(&path, 1000).unwrap();
        assert_eq!(hash1, hash2);
    }

    #[test]
    fn test_hash_file_different_content_different_hash() {
        let tmp = tempfile::tempdir().unwrap();

        let path1 = tmp.path().join("file1.txt");
        let mut f1 = File::create(&path1).unwrap();
        f1.write_all(b"content A").unwrap();

        let path2 = tmp.path().join("file2.txt");
        let mut f2 = File::create(&path2).unwrap();
        f2.write_all(b"content B").unwrap();

        let hash1 = hash_file(&path1, 1000).unwrap();
        let hash2 = hash_file(&path2, 1000).unwrap();
        assert_ne!(hash1, hash2);
    }

    #[test]
    fn test_hash_file_respects_max_bytes() {
        let tmp = tempfile::tempdir().unwrap();
        let path = tmp.path().join("long_file.txt");
        let mut f = File::create(&path).unwrap();
        f.write_all(b"abcdefghij").unwrap();

        // Hash only first 5 bytes
        let hash_limited = hash_file(&path, 5).unwrap();
        let expected = hash_bytes(b"abcde");
        assert_eq!(hash_limited, expected);

        // Full hash should be different
        let hash_full = hash_file(&path, 1000).unwrap();
        assert_ne!(hash_limited, hash_full);
    }
}