lindisfarner 0.1.0

A digital scriptorium: illuminate plain text with ASCII-art initials, rubrics, and ornate borders.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
//! The magnifica modes — an art project on the use of AI tools, named for the
//! encyclical *Magnifica Humanitas*.
//!
//! Point lindisfarner at a codebase; it finds where AI is used — hosted-vendor
//! API calls and the ML lifecycle (via the embedded Semgrep rules), plus model
//! weight files on disk — and answers with the words of the encyclical, in one
//! of two modes. **Both write the changes to the files on disk**, then print an
//! illuminated report:
//!
//! - **witness** — annotate the source: insert the encyclical as a comment
//!   beside every AI invocation. The code is not broken.
//! - **relinquish** — strike each AI block out of the source and leave the
//!   encyclical's words in its place, breaking what it touches.
//!
//! The writes go straight to disk with no safety net — keeping a backup (or a
//! clean commit) is the caller's responsibility. Weight files are reported but
//! never modified.

use std::collections::BTreeMap;
use std::fs;
use std::io;
use std::path::Path;

use clap::ValueEnum;
use lindisfarner::{render, render_glossed, Config};

use crate::search::{self, Finding};

/// The detection rules and the encyclical passages, baked into the binary.
const AI_RULES: &str = include_str!("../rules/ai.yml");
const ML_RULES: &str = include_str!("../rules/ml.yml");
const QUOTES: &str = include_str!("../assets/quotes.txt");

/// File extensions that denote trained model weights — found by walking the
/// filesystem, since Semgrep reads only source code, not binary artifacts.
const WEIGHT_EXTENSIONS: &[&str] = &[
    "pt",
    "pth",
    "safetensors",
    "ckpt",
    "h5",
    "hdf5",
    "onnx",
    "gguf",
    "ggml",
    "tflite",
    "joblib",
    "npz",
    "caffemodel",
    "mlmodel",
    "pdparams",
    "pkl",
];

/// Directories not worth walking for weight files.
const SKIP_DIRS: &[&str] = &[
    ".git",
    "target",
    "node_modules",
    ".venv",
    "venv",
    "__pycache__",
    "dist",
    "build",
];

#[derive(Copy, Clone, Debug, ValueEnum)]
pub(crate) enum Mode {
    /// Annotate the source with the encyclical beside every AI invocation.
    Witness,
    /// Strike each AI block out of the source, leaving the encyclical in its place.
    Relinquish,
}

/// Everything a magnifica run needs.
pub(crate) struct Plan<'a> {
    pub mode: Mode,
    pub path: &'a Path,
    pub quotes: Option<&'a Path>,
    pub seed: u64,
    pub cfg: Config,
}

/// Carry out the plan: write the encyclical into the files, then return the
/// illuminated report to print.
pub(crate) fn run(plan: &Plan) -> io::Result<String> {
    let quotes = load_quotes(plan.quotes)?;
    if quotes.is_empty() {
        return Err(io::Error::other("no encyclical passages to draw from"));
    }

    let findings = search::findings(plan.path, &[AI_RULES, ML_RULES])?;
    let weights = weight_files(plan.path);

    // The matched lines — signature, what they are, and where — captured before
    // we rewrite the files (which would change what `signature` reads back).
    let located: Vec<(String, String, String)> = findings
        .iter()
        .map(|f| {
            let label = f
                .message
                .clone()
                .unwrap_or_else(|| "an AI invocation".into());
            (f.signature(), label, f.location())
        })
        .collect();

    if findings.is_empty() {
        if weights.is_empty() {
            return Ok(render(
                "No AI tools were found here. For now, this remains the work of human hands.",
                &plan.cfg,
            ));
        }
        // Only weights — nothing in the source to write.
        return Ok(report(plan, &quotes, &[], &weights, "found"));
    }

    let verb = match plan.mode {
        Mode::Witness => {
            annotate(plan, &quotes, &findings)?;
            "annotated"
        }
        Mode::Relinquish => {
            strike(plan, &quotes, &findings)?;
            "relinquished"
        }
    };
    Ok(report(plan, &quotes, &located, &weights, verb))
}

/// The illuminated report printed after writing: each location with the action
/// taken, the (untouched) weight files, and a reading from the encyclical.
fn report(
    plan: &Plan,
    quotes: &[String],
    located: &[(String, String, String)],
    weights: &[(String, u64)],
    verb: &str,
) -> String {
    let mut rows: Vec<(String, String)> = located
        .iter()
        .map(|(sig, label, loc)| (sig.clone(), format!("{label} · {verb} · {loc}")))
        .collect();
    for (file, size) in weights {
        rows.push((
            file.clone(),
            format!("model weights · {} (left in place)", human_size(*size)),
        ));
    }
    let usage = render_glossed(&rows, &plan.cfg);
    let sermon = render(
        &sermon_text(quotes, plan.seed, rows.len().max(1)),
        &plan.cfg,
    );
    let note = if located.is_empty() {
        String::new()
    } else {
        format!(
            "\n\n  {} location(s) {verb}. Review with `git diff`; undo with `git checkout`.",
            located.len()
        )
    };
    format!("{usage}\n\n{sermon}{note}")
}

// ---- writing the files -----------------------------------------------------

/// witness: insert the encyclical as a comment above every AI invocation,
/// leaving the code itself in place.
fn annotate(plan: &Plan, quotes: &[String], findings: &[Finding]) -> io::Result<()> {
    for (file, group) in group_by_file(findings) {
        let marker = comment_marker(file);
        let mut lines = read_lines(file)?;
        // Insert from the bottom up so earlier line numbers stay valid.
        let mut ordered: Vec<&&Finding> = group.iter().collect();
        ordered.sort_by_key(|f| std::cmp::Reverse(f.line));
        for f in ordered {
            let at = (f.line as usize).saturating_sub(1).min(lines.len());
            let indent = leading_whitespace(lines.get(at).map(String::as_str).unwrap_or(""));
            let block = comment_block(quotes, plan, f.line, &indent, marker);
            lines.splice(at..at, block);
        }
        write_lines(file, &lines)?;
    }
    Ok(())
}

/// relinquish: replace each AI block with the encyclical's words (as comments),
/// breaking the code where it called out.
fn strike(plan: &Plan, quotes: &[String], findings: &[Finding]) -> io::Result<()> {
    for (file, group) in group_by_file(findings) {
        let marker = comment_marker(file);
        let mut lines = read_lines(file)?;
        let mut ordered: Vec<&&Finding> = group.iter().collect();
        ordered.sort_by_key(|f| std::cmp::Reverse(f.line));
        for f in ordered {
            let start = (f.line as usize).saturating_sub(1).min(lines.len());
            let end = (f.end_line as usize).min(lines.len());
            let indent = leading_whitespace(lines.get(start).map(String::as_str).unwrap_or(""));
            let mut block = comment_block(quotes, plan, f.line, &indent, marker);
            block.push(format!("{indent}{marker} (an AI invocation, relinquished)"));
            lines.splice(start..end, block);
        }
        write_lines(file, &lines)?;
    }
    Ok(())
}

/// A passage wrapped into comment lines at the given indentation.
fn comment_block(
    quotes: &[String],
    plan: &Plan,
    n: u64,
    indent: &str,
    marker: &str,
) -> Vec<String> {
    let quote = pick(quotes, plan.seed, n);
    let avail = plan
        .cfg
        .width
        .saturating_sub(indent.len() + marker.len() + 2)
        .max(20);
    word_wrap(quote, avail)
        .into_iter()
        .map(|l| format!("{indent}{marker} {l}"))
        .collect()
}

/// The line-comment marker for a file, by extension.
fn comment_marker(file: &str) -> &'static str {
    match Path::new(file).extension().and_then(|e| e.to_str()) {
        Some("py" | "rb" | "sh" | "bash" | "yaml" | "yml" | "toml" | "pl" | "r") => "#",
        Some("lua" | "sql" | "hs") => "--",
        _ => "//",
    }
}

fn write_lines(file: &str, lines: &[String]) -> io::Result<()> {
    fs::write(file, lines.join("\n")).map_err(|e| io::Error::new(e.kind(), format!("{file}: {e}")))
}

// ---- helpers ---------------------------------------------------------------

fn load_quotes(path: Option<&Path>) -> io::Result<Vec<String>> {
    let text = match path {
        Some(p) => fs::read_to_string(p)
            .map_err(|e| io::Error::new(e.kind(), format!("{}: {e}", p.display())))?,
        None => QUOTES.to_string(),
    };
    Ok(parse_quotes(&text))
}

/// Passages are separated by blank lines; `#` lines are comments.
fn parse_quotes(text: &str) -> Vec<String> {
    text.replace("\r\n", "\n")
        .split("\n\n")
        .map(|block| {
            block
                .lines()
                .filter(|l| !l.trim_start().starts_with('#'))
                .collect::<Vec<_>>()
                .join(" ")
                .split_whitespace()
                .collect::<Vec<_>>()
                .join(" ")
        })
        .filter(|q| !q.is_empty())
        .collect()
}

/// A short reading: up to `want` distinct passages, chosen by the seed.
fn sermon_text(quotes: &[String], seed: u64, want: usize) -> String {
    let want = want.clamp(1, quotes.len().min(6));
    let start = (pick_index(seed, 0) % quotes.len() as u64) as usize;
    (0..want)
        .map(|i| quotes[(start + i) % quotes.len()].clone())
        .collect::<Vec<_>>()
        .join("\n\n")
}

/// Deterministically choose a passage for a given line/index.
fn pick(quotes: &[String], seed: u64, n: u64) -> &str {
    &quotes[(pick_index(seed, n) % quotes.len() as u64) as usize]
}

/// A small deterministic hash (splitmix64).
fn pick_index(seed: u64, n: u64) -> u64 {
    let mut z = seed
        .wrapping_add(n.wrapping_mul(0x9E37_79B9_7F4A_7C15))
        .wrapping_add(0x6D2B_79F5);
    z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
    z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
    z ^ (z >> 31)
}

/// Group findings by their file, preserving order within each file.
fn group_by_file(findings: &[Finding]) -> BTreeMap<&str, Vec<&Finding>> {
    let mut map: BTreeMap<&str, Vec<&Finding>> = BTreeMap::new();
    for f in findings {
        map.entry(f.file.as_str()).or_default().push(f);
    }
    map
}

fn read_lines(file: &str) -> io::Result<Vec<String>> {
    let text =
        fs::read_to_string(file).map_err(|e| io::Error::new(e.kind(), format!("{file}: {e}")))?;
    Ok(text
        .replace("\r\n", "\n")
        .split('\n')
        .map(String::from)
        .collect())
}

/// The leading whitespace of a line, so a replacement sits at its indentation.
fn leading_whitespace(line: &str) -> String {
    line.chars().take_while(|c| c.is_whitespace()).collect()
}

/// Greedily wrap `text` to `width` columns (the passages are plain text).
fn word_wrap(text: &str, width: usize) -> Vec<String> {
    let width = width.max(1);
    let mut lines = Vec::new();
    let mut cur = String::new();
    for word in text.split_whitespace() {
        if cur.is_empty() {
            cur.push_str(word);
        } else if cur.len() + 1 + word.len() <= width {
            cur.push(' ');
            cur.push_str(word);
        } else {
            lines.push(std::mem::take(&mut cur));
            cur.push_str(word);
        }
    }
    if !cur.is_empty() {
        lines.push(cur);
    }
    if lines.is_empty() {
        lines.push(String::new());
    }
    lines
}

// ---- model weights (the filesystem half Semgrep can't do) ------------------

/// Walk `path` for trained-model weight files, returning `(display path, size)`
/// sorted by path. Common heavy directories are skipped.
fn weight_files(path: &Path) -> Vec<(String, u64)> {
    let mut out = Vec::new();
    collect_weights(path, &mut out);
    out.sort();
    out
}

fn collect_weights(path: &Path, out: &mut Vec<(String, u64)>) {
    if path.is_file() {
        if is_weight(path) {
            if let Ok(meta) = path.metadata() {
                out.push((path.display().to_string(), meta.len()));
            }
        }
        return;
    }
    let Ok(entries) = fs::read_dir(path) else {
        return;
    };
    for entry in entries.flatten() {
        let p = entry.path();
        if p.is_dir() {
            let skip = p
                .file_name()
                .and_then(|n| n.to_str())
                .is_some_and(|n| SKIP_DIRS.contains(&n));
            if !skip {
                collect_weights(&p, out);
            }
        } else if is_weight(&p) {
            if let Ok(meta) = p.metadata() {
                out.push((p.display().to_string(), meta.len()));
            }
        }
    }
}

/// Whether a path has a model-weight extension.
fn is_weight(path: &Path) -> bool {
    path.extension()
        .and_then(|e| e.to_str())
        .map(str::to_ascii_lowercase)
        .is_some_and(|e| WEIGHT_EXTENSIONS.contains(&e.as_str()))
}

/// Format a byte count as a human-readable size.
fn human_size(bytes: u64) -> String {
    const UNITS: [&str; 4] = ["B", "KB", "MB", "GB"];
    let mut size = bytes as f64;
    let mut unit = 0;
    while size >= 1024.0 && unit < UNITS.len() - 1 {
        size /= 1024.0;
        unit += 1;
    }
    if unit == 0 {
        format!("{bytes} B")
    } else {
        format!("{size:.1} {}", UNITS[unit])
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn embedded_quotes_parse_cleanly() {
        let quotes = parse_quotes(QUOTES);
        assert!(quotes.len() >= 10, "expected the curated passages");
        assert!(
            quotes.iter().all(|q| !q.starts_with('#')),
            "no comment lines"
        );
        assert!(
            quotes.iter().all(|q| !q.contains("  ")),
            "whitespace normalised"
        );
        assert!(quotes.iter().any(|q| q.contains("Magnifica Humanitas")));
    }

    #[test]
    fn pick_is_deterministic() {
        let quotes = parse_quotes(QUOTES);
        assert_eq!(pick(&quotes, 5, 3), pick(&quotes, 5, 3));
    }

    #[test]
    fn word_wrap_respects_width() {
        let lines = word_wrap("the quick brown fox jumps over the lazy dog", 12);
        assert!(lines.iter().all(|l| l.len() <= 12), "lines fit the width");
        assert_eq!(
            lines.join(" "),
            "the quick brown fox jumps over the lazy dog"
        );
    }

    #[test]
    fn weight_extensions_recognised() {
        assert!(is_weight(Path::new("model.safetensors")));
        assert!(is_weight(Path::new("ckpt/epoch.PT"))); // case-insensitive
        assert!(is_weight(Path::new("a/b/weights.onnx")));
        assert!(!is_weight(Path::new("train.py")));
        assert!(!is_weight(Path::new("README.md")));
    }

    #[test]
    fn human_size_scales() {
        assert_eq!(human_size(512), "512 B");
        assert_eq!(human_size(2048), "2.0 KB");
        assert_eq!(human_size(5 * 1024 * 1024), "5.0 MB");
        assert!(human_size(3 * 1024 * 1024 * 1024).ends_with("GB"));
    }
}