1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
//! # Module: outline
//!
//! ## Spec
//! - Parses a markdown document (after stripping YAML frontmatter) into a list of heading-delimited sections.
//! - Uses pulldown-cmark for CommonMark-compliant heading detection: ATX (`# …`) and setext (`===`/`---`) headings are recognized; headings inside fenced code blocks are silently ignored.
//! - Each section records: heading text (ATX-normalized), depth (1–6), 1-based start line, content line count, and approximate token count (bytes ÷ 4).
//! - Content before the first heading is emitted as a synthetic `(preamble)` section (depth 0) when non-empty.
//! - `run` outputs either a human-readable table (`--json` false) or compact JSON array (`--json` true) to stdout.
//! - Text output: indented by heading depth, padded columns for lines and tokens, with a `Total` summary row.
//! - JSON output: array of `{"heading","depth","line","lines","tokens"}` objects, no pretty-printing.
//!
//! ## Agentic Contracts
//! - `run(file, json)` — reads the file, returns `Err` if missing; otherwise prints section table/JSON and returns `Ok(())`.
//! - Callers may rely on stable JSON field names and column ordering for downstream parsing.
//! - Token counts are an approximation; callers must not treat them as exact.
//! - Headings inside code fences are guaranteed to be excluded from section output.
//!
//! ## Evals
//! - atx_headings: ATX `#`/`##`/`###` body → correct depth and text per section
//! - setext_headings: `===`/`---` underlined body → ATX-normalized heading strings
//! - code_block_ignored: `# heading` inside triple-backtick fence → not emitted as section
//! - preamble: body with content before first heading → `(preamble)` section at depth 0
//! - empty_doc: empty body → empty section list, no output rows
//! - json_output: single section → valid JSON array with all five fields
use anyhow::Result;
use pulldown_cmark::{Event, HeadingLevel, Options, Parser, Tag};
use std::path::Path;
/// A heading-delimited section of a markdown document.
struct Section {
/// Heading text (e.g. "## User")
heading: String,
/// Heading depth (1 for #, 2 for ##, etc.)
depth: usize,
/// Line number where the heading appears (1-based)
line: usize,
/// Number of content lines (excluding the heading itself)
lines: usize,
/// Approximate token count (bytes / 4)
tokens: usize,
}
pub fn run(file: &Path, json: bool) -> Result<()> {
if !file.exists() {
anyhow::bail!("file not found: {}", file.display());
}
let content = std::fs::read_to_string(file)?;
let (_fm, body) = crate::frontmatter::parse(&content)?;
let sections = parse_sections(body);
if json {
print_json(§ions);
} else {
print_text(§ions);
}
Ok(())
}
/// Collect `(byte_offset, depth, heading_text)` for every heading in `body`
/// using pulldown-cmark. Handles ATX headings (`# …`), setext headings
/// (`===` / `---` underlines), and correctly skips headings inside code blocks.
fn collect_headings(body: &str) -> Vec<(usize, usize, String)> {
let mut headings = Vec::new();
let parser = Parser::new_ext(body, Options::empty());
let mut iter = parser.into_offset_iter();
while let Some((event, range)) = iter.next() {
if let Event::Start(Tag::Heading { level, .. }) = event {
let depth = heading_level_to_depth(level);
let byte_start = range.start;
// Collect all inline text events until the matching End(Heading)
let mut text = String::new();
for (inner_event, _) in iter.by_ref() {
match inner_event {
Event::End(pulldown_cmark::TagEnd::Heading(_)) => break,
Event::Text(t) | Event::Code(t) => text.push_str(&t),
_ => {}
}
}
headings.push((byte_start, depth, text));
}
}
headings
}
fn heading_level_to_depth(level: HeadingLevel) -> usize {
match level {
HeadingLevel::H1 => 1,
HeadingLevel::H2 => 2,
HeadingLevel::H3 => 3,
HeadingLevel::H4 => 4,
HeadingLevel::H5 => 5,
HeadingLevel::H6 => 6,
}
}
fn parse_sections(body: &str) -> Vec<Section> {
let lines: Vec<&str> = body.lines().collect();
let headings = collect_headings(body);
// Convert byte offsets → 0-based line numbers.
// Build a lookup: byte offset of each line start.
let mut line_starts: Vec<usize> = Vec::with_capacity(lines.len() + 1);
line_starts.push(0);
for line in &lines {
let prev = *line_starts.last().unwrap();
line_starts.push(prev + line.len() + 1); // +1 for '\n'
}
// For a given byte offset find the 0-based line index.
let byte_to_line = |byte_off: usize| -> usize {
line_starts
.partition_point(|&start| start <= byte_off)
.saturating_sub(1)
};
// Build Section list from headings. We store line index (0-based) internally
// and convert to 1-based at the end, matching the original behaviour.
let mut sections: Vec<Section> = Vec::new();
for (byte_off, depth, text) in &headings {
let line_idx = byte_to_line(*byte_off);
// Build the canonical heading string. For ATX headings the source line
// starts with `#`; for setext headings the source line is plain text.
// We reconstruct an ATX-style string so the display format is stable.
let heading_str = {
let src_line = lines.get(line_idx).copied().unwrap_or("").trim();
if src_line.starts_with('#') {
src_line.to_string()
} else {
// Setext heading — emit canonical ATX form
format!("{} {}", "#".repeat(*depth), text)
}
};
// Close the previous section
if let Some(prev) = sections.last_mut() {
let prev_line = prev.line; // 0-based
prev.lines = line_idx - prev_line;
let section_text = lines[prev_line + 1..line_idx].join("\n");
prev.tokens = section_text.len().div_ceil(4);
}
sections.push(Section {
heading: heading_str,
depth: *depth,
line: line_idx, // 0-based for now
lines: 0,
tokens: 0,
});
}
// Close the last section
if let Some(prev) = sections.last_mut() {
let prev_line = prev.line;
prev.lines = lines.len() - prev_line;
let section_text = lines[prev_line + 1..].join("\n");
prev.tokens = section_text.len().div_ceil(4);
}
// Preamble: content before the first heading
let first_heading_line = sections.first().map_or(lines.len(), |s| s.line);
if first_heading_line > 0 {
let preamble_text: String = lines[..first_heading_line].join("\n");
let preamble_tokens = preamble_text.len().div_ceil(4);
if preamble_tokens > 0 {
sections.insert(
0,
Section {
heading: "(preamble)".to_string(),
depth: 0,
line: 0,
lines: first_heading_line,
tokens: preamble_tokens,
},
);
}
}
// Convert 0-based line indices to 1-based for display
for s in &mut sections {
s.line += 1;
}
sections
}
fn print_text(sections: &[Section]) {
let total_tokens: usize = sections.iter().map(|s| s.tokens).sum();
let total_lines: usize = sections.iter().map(|s| s.lines).sum();
for s in sections {
let indent = if s.depth > 1 {
" ".repeat(s.depth - 1)
} else {
String::new()
};
let heading = s.heading.trim_start_matches('#').trim();
let heading_display = if heading.is_empty() {
&s.heading
} else {
heading
};
println!(
"{}{:<40} {:>4} lines ~{:>5} tokens",
indent, heading_display, s.lines, s.tokens
);
}
println!("---");
println!(
"{:<40} {:>4} lines ~{:>5} tokens",
"Total", total_lines, total_tokens
);
}
fn print_json(sections: &[Section]) {
print!("[");
for (i, s) in sections.iter().enumerate() {
if i > 0 {
print!(",");
}
print!(
r#"{{"heading":"{}","depth":{},"line":{},"lines":{},"tokens":{}}}"#,
s.heading.replace('"', "\\\""),
s.depth,
s.line,
s.lines,
s.tokens
);
}
println!("]");
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_collect_headings_atx() {
let body = "# Title\n\n## Section\n\n### Sub\n";
let headings = collect_headings(body);
assert_eq!(headings.len(), 3);
assert_eq!(headings[0].1, 1);
assert_eq!(headings[0].2, "Title");
assert_eq!(headings[1].1, 2);
assert_eq!(headings[1].2, "Section");
assert_eq!(headings[2].1, 3);
assert_eq!(headings[2].2, "Sub");
}
#[test]
fn test_collect_headings_no_space_not_heading() {
// `#NoSpace` is not a valid ATX heading per CommonMark
let body = "#NoSpace\n\n# Real\n";
let headings = collect_headings(body);
assert_eq!(headings.len(), 1);
assert_eq!(headings[0].2, "Real");
}
#[test]
fn test_collect_headings_inside_code_block_ignored() {
let body = "```\n# Not a heading\n```\n\n# Real\n";
let headings = collect_headings(body);
assert_eq!(headings.len(), 1);
assert_eq!(headings[0].2, "Real");
}
#[test]
fn test_parse_sections_basic() {
let body = "## User\n\nHello world\n\n## Assistant\n\nResponse here\n";
let sections = parse_sections(body);
assert_eq!(sections.len(), 2);
assert_eq!(sections[0].heading, "## User");
assert_eq!(sections[0].depth, 2);
assert_eq!(sections[1].heading, "## Assistant");
assert_eq!(sections[1].depth, 2);
}
#[test]
fn test_parse_sections_with_preamble() {
let body = "Some intro text\n\n## First\n\nContent\n";
let sections = parse_sections(body);
assert_eq!(sections.len(), 2);
assert_eq!(sections[0].heading, "(preamble)");
assert_eq!(sections[0].depth, 0);
assert_eq!(sections[1].heading, "## First");
}
#[test]
fn test_parse_sections_empty() {
let body = "";
let sections = parse_sections(body);
assert!(sections.is_empty());
}
#[test]
fn test_setext_headings() {
// Setext-style: underlined with === (H1) or --- (H2)
let body = "Title\n=====\n\nSome content here\n\nSection\n-------\n\nMore content\n";
let sections = parse_sections(body);
assert_eq!(sections.len(), 2);
assert_eq!(sections[0].depth, 1);
assert_eq!(sections[0].heading, "# Title");
assert_eq!(sections[1].depth, 2);
assert_eq!(sections[1].heading, "## Section");
}
#[test]
fn test_heading_inside_code_block_ignored() {
// A `#` heading inside a fenced code block must not create a section
let body = "## Real\n\nContent\n\n```\n## Fake\n```\n\nmore\n";
let sections = parse_sections(body);
assert_eq!(sections.len(), 1);
assert_eq!(sections[0].heading, "## Real");
}
#[test]
fn test_json_output() {
// Just ensure it doesn't panic
let sections = vec![Section {
heading: "## Test".to_string(),
depth: 2,
line: 1,
lines: 5,
tokens: 20,
}];
print_json(§ions);
}
}