use std::env;
pub const DEFAULT_CHUNK_THRESHOLD: usize = 28_000;
pub const CHUNK_THRESHOLD_ENV: &str = "MX_WAKE_CHUNK_BYTES";
pub fn chunk_threshold() -> usize {
match env::var(CHUNK_THRESHOLD_ENV) {
Ok(v) => v
.parse::<usize>()
.ok()
.filter(|n| *n > 0)
.unwrap_or(DEFAULT_CHUNK_THRESHOLD),
Err(_) => DEFAULT_CHUNK_THRESHOLD,
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ChunkPlan {
pub total: u16,
pub boundaries: Vec<usize>,
pub oversized: Vec<bool>,
}
impl ChunkPlan {
pub fn chunk<'a>(&self, content: &'a str, idx: u16) -> &'a str {
let (start, end) = self.chunk_range(content, idx);
&content[start..end]
}
pub fn chunk_range(&self, content: &str, idx: u16) -> (usize, usize) {
let idx = idx as usize;
if idx >= self.total as usize {
return (content.len(), content.len());
}
let start = if idx == 0 {
0
} else {
self.boundaries[idx - 1]
};
let end = if idx == self.total as usize - 1 {
content.len()
} else {
self.boundaries[idx]
};
(start, end)
}
pub fn is_oversized(&self, idx: u16) -> bool {
self.oversized.get(idx as usize).copied().unwrap_or(false)
}
pub fn iter<'a>(&'a self, content: &'a str) -> ChunkPlanIter<'a> {
ChunkPlanIter {
plan: self,
content,
idx: 0,
}
}
}
pub struct ChunkPlanIter<'a> {
plan: &'a ChunkPlan,
content: &'a str,
idx: u16,
}
impl<'a> Iterator for ChunkPlanIter<'a> {
type Item = (u16, &'a str, bool);
fn next(&mut self) -> Option<Self::Item> {
if self.idx >= self.plan.total {
return None;
}
let idx = self.idx;
let chunk = self.plan.chunk(self.content, idx);
let flag = self.plan.is_oversized(idx);
self.idx += 1;
Some((idx, chunk, flag))
}
}
pub fn compute_chunks(content: &str, threshold: usize) -> ChunkPlan {
if content.len() <= threshold {
return ChunkPlan {
total: 1,
boundaries: Vec::new(),
oversized: vec![false],
};
}
let fences = find_fence_starts(content);
let mut boundaries: Vec<usize> = Vec::new();
let mut oversized: Vec<bool> = Vec::new();
let mut cursor = 0;
while content.len() - cursor > threshold {
let window_end = cursor + threshold;
match find_break(content, cursor, window_end, &fences) {
Some(pos) => {
if pos <= cursor {
let recovery = recover_past_block(content, cursor, window_end, &fences)
.unwrap_or_else(|| safe_utf8_fallback(content, content.len()));
if recovery <= cursor {
oversized.push(true);
break;
}
let chunk_len = recovery - cursor;
boundaries.push(recovery);
oversized.push(chunk_len > threshold);
cursor = recovery;
continue;
}
let chunk_len = pos - cursor;
boundaries.push(pos);
oversized.push(chunk_len > threshold);
cursor = pos;
}
None => {
let recovery = recover_past_block(content, cursor, window_end, &fences)
.unwrap_or_else(|| safe_utf8_fallback(content, content.len()));
if recovery <= cursor {
oversized.push(true);
break;
}
let chunk_len = recovery - cursor;
boundaries.push(recovery);
oversized.push(chunk_len > threshold);
cursor = recovery;
}
}
}
if oversized.len() == boundaries.len() {
let tail_start = boundaries.last().copied().unwrap_or(0);
let tail_len = content.len() - tail_start;
oversized.push(tail_len > threshold);
}
let total_u16 = u16::try_from(boundaries.len() + 1).unwrap_or(u16::MAX);
let mut recomputed_oversized = Vec::with_capacity(total_u16 as usize);
for idx in 0..total_u16 {
let i = idx as usize;
let start = if i == 0 { 0 } else { boundaries[i - 1] };
let end = if i == total_u16 as usize - 1 {
content.len()
} else {
boundaries[i]
};
let declared = oversized.get(i).copied().unwrap_or(false);
let actual = end - start > threshold;
recomputed_oversized.push(declared || actual);
}
ChunkPlan {
total: total_u16,
boundaries,
oversized: recomputed_oversized,
}
}
fn find_fence_starts(content: &str) -> Vec<usize> {
let bytes = content.as_bytes();
let mut fences = Vec::new();
let mut i = 0;
while i < bytes.len() {
let at_line_start = i == 0 || bytes[i - 1] == b'\n';
if at_line_start && i + 2 < bytes.len() && &bytes[i..i + 3] == b"```" {
fences.push(i);
match bytes[i..].iter().position(|&b| b == b'\n') {
Some(nl) => i += nl + 1,
None => break,
}
} else {
i += 1;
}
}
fences
}
fn is_inside_fence(fences: &[usize], pos: usize) -> bool {
let count = fences.iter().take_while(|&&f| f < pos).count();
count % 2 == 1
}
fn find_break(content: &str, start: usize, window_end: usize, fences: &[usize]) -> Option<usize> {
let window_end = window_end.min(content.len());
if window_end <= start {
return None;
}
let ladder: &[&[u8]] = &[b"\n---\n", b"\n## ", b"\n### ", b"\n\n", b"\n"];
for pat in ladder {
if let Some(pos) = rfind_in_range(content, start, window_end, pat) {
let split = pos + 1; if split <= start || split >= window_end {
continue;
}
if !is_inside_fence(fences, split) && content.is_char_boundary(split) {
return Some(split);
}
}
}
let fallback = safe_utf8_fallback(content, window_end);
if fallback > start && !is_inside_fence(fences, fallback) {
return Some(fallback);
}
None
}
fn rfind_in_range(content: &str, start: usize, end: usize, needle: &[u8]) -> Option<usize> {
let end = end.min(content.len());
if end <= start || needle.is_empty() || end - start < needle.len() {
return None;
}
let haystack = &content.as_bytes()[start..end];
let n = needle.len();
let mut i = haystack.len().saturating_sub(n);
loop {
if &haystack[i..i + n] == needle {
return Some(start + i);
}
if i == 0 {
return None;
}
i -= 1;
}
}
fn safe_utf8_fallback(content: &str, pos: usize) -> usize {
let mut p = pos.min(content.len());
while p > 0 && !content.is_char_boundary(p) {
p -= 1;
}
p
}
fn recover_past_block(
content: &str,
start: usize,
window_end: usize,
fences: &[usize],
) -> Option<usize> {
let next_fence = *fences.iter().find(|&&f| f > window_end)?;
let bytes = content.as_bytes();
let after_fence_line = match bytes[next_fence..].iter().position(|&b| b == b'\n') {
Some(nl) => next_fence + nl + 1,
None => content.len(),
};
let safe = safe_utf8_fallback(content, after_fence_line);
if safe > start { Some(safe) } else { None }
}
const PHRASE_MAX_CHARS: usize = 120;
const SENTENCE_MAX_CHARS: usize = 100;
const LINE_FALLBACK_MAX_CHARS: usize = 80;
const SYNTHETIC_PREFIX_CHARS: usize = 40;
pub fn extract_salient_phrase(content: &str, chunk_idx: u16, total: u16) -> String {
if let Some(heading) = first_heading(content) {
return cap_chars(heading.trim(), PHRASE_MAX_CHARS);
}
if let Some(sentence) = first_sentence(content) {
let s = sentence.trim();
if !s.is_empty() {
return cap_chars(s, SENTENCE_MAX_CHARS);
}
}
if let Some(line) = first_non_empty_line(content) {
return cap_chars(line.trim(), LINE_FALLBACK_MAX_CHARS);
}
synthetic_phrase(content, chunk_idx, total)
}
fn first_heading(content: &str) -> Option<String> {
let mut in_fence = false;
for line in content.lines() {
let trimmed_start = line.trim_start();
if trimmed_start.starts_with("```") {
in_fence = !in_fence;
continue;
}
if in_fence {
continue;
}
if trimmed_start.starts_with('#') {
let stripped = trimmed_start.trim_start_matches('#');
let stripped = stripped.strip_prefix(' ').unwrap_or(stripped);
if !stripped.trim().is_empty() {
return Some(stripped.to_string());
}
}
}
None
}
fn first_sentence(content: &str) -> Option<String> {
let para_end = content.find("\n\n");
let sentence_end = content.find(". ").map(|i| i + 1); let end = match (para_end, sentence_end) {
(Some(a), Some(b)) => Some(a.min(b)),
(a, b) => a.or(b),
};
let end = end?;
let trimmed = content[..end].trim();
if trimmed.is_empty() {
None
} else {
Some(trimmed.to_string())
}
}
fn first_non_empty_line(content: &str) -> Option<String> {
let mut in_fence = false;
for line in content.lines() {
let trimmed_start = line.trim_start();
if trimmed_start.starts_with("```") {
in_fence = !in_fence;
continue;
}
if in_fence {
continue;
}
let trimmed = line.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
None
}
fn synthetic_phrase(content: &str, chunk_idx: u16, total: u16) -> String {
let prefix: String = content
.chars()
.take_while(|c| !matches!(c, '\n' | '\r'))
.collect::<String>()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
let prefix_capped = cap_chars(prefix.trim(), SYNTHETIC_PREFIX_CHARS);
let display_idx = chunk_idx.saturating_add(1);
let display_total = total.max(display_idx);
if prefix_capped.is_empty() {
format!("Part {}/{}", display_idx, display_total)
} else {
format!("Part {}/{} — {}", display_idx, display_total, prefix_capped)
}
}
fn cap_chars(s: &str, max: usize) -> String {
let count = s.chars().count();
if count <= max {
return s.to_string();
}
let head: String = s.chars().take(max).collect();
let cut = match head.rfind(char::is_whitespace) {
Some(i) if i >= max / 2 => &head[..i],
_ => &head[..],
};
format!("{}…", cut.trim_end())
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PhraseMode {
Authored,
Derived,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PhraseMatch {
Exact,
Tolerant,
Mismatch,
}
pub fn compare_phrase(input: &str, target: &str, mode: PhraseMode) -> PhraseMatch {
let i = input.trim();
let t = target.trim();
if i == t {
return PhraseMatch::Exact;
}
match mode {
PhraseMode::Authored => PhraseMatch::Mismatch,
PhraseMode::Derived => {
if normalize_derived(i) == normalize_derived(t) {
PhraseMatch::Tolerant
} else {
PhraseMatch::Mismatch
}
}
}
}
pub fn normalize_derived(s: &str) -> String {
let lowered = s.to_lowercase();
let mut out = String::with_capacity(lowered.len());
let mut prev_space = false;
for ch in lowered.chars() {
let c = match ch {
'\u{2018}' | '\u{2019}' | '\u{2032}' => '\'',
'\u{201C}' | '\u{201D}' | '\u{2033}' => '"',
'\u{2013}' | '\u{2014}' => '-',
_ => ch,
};
if c.is_whitespace() {
if !prev_space && !out.is_empty() {
out.push(' ');
prev_space = true;
}
continue;
}
prev_space = false;
out.push(c);
}
while let Some(last) = out.chars().last() {
if matches!(last, '.' | '!' | '?' | '…' | ',' | ';' | ':') {
out.pop();
} else {
break;
}
}
out.trim().to_string()
}
pub fn extract_auto_phrase(content: &str, title: &str) -> String {
if let Some(heading) = first_heading(content) {
return cap_chars(heading.trim(), PHRASE_MAX_CHARS);
}
let sentences = extract_sentences(content);
if !sentences.is_empty() {
let refs: Vec<&str> = sentences.iter().map(|s| s.as_str()).collect();
let idx = select_sentence_index(&refs, content);
let s = sentences[idx].trim();
if !s.is_empty() {
return cap_chars(s, SENTENCE_MAX_CHARS);
}
}
if let Some(line) = first_non_empty_line(content) {
let trimmed = line.trim();
if !trimmed.is_empty() {
return cap_chars(trimmed, LINE_FALLBACK_MAX_CHARS);
}
}
cap_chars(title.trim(), PHRASE_MAX_CHARS)
}
fn extract_sentences(content: &str) -> Vec<String> {
let mut sentences = Vec::new();
let mut current = String::new();
let mut in_fence = false;
for line in content.lines() {
let trimmed_start = line.trim_start();
if trimmed_start.starts_with("```") {
in_fence = !in_fence;
continue;
}
if in_fence {
continue;
}
if line.trim().is_empty() {
let trimmed = current.trim().to_string();
if !trimmed.is_empty() {
sentences.push(trimmed);
}
current.clear();
continue;
}
let mut cleaned = line.trim();
if let Some(rest) = cleaned.strip_prefix("- ") {
cleaned = rest;
} else if let Some(rest) = cleaned.strip_prefix("* ") {
cleaned = rest;
}
if !current.is_empty() {
current.push(' ');
}
current.push_str(cleaned);
while let Some(pos) = current.find(". ") {
let sentence = current[..=pos].trim().to_string(); if !sentence.is_empty() {
sentences.push(sentence);
}
current = current[pos + 2..].to_string();
}
}
let trimmed = current.trim().to_string();
if !trimmed.is_empty() {
sentences.push(trimmed);
}
sentences
}
fn select_sentence_index(sentences: &[&str], content: &str) -> usize {
let seed: u64 = content
.bytes()
.fold(0u64, |acc, b| acc.wrapping_mul(31).wrapping_add(b as u64));
seed as usize % sentences.len()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn below_threshold_is_one_chunk() {
let plan = compute_chunks("tiny content", 28_000);
assert_eq!(plan.total, 1);
assert!(plan.boundaries.is_empty());
assert_eq!(plan.chunk("tiny content", 0), "tiny content");
assert!(!plan.is_oversized(0));
}
#[test]
fn exact_threshold_is_one_chunk() {
let content = "a".repeat(100);
let plan = compute_chunks(&content, 100);
assert_eq!(plan.total, 1);
}
#[test]
fn just_over_threshold_chunks_to_size_bound() {
let content = "a".repeat(201);
let plan = compute_chunks(&content, 100);
assert!(plan.total >= 2);
let joined: String = plan
.iter(&content)
.map(|(_, s, _)| s)
.collect::<Vec<_>>()
.join("");
assert_eq!(joined, content);
for (_, chunk, oversized) in plan.iter(&content) {
if !oversized {
assert!(chunk.len() <= 100, "chunk {} over threshold", chunk.len());
}
}
}
#[test]
fn horizontal_rule_preferred_over_paragraph() {
let mut content = String::new();
content.push_str(&"x".repeat(50));
content.push_str("\n---\n");
content.push_str(&"y".repeat(15));
content.push_str("\n\n");
content.push_str(&"z".repeat(80));
let plan = compute_chunks(&content, 100);
assert!(plan.total >= 2);
let chunk0 = plan.chunk(&content, 0);
assert!(
chunk0.ends_with('\n'),
"chunk 0 should end at HR newline: {:?}",
chunk0
);
}
#[test]
fn falls_through_to_h2_header() {
let mut content = String::new();
content.push_str(&"x".repeat(50));
content.push_str("\n## Section Two\n");
content.push_str(&"y".repeat(200));
let plan = compute_chunks(&content, 100);
assert!(plan.total >= 2);
let chunk1 = plan.chunk(&content, 1);
assert!(
chunk1.starts_with("## Section Two"),
"chunk 1 starts with H2: {:?}",
&chunk1[..30.min(chunk1.len())]
);
}
#[test]
fn falls_through_to_paragraph_break() {
let mut content = String::new();
content.push_str(&"x".repeat(50));
content.push_str("\n\n");
content.push_str(&"y".repeat(150));
let plan = compute_chunks(&content, 100);
assert!(plan.total >= 2);
}
#[test]
fn no_semantic_breaks_uses_utf8_fallback() {
let content = "A".repeat(250);
let plan = compute_chunks(&content, 100);
assert!(plan.total >= 2);
let joined: String = plan
.iter(&content)
.map(|(_, s, _)| s)
.collect::<Vec<_>>()
.join("");
assert_eq!(joined, content);
}
#[test]
fn utf8_boundary_safety_emoji() {
let prefix = "a".repeat(98);
let mut content = prefix.clone();
content.push('\u{1F41F}'); content.push_str(&"b".repeat(200));
let plan = compute_chunks(&content, 100);
for &b in &plan.boundaries {
assert!(
content.is_char_boundary(b),
"boundary {} not on char boundary",
b
);
}
let joined: String = plan
.iter(&content)
.map(|(_, s, _)| s)
.collect::<Vec<_>>()
.join("");
assert_eq!(joined, content);
}
#[test]
fn code_block_not_split_mid_block() {
let mut content = String::new();
content.push_str(&"x".repeat(20));
content.push_str("\n```rust\n");
content.push_str(&"fn a() {}\n".repeat(20)); content.push_str("```\n");
content.push_str(&"y".repeat(300));
let threshold = 120;
let plan = compute_chunks(&content, threshold);
let fence_open = content.find("```").unwrap();
let fence_close = content.rfind("```").unwrap();
for &b in &plan.boundaries {
assert!(
b <= fence_open || b > fence_close,
"boundary {} landed inside code block [{}, {}]",
b,
fence_open,
fence_close
);
}
}
#[test]
fn oversized_code_block_flagged() {
let mut content = String::new();
content.push_str("intro\n\n");
content.push_str("```\n");
content.push_str(&"A".repeat(500));
content.push_str("\n```\n");
content.push_str(&"tail".repeat(50));
let plan = compute_chunks(&content, 100);
assert!(
plan.oversized.iter().any(|&f| f),
"expected at least one oversized chunk, got {:?}",
plan.oversized
);
}
#[test]
fn chunk_count_beyond_u8_is_not_truncated() {
let threshold = 25;
let content = "a".repeat(15_000);
let plan = compute_chunks(&content, threshold);
assert_eq!(
plan.total as usize,
plan.boundaries.len() + 1,
"total {} does not match boundaries.len() + 1 = {}",
plan.total,
plan.boundaries.len() + 1,
);
assert!(
plan.total > 255,
"expected >255 chunks to exercise the u8-saturation regression, got {}",
plan.total
);
let joined: String = plan
.iter(&content)
.map(|(_, s, _)| s)
.collect::<Vec<_>>()
.join("");
assert_eq!(joined, content);
let last_idx = plan.total - 1;
let last = plan.chunk(&content, last_idx);
assert!(last.len() <= threshold + 8, "last chunk over threshold");
}
#[test]
fn env_var_overrides_threshold() {
let prev = env::var(CHUNK_THRESHOLD_ENV).ok();
unsafe {
env::set_var(CHUNK_THRESHOLD_ENV, "50");
}
assert_eq!(chunk_threshold(), 50);
unsafe {
env::set_var(CHUNK_THRESHOLD_ENV, "not-a-number");
}
assert_eq!(chunk_threshold(), DEFAULT_CHUNK_THRESHOLD);
unsafe {
env::set_var(CHUNK_THRESHOLD_ENV, "0");
}
assert_eq!(chunk_threshold(), DEFAULT_CHUNK_THRESHOLD);
unsafe {
match prev {
Some(v) => env::set_var(CHUNK_THRESHOLD_ENV, v),
None => env::remove_var(CHUNK_THRESHOLD_ENV),
}
}
}
#[test]
fn phrase_from_heading_preferred() {
let content = "\n## Token semantics\n\nThe token signs (session_id, step).";
let p = extract_salient_phrase(content, 0, 1);
assert_eq!(p, "Token semantics");
}
#[test]
fn phrase_heading_strips_all_hash_levels() {
let content = "#### Deep heading\n\nbody";
let p = extract_salient_phrase(content, 0, 1);
assert_eq!(p, "Deep heading");
}
#[test]
fn phrase_from_first_sentence_when_no_heading() {
let content = "The wake ritual walks a cascade. It uses chunks now.";
let p = extract_salient_phrase(content, 0, 1);
assert_eq!(p, "The wake ritual walks a cascade.");
}
#[test]
fn phrase_from_first_line_truncated() {
let content = "word ".repeat(50); let p = extract_salient_phrase(&content, 0, 1);
assert!(
p.chars().count() <= LINE_FALLBACK_MAX_CHARS + 1,
"got {} chars",
p.chars().count()
);
assert!(!p.is_empty());
}
#[test]
fn phrase_synthetic_for_empty_input() {
let p = extract_salient_phrase("", 0, 3);
assert_eq!(p, "Part 1/3");
}
#[test]
fn phrase_synthetic_for_whitespace_only() {
let p = extract_salient_phrase(" \n\n\n ", 2, 5);
assert_eq!(p, "Part 3/5");
}
#[test]
fn phrase_never_empty() {
let cases = ["", " ", "\n", "\n\n", "\t\t", "a", ".", "\u{200B}"];
for c in cases {
let p = extract_salient_phrase(c, 0, 1);
assert!(!p.is_empty(), "empty phrase for input {:?}", c);
}
}
#[test]
fn phrase_abbreviation_splits_at_period_space_known_limitation() {
let content = "See Dr. Smith for details. He prescribes two aspirin.";
let p = extract_salient_phrase(content, 0, 1);
assert_eq!(p, "See Dr.");
}
#[test]
fn phrase_heading_skips_inside_fenced_code_block() {
let content = "Intro paragraph without a heading. More prose here.\n\n\
```markdown\n\
## fake heading in code\n\
more code lines\n\
```\n\
trailing prose line.";
let p = extract_salient_phrase(content, 0, 1);
assert!(
!p.contains("fake heading in code"),
"heading extractor descended into fenced block: {:?}",
p
);
assert!(
p.starts_with("Intro paragraph"),
"expected first-sentence fallback, got {:?}",
p
);
}
#[test]
fn phrase_heading_after_fenced_block_is_picked() {
let content = "Intro prose.\n\n\
```rust\n\
// ## not a heading\n\
fn x() {}\n\
```\n\n\
## Real Heading\n\n\
body text.";
let p = extract_salient_phrase(content, 0, 1);
assert_eq!(p, "Real Heading");
}
#[test]
fn phrase_heading_between_two_fenced_blocks_is_picked() {
let content = "\
```\n\
## fake one\n\
```\n\n\
## Real Heading\n\n\
body\n\n\
```\n\
## fake two\n\
```\n";
let p = extract_salient_phrase(content, 0, 1);
assert_eq!(p, "Real Heading");
}
#[test]
fn phrase_pure_fenced_chunk_has_no_extractable_heading() {
let content = "\
```markdown\n\
## fake heading inside code\n\
more code\n\
```\n";
let p = extract_salient_phrase(content, 0, 1);
assert!(
!p.contains("fake heading inside code"),
"fence-only chunk returned fake heading: {:?}",
p
);
assert!(!p.is_empty());
}
#[test]
fn compare_authored_exact_match() {
let r = compare_phrase(
"Rust is memory-safe",
"Rust is memory-safe",
PhraseMode::Authored,
);
assert_eq!(r, PhraseMatch::Exact);
}
#[test]
fn compare_authored_case_mismatch_is_reject() {
let r = compare_phrase(
"rust is memory-safe",
"Rust is memory-safe",
PhraseMode::Authored,
);
assert_eq!(r, PhraseMatch::Mismatch);
}
#[test]
fn compare_derived_case_tolerant() {
let r = compare_phrase("Token semantics", "token semantics", PhraseMode::Derived);
assert_eq!(r, PhraseMatch::Tolerant);
}
#[test]
fn compare_derived_trailing_punct_stripped() {
let r = compare_phrase(
"The wake ritual walks a cascade",
"The wake ritual walks a cascade.",
PhraseMode::Derived,
);
assert_eq!(r, PhraseMatch::Tolerant);
}
#[test]
fn compare_derived_whitespace_collapsed() {
let r = compare_phrase("token semantics", "token semantics", PhraseMode::Derived);
assert_eq!(r, PhraseMatch::Tolerant);
}
#[test]
fn compare_derived_smart_quotes_normalized() {
let r = compare_phrase(
"it's \u{201C}alive\u{201D}",
"it\u{2019}s \"alive\"",
PhraseMode::Derived,
);
assert_eq!(r, PhraseMatch::Tolerant);
}
#[test]
fn compare_derived_mismatch_still_mismatches() {
let r = compare_phrase("totally different", "token semantics", PhraseMode::Derived);
assert_eq!(r, PhraseMatch::Mismatch);
}
use proptest::prelude::*;
fn threshold_strategy() -> impl Strategy<Value = usize> {
prop_oneof![
Just(50usize),
Just(100usize),
Just(256usize),
Just(1024usize),
Just(4096usize),
]
}
proptest! {
#[test]
fn prop_reconstitution(
content in "\\PC{0,8192}", threshold in threshold_strategy(),
) {
let plan = compute_chunks(&content, threshold);
let joined: String = plan.iter(&content).map(|(_, s, _)| s).collect::<Vec<_>>().join("");
prop_assert_eq!(joined, content);
}
#[test]
fn prop_all_boundaries_are_char_boundaries(
content in "\\PC{0,8192}",
threshold in threshold_strategy(),
) {
let plan = compute_chunks(&content, threshold);
for &b in &plan.boundaries {
prop_assert!(content.is_char_boundary(b), "byte {} is not a char boundary", b);
}
}
#[test]
fn prop_boundaries_strictly_increasing(
content in "\\PC{0,8192}",
threshold in threshold_strategy(),
) {
let plan = compute_chunks(&content, threshold);
for pair in plan.boundaries.windows(2) {
prop_assert!(pair[0] < pair[1], "non-monotonic boundaries: {:?}", plan.boundaries);
}
}
#[test]
fn prop_determinism(
content in "\\PC{0,4096}",
threshold in threshold_strategy(),
) {
let a = compute_chunks(&content, threshold);
let b = compute_chunks(&content, threshold);
prop_assert_eq!(a, b);
}
#[test]
fn prop_chunk_size_bound(
content in "\\PC{0,8192}",
threshold in threshold_strategy(),
) {
let plan = compute_chunks(&content, threshold);
for (_, chunk, oversized) in plan.iter(&content) {
if oversized {
prop_assert!(
chunk.len() <= content.len(),
"oversized chunk larger than input: {} vs {}",
chunk.len(), content.len()
);
} else {
prop_assert!(
chunk.len() <= threshold,
"non-oversized chunk {} > threshold {}",
chunk.len(), threshold
);
}
}
}
#[test]
fn prop_phrase_never_empty(content in "\\PC{0,4096}", idx in 0u16..10, total in 1u16..10) {
let p = extract_salient_phrase(&content, idx, total);
prop_assert!(!p.is_empty(), "empty phrase for content len {}", content.len());
}
#[test]
fn prop_phrase_deterministic(content in "\\PC{0,4096}") {
let a = extract_salient_phrase(&content, 0, 1);
let b = extract_salient_phrase(&content, 0, 1);
prop_assert_eq!(a, b);
}
#[test]
fn prop_compare_derived_tolerant_to_case_and_trailing_punct(
word1 in "[a-zA-Z]{2,20}",
word2 in "[a-zA-Z]{2,20}",
) {
let base = format!("{} {}", word1, word2);
let variant = format!("{} {}.", base.to_lowercase(), ""); let variant = variant.trim().to_string();
let r = compare_phrase(&variant, &base, PhraseMode::Derived);
prop_assert!(matches!(r, PhraseMatch::Exact | PhraseMatch::Tolerant),
"derived compare rejected trivial variant: {:?} vs {:?}", variant, base);
}
#[test]
fn prop_compare_same_string_is_exact(s in "[\\PC]{1,64}") {
let trimmed = s.trim().to_string();
prop_assume!(!trimmed.is_empty());
prop_assert_eq!(
compare_phrase(&trimmed, &trimmed, PhraseMode::Authored),
PhraseMatch::Exact
);
prop_assert_eq!(
compare_phrase(&trimmed, &trimmed, PhraseMode::Derived),
PhraseMatch::Exact
);
}
}
#[test]
fn auto_phrase_from_heading() {
let content = "Some intro text.\n\n## The Spark\n\nBody text here.";
let p = extract_auto_phrase(content, "Fallback Title");
assert_eq!(p, "The Spark");
}
#[test]
fn auto_phrase_from_sentence() {
let content = "The warmth accumulator stores relational bricks. Each brick records a moment of connection.";
let p = extract_auto_phrase(content, "Warmth Accumulator");
assert!(
p.contains("warmth accumulator") || p.contains("brick records"),
"expected a sentence from the content, got {:?}",
p
);
assert!(!p.is_empty());
}
#[test]
fn auto_phrase_from_line() {
let content = "- brick one: kautau noticed the pattern\n- brick two: something else";
let p = extract_auto_phrase(content, "Fallback");
assert!(
p.contains("brick one"),
"expected first line as phrase, got {:?}",
p
);
}
#[test]
fn auto_phrase_from_title_fallback() {
let p = extract_auto_phrase("", "Warmth Accumulator");
assert_eq!(p, "Warmth Accumulator");
}
#[test]
fn auto_phrase_never_empty() {
let cases = ["", " ", "\n", "\n\n", "\t\t"];
for c in cases {
let p = extract_auto_phrase(c, "Title");
assert!(!p.is_empty(), "empty auto-phrase for content {:?}", c);
}
}
#[test]
fn auto_phrase_deterministic() {
let content = "Some paragraph with multiple sentences. Another one here. And a third.";
let a = extract_auto_phrase(content, "Title");
let b = extract_auto_phrase(content, "Title");
assert_eq!(a, b, "auto-phrase must be deterministic");
}
#[test]
fn auto_phrase_skips_fenced_headings() {
let content = "```markdown\n## Fake Heading\n```\n\nReal first sentence here.";
let p = extract_auto_phrase(content, "Title");
assert!(
!p.contains("Fake Heading"),
"auto-phrase picked heading inside fenced block: {:?}",
p
);
}
#[test]
fn auto_phrase_warmth_bricks() {
let content = "- kautau noticed the pattern and said so\n- Q remembered the first wake\n- Semvii brought coffee";
let p = extract_auto_phrase(content, "Warmth Accumulator");
assert!(
p.contains("kautau") || p.contains("Q remembered") || p.contains("Semvii"),
"expected a brick line, got {:?}",
p
);
assert!(!p.is_empty());
}
#[test]
fn auto_phrase_content_hash_varies_across_blooms() {
let content_a = "First sentence here. Second sentence here. Third sentence here.";
let content_b = "Alpha sentence here. Beta sentence here. Gamma sentence here.";
let p_a = extract_auto_phrase(content_a, "A");
let p_b = extract_auto_phrase(content_b, "B");
assert_ne!(
p_a, p_b,
"different content should produce different phrases"
);
}
#[test]
fn extract_sentences_basic() {
let content = "First sentence. Second sentence. Third.";
let sentences = extract_sentences(content);
assert!(
sentences.len() >= 2,
"expected >=2 sentences, got {:?}",
sentences
);
assert!(sentences[0].contains("First sentence."));
}
#[test]
fn extract_sentences_paragraph_break() {
let content = "Paragraph one\n\nParagraph two";
let sentences = extract_sentences(content);
assert_eq!(sentences.len(), 2);
assert_eq!(sentences[0], "Paragraph one");
assert_eq!(sentences[1], "Paragraph two");
}
#[test]
fn extract_sentences_skips_fenced_blocks() {
let content = "Real sentence.\n\n```\nFake sentence inside code.\n```\n\nAnother real one.";
let sentences = extract_sentences(content);
for s in &sentences {
assert!(
!s.contains("Fake sentence"),
"sentence extractor should skip fenced blocks, got {:?}",
s
);
}
}
#[test]
fn select_sentence_index_deterministic() {
let sentences = vec!["a", "b", "c", "d", "e"];
let content = "some content for hashing";
let idx1 = select_sentence_index(&sentences, content);
let idx2 = select_sentence_index(&sentences, content);
assert_eq!(idx1, idx2);
assert!(idx1 < sentences.len());
}
#[test]
fn select_sentence_index_varies_with_content() {
let sentences = vec!["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"];
let idx1 = select_sentence_index(&sentences, "content alpha");
let idx2 = select_sentence_index(&sentences, "content beta");
assert_ne!(
idx1, idx2,
"different content should usually select different indices"
);
}
#[test]
fn first_non_empty_line_skips_fenced_code_block() {
let content = "```rust\nfn main() {}\n```\n\nActual first line.";
let line = first_non_empty_line(content);
assert_eq!(
line.as_deref(),
Some("Actual first line."),
"should skip fenced code block, got {:?}",
line
);
}
#[test]
fn first_non_empty_line_all_fenced_returns_none() {
let content = "```\nonly code here\nmore code\n```";
let line = first_non_empty_line(content);
assert_eq!(line, None, "all-fenced content should return None");
}
#[test]
fn first_non_empty_line_between_fences() {
let content = "```\ncode\n```\n\nSandwiched line\n\n```\nmore code\n```";
let line = first_non_empty_line(content);
assert_eq!(line.as_deref(), Some("Sandwiched line"));
}
#[test]
fn extract_sentences_strips_dash_list_prefix() {
let content = "- first brick line\n- second brick line";
let sentences = extract_sentences(content);
for s in &sentences {
assert!(
!s.starts_with("- "),
"sentence should not keep `- ` prefix: {:?}",
s
);
}
assert!(!sentences.is_empty());
assert!(
sentences[0].contains("first brick line"),
"expected content without prefix, got {:?}",
sentences
);
}
#[test]
fn extract_sentences_strips_star_list_prefix() {
let content = "* item alpha\n* item beta";
let sentences = extract_sentences(content);
for s in &sentences {
assert!(
!s.starts_with("* "),
"sentence should not keep `* ` prefix: {:?}",
s
);
}
assert!(!sentences.is_empty());
assert!(sentences[0].contains("item alpha"));
}
#[test]
fn extract_sentences_list_items_with_paragraph_breaks() {
let content = "- first item\n\n- second item";
let sentences = extract_sentences(content);
assert_eq!(sentences.len(), 2);
assert_eq!(sentences[0], "first item");
assert_eq!(sentences[1], "second item");
}
}