use std::collections::{BTreeMap, HashMap};
use serde::{Deserialize, Serialize};
use crate::error::{Error, Result};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Chunk {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub id: Option<String>,
pub text: String,
}
impl Chunk {
pub fn new(id: impl Into<String>, text: impl Into<String>) -> Self {
Self {
id: Some(id.into()),
text: text.into(),
}
}
pub fn unidentified(text: impl Into<String>) -> Self {
Self {
id: None,
text: text.into(),
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ChunkLintConfig {
pub tiny_chars: usize,
pub giant_chars: usize,
pub near_empty_chars: usize,
pub max_tiny_fraction: f64,
pub fatal: bool,
pub language: LanguageLintConfig,
pub near_duplicates: NearDuplicateLintConfig,
}
impl Default for ChunkLintConfig {
fn default() -> Self {
Self {
tiny_chars: 32,
giant_chars: 4096,
near_empty_chars: 4,
max_tiny_fraction: 0.10,
fatal: false,
language: LanguageLintConfig::default(),
near_duplicates: NearDuplicateLintConfig::default(),
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct NearDuplicateLintConfig {
pub enabled: bool,
pub shingle_tokens: usize,
pub signature_len: usize,
pub threshold: f64,
pub max_examples: usize,
}
impl Default for NearDuplicateLintConfig {
fn default() -> Self {
Self {
enabled: false,
shingle_tokens: 5,
signature_len: 64,
threshold: 0.85,
max_examples: 8,
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct LanguageLintConfig {
pub enabled: bool,
pub allowed_languages: Vec<String>,
pub max_unknown_fraction: f64,
}
impl Default for LanguageLintConfig {
fn default() -> Self {
Self {
enabled: false,
allowed_languages: Vec::new(),
max_unknown_fraction: 0.20,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct LanguageCount {
pub language: String,
pub count: u64,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct NearDuplicatePair {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub left_id: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub right_id: Option<String>,
pub similarity: f64,
}
#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ChunkStats {
pub count: u64,
pub empty: u64,
pub near_empty: u64,
pub tiny: u64,
pub giant: u64,
pub missing_ids: u64,
pub duplicate_text: u64,
pub near_duplicate_pairs: u64,
pub control_chars: u64,
pub bom_chunks: u64,
pub min_chars: u64,
pub max_chars: u64,
pub mean_chars: u64,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
#[non_exhaustive]
pub enum ChunkLintWarning {
EmptyChunks {
count: u64,
},
NearEmptyChunks {
count: u64,
threshold: usize,
},
TooManyTinyChunks {
count: u64,
total: u64,
tiny_chars: usize,
max_fraction: f64,
},
GiantChunks {
count: u64,
threshold: usize,
},
DuplicateChunks {
count: u64,
groups: u64,
},
MissingIds {
count: u64,
},
Encoding {
warning: EncodingLintWarning,
},
UnknownLanguage {
count: u64,
total: u64,
max_fraction: f64,
},
DisallowedLanguages {
languages: Vec<LanguageCount>,
allowed: Vec<String>,
},
NearDuplicateChunks {
pairs: u64,
threshold: f64,
examples: Vec<NearDuplicatePair>,
},
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum EncodingLintWarning {
ControlCharacters {
count: u64,
},
ByteOrderMarks {
count: u64,
},
}
#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
pub struct ChunkLintReport {
pub stats: ChunkStats,
pub warnings: Vec<ChunkLintWarning>,
}
impl ChunkLintReport {
#[must_use]
pub fn has_warnings(&self) -> bool {
!self.warnings.is_empty()
}
pub fn to_json(&self) -> Result<String> {
Ok(serde_json::to_string_pretty(self)?)
}
}
#[must_use]
pub fn lint_chunks(chunks: &[Chunk], config: &ChunkLintConfig) -> ChunkLintReport {
let count = chunks.len() as u64;
if count == 0 {
return ChunkLintReport::default();
}
let mut empty = 0u64;
let mut near_empty = 0u64;
let mut tiny = 0u64;
let mut giant = 0u64;
let mut missing_ids = 0u64;
let mut control_chars = 0u64;
let mut bom_chunks = 0u64;
let mut unknown_language = 0u64;
let mut min_chars = u64::MAX;
let mut max_chars = 0u64;
let mut total_chars = 0u128;
let mut text_counts: HashMap<&str, u64> = HashMap::with_capacity(chunks.len());
let mut language_counts: BTreeMap<String, u64> = BTreeMap::new();
let allowed_languages: Vec<String> = config
.language
.allowed_languages
.iter()
.map(|lang| lang.to_ascii_lowercase())
.collect();
for chunk in chunks {
if chunk.id.is_none() {
missing_ids = missing_ids.saturating_add(1);
}
let len = chunk.text.chars().count() as u64;
if len == 0 {
empty = empty.saturating_add(1);
} else if len <= config.near_empty_chars as u64 {
near_empty = near_empty.saturating_add(1);
}
if len > 0 && len < config.tiny_chars as u64 {
tiny = tiny.saturating_add(1);
}
if len > config.giant_chars as u64 {
giant = giant.saturating_add(1);
}
let bad_controls = chunk
.text
.chars()
.filter(|ch| ch.is_control() && !matches!(ch, '\n' | '\r' | '\t'))
.count() as u64;
control_chars = control_chars.saturating_add(bad_controls);
if chunk.text.contains('\u{feff}') {
bom_chunks = bom_chunks.saturating_add(1);
}
if config.language.enabled && !chunk.text.trim().is_empty() {
match whatlang::detect(&chunk.text) {
Some(info) => {
let code = info.lang().code().to_ascii_lowercase();
*language_counts.entry(code).or_insert(0) += 1;
}
None => {
unknown_language = unknown_language.saturating_add(1);
}
}
}
min_chars = min_chars.min(len);
max_chars = max_chars.max(len);
total_chars = total_chars.saturating_add(u128::from(len));
*text_counts.entry(chunk.text.as_str()).or_insert(0) += 1;
}
let duplicate_groups = text_counts.values().filter(|n| **n > 1).count() as u64;
let duplicate_text: u64 = text_counts.values().filter(|n| **n > 1).sum();
let near_duplicates = if config.near_duplicates.enabled {
detect_near_duplicates(chunks, &config.near_duplicates)
} else {
NearDuplicateSummary::default()
};
let mean_chars = (total_chars / u128::from(count)) as u64;
let min_chars = if min_chars == u64::MAX { 0 } else { min_chars };
let stats = ChunkStats {
count,
empty,
near_empty,
tiny,
giant,
missing_ids,
duplicate_text,
near_duplicate_pairs: near_duplicates.pairs,
control_chars,
bom_chunks,
min_chars,
max_chars,
mean_chars,
};
let mut warnings = Vec::new();
if empty > 0 {
warnings.push(ChunkLintWarning::EmptyChunks { count: empty });
}
if near_empty > 0 {
warnings.push(ChunkLintWarning::NearEmptyChunks {
count: near_empty,
threshold: config.near_empty_chars,
});
}
let tiny_fraction = tiny as f64 / count as f64;
if tiny_fraction > config.max_tiny_fraction {
warnings.push(ChunkLintWarning::TooManyTinyChunks {
count: tiny,
total: count,
tiny_chars: config.tiny_chars,
max_fraction: config.max_tiny_fraction,
});
}
if giant > 0 {
warnings.push(ChunkLintWarning::GiantChunks {
count: giant,
threshold: config.giant_chars,
});
}
if duplicate_groups > 0 {
warnings.push(ChunkLintWarning::DuplicateChunks {
count: duplicate_text,
groups: duplicate_groups,
});
}
if near_duplicates.pairs > 0 {
warnings.push(ChunkLintWarning::NearDuplicateChunks {
pairs: near_duplicates.pairs,
threshold: config.near_duplicates.threshold,
examples: near_duplicates.examples,
});
}
if missing_ids > 0 {
warnings.push(ChunkLintWarning::MissingIds { count: missing_ids });
}
if control_chars > 0 {
warnings.push(ChunkLintWarning::Encoding {
warning: EncodingLintWarning::ControlCharacters {
count: control_chars,
},
});
}
if bom_chunks > 0 {
warnings.push(ChunkLintWarning::Encoding {
warning: EncodingLintWarning::ByteOrderMarks { count: bom_chunks },
});
}
if config.language.enabled {
let unknown_fraction = unknown_language as f64 / count as f64;
if unknown_fraction > config.language.max_unknown_fraction {
warnings.push(ChunkLintWarning::UnknownLanguage {
count: unknown_language,
total: count,
max_fraction: config.language.max_unknown_fraction,
});
}
if !allowed_languages.is_empty() {
let disallowed = language_counts
.iter()
.filter(|(language, _count)| !allowed_languages.contains(language))
.map(|(language, count)| LanguageCount {
language: language.clone(),
count: *count,
})
.collect::<Vec<_>>();
if !disallowed.is_empty() {
warnings.push(ChunkLintWarning::DisallowedLanguages {
languages: disallowed,
allowed: allowed_languages,
});
}
}
}
ChunkLintReport { stats, warnings }
}
#[derive(Default)]
struct NearDuplicateSummary {
pairs: u64,
examples: Vec<NearDuplicatePair>,
}
#[derive(Clone)]
struct NearDuplicateSignature {
id: Option<String>,
normalized: String,
values: Vec<u64>,
}
fn detect_near_duplicates(
chunks: &[Chunk],
config: &NearDuplicateLintConfig,
) -> NearDuplicateSummary {
if config.signature_len == 0 || config.threshold > 1.0 {
return NearDuplicateSummary::default();
}
let signatures = chunks
.iter()
.filter_map(|chunk| build_signature(chunk, config))
.collect::<Vec<_>>();
let mut summary = NearDuplicateSummary::default();
for (left_index, left) in signatures.iter().enumerate() {
for right in signatures.iter().skip(left_index.saturating_add(1)) {
if left.normalized == right.normalized {
continue;
}
let similarity = signature_similarity(&left.values, &right.values);
if similarity >= config.threshold {
summary.pairs = summary.pairs.saturating_add(1);
if summary.examples.len() < config.max_examples {
summary.examples.push(NearDuplicatePair {
left_id: left.id.clone(),
right_id: right.id.clone(),
similarity,
});
}
}
}
}
summary
}
fn build_signature(
chunk: &Chunk,
config: &NearDuplicateLintConfig,
) -> Option<NearDuplicateSignature> {
let normalized = normalize_for_near_duplicate(&chunk.text);
if normalized.is_empty() {
return None;
}
let tokens = normalized.split_whitespace().collect::<Vec<_>>();
let shingle_tokens = config.shingle_tokens.max(1);
let mut shingles = Vec::new();
if tokens.len() < shingle_tokens {
shingles.push(normalized.clone());
} else {
for window in tokens.windows(shingle_tokens) {
shingles.push(window.join(" "));
}
}
let mut values = vec![u64::MAX; config.signature_len];
for shingle in shingles {
for (seed, value) in values.iter_mut().enumerate() {
let hash = stable_hash(seed as u64, &shingle);
*value = (*value).min(hash);
}
}
Some(NearDuplicateSignature {
id: chunk.id.clone(),
normalized,
values,
})
}
fn normalize_for_near_duplicate(text: &str) -> String {
let mut normalized = String::with_capacity(text.len());
let mut in_space = false;
for ch in text.chars().flat_map(char::to_lowercase) {
if ch.is_alphanumeric() {
normalized.push(ch);
in_space = false;
} else if !in_space && !normalized.is_empty() {
normalized.push(' ');
in_space = true;
}
}
normalized.trim().to_string()
}
fn stable_hash(seed: u64, text: &str) -> u64 {
let mut hash = 0xcbf29ce484222325u64 ^ seed.wrapping_mul(0x9e3779b97f4a7c15);
for byte in text.bytes() {
hash ^= u64::from(byte);
hash = hash.wrapping_mul(0x100000001b3);
hash ^= hash >> 32;
}
hash
}
fn signature_similarity(left: &[u64], right: &[u64]) -> f64 {
let total = left.len().min(right.len());
if total == 0 {
return 0.0;
}
let matches = left.iter().zip(right).filter(|(a, b)| a == b).count();
matches as f64 / total as f64
}
pub fn lint_chunks_strict(chunks: &[Chunk], config: &ChunkLintConfig) -> Result<ChunkLintReport> {
let report = lint_chunks(chunks, config);
if config.fatal && report.has_warnings() {
return Err(Error::Ingestion(format!(
"chunk lint failed: {} warning(s)",
report.warnings.len()
)));
}
Ok(report)
}
#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::panic, clippy::indexing_slicing)]
mod tests {
use super::*;
fn corpus() -> Vec<Chunk> {
vec![
Chunk::new("a", "The quick brown fox jumps over the lazy dog."),
Chunk::new("b", "The quick brown fox jumps over the lazy dog."), Chunk::new("c", "tiny"),
Chunk::new("d", ""), Chunk::new("e", "x"), Chunk::new("f", "z".repeat(5000)), Chunk::unidentified("a healthy-sized chunk of prose."), ]
}
#[test]
fn empty_corpus_produces_empty_report() {
let report = lint_chunks(&[], &ChunkLintConfig::default());
assert_eq!(report.stats, ChunkStats::default());
assert!(report.warnings.is_empty());
}
#[test]
fn lint_flags_every_pathological_shape() {
let report = lint_chunks(&corpus(), &ChunkLintConfig::default());
assert_eq!(report.stats.count, 7);
assert_eq!(report.stats.empty, 1);
assert_eq!(report.stats.near_empty, 2);
assert_eq!(report.stats.giant, 1);
assert_eq!(report.stats.missing_ids, 1);
assert_eq!(report.stats.duplicate_text, 2);
let kinds: Vec<&'static str> = report
.warnings
.iter()
.map(|w| match w {
ChunkLintWarning::EmptyChunks { .. } => "empty",
ChunkLintWarning::NearEmptyChunks { .. } => "near_empty",
ChunkLintWarning::TooManyTinyChunks { .. } => "tiny",
ChunkLintWarning::GiantChunks { .. } => "giant",
ChunkLintWarning::DuplicateChunks { .. } => "dup",
ChunkLintWarning::MissingIds { .. } => "missing",
ChunkLintWarning::Encoding { .. } => "encoding",
ChunkLintWarning::UnknownLanguage { .. } => "unknown_language",
ChunkLintWarning::DisallowedLanguages { .. } => "language",
ChunkLintWarning::NearDuplicateChunks { .. } => "near_duplicate",
})
.collect();
assert!(kinds.contains(&"empty"));
assert!(kinds.contains(&"giant"));
assert!(kinds.contains(&"dup"));
assert!(kinds.contains(&"missing"));
}
#[test]
fn fatal_config_promotes_warnings_to_error() {
let config = ChunkLintConfig {
fatal: true,
..ChunkLintConfig::default()
};
let err = lint_chunks_strict(&corpus(), &config).unwrap_err();
let msg = format!("{err}");
assert!(msg.contains("chunk lint failed"), "got: {msg}");
}
#[test]
fn clean_corpus_passes_strict_mode() {
let chunks = vec![
Chunk::new("a", "healthy chunk number one. ".repeat(4)),
Chunk::new("b", "healthy chunk number two. ".repeat(4)),
Chunk::new("c", "healthy chunk number three. ".repeat(4)),
];
let config = ChunkLintConfig {
fatal: true,
..ChunkLintConfig::default()
};
let report = lint_chunks_strict(&chunks, &config).unwrap();
assert!(!report.has_warnings(), "got: {:?}", report.warnings);
assert_eq!(report.stats.count, 3);
}
#[test]
fn report_json_round_trips() {
let report = lint_chunks(&corpus(), &ChunkLintConfig::default());
let json = report.to_json().unwrap();
let parsed: ChunkLintReport = serde_json::from_str(&json).unwrap();
assert_eq!(parsed, report);
}
#[test]
fn lint_flags_encoding_artifacts() {
let chunks = vec![
Chunk::new("bom", "\u{feff}starts with a byte order mark"),
Chunk::new("ctrl", "contains \u{0007} bell"),
];
let report = lint_chunks(&chunks, &ChunkLintConfig::default());
assert_eq!(report.stats.bom_chunks, 1);
assert_eq!(report.stats.control_chars, 1);
assert!(report.warnings.iter().any(|warning| matches!(
warning,
ChunkLintWarning::Encoding {
warning: EncodingLintWarning::ByteOrderMarks { count: 1 }
}
)));
assert!(report.warnings.iter().any(|warning| matches!(
warning,
ChunkLintWarning::Encoding {
warning: EncodingLintWarning::ControlCharacters { count: 1 }
}
)));
}
#[test]
fn language_lint_flags_disallowed_languages() {
let chunks = vec![
Chunk::new("en", "This is an English technical report about retrieval."),
Chunk::new(
"fr",
"Bonjour, ceci est un rapport technique sur la recherche.",
),
];
let config = ChunkLintConfig {
language: LanguageLintConfig {
enabled: true,
allowed_languages: vec!["eng".into()],
max_unknown_fraction: 1.0,
},
..ChunkLintConfig::default()
};
let report = lint_chunks(&chunks, &config);
assert!(
report
.warnings
.iter()
.any(|warning| matches!(warning, ChunkLintWarning::DisallowedLanguages { .. }))
);
}
#[test]
fn near_duplicate_lint_flags_similar_chunks_when_enabled() {
let chunks = vec![
Chunk::new(
"a",
"alpha beta gamma delta epsilon zeta eta theta iota kappa lambda mu",
),
Chunk::new(
"b",
"alpha beta gamma delta epsilon zeta eta theta iota kappa lambda nu",
),
Chunk::new(
"c",
"orchid payroll cabinet runway marble visitor lantern summit anchor velvet puzzle",
),
];
let config = ChunkLintConfig {
near_duplicates: NearDuplicateLintConfig {
enabled: true,
shingle_tokens: 1,
signature_len: 96,
threshold: 0.75,
max_examples: 4,
},
..ChunkLintConfig::default()
};
let report = lint_chunks(&chunks, &config);
assert_eq!(report.stats.near_duplicate_pairs, 1);
assert!(report.warnings.iter().any(|warning| matches!(
warning,
ChunkLintWarning::NearDuplicateChunks {
pairs: 1,
examples,
..
} if examples.len() == 1
&& examples[0].left_id.as_deref() == Some("a")
&& examples[0].right_id.as_deref() == Some("b")
)));
}
#[test]
fn near_duplicate_lint_skips_exact_duplicates() {
let chunks = vec![
Chunk::new("a", "same text repeated for exact duplicate coverage"),
Chunk::new("b", "same text repeated for exact duplicate coverage"),
];
let config = ChunkLintConfig {
near_duplicates: NearDuplicateLintConfig {
enabled: true,
threshold: 0.0,
..NearDuplicateLintConfig::default()
},
..ChunkLintConfig::default()
};
let report = lint_chunks(&chunks, &config);
assert_eq!(report.stats.duplicate_text, 2);
assert_eq!(report.stats.near_duplicate_pairs, 0);
assert!(
!report
.warnings
.iter()
.any(|warning| matches!(warning, ChunkLintWarning::NearDuplicateChunks { .. }))
);
}
}