use crate::chat::header::{Headers, split_header_line};
use crate::chat::utterance::Utterance;
use regex::Regex;
use std::collections::{HashMap, HashSet};
use std::sync::LazyLock;
#[derive(Debug)]
pub struct ValidationError {
pub message: String,
}
impl ValidationError {
fn new(msg: impl Into<String>) -> Self {
Self {
message: msg.into(),
}
}
}
static VALID_ROLES: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
[
"Target_Child",
"Child",
"Mother",
"Father",
"Brother",
"Sister",
"Sibling",
"Grandmother",
"Grandfather",
"Relative",
"Participant",
"Investigator",
"Partner",
"Boy",
"Girl",
"Adult",
"Teenager",
"Male",
"Female",
"Visitor",
"Friend",
"Playmate",
"Caretaker",
"Environment",
"Group",
"Unidentified",
"Uncertain",
"Other",
"Text",
"Media",
"PlayRole",
"LENA",
"Target_Adult",
"Non_Human",
"Nurse",
"Doctor",
"Babysitter",
"Housekeeper",
"Student",
"Teacher",
"Camera_Operator",
"Observer",
"Speaker",
"Narrator",
"Justice",
"Attorney",
"Offender",
"Victim",
"Witness",
"Uncle",
"Aunt",
"Clinician",
"Therapist",
"Teacher's_Aide",
"Audience",
"Guest",
"Host",
"Informant",
"Leader",
"Member",
"Subject",
]
.into_iter()
.collect()
});
static VALID_OPTIONS: LazyLock<HashSet<&'static str>> =
LazyLock::new(|| ["CA", "CA-Unicode", "multi", "dummy"].into_iter().collect());
static VALID_FILE_HEADERS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
[
"UTF8",
"Begin",
"End",
"Window",
"Font",
"Color Words",
"ColorWords",
"PID",
"Languages",
"Participants",
"Options",
"ID",
"Media",
"Location",
"Number",
"Recording Quality",
"Room Layout",
"Tape Location",
"Time Duration",
"Time Start",
"Transcriber",
"Transcription",
"Warning",
"Activities",
"Bck",
"Bg",
"Blank",
"Comment",
"Date",
"Eg",
"G",
"New Episode",
"Page",
"Situation",
]
.into_iter()
.collect()
});
static VALID_FORM_MARKERS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
[
"s", "c", "d", "e", "f", "g", "i", "k", "l", "n", "o", "p", "q", "t", "u", "b", "wp", "si",
"sl", "m", "fp", "fs", "ls", "sas", "z", "x", "w", "r", "h", "j", "v", "a",
]
.into_iter()
.collect()
});
static BULLET_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\x15-?(\d+)_(\d+)-?\x15").unwrap());
static RAW_BULLET_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(\d+)_(\d+)\s*$").unwrap());
static REPLACEMENT_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\[:\s+([^\]]+)\]").unwrap());
static FRAG_REPLACEMENT_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"&~\S+\s+\[:\s+[^\]]+\]").unwrap());
static ADJACENT_QUOTES_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#""\s+""#).unwrap());
pub fn validate_chat_file(
file_path: &str,
headers: &Headers,
events: &[Utterance],
raw_lines: &[String],
) -> Vec<ValidationError> {
let mut errors = Vec::new();
validate_raw_structure(file_path, raw_lines, &mut errors);
validate_headers(file_path, headers, raw_lines, &mut errors);
validate_utterances(file_path, headers, events, raw_lines, &mut errors);
errors
}
fn validate_raw_structure(
file_path: &str,
raw_lines: &[String],
errors: &mut Vec<ValidationError>,
) {
let mut seen_end = false;
for line in raw_lines {
if line == "@End" {
if seen_end {
errors.push(ValidationError::new(format!(
"{file_path}: content after @End"
)));
break;
}
seen_end = true;
continue;
}
if seen_end && (line.starts_with('*') || line.starts_with('%')) {
errors.push(ValidationError::new(format!(
"{file_path}: content after @End"
)));
break;
}
}
let mut current_dep_tiers: HashSet<String> = HashSet::new();
for line in raw_lines {
if line.starts_with('*') {
current_dep_tiers.clear();
} else if line.starts_with('%') {
let tier_name = line.split(':').next().unwrap_or("");
if !current_dep_tiers.insert(tier_name.to_string()) {
errors.push(ValidationError::new(format!(
"{file_path}: duplicate dependent tier {tier_name}"
)));
}
}
}
for line in raw_lines {
if !line.starts_with('@') {
continue;
}
if line == "@End" || line == "@Begin" || line == "@UTF8" {
continue;
}
let (name, _value) = split_header_line(line);
if name.contains(" of ") {
continue;
}
if !VALID_FILE_HEADERS.contains(name)
&& !name.starts_with("Birth of")
&& !name.starts_with("Birthplace of")
&& !name.starts_with("L1 of")
{
if name == "Exceptions" || name == "New Language" || name == "Code" {
errors.push(ValidationError::new(format!(
"{file_path}: invalid header @{name}"
)));
}
}
}
for line in raw_lines {
if !line.starts_with("@Page") {
continue;
}
let (name, value) = split_header_line(line);
if name == "Page" {
match value {
None => {
errors.push(ValidationError::new(format!(
"{file_path}: @Page header missing page number"
)));
}
Some(v) => {
if v.parse::<u64>().is_err() {
errors.push(ValidationError::new(format!(
"{file_path}: @Page header has non-numeric value: {v}"
)));
}
}
}
}
}
if !raw_lines.is_empty() && raw_lines[0] != "@UTF8" && raw_lines[0] == "@Begin" {
errors.push(ValidationError::new(format!(
"{file_path}: file does not start with @UTF8"
)));
}
}
fn validate_headers(
file_path: &str,
headers: &Headers,
raw_lines: &[String],
errors: &mut Vec<ValidationError>,
) {
if let Some(ref opts) = headers.options
&& !VALID_OPTIONS.contains(opts.as_str())
{
errors.push(ValidationError::new(format!(
"{file_path}: invalid @Options value: {opts}"
)));
}
for p in &headers.participants {
if !p.code.is_ascii() {
errors.push(ValidationError::new(format!(
"{file_path}: non-ASCII participant code: {}",
p.code
)));
}
if p.code.contains('+') {
errors.push(ValidationError::new(format!(
"{file_path}: participant code contains '+': {}",
p.code
)));
}
}
let id_codes: HashSet<String> = raw_lines
.iter()
.filter(|l| l.starts_with("@ID:") || l.starts_with("@ID\t") || l.starts_with("@ID:\t"))
.filter_map(|l| {
let value = l.split_once('\t')?.1;
let fields: Vec<&str> = value.split('|').collect();
if fields.len() >= 3 {
Some(fields[2].trim().to_string())
} else {
None
}
})
.collect();
if !id_codes.is_empty() {
for p in &headers.participants {
if !p.code.is_empty() && !id_codes.contains(&p.code) {
errors.push(ValidationError::new(format!(
"{file_path}: participant {} is missing a required @ID header",
p.code
)));
}
}
}
for p in &headers.participants {
if !p.role.is_empty() {
if !p.role.is_ascii() {
errors.push(ValidationError::new(format!(
"{file_path}: invalid role (non-ASCII): {}",
p.role
)));
}
if p.role.is_ascii() && !VALID_ROLES.contains(p.role.as_str()) {
errors.push(ValidationError::new(format!(
"{file_path}: invalid role: {}",
p.role
)));
}
}
}
let participants_roles: HashMap<String, String> = headers
.participants
.iter()
.filter(|p| !p.role.is_empty())
.map(|p| (p.code.clone(), p.role.clone()))
.collect();
for line in raw_lines {
if !(line.starts_with("@ID:") || line.starts_with("@ID\t") || line.starts_with("@ID:\t")) {
continue;
}
if let Some(value) = line.split_once('\t').map(|x| x.1) {
let fields: Vec<&str> = value.split('|').collect();
if fields.len() >= 8 {
let code = fields[2].trim();
let id_role = fields[7].trim();
if !id_role.is_empty()
&& let Some(part_role) = participants_roles.get(code)
&& !part_role.is_empty()
&& part_role != id_role
{
errors.push(ValidationError::new(format!(
"{file_path}: {code} has @ID role {id_role} but @Participants role {part_role}"
)));
}
}
}
}
for line in raw_lines {
if !(line.starts_with("@ID:") || line.starts_with("@ID\t") || line.starts_with("@ID:\t")) {
continue;
}
if let Some(value) = line.split_once('\t').map(|x| x.1) {
let fields: Vec<&str> = value.split('|').collect();
if fields.len() >= 4 {
let age_str = fields[3].trim();
if !age_str.is_empty() {
validate_age_format(file_path, age_str, errors);
}
}
}
}
if let Some(ref media) = headers.media_data {
let file_basename = std::path::Path::new(file_path)
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("");
if !media.filename.is_empty()
&& !file_basename.is_empty()
&& !media.filename.contains("://")
&& file_path.ends_with(".cha")
&& media.filename != file_basename
{
errors.push(ValidationError::new(format!(
"{file_path}: @Media name '{}' does not match file name",
media.filename
)));
}
for line in raw_lines {
if line.starts_with("@Media") {
let value = line.split_once('\t').map(|x| x.1).unwrap_or("");
if value.contains("://") && !value.starts_with('"') {
errors.push(ValidationError::new(format!(
"{file_path}: @Media URL must be quoted"
)));
}
break;
}
}
}
for line in raw_lines {
if !(line.starts_with("@ID:") || line.starts_with("@ID\t") || line.starts_with("@ID:\t")) {
continue;
}
if let Some(value) = line.split_once('\t').map(|x| x.1) {
let fields: Vec<&str> = value.split('|').collect();
if fields.len() >= 7 {
let ses_field = fields[6].trim();
if !ses_field.is_empty() {
let valid_ses_patterns = [
"UC",
"MC",
"WC",
"LWC",
"LMC",
"UMC",
"upper-working-class",
"working-class",
"middle-class",
];
let parts: Vec<&str> = ses_field.split(',').collect();
for part in &parts {
let part = part.trim();
if part.is_empty() {
continue;
}
if part.chars().all(|c| c.is_ascii_digit()) {
continue;
}
if valid_ses_patterns.contains(&part) {
continue;
}
if part.contains('-')
&& part.chars().all(|c| c.is_ascii_alphanumeric() || c == '-')
{
continue;
}
errors.push(ValidationError::new(format!(
"{file_path}: invalid SES value in @ID: {ses_field}"
)));
break;
}
}
}
}
}
for lang in &headers.languages {
if lang.len() == 3 && lang == "aaa" {
errors.push(ValidationError::new(format!(
"{file_path}: invalid language code: {lang}"
)));
}
}
for line in raw_lines {
if !line.starts_with('%') {
continue;
}
if let Some(tier_name) = line.strip_prefix('%') {
let tier_name = tier_name.split(':').next().unwrap_or("").trim();
if tier_name == "lan" {
errors.push(ValidationError::new(format!(
"{file_path}: obsolete dependent tier %{tier_name}"
)));
}
}
}
}
fn validate_age_format(file_path: &str, age_str: &str, errors: &mut Vec<ValidationError>) {
let age_str = age_str.trim();
if let Some((_years_str, rest)) = age_str.split_once(';') {
if rest.is_empty() {
return; }
let (months_str, days_str) = if let Some((m, d)) = rest.split_once('.') {
(m, d)
} else {
(rest, "")
};
if !months_str.is_empty() && months_str.len() != 2 {
errors.push(ValidationError::new(format!(
"{file_path}: age months must be two digits, got: {age_str}"
)));
}
if !days_str.is_empty() && days_str.len() != 2 {
errors.push(ValidationError::new(format!(
"{file_path}: age days must be two digits, got: {age_str}"
)));
}
}
}
fn validate_utterances(
file_path: &str,
headers: &Headers,
events: &[Utterance],
raw_lines: &[String],
errors: &mut Vec<ValidationError>,
) {
let is_ca = headers
.options
.as_deref()
.map(|o| o.starts_with("CA"))
.unwrap_or(false);
let mut utterance_raw_lines: Vec<&str> = Vec::new();
for line in raw_lines {
if line.starts_with('*') {
utterance_raw_lines.push(line);
}
}
for (idx, utt) in events.iter().enumerate() {
if utt.changeable_header.is_some() {
continue;
}
let raw_main = if let Some(ref tiers) = utt.tiers {
if let Some(ref participant) = utt.participant {
tiers.get(participant).map(|s| s.as_str()).unwrap_or("")
} else {
""
}
} else {
""
};
let raw_line = utterance_raw_lines.get(idx).copied().unwrap_or("");
let raw_text = raw_line
.find(":\t")
.map(|i| &raw_line[i + 2..])
.unwrap_or(raw_main);
validate_single_utterance(file_path, raw_text, is_ca, headers, errors);
}
for line in raw_lines {
if !line.starts_with('*') {
continue;
}
for caps in BULLET_REGEX.captures_iter(line) {
if let (Some(start_m), Some(end_m)) = (caps.get(1), caps.get(2))
&& let (Ok(start), Ok(end)) = (
start_m.as_str().parse::<i64>(),
end_m.as_str().parse::<i64>(),
)
&& start >= end
{
errors.push(ValidationError::new(format!(
"{file_path}: bullet start {start} must be earlier than end {end}"
)));
}
}
}
}
fn strip_trailing_brackets(text: &str) -> &str {
let mut end = text.len();
let bytes = text.as_bytes();
loop {
while end > 0 && (bytes[end - 1] == b' ' || bytes[end - 1] == b'\t') {
end -= 1;
}
if end == 0 {
break;
}
if bytes[end - 1] == b']' {
let mut depth = 1;
let mut i = end - 2;
loop {
if bytes[i] == b']' {
depth += 1;
} else if bytes[i] == b'[' {
depth -= 1;
if depth == 0 {
end = i;
break;
}
}
if i == 0 {
return &text[..end];
}
i -= 1;
}
} else {
break;
}
}
&text[..end]
}
fn validate_single_utterance(
file_path: &str,
raw_text: &str,
is_ca: bool,
headers: &Headers,
errors: &mut Vec<ValidationError>,
) {
let text = BULLET_REGEX.replace_all(raw_text, "");
let text = RAW_BULLET_REGEX.replace_all(&text, "");
let text = text.trim();
if text.is_empty() {
return;
}
let core_text = strip_trailing_brackets(text);
let core_text = core_text.trim();
if core_text.is_empty() {
return;
}
let terminators = ['.', '?', '!'];
let last_char = core_text.chars().last().unwrap_or(' ');
if !is_ca {
if !terminators.contains(&last_char)
&& !core_text.ends_with("+...")
&& !core_text.ends_with("+/.")
&& !core_text.ends_with("+//.")
&& !core_text.ends_with("+/?")
&& !core_text.ends_with("+//?")
&& !core_text.ends_with("+\"/.")
{
errors.push(ValidationError::new(format!(
"{file_path}: utterance missing terminator"
)));
return;
}
if core_text.len() >= 2 {
let chars: Vec<char> = core_text.chars().collect();
let len = chars.len();
let second_last = chars[len - 2];
if terminators.contains(&last_char)
&& terminators.contains(&second_last)
&& !(second_last == '.' && last_char == '.' && len >= 3 && chars[len - 3] == '.')
&& !core_text.ends_with("+...")
&& !core_text.ends_with("+..?")
{
errors.push(ValidationError::new(format!(
"{file_path}: multiple terminators"
)));
}
}
if core_text.ends_with("+=.") || core_text.ends_with("+=?") || core_text.ends_with("+=!") {
errors.push(ValidationError::new(format!(
"{file_path}: CA-style terminator in non-CA transcript"
)));
}
if core_text.len() >= 2 {
let bytes = core_text.as_bytes();
let len = bytes.len();
if bytes[len - 2] == b'-'
&& (last_char == '.' || last_char == '!' || last_char == '?' || last_char == '\'')
&& (len == 2 || bytes[len - 3] == b' ')
{
errors.push(ValidationError::new(format!(
"{file_path}: tone terminator not allowed"
)));
}
if len >= 3
&& core_text.ends_with(",.")
&& bytes[len - 3] == b'-'
&& (len == 3 || bytes[len - 4] == b' ')
{
errors.push(ValidationError::new(format!(
"{file_path}: tone terminator not allowed"
)));
}
}
}
let mut content_text = core_text;
content_text = content_text.trim_end_matches(|c: char| terminators.contains(&c) || c == ' ');
for suffix in &["+...", "+/.", "+//.", "+/?", "+//?", "+\"/.", "+..?"] {
if let Some(stripped) = content_text.strip_suffix(suffix) {
content_text = stripped;
break;
}
}
content_text = content_text.trim();
if content_text.is_empty() {
errors.push(ValidationError::new(format!(
"{file_path}: empty utterance (no content before terminator)"
)));
return;
}
let words = extract_words(text);
for word in &words {
if *word == "XXX" || *word == "YYY" || *word == "WWW" {
errors.push(ValidationError::new(format!(
"{file_path}: \"{word}\" must be lowercase"
)));
}
if *word == "xx" || *word == "yy" {
errors.push(ValidationError::new(format!(
"{file_path}: \"{word}\" is obsolete; use \"{}\"",
if *word == "xx" { "xxx" } else { "yyy" }
)));
}
if word.starts_with('^') {
errors.push(ValidationError::new(format!(
"{file_path}: ^ not allowed at beginning of word: {word}"
)));
}
if word.starts_with("&=0") {
errors.push(ValidationError::new(format!(
"{file_path}: &=0 notation is illegal: {word}"
)));
}
if let Some(rest) = word.strip_prefix("&=")
&& rest.contains('.')
&& rest
.chars()
.next()
.map(|c| c.is_ascii_digit())
.unwrap_or(false)
{
errors.push(ValidationError::new(format!(
"{file_path}: invalid &= notation (fraction): {word}"
)));
}
if word.starts_with('&')
&& !word.starts_with("&=")
&& !word.starts_with("&~")
&& !word.starts_with("&-")
&& !word.starts_with("&+")
&& !word.starts_with("&*")
&& word.len() > 1
{
let rest = &word[1..];
if rest
.chars()
.next()
.map(|c| c.is_alphabetic())
.unwrap_or(false)
&& !rest.starts_with("l")
{
errors.push(ValidationError::new(format!(
"{file_path}: old & notation not allowed: {word}"
)));
}
}
if word.contains("-0") && !word.starts_with("0") && !word.starts_with("&") {
errors.push(ValidationError::new(format!(
"{file_path}: illegal 0-affix in word: {word}"
)));
}
if word.contains("(.)") && *word != "(.)" {
errors.push(ValidationError::new(format!(
"{file_path}: pause marker embedded in word: {word}"
)));
}
if word.contains("(..)") && *word != "(..)" {
errors.push(ValidationError::new(format!(
"{file_path}: pause marker embedded in word: {word}"
)));
}
if word.contains("(...)") && *word != "(...)" {
errors.push(ValidationError::new(format!(
"{file_path}: pause marker embedded in word: {word}"
)));
}
if !is_ca
&& word.starts_with('(')
&& word.ends_with(')')
&& !word.starts_with("(.)")
&& !word.starts_with("(..)")
&& !word.starts_with("(...)")
&& word.len() > 2
{
let inner = &word[1..word.len() - 1];
let is_single_group = !inner.contains('(') && !inner.contains(')');
if is_single_group {
let is_timed_pause = !inner.is_empty()
&& inner
.chars()
.all(|c| c.is_ascii_digit() || c == '.' || c == ':');
if !is_timed_pause {
errors.push(ValidationError::new(format!(
"{file_path}: word has no spoken content (entirely parenthesized): {word}"
)));
}
}
}
validate_form_markers(file_path, word, headers, errors);
if word.contains("@q-") || word.contains("@ap-") {
errors.push(ValidationError::new(format!(
"{file_path}: extraneous suffix after form marker: {word}"
)));
}
}
let inline_dep_tiers: &[&str] = &["[%act:", "[%sch:", "[%sdi:", "[%ssx:"];
for pattern in inline_dep_tiers {
if text.contains(pattern) {
errors.push(ValidationError::new(format!(
"{file_path}: inline dependent tier in utterance: {pattern}"
)));
}
}
validate_quotations(file_path, text, errors);
validate_replacements(file_path, text, errors);
}
fn extract_words(text: &str) -> Vec<&str> {
let mut words = Vec::new();
let mut i = 0;
let bytes = text.as_bytes();
let len = bytes.len();
while i < len {
if bytes[i] == b' ' || bytes[i] == b'\t' {
i += 1;
continue;
}
if bytes[i] == b'[' {
let mut depth = 1;
i += 1;
while i < len && depth > 0 {
if bytes[i] == b'[' {
depth += 1;
} else if bytes[i] == b']' {
depth -= 1;
}
i += 1;
}
continue;
}
if bytes[i] == b'<' {
let start = i;
let mut depth = 1;
i += 1;
while i < len && depth > 0 {
if bytes[i] == b'<' {
depth += 1;
} else if bytes[i] == b'>' {
depth -= 1;
}
i += 1;
}
if i <= len {
let inner = &text[start + 1..i.saturating_sub(1)];
words.extend(extract_words(inner));
}
continue;
}
let start = i;
while i < len && bytes[i] != b' ' && bytes[i] != b'\t' {
if bytes[i] == b'[' {
break;
}
i += 1;
}
if i > start {
let word = &text[start..i];
if !word.contains('\x15') {
words.push(word);
}
}
}
words
}
fn validate_form_markers(
file_path: &str,
word: &str,
headers: &Headers,
errors: &mut Vec<ValidationError>,
) {
if let Some(at_pos) = word.find('@') {
if at_pos == 0 {
return;
}
let marker = &word[at_pos + 1..];
let marker =
marker.trim_end_matches(|c: char| ".?!\"'\u{201C}\u{201D}\u{2018}\u{2019}".contains(c));
let marker_code = marker.split(':').next().unwrap_or(marker);
let marker_code = marker_code.split('$').next().unwrap_or(marker_code);
let marker_base = marker_code.split('-').next().unwrap_or(marker_code);
if marker_base.is_empty() {
return;
}
if !VALID_FORM_MARKERS.contains(marker_base) {
errors.push(ValidationError::new(format!(
"{file_path}: invalid form marker code: @{marker_base} in word: {word}"
)));
}
let word_before_at = &word[..at_pos];
let is_compound = word_before_at.contains('+');
let is_sign_mode = headers
.options
.as_deref()
.map(|o| o == "sign")
.unwrap_or(false);
if is_compound && !is_sign_mode && (marker_base == "s" || marker_base == "o") {
errors.push(ValidationError::new(format!(
"{file_path}: compound word may not have form marker @{marker_base}: {word}"
)));
}
let bare_word = word_before_at
.trim_start_matches(|c: char| "\"\u{201C}\u{201D}\u{2018}\u{2019}".contains(c));
if marker_base == "l" && bare_word.chars().count() != 1 {
errors.push(ValidationError::new(format!(
"{file_path}: @l form marker requires a single letter: {word}"
)));
}
}
}
fn validate_quotations(file_path: &str, text: &str, errors: &mut Vec<ValidationError>) {
if text.contains('\u{2018}') || text.contains('\u{2019}') {
errors.push(ValidationError::new(format!(
"{file_path}: Unicode curly quotes not allowed in utterance"
)));
return;
}
let cleaned = text.replace("+\"", "").replace("+\"/.", "");
let quote_count = cleaned.chars().filter(|&c| c == '"').count();
if quote_count > 2 && ADJACENT_QUOTES_REGEX.is_match(&cleaned) {
errors.push(ValidationError::new(format!(
"{file_path}: multiple quotation sections in utterance"
)));
}
}
fn validate_replacements(file_path: &str, text: &str, errors: &mut Vec<ValidationError>) {
for caps in REPLACEMENT_REGEX.captures_iter(text) {
let replacement = caps.get(1).unwrap().as_str().trim();
if replacement == "xxx" || replacement == "yyy" || replacement == "www" {
errors.push(ValidationError::new(format!(
"{file_path}: replacement word cannot be untranscribed: [: {replacement}]"
)));
}
if replacement.starts_with('0') && replacement.len() > 1 {
errors.push(ValidationError::new(format!(
"{file_path}: replacement word cannot be an omission: [: {replacement}]"
)));
}
}
if FRAG_REPLACEMENT_REGEX.is_match(text) {
errors.push(ValidationError::new(format!(
"{file_path}: replacement not allowed for a fragment"
)));
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_words_basic() {
assert_eq!(extract_words("hello world ."), vec!["hello", "world", "."]);
}
#[test]
fn test_extract_words_skips_brackets() {
assert_eq!(
extract_words("aam [: m_d@s] [*] player@s ."),
vec!["aam", "player@s", "."]
);
}
#[test]
fn test_extract_words_angle_group() {
assert_eq!(
extract_words("<word1 word2> [/] word3 ."),
vec!["word1", "word2", "word3", "."]
);
}
#[test]
fn test_extract_words_angle_group_with_brackets() {
assert_eq!(
extract_words("<aam [: m_d@s] [*]> [/] aam [: m_d@s] [*] player@s ."),
vec!["aam", "aam", "player@s", "."]
);
}
}