#[cfg(feature = "pyo3")]
use super::utterance_py::{PyToken, PyUtterance};
use crate::chat::clean_utterance::clean_utterance;
use crate::chat::header::{
Age, ChangeableHeader, Headers, Participant, parse_changeable, parse_file_headers,
split_header_line,
};
use crate::chat::utterance::{BaseUtterance, Gra, Token, Utterance, Utterances};
#[cfg(feature = "pyo3")]
use pyo3::prelude::*;
use fancy_regex::Regex as FancyRegex;
#[cfg(feature = "parallel")]
use rayon::prelude::*;
use regex::Regex;
use std::collections::{HashMap, HashSet, VecDeque};
use std::sync::LazyLock;
#[cfg(feature = "pyo3")]
use std::sync::{Arc, OnceLock};
static TIME_MARKS_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\x15-?(\d+)_(\d+)-?\x15").unwrap());
#[derive(Debug)]
pub struct ChatFile {
pub file_path: String,
pub headers: Headers,
pub events: Vec<Utterance>,
pub raw_lines: Vec<String>,
#[cfg(feature = "pyo3")]
pub(crate) py_utterances: Arc<OnceLock<Vec<Py<PyUtterance>>>>,
#[cfg(feature = "pyo3")]
pub(crate) py_tokens: Arc<OnceLock<Vec<Vec<Py<PyToken>>>>>,
}
impl Clone for ChatFile {
fn clone(&self) -> Self {
Self {
file_path: self.file_path.clone(),
headers: self.headers.clone(),
events: self.events.clone(),
raw_lines: self.raw_lines.clone(),
#[cfg(feature = "pyo3")]
py_utterances: Arc::new(OnceLock::new()),
#[cfg(feature = "pyo3")]
py_tokens: Arc::new(OnceLock::new()),
}
}
}
impl ChatFile {
pub fn new(
file_path: String,
headers: Headers,
events: Vec<Utterance>,
raw_lines: Vec<String>,
) -> Self {
Self {
file_path,
headers,
events,
raw_lines,
#[cfg(feature = "pyo3")]
py_utterances: Arc::new(OnceLock::new()),
#[cfg(feature = "pyo3")]
py_tokens: Arc::new(OnceLock::new()),
}
}
pub fn utterances(&self) -> impl Iterator<Item = &Utterance> {
self.events.iter()
}
pub fn real_utterances(&self) -> impl Iterator<Item = &Utterance> {
self.events.iter().filter(|u| u.changeable_header.is_none())
}
pub(crate) fn eq_data(&self, other: &ChatFile) -> bool {
self.file_path == other.file_path
&& self.headers == other.headers
&& self.events == other.events
&& self.raw_lines == other.raw_lines
}
pub fn is_empty(&self) -> bool {
self.events.is_empty()
}
pub fn reset_caches(&mut self) {
#[cfg(feature = "pyo3")]
{
self.py_utterances = Arc::new(OnceLock::new());
self.py_tokens = Arc::new(OnceLock::new());
}
}
}
struct TierGroup {
participant: String,
main_tier: String,
dependent_tiers: HashMap<String, String>,
}
struct MorItem {
pos: String,
mor: String,
is_clitic: bool,
}
struct MisalignmentCounts {
word_count: usize,
mor_count: usize,
words: Vec<String>,
mor_labels: Vec<String>,
}
pub struct MisalignmentInfo {
pub file_path: String,
pub participant: String,
pub main_tier: String,
pub mor_tier_name: String,
pub mor_tier_content: String,
pub word_count: usize,
pub mor_count: usize,
pub words: Vec<String>,
pub mor_labels: Vec<String>,
}
#[derive(Debug)]
pub enum ChatError {
Io(std::io::Error),
InvalidPattern(String),
Zip(String),
Source(crate::sources::SourceError),
}
impl std::fmt::Display for ChatError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ChatError::Io(e) => write!(f, "{e}"),
ChatError::InvalidPattern(e) => write!(f, "Invalid match regex: {e}"),
ChatError::Zip(e) => write!(f, "{e}"),
ChatError::Source(e) => write!(f, "{e}"),
}
}
}
impl std::error::Error for ChatError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
ChatError::Io(e) => Some(e),
ChatError::Source(e) => Some(e),
_ => None,
}
}
}
impl From<crate::sources::SourceError> for ChatError {
fn from(e: crate::sources::SourceError) -> Self {
ChatError::Source(e)
}
}
impl From<std::io::Error> for ChatError {
fn from(e: std::io::Error) -> Self {
ChatError::Io(e)
}
}
fn get_lines(chat_str: &str) -> Vec<String> {
let mut lines = Vec::new();
for raw_line in chat_str.lines() {
let line = raw_line.trim();
if line.is_empty() {
continue;
}
if line.starts_with('*') || line.starts_with('%') || line.starts_with('@') {
lines.push(line.to_string());
} else if let Some(last) = lines.last_mut() {
last.push(' ');
last.push_str(line.trim());
}
}
lines
}
enum EventOrTierGroup {
TierGroup(TierGroup),
Header(ChangeableHeader),
}
fn get_all_events(lines: &[String], start_idx: usize) -> Vec<EventOrTierGroup> {
let mut results = Vec::new();
let mut current: Option<TierGroup> = None;
for line in &lines[start_idx..] {
if line.starts_with('@') {
let (name, value) = split_header_line(line);
if name == "End" {
continue;
}
if let Some(ch) = parse_changeable(name, value) {
if let Some(group) = current.take() {
results.push(EventOrTierGroup::TierGroup(group));
}
results.push(EventOrTierGroup::Header(ch));
}
continue;
}
if line.starts_with('*') {
if let Some(group) = current.take() {
results.push(EventOrTierGroup::TierGroup(group));
}
if let Some(colon_pos) = line.find(':') {
let participant = line[1..colon_pos].to_string();
let content = line[colon_pos + 1..]
.trim_start_matches('\t')
.trim()
.to_string();
current = Some(TierGroup {
participant,
main_tier: content,
dependent_tiers: HashMap::new(),
});
}
} else if line.starts_with('%')
&& let Some(ref mut group) = current
&& let Some(colon_pos) = line.find(':')
{
let tier_name = line[..colon_pos].to_string();
let content = line[colon_pos + 1..]
.trim_start_matches('\t')
.trim()
.to_string();
group.dependent_tiers.insert(tier_name, content);
}
}
if let Some(group) = current {
results.push(EventOrTierGroup::TierGroup(group));
}
results
}
fn split_pos_mor(item: &str) -> (String, String) {
if let Some(pipe_pos) = item.find('|') {
(
item[..pipe_pos].to_string(),
item[pipe_pos + 1..].to_string(),
)
} else {
(String::new(), item.to_string())
}
}
fn parse_mor_tier(mor_str: &str) -> Vec<MorItem> {
let mut items = Vec::new();
for mor_token in mor_str.split_whitespace() {
let tilde_parts: Vec<&str> = mor_token.split('~').collect();
for (tilde_idx, tilde_part) in tilde_parts.iter().enumerate() {
let dollar_parts: Vec<&str> = tilde_part.split('$').collect();
for (dollar_idx, dollar_part) in dollar_parts.iter().enumerate() {
let (pos, mor) = split_pos_mor(dollar_part);
let is_clitic = tilde_idx > 0 || dollar_idx < dollar_parts.len() - 1;
items.push(MorItem {
pos,
mor,
is_clitic,
});
}
}
}
if let Some(last) = items.last_mut()
&& !last.pos.is_empty()
&& last.mor.len() > 1
{
let final_byte = last.mor.as_bytes()[last.mor.len() - 1];
if matches!(final_byte, b'.' | b'?' | b'!') {
let punct = last.mor[last.mor.len() - 1..].to_string();
last.mor.truncate(last.mor.len() - 1);
items.push(MorItem {
pos: String::new(),
mor: punct,
is_clitic: false,
});
}
}
items
}
fn parse_gra_tier(gra_str: &str) -> Vec<Gra> {
gra_str
.split_whitespace()
.filter_map(|item| {
let parts: Vec<&str> = item.split('|').collect();
if parts.len() >= 3 {
Some(Gra {
dep: parts[0].parse().unwrap_or(0),
head: parts[1].parse().unwrap_or(0),
rel: parts[2].to_string(),
})
} else {
None
}
})
.collect()
}
fn build_tokens(
words: &[&str],
mor_items: Option<&[MorItem]>,
gra_items: Option<&[Gra]>,
) -> (Vec<Token>, Option<MisalignmentCounts>) {
if words.is_empty() {
return (Vec::new(), None);
}
let Some(mor_items) = mor_items else {
return (
words
.iter()
.map(|w| Token {
word: w.to_string(),
pos: None,
mor: None,
gra: None,
})
.collect(),
None,
);
};
let non_clitic_count = mor_items.iter().filter(|m| !m.is_clitic).count();
if non_clitic_count != words.len() {
let word_list = words.iter().map(|w| w.to_string()).collect();
let mor_list = mor_items
.iter()
.filter(|m| !m.is_clitic)
.map(|m| format!("{}|{}", m.pos, m.mor))
.collect();
return (
Vec::new(),
Some(MisalignmentCounts {
word_count: words.len(),
mor_count: non_clitic_count,
words: word_list,
mor_labels: mor_list,
}),
);
}
let mut tokens = Vec::new();
let mut mor_idx = 0;
let mut word_idx = 0;
while mor_idx < mor_items.len() {
let item = &mor_items[mor_idx];
if item.is_clitic {
let gra = gra_items.and_then(|g| g.get(mor_idx)).cloned();
tokens.push(Token {
word: String::new(),
pos: Some(item.pos.clone()),
mor: Some(item.mor.clone()),
gra,
});
} else {
let word = if word_idx < words.len() {
words[word_idx]
} else {
""
};
let gra = gra_items.and_then(|g| g.get(mor_idx)).cloned();
tokens.push(Token {
word: word.to_string(),
pos: Some(item.pos.clone()),
mor: Some(item.mor.clone()),
gra,
});
word_idx += 1;
}
mor_idx += 1;
}
(tokens, None)
}
#[allow(unused_variables)]
fn parse_chat_str(
chat_str: &str,
parallel: bool,
mor_tier: Option<&str>,
gra_tier: Option<&str>,
) -> (Headers, Vec<Utterance>, Vec<String>, Vec<MisalignmentInfo>) {
let lines = get_lines(chat_str);
let (headers, start_idx, _initial_events) = parse_file_headers(&lines);
let event_or_groups = get_all_events(&lines, start_idx);
let tier_groups: Vec<&TierGroup> = event_or_groups
.iter()
.filter_map(|e| match e {
EventOrTierGroup::TierGroup(tg) => Some(tg),
EventOrTierGroup::Header(_) => None,
})
.collect();
#[cfg(feature = "parallel")]
let results: Vec<(Utterance, Option<MisalignmentInfo>)> = if parallel {
tier_groups
.par_iter()
.with_min_len(16)
.map(|tg| build_utterance(tg, mor_tier, gra_tier))
.collect()
} else {
tier_groups
.iter()
.map(|tg| build_utterance(tg, mor_tier, gra_tier))
.collect()
};
#[cfg(not(feature = "parallel"))]
let results: Vec<(Utterance, Option<MisalignmentInfo>)> = tier_groups
.iter()
.map(|tg| build_utterance(tg, mor_tier, gra_tier))
.collect();
let mut utterances = Vec::with_capacity(results.len());
let mut misalignments = Vec::new();
for (utt, mis) in results {
utterances.push(utt);
if let Some(m) = mis {
misalignments.push(m);
}
}
let mut events: Vec<Utterance> = Vec::new();
let mut utt_iter = utterances.into_iter();
for eg in event_or_groups {
match eg {
EventOrTierGroup::TierGroup(_) => {
events.push(utt_iter.next().unwrap());
}
EventOrTierGroup::Header(h) => {
events.push(Utterance {
participant: None,
tokens: None,
time_marks: None,
tiers: None,
changeable_header: Some(h),
mor_tier_name: None,
gra_tier_name: None,
});
}
}
}
(headers, events, lines, misalignments)
}
fn build_utterance(
group: &TierGroup,
mor_tier: Option<&str>,
gra_tier: Option<&str>,
) -> (Utterance, Option<MisalignmentInfo>) {
let time_marks = TIME_MARKS_REGEX
.captures(&group.main_tier)
.and_then(|caps| {
let start: i64 = caps.get(1)?.as_str().parse().ok()?;
let end: i64 = caps.get(2)?.as_str().parse().ok()?;
Some((start, end))
});
let cleaned = clean_utterance(&group.main_tier);
let words: Vec<&str> = cleaned.split_whitespace().collect();
let (mor_items, gra_items) = if let (Some(mt), Some(gt)) = (mor_tier, gra_tier) {
(
group.dependent_tiers.get(mt).map(|s| parse_mor_tier(s)),
group.dependent_tiers.get(gt).map(|s| parse_gra_tier(s)),
)
} else {
(None, None)
};
let (tokens, misalignment_counts) =
build_tokens(&words, mor_items.as_deref(), gra_items.as_deref());
let misalignment = misalignment_counts.map(|counts| MisalignmentInfo {
file_path: String::new(), participant: group.participant.clone(),
main_tier: group.main_tier.clone(),
mor_tier_name: mor_tier.unwrap_or("%mor").to_string(),
mor_tier_content: mor_tier
.and_then(|mt| group.dependent_tiers.get(mt))
.cloned()
.unwrap_or_default(),
word_count: counts.word_count,
mor_count: counts.mor_count,
words: counts.words,
mor_labels: counts.mor_labels,
});
let mut tiers = group.dependent_tiers.clone();
tiers.insert(group.participant.clone(), group.main_tier.clone());
(
Utterance {
participant: Some(group.participant.clone()),
tokens: Some(tokens),
time_marks,
tiers: Some(tiers),
changeable_header: None,
mor_tier_name: mor_tier.map(|s| s.to_string()),
gra_tier_name: gra_tier.map(|s| s.to_string()),
},
misalignment,
)
}
pub fn filter_file_paths(
paths: &[String],
match_pattern: Option<&str>,
) -> Result<Vec<String>, ModelError> {
let match_re = match_pattern
.map(FancyRegex::new)
.transpose()
.map_err(|e| ModelError::ValidationError(format!("Invalid match regex: {e}")))?;
Ok(paths
.iter()
.filter(|p| {
if let Some(ref re) = match_re
&& !re.is_match(p).unwrap_or(false)
{
return false;
}
true
})
.cloned()
.collect())
}
pub(crate) fn filter_chat_file_by_participants(
mut file: ChatFile,
patterns: &[FancyRegex],
) -> ChatFile {
file.events.retain(|u| {
if u.changeable_header.is_some() {
false
} else {
patterns.iter().any(|re| {
re.is_match(u.participant.as_deref().unwrap_or(""))
.unwrap_or(false)
})
}
});
file.headers.participants.retain(|p| {
patterns
.iter()
.any(|re| re.is_match(&p.code).unwrap_or(false))
});
file.reset_caches();
file
}
pub(crate) fn parse_chat_strs(
pairs: Vec<(String, String)>,
parallel: bool,
mor_tier: Option<&str>,
gra_tier: Option<&str>,
) -> (Vec<ChatFile>, Vec<MisalignmentInfo>) {
let build = |content: &str, id: &str| {
let (headers, events, raw_lines, mut mis) =
parse_chat_str(content, parallel, mor_tier, gra_tier);
for m in &mut mis {
m.file_path = id.to_string();
}
(
ChatFile::new(id.to_string(), headers, events, raw_lines),
mis,
)
};
#[cfg(feature = "parallel")]
if parallel {
let results: Vec<(ChatFile, Vec<MisalignmentInfo>)> = pairs
.par_iter()
.with_min_len(16)
.map(|(content, id)| build(content, id))
.collect();
let (files, nested): (Vec<_>, Vec<_>) = results.into_iter().unzip();
return (files, nested.into_iter().flatten().collect());
}
let results: Vec<(ChatFile, Vec<MisalignmentInfo>)> = pairs
.iter()
.map(|(content, id)| build(content, id))
.collect();
let (files, nested): (Vec<_>, Vec<_>) = results.into_iter().unzip();
(files, nested.into_iter().flatten().collect())
}
pub(crate) fn load_chat_files(
paths: &[String],
parallel: bool,
mor_tier: Option<&str>,
gra_tier: Option<&str>,
) -> Result<(Vec<ChatFile>, Vec<MisalignmentInfo>), std::io::Error> {
let build = |path: &str| -> Result<(ChatFile, Vec<MisalignmentInfo>), std::io::Error> {
let content = std::fs::read_to_string(path)?;
let (headers, events, raw_lines, mut mis) =
parse_chat_str(&content, parallel, mor_tier, gra_tier);
for m in &mut mis {
m.file_path = path.to_string();
}
Ok((
ChatFile::new(path.to_string(), headers, events, raw_lines),
mis,
))
};
#[cfg(feature = "parallel")]
if parallel {
let results: Vec<(ChatFile, Vec<MisalignmentInfo>)> = paths
.par_iter()
.with_min_len(16)
.map(|path| build(path))
.collect::<Result<Vec<_>, _>>()?;
let (files, nested): (Vec<_>, Vec<_>) = results.into_iter().unzip();
return Ok((files, nested.into_iter().flatten().collect()));
}
let results: Vec<(ChatFile, Vec<MisalignmentInfo>)> = paths
.iter()
.map(|path| build(path))
.collect::<Result<Vec<_>, _>>()?;
let (files, nested): (Vec<_>, Vec<_>) = results.into_iter().unzip();
Ok((files, nested.into_iter().flatten().collect()))
}
pub fn serialize_chat_file(file: &ChatFile) -> String {
let mut output = String::new();
for line in &file.raw_lines {
if line == "@End" {
continue;
}
output.push_str(line);
output.push('\n');
}
output.push_str("@End\n");
output
}
use crate::persistence::ModelError;
pub enum WriteError {
Validation(String),
Io(std::io::Error),
}
pub trait BaseChat: Sized {
fn files(&self) -> &VecDeque<ChatFile>;
fn files_mut(&mut self) -> &mut VecDeque<ChatFile>;
fn from_files(files: VecDeque<ChatFile>) -> Self;
fn from_utterances<U: BaseUtterance>(utterances: Vec<U>) -> Self {
let mut raw_lines = Vec::new();
let mut events = Vec::new();
for utt in &utterances {
raw_lines.extend(utt.to_chat_lines());
events.push(utt.to_utterance());
}
let file = ChatFile::new(
uuid::Uuid::new_v4().to_string(),
Headers::default(),
events,
raw_lines,
);
Self::from_files(VecDeque::from(vec![file]))
}
fn num_files(&self) -> usize {
self.files().len()
}
fn is_empty(&self) -> bool {
self.files().is_empty()
}
fn file_paths(&self) -> Vec<String> {
self.files().iter().map(|f| f.file_path.clone()).collect()
}
fn headers(&self) -> Vec<Headers> {
self.files().iter().map(|f| f.headers.clone()).collect()
}
fn ages(&self) -> Vec<Option<Age>> {
self.files()
.iter()
.map(|f| {
f.headers
.participants
.iter()
.find(|p| p.code == "CHI")
.and_then(|p| p.age.clone())
})
.collect()
}
fn participants(&self) -> Vec<Vec<Participant>> {
self.files()
.iter()
.map(|f| f.headers.participants.clone())
.collect()
}
fn unique_participants(&self) -> Vec<Participant> {
let mut seen = HashSet::new();
self.files()
.iter()
.flat_map(|f| f.headers.participants.clone())
.filter(|p| seen.insert(p.clone()))
.collect()
}
fn languages(&self) -> Vec<Vec<String>> {
self.files()
.iter()
.map(|f| f.headers.languages.clone())
.collect()
}
fn unique_languages(&self) -> Vec<String> {
let mut seen = HashSet::new();
self.files()
.iter()
.flat_map(|f| f.headers.languages.clone())
.filter(|lang| seen.insert(lang.clone()))
.collect()
}
fn default_output_filenames(&self, target_ext: &str) -> Vec<String> {
let derived: Vec<Option<String>> = self
.files()
.iter()
.map(|f| {
let path = std::path::Path::new(&f.file_path);
let stem = path.file_stem()?.to_str()?;
if uuid::Uuid::try_parse(stem).is_ok() {
return None;
}
Some(format!("{stem}{target_ext}"))
})
.collect();
if derived.iter().all(|d| d.is_some()) {
let names: Vec<String> = derived.into_iter().map(|d| d.unwrap()).collect();
let unique: HashSet<&String> = names.iter().collect();
if unique.len() == names.len() {
return names;
}
}
(0..self.files().len())
.map(|i| format!("{:04}{target_ext}", i + 1))
.collect()
}
fn to_strings(&self) -> Vec<String> {
self.files().iter().map(serialize_chat_file).collect()
}
fn to_elan_strings(&self) -> Vec<String> {
self.files()
.iter()
.map(super::elan_writer::chat_file_to_eaf_xml)
.collect()
}
fn to_elan(&self) -> crate::elan::Elan {
let strs = self.to_elan_strings();
let ids: Vec<String> = self
.files()
.iter()
.map(|f| {
let path = std::path::Path::new(&f.file_path);
let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
if uuid::Uuid::try_parse(stem).is_ok() {
f.file_path.clone()
} else {
format!("{stem}.eaf")
}
})
.collect();
crate::elan::Elan::from_strs(strs, Some(ids), false).unwrap()
}
fn write_elan_files(
&self,
dir_path: &str,
filenames: Option<Vec<String>>,
) -> Result<(), WriteError> {
let strs = self.to_elan_strings();
let dir = std::path::Path::new(dir_path);
std::fs::create_dir_all(dir).map_err(WriteError::Io)?;
let names: Vec<String> = match filenames {
Some(names) => {
if names.len() != self.files().len() {
return Err(WriteError::Validation(format!(
"There are {} ELAN files to create, \
but {} filenames were provided.",
self.files().len(),
names.len()
)));
}
names
}
None => self.default_output_filenames(".eaf"),
};
for (name, content) in names.iter().zip(strs.iter()) {
let file_path = dir.join(name);
std::fs::write(&file_path, content).map_err(WriteError::Io)?;
}
Ok(())
}
fn write_chat_files(
&self,
dir_path: &str,
filenames: Option<Vec<String>>,
) -> Result<(), WriteError> {
let strs = self.to_strings();
let dir = std::path::Path::new(dir_path);
std::fs::create_dir_all(dir).map_err(WriteError::Io)?;
let names: Vec<String> = match filenames {
Some(names) => {
if names.len() != self.files().len() {
return Err(WriteError::Validation(format!(
"There are {} CHAT files to create, \
but {} filenames were provided.",
self.files().len(),
names.len()
)));
}
names
}
None => self.default_output_filenames(".cha"),
};
for (name, content) in names.iter().zip(strs.iter()) {
let file_path = dir.join(name);
std::fs::write(&file_path, content).map_err(WriteError::Io)?;
}
Ok(())
}
fn to_srt_strings(&self, participants: Option<&[String]>) -> Vec<String> {
self.files()
.iter()
.map(|f| super::srt_writer::chat_file_to_srt_str(f, participants))
.collect()
}
fn to_srt(&self, participants: Option<&[String]>) -> crate::srt::Srt {
let strs = self.to_srt_strings(participants);
let ids: Vec<String> = self
.files()
.iter()
.map(|f| {
let path = std::path::Path::new(&f.file_path);
let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
if uuid::Uuid::try_parse(stem).is_ok() {
f.file_path.clone()
} else {
format!("{stem}.srt")
}
})
.collect();
crate::srt::Srt::from_strs(strs, Some(ids), false).unwrap()
}
fn write_srt_files(
&self,
dir_path: &str,
participants: Option<&[String]>,
filenames: Option<Vec<String>>,
) -> Result<(), WriteError> {
let strs = self.to_srt_strings(participants);
let dir = std::path::Path::new(dir_path);
std::fs::create_dir_all(dir).map_err(WriteError::Io)?;
let names: Vec<String> = match filenames {
Some(names) => {
if names.len() != self.files().len() {
return Err(WriteError::Validation(format!(
"There are {} SRT files to create, \
but {} filenames were provided.",
self.files().len(),
names.len()
)));
}
names
}
None => self.default_output_filenames(".srt"),
};
for (name, content) in names.iter().zip(strs.iter()) {
let file_path = dir.join(name);
std::fs::write(&file_path, content).map_err(WriteError::Io)?;
}
Ok(())
}
fn to_conllu_strings(&self) -> Vec<String> {
self.files()
.iter()
.map(super::conllu_writer::chat_file_to_conllu_str)
.collect()
}
fn to_conllu(&self) -> crate::conllu::Conllu {
let strs = self.to_conllu_strings();
let ids: Vec<String> = self
.files()
.iter()
.map(|f| {
let path = std::path::Path::new(&f.file_path);
let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
if uuid::Uuid::try_parse(stem).is_ok() {
f.file_path.clone()
} else {
format!("{stem}.conllu")
}
})
.collect();
crate::conllu::Conllu::from_strs(strs, Some(ids), false).unwrap()
}
fn write_conllu_files(
&self,
dir_path: &str,
filenames: Option<Vec<String>>,
) -> Result<(), WriteError> {
let strs = self.to_conllu_strings();
let dir = std::path::Path::new(dir_path);
std::fs::create_dir_all(dir).map_err(WriteError::Io)?;
let names: Vec<String> = match filenames {
Some(names) => {
if names.len() != self.files().len() {
return Err(WriteError::Validation(format!(
"There are {} CoNLL-U files to create, \
but {} filenames were provided.",
self.files().len(),
names.len()
)));
}
names
}
None => self.default_output_filenames(".conllu"),
};
for (name, content) in names.iter().zip(strs.iter()) {
let file_path = dir.join(name);
std::fs::write(&file_path, content).map_err(WriteError::Io)?;
}
Ok(())
}
fn to_textgrid_strings(&self, participants: Option<&[String]>) -> Vec<String> {
self.files()
.iter()
.map(|f| super::textgrid_writer::chat_file_to_textgrid_str(f, participants))
.collect()
}
fn to_textgrid(&self, participants: Option<&[String]>) -> crate::textgrid::TextGrid {
let strs = self.to_textgrid_strings(participants);
let ids: Vec<String> = self
.files()
.iter()
.map(|f| {
let path = std::path::Path::new(&f.file_path);
let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
if uuid::Uuid::try_parse(stem).is_ok() {
f.file_path.clone()
} else {
format!("{stem}.TextGrid")
}
})
.collect();
crate::textgrid::TextGrid::from_strs(strs, Some(ids), false).unwrap()
}
fn write_textgrid_files(
&self,
dir_path: &str,
participants: Option<&[String]>,
filenames: Option<Vec<String>>,
) -> Result<(), WriteError> {
let strs = self.to_textgrid_strings(participants);
let dir = std::path::Path::new(dir_path);
std::fs::create_dir_all(dir).map_err(WriteError::Io)?;
let names: Vec<String> = match filenames {
Some(names) => {
if names.len() != self.files().len() {
return Err(WriteError::Validation(format!(
"There are {} TextGrid files to create, \
but {} filenames were provided.",
self.files().len(),
names.len()
)));
}
names
}
None => self.default_output_filenames(".TextGrid"),
};
for (name, content) in names.iter().zip(strs.iter()) {
let file_path = dir.join(name);
std::fs::write(&file_path, content).map_err(WriteError::Io)?;
}
Ok(())
}
fn filter_by(
&self,
files: Option<&str>,
participants: Option<&str>,
) -> Result<Self, ModelError> {
let mut filtered: VecDeque<ChatFile> = if let Some(pattern) = files {
let re = FancyRegex::new(pattern)
.map_err(|e| ModelError::ValidationError(format!("Invalid file regex: {e}")))?;
self.files()
.iter()
.filter(|f| re.is_match(&f.file_path).unwrap_or(false))
.cloned()
.collect()
} else {
self.files().clone()
};
if let Some(pattern) = participants {
let anchored = if pattern.starts_with('^') || pattern.ends_with('$') {
pattern.to_string()
} else {
format!("^(?:{pattern})$")
};
let re = FancyRegex::new(&anchored).map_err(|e| {
ModelError::ValidationError(format!("Invalid participant regex: {e}"))
})?;
filtered = filtered
.into_iter()
.map(|f| filter_chat_file_by_participants(f, std::slice::from_ref(&re)))
.collect();
}
Ok(Self::from_files(filtered))
}
fn info_string(&self, verbose: bool) -> String {
let n_files = self.files().len();
let total_utterances: usize = self
.files()
.iter()
.map(|f| f.real_utterances().count())
.sum();
let total_words: usize = self
.files()
.iter()
.map(|f| {
f.real_utterances()
.map(|u| {
u.tokens
.as_deref()
.unwrap_or(&[])
.iter()
.filter(|t| !t.word.is_empty())
.count()
})
.sum::<usize>()
})
.sum();
let mut output =
format!("{n_files} files\n{total_utterances} utterances\n{total_words} words\n");
if n_files >= 2 {
let stats: Vec<(usize, usize, &str)> = self
.files()
.iter()
.map(|f| {
let utt_count = f.real_utterances().count();
let word_count: usize = f
.real_utterances()
.map(|u| {
u.tokens
.as_deref()
.unwrap_or(&[])
.iter()
.filter(|t| !t.word.is_empty())
.count()
})
.sum();
(utt_count, word_count, f.file_path.as_str())
})
.collect();
let max_rows = if verbose { n_files } else { 5.min(n_files) };
for (i, (utts, words, path)) in stats[..max_rows].iter().enumerate() {
output.push_str(&format!(
" #{}: {} utterances, {} words — {}\n",
i + 1,
utts,
words,
path
));
}
if !verbose && max_rows < n_files {
output.push_str("...\n(set `verbose` to True for all the files)\n");
}
}
output
}
fn mlum(&self, participant: &str, n: Option<usize>) -> Vec<f64> {
self.files()
.iter()
.map(|f| {
let utterances: Vec<_> = f
.real_utterances()
.filter(|u| u.participant.as_deref() == Some(participant))
.collect();
let utterances = if let Some(n) = n {
&utterances[..utterances.len().min(n)]
} else {
&utterances[..]
};
if utterances.is_empty() {
return 0.0;
}
let total: usize = utterances
.iter()
.map(|u| {
u.tokens
.as_deref()
.unwrap_or(&[])
.iter()
.filter(|t| t.pos.as_ref().is_some_and(|p| !p.is_empty()))
.count()
})
.sum();
total as f64 / utterances.len() as f64
})
.collect()
}
fn mluw(&self, participant: &str, n: Option<usize>) -> Vec<f64> {
self.files()
.iter()
.map(|f| {
let utterances: Vec<_> = f
.real_utterances()
.filter(|u| u.participant.as_deref() == Some(participant))
.collect();
let utterances = if let Some(n) = n {
&utterances[..utterances.len().min(n)]
} else {
&utterances[..]
};
if utterances.is_empty() {
return 0.0;
}
let total: usize = utterances
.iter()
.map(|u| {
u.tokens
.as_deref()
.unwrap_or(&[])
.iter()
.filter(|t| !t.word.is_empty() && t.pos.as_deref() != Some(""))
.count()
})
.sum();
total as f64 / utterances.len() as f64
})
.collect()
}
fn ttr(&self, participant: &str, n: Option<usize>) -> Vec<f64> {
self.files()
.iter()
.map(|f| {
let words: Vec<&str> = f
.real_utterances()
.filter(|u| u.participant.as_deref() == Some(participant))
.flat_map(|u| u.tokens.as_deref().unwrap_or(&[]))
.filter(|t| !t.word.is_empty() && t.pos.as_deref() != Some(""))
.map(|t| t.word.as_str())
.collect();
let words = if let Some(n) = n {
&words[..words.len().min(n)]
} else {
&words[..]
};
if words.is_empty() {
0.0
} else {
let types: HashSet<&str> = words.iter().copied().collect();
types.len() as f64 / words.len() as f64
}
})
.collect()
}
fn clear(&mut self) {
self.files_mut().clear();
}
fn head(&self, n: usize) -> Utterances {
let utterances: Vec<Utterance> = self
.files()
.iter()
.flat_map(|f| f.utterances())
.take(n)
.cloned()
.collect();
Utterances::new(utterances)
}
fn tail(&self, n: usize) -> Utterances {
let all: Vec<&Utterance> = self.files().iter().flat_map(|f| f.utterances()).collect();
let start = all.len().saturating_sub(n);
let utterances: Vec<Utterance> = all[start..].iter().map(|u| (*u).clone()).collect();
Utterances::new(utterances)
}
}
#[derive(Clone, Debug)]
pub struct Chat {
pub(crate) files: VecDeque<ChatFile>,
}
impl BaseChat for Chat {
fn files(&self) -> &VecDeque<ChatFile> {
&self.files
}
fn files_mut(&mut self) -> &mut VecDeque<ChatFile> {
&mut self.files
}
fn from_files(files: VecDeque<ChatFile>) -> Self {
Self { files }
}
}
impl Chat {
pub fn from_chat_files(files: Vec<ChatFile>) -> Self {
Self {
files: VecDeque::from(files),
}
}
pub fn push_back(&mut self, other: &Chat) {
self.files.extend(other.files.iter().cloned());
}
pub fn push_front(&mut self, other: &Chat) {
let mut new_files = other.files.clone();
new_files.extend(std::mem::take(&mut self.files));
self.files = new_files;
}
pub fn pop_back(&mut self) -> Option<Chat> {
self.files
.pop_back()
.map(|f| Chat::from_files(VecDeque::from(vec![f])))
}
pub fn pop_front(&mut self) -> Option<Chat> {
self.files
.pop_front()
.map(|f| Chat::from_files(VecDeque::from(vec![f])))
}
pub fn from_strs(
strs: Vec<String>,
ids: Option<Vec<String>>,
parallel: bool,
mor_tier: Option<&str>,
gra_tier: Option<&str>,
) -> (Self, Vec<MisalignmentInfo>) {
let ids = ids.unwrap_or_else(|| {
strs.iter()
.map(|_| uuid::Uuid::new_v4().to_string())
.collect()
});
assert_eq!(
strs.len(),
ids.len(),
"strs and ids must have the same length: {} vs {}",
strs.len(),
ids.len()
);
let pairs: Vec<(String, String)> = strs.into_iter().zip(ids).collect();
let (files, misalignments) = parse_chat_strs(pairs, parallel, mor_tier, gra_tier);
(Self::from_chat_files(files), misalignments)
}
pub fn read_files(
paths: &[String],
parallel: bool,
mor_tier: Option<&str>,
gra_tier: Option<&str>,
) -> Result<(Self, Vec<MisalignmentInfo>), std::io::Error> {
let (files, misalignments) = load_chat_files(paths, parallel, mor_tier, gra_tier)?;
Ok((Self::from_chat_files(files), misalignments))
}
pub fn read_dir(
path: &str,
match_pattern: Option<&str>,
extension: &str,
parallel: bool,
mor_tier: Option<&str>,
gra_tier: Option<&str>,
) -> Result<(Self, Vec<MisalignmentInfo>), ChatError> {
let mut paths: Vec<String> = Vec::new();
for entry in walkdir::WalkDir::new(path)
.into_iter()
.filter_map(|e| e.ok())
{
if entry.file_type().is_file() {
let file_path = entry.path().to_string_lossy().to_string();
if file_path.ends_with(extension) {
paths.push(file_path);
}
}
}
paths.sort();
let filtered = filter_file_paths(&paths, match_pattern)
.map_err(|e| ChatError::InvalidPattern(e.to_string()))?;
let (files, misalignments) = load_chat_files(&filtered, parallel, mor_tier, gra_tier)?;
Ok((Self::from_chat_files(files), misalignments))
}
pub fn read_zip(
path: &str,
match_pattern: Option<&str>,
extension: &str,
parallel: bool,
mor_tier: Option<&str>,
gra_tier: Option<&str>,
) -> Result<(Self, Vec<MisalignmentInfo>), ChatError> {
let file = std::fs::File::open(path)?;
let mut archive = zip::ZipArchive::new(file)
.map_err(|e| ChatError::Zip(format!("Invalid zip file: {e}")))?;
let mut entry_names: Vec<String> = (0..archive.len())
.filter_map(|i| {
let entry = archive.by_index(i).ok()?;
let name = entry.name().to_string();
if name.ends_with(extension) && !entry.is_dir() {
Some(name)
} else {
None
}
})
.collect();
entry_names.sort();
let filtered = filter_file_paths(&entry_names, match_pattern)
.map_err(|e| ChatError::InvalidPattern(e.to_string()))?;
let mut pairs: Vec<(String, String)> = Vec::new();
for name in &filtered {
let mut entry = archive
.by_name(name)
.map_err(|e| ChatError::Zip(format!("Zip entry error: {e}")))?;
let mut content = String::new();
std::io::Read::read_to_string(&mut entry, &mut content)
.map_err(|e| ChatError::Zip(format!("Read error: {e}")))?;
pairs.push((content, name.clone()));
}
let (files, misalignments) = parse_chat_strs(pairs, parallel, mor_tier, gra_tier);
Ok((Self::from_chat_files(files), misalignments))
}
#[allow(clippy::too_many_arguments)]
pub fn from_git(
url: &str,
rev: Option<&str>,
depth: Option<u32>,
match_pattern: Option<&str>,
extension: &str,
cache_dir: Option<std::path::PathBuf>,
force_download: bool,
parallel: bool,
mor_tier: Option<&str>,
gra_tier: Option<&str>,
) -> Result<(Self, Vec<MisalignmentInfo>), ChatError> {
let local_path = crate::sources::resolve_git(url, rev, depth, cache_dir, force_download)?;
let path = local_path.to_string_lossy();
Self::read_dir(
&path,
match_pattern,
extension,
parallel,
mor_tier,
gra_tier,
)
}
#[allow(clippy::too_many_arguments)]
pub fn from_url(
url: &str,
match_pattern: Option<&str>,
extension: &str,
cache_dir: Option<std::path::PathBuf>,
force_download: bool,
parallel: bool,
mor_tier: Option<&str>,
gra_tier: Option<&str>,
) -> Result<(Self, Vec<MisalignmentInfo>), ChatError> {
let (local_path, is_zip) = crate::sources::resolve_url(url, cache_dir, force_download)?;
let path = local_path.to_string_lossy();
if is_zip {
Self::read_zip(
&path,
match_pattern,
extension,
parallel,
mor_tier,
gra_tier,
)
} else {
let content = std::fs::read_to_string(local_path)?;
Ok(Self::from_strs(
vec![content],
None,
parallel,
mor_tier,
gra_tier,
))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::chat::utterance::Utterances;
fn make_basic_chat() -> &'static str {
"@UTF8\n@Begin\n@Participants:\tCHI Child, MOT Mother\n*CHI:\tI want cookie .\n%mor:\tpro|I v|want n|cookie .\n%gra:\t1|2|SUBJ 2|0|ROOT 3|2|OBJ 4|2|PUNCT\n*MOT:\tno .\n%mor:\tco|no .\n%gra:\t1|0|ROOT 2|1|PUNCT\n@End\n"
}
#[test]
fn test_chat_file_is_empty() {
let empty_file = ChatFile::new(String::new(), Headers::default(), vec![], vec![]);
assert!(empty_file.is_empty());
let (headers, events, raw_lines, _) =
parse_chat_str(make_basic_chat(), true, DEFAULT_MOR, DEFAULT_GRA);
let non_empty_file = ChatFile::new("test".to_string(), headers, events, raw_lines);
assert!(!non_empty_file.is_empty());
}
#[test]
fn test_get_lines_joins_continuations() {
let input = "@Begin\n*CHI:\tI want\n\tcookie .\n@End\n";
let lines = get_lines(input);
assert!(lines.iter().any(|l| l.contains("I want cookie .")));
}
#[test]
fn test_get_lines_trims_leading_whitespace() {
let input = " @Begin\n *CHI:\tI want cookie .\n *MOT:\tno .\n @End\n";
let lines = get_lines(input);
assert_eq!(lines.len(), 4);
assert!(lines[0].starts_with("@Begin"));
assert!(lines[1].starts_with("*CHI:"));
assert!(lines[2].starts_with("*MOT:"));
assert!(lines[3].starts_with("@End"));
}
#[test]
fn test_parse_chat_str_leading_whitespace() {
let input = " @UTF8\n @Begin\n @Participants:\tCHI Child, MOT Mother\n *CHI:\tI want cookie .\n %mor:\tpro|I v|want n|cookie .\n %gra:\t1|2|SUBJ 2|0|ROOT 3|2|OBJ 4|2|PUNCT\n @End\n";
let (_, events, _, _) = parse_chat_str(input, true, DEFAULT_MOR, DEFAULT_GRA);
let utterances: Vec<&Utterance> = events
.iter()
.filter(|u| u.changeable_header.is_none())
.collect();
assert_eq!(utterances.len(), 1);
assert_eq!(utterances[0].participant.as_deref(), Some("CHI"));
let tokens = utterances[0].tokens.as_ref().unwrap();
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0].word, "I");
assert_eq!(tokens[2].word, "cookie");
}
#[test]
fn test_get_all_events_extracts_tiers() {
let lines = get_lines(make_basic_chat());
let (_, start_idx, _) = parse_file_headers(&lines);
let events = get_all_events(&lines, start_idx);
let tier_groups: Vec<&TierGroup> = events
.iter()
.filter_map(|e| match e {
EventOrTierGroup::TierGroup(tg) => Some(tg),
_ => None,
})
.collect();
assert_eq!(tier_groups.len(), 2);
assert_eq!(tier_groups[0].participant, "CHI");
assert_eq!(tier_groups[1].participant, "MOT");
}
#[test]
fn test_parse_mor_tier_basic() {
let items = parse_mor_tier("pro|I v|want n|cookie .");
assert_eq!(items.len(), 4);
assert_eq!(items[0].pos, "pro");
assert_eq!(items[0].mor, "I");
assert_eq!(items[1].pos, "v");
assert_eq!(items[1].mor, "want");
assert_eq!(items[3].pos, "");
assert_eq!(items[3].mor, ".");
}
#[test]
fn test_parse_mor_tier_postclitic() {
let items = parse_mor_tier("pro:dem|that~cop|be&3S adj|good .");
assert_eq!(items.len(), 4); assert_eq!(items[0].pos, "pro:dem");
assert!(!items[0].is_clitic);
assert_eq!(items[1].pos, "cop");
assert!(items[1].is_clitic);
assert_eq!(items[2].pos, "adj");
assert!(!items[2].is_clitic);
}
#[test]
fn test_parse_mor_tier_preclitic() {
let items = parse_mor_tier("aux|will$neg|not");
assert_eq!(items.len(), 2);
assert!(items[0].is_clitic); assert!(!items[1].is_clitic); }
#[test]
fn test_parse_mor_tier_preclitic_and_postclitic() {
let items = parse_mor_tier("v|da-give$pro|me&dat-me~pro|lo&acc-it");
assert_eq!(items.len(), 3);
assert!(items[0].is_clitic); assert!(!items[1].is_clitic); assert!(items[2].is_clitic); }
#[test]
fn test_parse_mor_tier_attached_period() {
let items = parse_mor_tier("pro:sub|she v|say&PAST pro:sub|I v|want n|cookie-PL.");
assert_eq!(items.len(), 6);
assert_eq!(items[4].pos, "n");
assert_eq!(items[4].mor, "cookie-PL");
assert!(!items[4].is_clitic);
assert_eq!(items[5].pos, "");
assert_eq!(items[5].mor, ".");
assert!(!items[5].is_clitic);
}
#[test]
fn test_parse_mor_tier_attached_question_mark() {
let items = parse_mor_tier("pro|what v|be&3S n|that?");
assert_eq!(items.len(), 4);
assert_eq!(items[2].pos, "n");
assert_eq!(items[2].mor, "that");
assert_eq!(items[3].pos, "");
assert_eq!(items[3].mor, "?");
}
#[test]
fn test_parse_mor_tier_attached_exclamation() {
let items = parse_mor_tier("co|yes!");
assert_eq!(items.len(), 2);
assert_eq!(items[0].pos, "co");
assert_eq!(items[0].mor, "yes");
assert_eq!(items[1].pos, "");
assert_eq!(items[1].mor, "!");
}
#[test]
fn test_parse_mor_tier_standalone_punct_unchanged() {
let items = parse_mor_tier("pro|I v|want n|cookie .");
assert_eq!(items.len(), 4);
assert_eq!(items[2].pos, "n");
assert_eq!(items[2].mor, "cookie");
assert_eq!(items[3].pos, "");
assert_eq!(items[3].mor, ".");
}
#[test]
fn test_parse_mor_tier_postclitic_attached_period() {
let items = parse_mor_tier("pro:dem|that~cop|be&3S.");
assert_eq!(items.len(), 3);
assert_eq!(items[0].pos, "pro:dem");
assert!(!items[0].is_clitic);
assert_eq!(items[1].pos, "cop");
assert_eq!(items[1].mor, "be&3S");
assert!(items[1].is_clitic);
assert_eq!(items[2].pos, "");
assert_eq!(items[2].mor, ".");
assert!(!items[2].is_clitic);
}
#[test]
fn test_parse_gra_tier() {
let items = parse_gra_tier("1|2|SUBJ 2|0|ROOT 3|2|OBJ");
assert_eq!(items.len(), 3);
assert_eq!(items[0].dep, 1);
assert_eq!(items[0].head, 2);
assert_eq!(items[0].rel, "SUBJ");
}
#[test]
fn test_parse_chat_str_basic() {
let (_, events, _, _) = parse_chat_str(make_basic_chat(), true, DEFAULT_MOR, DEFAULT_GRA);
let utterances: Vec<&Utterance> = events
.iter()
.filter(|u| u.changeable_header.is_none())
.collect();
assert_eq!(utterances.len(), 2);
assert_eq!(utterances[0].participant.as_deref(), Some("CHI"));
let tokens0 = utterances[0].tokens.as_ref().unwrap();
assert_eq!(tokens0.len(), 4); assert_eq!(tokens0[0].word, "I");
assert_eq!(tokens0[0].pos.as_deref(), Some("pro"));
assert_eq!(tokens0[0].mor.as_deref(), Some("I"));
assert!(tokens0[0].gra.is_some());
assert_eq!(tokens0[0].gra.as_ref().unwrap().rel, "SUBJ");
}
#[test]
fn test_parse_chat_str_attached_mor_period() {
let input = "@UTF8\n@Begin\n@Participants:\tCHI Child\n\
*CHI:\tshe said \u{201c}I want cookies\u{201d} .\n\
%mor:\tpro:sub|she v|say&PAST pro:sub|I v|want n|cookie-PL.\n\
@End\n";
let (_, events, _, misalignments) = parse_chat_str(input, false, DEFAULT_MOR, DEFAULT_GRA);
assert!(misalignments.is_empty());
let utterances: Vec<&Utterance> = events
.iter()
.filter(|u| u.changeable_header.is_none())
.collect();
assert_eq!(utterances.len(), 1);
let tokens = utterances[0].tokens.as_ref().unwrap();
assert_eq!(tokens.len(), 6); assert_eq!(tokens[4].word, "cookies");
assert_eq!(tokens[4].pos.as_deref(), Some("n"));
assert_eq!(tokens[4].mor.as_deref(), Some("cookie-PL"));
assert_eq!(tokens[5].word, ".");
assert_eq!(tokens[5].pos.as_deref(), Some(""));
assert_eq!(tokens[5].mor.as_deref(), Some("."));
}
#[test]
fn test_parse_chat_str_time_marks() {
let input = "@UTF8\n@Begin\n*CHI:\thello . \x15123_456\x15\n@End\n";
let (_, events, _, _) = parse_chat_str(input, true, DEFAULT_MOR, DEFAULT_GRA);
let utterances: Vec<&Utterance> = events
.iter()
.filter(|u| u.changeable_header.is_none())
.collect();
assert_eq!(utterances.len(), 1);
assert_eq!(utterances[0].time_marks, Some((123, 456)));
}
#[test]
fn test_parse_chat_str_no_mor() {
let input = "@UTF8\n@Begin\n*CHI:\thello world .\n@End\n";
let (_, events, _, _) = parse_chat_str(input, true, DEFAULT_MOR, DEFAULT_GRA);
let utterances: Vec<&Utterance> = events
.iter()
.filter(|u| u.changeable_header.is_none())
.collect();
assert_eq!(utterances.len(), 1);
let tokens0 = utterances[0].tokens.as_ref().unwrap();
assert_eq!(tokens0.len(), 3);
assert_eq!(tokens0[0].word, "hello");
assert!(tokens0[0].pos.is_none());
}
#[test]
fn test_build_tokens_alignment_with_clitics() {
let mor_items = parse_mor_tier("pro:dem|that~cop|be&3S adj|good .");
let words = vec!["that's", "good", "."];
let (tokens, misalignment) = build_tokens(&words, Some(&mor_items), None);
assert!(misalignment.is_none());
assert_eq!(tokens.len(), 4); assert_eq!(tokens[0].word, "that's");
assert_eq!(tokens[0].pos.as_deref(), Some("pro:dem"));
assert_eq!(tokens[1].word, ""); assert_eq!(tokens[1].pos.as_deref(), Some("cop"));
assert_eq!(tokens[2].word, "good");
}
#[test]
fn test_build_tokens_misalignment_returns_empty() {
let mor_items = parse_mor_tier("pro|I v|want .");
let words = vec!["I", "want", "cookie", "."];
let (tokens, misalignment) = build_tokens(&words, Some(&mor_items), None);
assert!(tokens.is_empty());
assert!(misalignment.is_some());
let counts = misalignment.unwrap();
assert_eq!(counts.word_count, 4);
assert_eq!(counts.mor_count, 3);
}
#[test]
fn test_build_tokens_no_mor() {
let words = vec!["hello", "world", "."];
let (tokens, misalignment) = build_tokens(&words, None, None);
assert!(misalignment.is_none());
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].word, "hello");
assert!(tokens[0].pos.is_none());
}
#[test]
fn test_parse_chat_str_collects_misalignments() {
let input = "@UTF8\n@Begin\n@Participants:\tCHI Child\n\
*CHI:\tI want cookie .\n\
%mor:\tpro|I v|want .\n\
@End\n";
let (_, _, _, misalignments) = parse_chat_str(input, false, DEFAULT_MOR, DEFAULT_GRA);
assert!(!misalignments.is_empty());
assert_eq!(misalignments[0].participant, "CHI");
}
#[test]
fn test_parse_chat_str_no_misalignment() {
let (_, _, _, misalignments) =
parse_chat_str(make_basic_chat(), true, DEFAULT_MOR, DEFAULT_GRA);
assert!(misalignments.is_empty());
}
#[test]
fn test_filter_file_paths() {
let paths = vec![
"a/action.cha".to_string(),
"a/codes.cha".to_string(),
"a/phono.cha".to_string(),
];
let filtered = filter_file_paths(&paths, Some("action")).unwrap();
assert_eq!(filtered, vec!["a/action.cha"]);
let filtered = filter_file_paths(&paths, None).unwrap();
assert_eq!(filtered.len(), 3);
}
#[test]
fn test_filter_negative_lookahead_drops_chi_and_headers() {
let chat_str = "@UTF8\n@Begin\n@Participants:\tCHI Child, MOT Mother\n\
*CHI:\thello .\n\
@New Episode\n\
*MOT:\thi .\n\
@End\n";
let file = make_chat_file("test", chat_str);
assert_eq!(file.events.len(), 3);
let pattern = FancyRegex::new("^(?!CHI$)").unwrap();
let filtered = filter_chat_file_by_participants(file, &[pattern]);
assert_eq!(filtered.events.len(), 1);
assert_eq!(filtered.events[0].participant.as_deref(), Some("MOT"));
assert!(filtered.events[0].changeable_header.is_none());
assert_eq!(filtered.headers.participants.len(), 1);
assert_eq!(filtered.headers.participants[0].code, "MOT");
}
#[test]
fn test_tiers_in_utterance() {
let (_, events, _, _) = parse_chat_str(make_basic_chat(), true, DEFAULT_MOR, DEFAULT_GRA);
let utterances: Vec<&Utterance> = events
.iter()
.filter(|u| u.changeable_header.is_none())
.collect();
let tiers = utterances[0].tiers.as_ref().unwrap();
assert!(tiers.contains_key("CHI"));
assert!(tiers.contains_key("%mor"));
assert!(tiers.contains_key("%gra"));
}
#[test]
fn test_raw_lines_captured() {
let (_, _, raw_lines, _) =
parse_chat_str(make_basic_chat(), true, DEFAULT_MOR, DEFAULT_GRA);
assert!(raw_lines.iter().any(|l| l == "@UTF8"));
assert!(raw_lines.iter().any(|l| l == "@Begin"));
assert!(raw_lines.iter().any(|l| l.starts_with("@Participants:")));
assert!(raw_lines.iter().any(|l| l.starts_with("*CHI:")));
assert!(raw_lines.iter().any(|l| l.starts_with("%mor:")));
assert!(raw_lines.iter().any(|l| l == "@End"));
}
#[test]
fn test_serialize_round_trip() {
let input = make_basic_chat();
let (_, _, raw_lines, _) = parse_chat_str(input, true, DEFAULT_MOR, DEFAULT_GRA);
let file = ChatFile::new("test".to_string(), Headers::default(), vec![], raw_lines);
let output = serialize_chat_file(&file);
let (_, _, raw_lines2, _) = parse_chat_str(&output, true, DEFAULT_MOR, DEFAULT_GRA);
let (_, _, raw_lines_orig, _) = parse_chat_str(input, true, DEFAULT_MOR, DEFAULT_GRA);
assert_eq!(raw_lines2, raw_lines_orig);
}
const DEFAULT_MOR: Option<&str> = Some("%mor");
const DEFAULT_GRA: Option<&str> = Some("%gra");
fn make_chat_file(id: &str, chat_str: &str) -> ChatFile {
let (headers, events, raw_lines, _) =
parse_chat_str(chat_str, false, DEFAULT_MOR, DEFAULT_GRA);
ChatFile::new(id.to_string(), headers, events, raw_lines)
}
fn make_chat(files: Vec<ChatFile>) -> Chat {
Chat {
files: VecDeque::from(files),
}
}
#[test]
fn test_push_back() {
let mut chat = make_chat(vec![make_chat_file("a", make_basic_chat())]);
let other = make_chat(vec![make_chat_file("b", make_basic_chat())]);
chat.push_back(&other);
assert_eq!(chat.files.len(), 2);
assert_eq!(chat.files[0].file_path, "a");
assert_eq!(chat.files[1].file_path, "b");
}
#[test]
fn test_push_front() {
let mut chat = make_chat(vec![make_chat_file("a", make_basic_chat())]);
let other = make_chat(vec![
make_chat_file("b", make_basic_chat()),
make_chat_file("c", make_basic_chat()),
]);
chat.push_front(&other);
assert_eq!(chat.files.len(), 3);
assert_eq!(chat.files[0].file_path, "b");
assert_eq!(chat.files[1].file_path, "c");
assert_eq!(chat.files[2].file_path, "a");
}
#[test]
fn test_pop_back() {
let mut chat = make_chat(vec![
make_chat_file("a", make_basic_chat()),
make_chat_file("b", make_basic_chat()),
]);
let popped = chat.pop_back().unwrap();
assert_eq!(chat.files.len(), 1);
assert_eq!(chat.files[0].file_path, "a");
assert_eq!(popped.files.len(), 1);
assert_eq!(popped.files[0].file_path, "b");
}
#[test]
fn test_pop_front() {
let mut chat = make_chat(vec![
make_chat_file("a", make_basic_chat()),
make_chat_file("b", make_basic_chat()),
]);
let popped = chat.pop_front().unwrap();
assert_eq!(chat.files.len(), 1);
assert_eq!(chat.files[0].file_path, "b");
assert_eq!(popped.files.len(), 1);
assert_eq!(popped.files[0].file_path, "a");
}
#[test]
fn test_pop_empty() {
let mut chat = make_chat(vec![]);
assert!(chat.pop_back().is_none());
assert!(chat.pop_front().is_none());
}
#[test]
fn test_from_utterances() {
let utts = vec![
Utterance {
participant: Some("CHI".to_string()),
tokens: Some(vec![Token {
word: "hello".to_string(),
pos: None,
mor: None,
gra: None,
}]),
time_marks: None,
tiers: None,
changeable_header: None,
mor_tier_name: Some("%mor".to_string()),
gra_tier_name: Some("%gra".to_string()),
},
Utterance {
participant: Some("MOT".to_string()),
tokens: Some(vec![Token {
word: "hi".to_string(),
pos: None,
mor: None,
gra: None,
}]),
time_marks: None,
tiers: None,
changeable_header: None,
mor_tier_name: Some("%mor".to_string()),
gra_tier_name: Some("%gra".to_string()),
},
];
let chat = Chat::from_utterances(utts.clone());
assert_eq!(chat.files.len(), 1);
assert_eq!(chat.files[0].events.len(), 2);
assert_eq!(chat.files[0].events, utts);
assert_eq!(chat.files[0].headers, Headers::default());
assert!(chat.files[0].raw_lines.is_empty());
}
#[test]
fn test_from_utterances_empty() {
let chat = Chat::from_utterances(Vec::<Utterance>::new());
assert_eq!(chat.files.len(), 1);
assert!(chat.files[0].events.is_empty());
}
#[test]
fn test_from_utterances_with_tiers() {
let mut tiers = HashMap::new();
tiers.insert("CHI".to_string(), "hello .".to_string());
tiers.insert("%mor".to_string(), "co|hello .".to_string());
let utts = vec![Utterance {
participant: Some("CHI".to_string()),
tokens: Some(vec![Token {
word: "hello".to_string(),
pos: Some("co".to_string()),
mor: Some("hello".to_string()),
gra: None,
}]),
time_marks: None,
tiers: Some(tiers),
changeable_header: None,
mor_tier_name: Some("%mor".to_string()),
gra_tier_name: Some("%gra".to_string()),
}];
let chat = Chat::from_utterances(utts);
assert_eq!(chat.files[0].raw_lines.len(), 2);
assert_eq!(chat.files[0].raw_lines[0], "*CHI:\thello .");
assert_eq!(chat.files[0].raw_lines[1], "%mor:\tco|hello .");
}
#[test]
fn test_from_utterances_serialization_round_trip() {
let (original, _) = Chat::from_strs(
vec![make_basic_chat().to_string()],
None,
false,
DEFAULT_MOR,
DEFAULT_GRA,
);
let utts: Vec<Utterance> = original
.files
.iter()
.flat_map(|f| f.utterances().cloned())
.collect();
let rebuilt = Chat::from_utterances(utts);
let serialized = rebuilt.to_strings();
assert_eq!(serialized.len(), 1);
let output = &serialized[0];
assert!(output.contains("*CHI:"));
assert!(output.contains("%mor:"));
assert!(output.ends_with("@End\n"));
}
#[test]
fn test_clear() {
let mut chat = make_chat(vec![
make_chat_file("a", make_basic_chat()),
make_chat_file("b", make_basic_chat()),
]);
chat.clear();
assert_eq!(chat.files.len(), 0);
}
#[test]
fn test_serialize_chat_file() {
let file = make_chat_file("test", make_basic_chat());
let output = serialize_chat_file(&file);
assert!(output.starts_with("@UTF8\n"));
assert!(output.contains("*CHI:"));
assert!(output.contains("%mor:"));
assert!(output.ends_with("@End\n"));
assert_eq!(output.matches("@End").count(), 1);
}
#[test]
fn test_serialize_ensures_at_end() {
let input = "@UTF8\n@Begin\n*CHI:\thello .\n";
let file = make_chat_file("test", input);
let output = serialize_chat_file(&file);
assert!(output.ends_with("@End\n"));
assert_eq!(output.matches("@End").count(), 1);
}
#[test]
fn test_to_strings() {
let chat = make_chat(vec![
make_chat_file("a", make_basic_chat()),
make_chat_file("b", make_basic_chat()),
]);
let strs = chat.to_strings();
assert_eq!(strs.len(), 2);
assert!(strs[0].contains("@UTF8"));
assert!(strs[0].contains("@End"));
assert!(strs[1].contains("*CHI:"));
}
#[test]
fn test_mlum_basic() {
let chat = make_chat(vec![make_chat_file("a", make_basic_chat())]);
let result = chat.mlum("CHI", Some(100));
assert_eq!(result.len(), 1);
assert!((result[0] - 3.0).abs() < f64::EPSILON);
}
#[test]
fn test_mlu_aliases_mlum() {
let chat = make_chat(vec![make_chat_file("a", make_basic_chat())]);
assert_eq!(chat.mlum("CHI", Some(100)), chat.mlum("CHI", Some(100)));
}
#[test]
fn test_mluw_basic() {
let chat = make_chat(vec![make_chat_file("a", make_basic_chat())]);
let result = chat.mluw("CHI", Some(100));
assert_eq!(result.len(), 1);
assert!((result[0] - 3.0).abs() < f64::EPSILON);
}
#[test]
fn test_ttr_basic() {
let chat = make_chat(vec![make_chat_file("a", make_basic_chat())]);
let result = chat.ttr("CHI", Some(350));
assert_eq!(result.len(), 1);
assert!((result[0] - 1.0).abs() < f64::EPSILON);
}
#[test]
fn test_mlum_empty() {
let chat = make_chat(vec![]);
assert!(chat.mlum("CHI", Some(100)).is_empty());
}
#[test]
fn test_mluw_empty() {
let chat = make_chat(vec![]);
assert!(chat.mluw("CHI", Some(100)).is_empty());
}
#[test]
fn test_ttr_empty() {
let chat = make_chat(vec![]);
assert!(chat.ttr("CHI", Some(350)).is_empty());
}
#[test]
fn test_measures_multiple_files() {
let chat = make_chat(vec![
make_chat_file("a", make_basic_chat()),
make_chat_file("b", make_basic_chat()),
]);
let mlum = chat.mlum("CHI", Some(100));
let mluw = chat.mluw("CHI", Some(100));
let ttr = chat.ttr("CHI", Some(350));
assert_eq!(mlum.len(), 2);
assert_eq!(mluw.len(), 2);
assert_eq!(ttr.len(), 2);
assert!((mlum[0] - 3.0).abs() < f64::EPSILON);
assert!((mlum[1] - 3.0).abs() < f64::EPSILON);
}
#[test]
fn test_mlum_with_clitics() {
let input =
"@UTF8\n@Begin\n*CHI:\tthat's good .\n%mor:\tpro:dem|that~cop|be&3S adj|good .\n@End\n";
let chat = make_chat(vec![make_chat_file("a", input)]);
let result = chat.mlum("CHI", Some(100));
assert!((result[0] - 3.0).abs() < f64::EPSILON);
}
#[test]
fn test_mluw_with_clitics() {
let input =
"@UTF8\n@Begin\n*CHI:\tthat's good .\n%mor:\tpro:dem|that~cop|be&3S adj|good .\n@End\n";
let chat = make_chat(vec![make_chat_file("a", input)]);
let result = chat.mluw("CHI", Some(100));
assert!((result[0] - 2.0).abs() < f64::EPSILON);
}
#[test]
fn test_ttr_with_repeated_words() {
let input = "@UTF8\n@Begin\n*CHI:\tno no no .\n%mor:\tco|no co|no co|no .\n@End\n";
let chat = make_chat(vec![make_chat_file("a", input)]);
let result = chat.ttr("CHI", Some(350));
assert!((result[0] - 1.0 / 3.0).abs() < 1e-10);
}
#[test]
fn test_measures_no_mor_tier() {
let input = "@UTF8\n@Begin\n*CHI:\thello world .\n@End\n";
let chat = make_chat(vec![make_chat_file("a", input)]);
let mlum = chat.mlum("CHI", Some(100));
assert!((mlum[0] - 0.0).abs() < f64::EPSILON);
let mluw = chat.mluw("CHI", Some(100));
assert!((mluw[0] - 3.0).abs() < f64::EPSILON);
let ttr = chat.ttr("CHI", Some(350));
assert!((ttr[0] - 1.0).abs() < f64::EPSILON);
}
fn utterances_text(us: &Utterances) -> String {
us.utterances
.iter()
.map(|u| u.to_str())
.collect::<Vec<_>>()
.join("\n\n")
}
#[test]
fn test_head_first_utterance() {
let chat = make_chat(vec![make_chat_file("a", make_basic_chat())]);
let result = chat.head(1);
assert_eq!(result.utterances.len(), 1);
let text = utterances_text(&result);
assert!(text.contains("*CHI:"));
assert!(!text.contains("*MOT:"));
}
#[test]
fn test_head_all_utterances() {
let chat = make_chat(vec![make_chat_file("a", make_basic_chat())]);
let result = chat.head(5);
assert_eq!(result.utterances.len(), 2);
let text = utterances_text(&result);
assert!(text.contains("*CHI:"));
assert!(text.contains("*MOT:"));
assert!(text.contains("\n\n"));
}
#[test]
fn test_tail_last_utterance() {
let chat = make_chat(vec![make_chat_file("a", make_basic_chat())]);
let result = chat.tail(1);
assert_eq!(result.utterances.len(), 1);
let text = utterances_text(&result);
assert!(text.contains("*MOT:"));
assert!(!text.contains("*CHI:"));
}
#[test]
fn test_head_empty() {
let chat = make_chat(vec![]);
let result = chat.head(5);
assert_eq!(result.utterances.len(), 0);
assert_eq!(utterances_text(&result), "");
}
#[test]
fn test_tail_empty() {
let chat = make_chat(vec![]);
let result = chat.tail(5);
assert_eq!(result.utterances.len(), 0);
assert_eq!(utterances_text(&result), "");
}
#[test]
fn test_head_across_files() {
let chat = make_chat(vec![
make_chat_file("a", make_basic_chat()),
make_chat_file("b", make_basic_chat()),
]);
let result = chat.head(3);
assert_eq!(result.utterances.len(), 3);
let text = utterances_text(&result);
assert_eq!(text.matches("*CHI:").count(), 2);
assert_eq!(text.matches("*MOT:").count(), 1);
}
#[test]
fn test_tail_across_files() {
let chat = make_chat(vec![
make_chat_file("a", make_basic_chat()),
make_chat_file("b", make_basic_chat()),
]);
let result = chat.tail(3);
assert_eq!(result.utterances.len(), 3);
let text = utterances_text(&result);
assert_eq!(text.matches("*CHI:").count(), 1);
assert_eq!(text.matches("*MOT:").count(), 2);
}
#[test]
fn test_head_contains_mor_and_gra() {
let chat = make_chat(vec![make_chat_file("a", make_basic_chat())]);
let text = utterances_text(&chat.head(1));
assert!(text.contains("%mor:"));
assert!(text.contains("%gra:"));
assert!(text.contains("pro|I"));
assert!(text.contains("1|2|SUBJ"));
}
#[test]
fn test_chat_from_strs() {
let (chat, misalignments) = Chat::from_strs(
vec![make_basic_chat().to_string()],
Some(vec!["test-id".to_string()]),
false,
DEFAULT_MOR,
DEFAULT_GRA,
);
assert!(misalignments.is_empty());
assert_eq!(chat.num_files(), 1);
assert_eq!(chat.file_paths(), vec!["test-id"]);
let utts: Vec<&Utterance> = chat
.files()
.iter()
.flat_map(|f| f.real_utterances())
.collect();
assert_eq!(utts.len(), 2);
assert_eq!(utts[0].participant.as_deref(), Some("CHI"));
assert_eq!(utts[1].participant.as_deref(), Some("MOT"));
}
#[test]
fn test_chat_from_strs_auto_ids() {
let (chat, _) = Chat::from_strs(
vec![make_basic_chat().to_string(), make_basic_chat().to_string()],
None,
false,
DEFAULT_MOR,
DEFAULT_GRA,
);
assert_eq!(chat.num_files(), 2);
let paths = chat.file_paths();
assert_ne!(paths[0], paths[1]);
}
#[test]
#[should_panic(expected = "strs and ids must have the same length")]
fn test_chat_from_strs_length_mismatch() {
Chat::from_strs(
vec![make_basic_chat().to_string()],
Some(vec!["a".to_string(), "b".to_string()]),
false,
DEFAULT_MOR,
DEFAULT_GRA,
);
}
#[test]
fn test_chat_read_files() {
let dir = tempfile::tempdir().unwrap();
let file_path = dir.path().join("test.cha");
std::fs::write(&file_path, make_basic_chat()).unwrap();
let (chat, misalignments) = Chat::read_files(
&[file_path.to_string_lossy().to_string()],
false,
DEFAULT_MOR,
DEFAULT_GRA,
)
.unwrap();
assert!(misalignments.is_empty());
assert_eq!(chat.num_files(), 1);
let utts: Vec<&Utterance> = chat
.files()
.iter()
.flat_map(|f| f.real_utterances())
.collect();
assert_eq!(utts.len(), 2);
}
#[test]
fn test_chat_read_dir() {
let dir = tempfile::tempdir().unwrap();
std::fs::write(dir.path().join("a.cha"), make_basic_chat()).unwrap();
std::fs::write(dir.path().join("b.cha"), make_basic_chat()).unwrap();
std::fs::write(dir.path().join("c.txt"), "not a chat file").unwrap();
let (chat, _) = Chat::read_dir(
&dir.path().to_string_lossy(),
None,
".cha",
false,
DEFAULT_MOR,
DEFAULT_GRA,
)
.unwrap();
assert_eq!(chat.num_files(), 2);
}
#[test]
fn test_chat_read_dir_with_match() {
let dir = tempfile::tempdir().unwrap();
std::fs::write(dir.path().join("alpha.cha"), make_basic_chat()).unwrap();
std::fs::write(dir.path().join("beta.cha"), make_basic_chat()).unwrap();
let (chat, _) = Chat::read_dir(
&dir.path().to_string_lossy(),
Some("alpha"),
".cha",
false,
DEFAULT_MOR,
DEFAULT_GRA,
)
.unwrap();
assert_eq!(chat.num_files(), 1);
}
#[test]
fn test_chat_read_zip() {
let dir = tempfile::tempdir().unwrap();
let zip_path = dir.path().join("test.zip");
let file = std::fs::File::create(&zip_path).unwrap();
let mut zip = zip::ZipWriter::new(file);
let options = zip::write::SimpleFileOptions::default();
zip.start_file("a.cha", options).unwrap();
std::io::Write::write_all(&mut zip, make_basic_chat().as_bytes()).unwrap();
zip.start_file("b.cha", options).unwrap();
std::io::Write::write_all(&mut zip, make_basic_chat().as_bytes()).unwrap();
zip.start_file("c.txt", options).unwrap();
std::io::Write::write_all(&mut zip, b"not a chat file").unwrap();
zip.finish().unwrap();
let (chat, _) = Chat::read_zip(
&zip_path.to_string_lossy(),
None,
".cha",
false,
DEFAULT_MOR,
DEFAULT_GRA,
)
.unwrap();
assert_eq!(chat.num_files(), 2);
}
#[test]
fn test_chat_read_zip_with_match() {
let dir = tempfile::tempdir().unwrap();
let zip_path = dir.path().join("test.zip");
let file = std::fs::File::create(&zip_path).unwrap();
let mut zip = zip::ZipWriter::new(file);
let options = zip::write::SimpleFileOptions::default();
zip.start_file("alpha.cha", options).unwrap();
std::io::Write::write_all(&mut zip, make_basic_chat().as_bytes()).unwrap();
zip.start_file("beta.cha", options).unwrap();
std::io::Write::write_all(&mut zip, make_basic_chat().as_bytes()).unwrap();
zip.finish().unwrap();
let (chat, _) = Chat::read_zip(
&zip_path.to_string_lossy(),
Some("alpha"),
".cha",
false,
DEFAULT_MOR,
DEFAULT_GRA,
)
.unwrap();
assert_eq!(chat.num_files(), 1);
}
fn make_chat_with_custom_tiers() -> &'static str {
"@UTF8\n@Begin\n@Participants:\tCHI Child\n*CHI:\tI want cookie .\n%xmor:\tpro|I v|want n|cookie .\n%xgra:\t1|2|SUBJ 2|0|ROOT 3|2|OBJ 4|2|PUNCT\n@End\n"
}
#[test]
fn test_custom_tier_names_parsed() {
let (_, events, _, misalignments) = parse_chat_str(
make_chat_with_custom_tiers(),
true,
Some("%xmor"),
Some("%xgra"),
);
assert!(misalignments.is_empty());
assert_eq!(events.len(), 1);
let utt = &events[0];
let tokens = utt.tokens.as_ref().unwrap();
assert_eq!(tokens[0].pos.as_deref(), Some("pro"));
assert_eq!(tokens[0].mor.as_deref(), Some("I"));
assert_eq!(tokens[0].gra.as_ref().unwrap().rel, "SUBJ");
assert_eq!(utt.mor_tier_name.as_deref(), Some("%xmor"));
assert_eq!(utt.gra_tier_name.as_deref(), Some("%xgra"));
}
#[test]
fn test_default_tiers_ignore_custom_tier_data() {
let (_, events, _, _) = parse_chat_str(
make_chat_with_custom_tiers(),
true,
DEFAULT_MOR,
DEFAULT_GRA,
);
let tokens = events[0].tokens.as_ref().unwrap();
assert!(tokens[0].mor.is_none());
assert!(tokens[0].gra.is_none());
}
#[test]
fn test_none_tiers_disable_mor_gra() {
let (_, events, _, misalignments) = parse_chat_str(make_basic_chat(), true, None, None);
assert!(misalignments.is_empty());
let tokens = events[0].tokens.as_ref().unwrap();
assert!(tokens[0].mor.is_none());
assert!(tokens[0].gra.is_none());
assert!(events[0].mor_tier_name.is_none());
assert!(events[0].gra_tier_name.is_none());
}
#[test]
fn test_none_mor_disables_both() {
let (_, events, _, _) = parse_chat_str(make_basic_chat(), true, None, DEFAULT_GRA);
let tokens = events[0].tokens.as_ref().unwrap();
assert!(tokens[0].mor.is_none());
assert!(tokens[0].gra.is_none());
}
#[test]
fn test_none_gra_disables_both() {
let (_, events, _, _) = parse_chat_str(make_basic_chat(), true, DEFAULT_MOR, None);
let tokens = events[0].tokens.as_ref().unwrap();
assert!(tokens[0].mor.is_none());
assert!(tokens[0].gra.is_none());
}
#[test]
fn test_custom_tiers_from_strs() {
let (chat, misalignments) = Chat::from_strs(
vec![make_chat_with_custom_tiers().to_string()],
None,
true,
Some("%xmor"),
Some("%xgra"),
);
assert!(misalignments.is_empty());
let files = chat.files();
let utt = files[0].utterances().next().unwrap();
let tokens = utt.tokens.as_ref().unwrap();
assert_eq!(tokens[0].pos.as_deref(), Some("pro"));
assert_eq!(tokens[0].mor.as_deref(), Some("I"));
}
#[test]
fn test_disabled_tiers_from_strs() {
let (chat, _) =
Chat::from_strs(vec![make_basic_chat().to_string()], None, true, None, None);
let files = chat.files();
let utt = files[0].utterances().next().unwrap();
let tokens = utt.tokens.as_ref().unwrap();
assert!(tokens[0].mor.is_none());
}
#[test]
fn test_custom_tiers_to_chat_lines() {
let (_, events, _, _) = parse_chat_str(
make_chat_with_custom_tiers(),
true,
Some("%xmor"),
Some("%xgra"),
);
let lines = events[0].to_chat_lines();
let joined = lines.join("\n");
assert!(joined.contains("%xmor:"));
assert!(joined.contains("%xgra:"));
assert!(!joined.contains("%mor:"));
assert!(!joined.contains("%gra:"));
}
}