import datetime
import warnings
import pytest
from rustling.chat import (
CHAT,
Age,
ChangeableHeader,
Gra,
Headers,
Participant,
Token,
Utterance,
)
BASIC_CHAT = (
"@UTF8\n"
"@Begin\n"
"@Participants:\tCHI Child, MOT Mother\n"
"*CHI:\tI want cookie .\n"
"%mor:\tpro|I v|want n|cookie .\n"
"%gra:\t1|2|SUBJ 2|0|ROOT 3|2|OBJ 4|2|PUNCT\n"
"*MOT:\tno .\n"
"%mor:\tco|no .\n"
"%gra:\t1|0|ROOT 2|1|PUNCT\n"
"@End\n"
)
TWO_FILES = [
"@UTF8\n@Begin\n*CHI:\thi .\n@End\n",
"@UTF8\n@Begin\n*MOT:\tbye .\n@End\n",
]
class TestCHATFromStrs:
def test_basic_parsing(self):
reader = CHAT.from_strs([BASIC_CHAT])
assert reader.n_files == 1
def test_empty_strs(self):
reader = CHAT.from_strs([])
assert reader.n_files == 0
def test_ids_provided(self):
reader = CHAT.from_strs([BASIC_CHAT], ids=["my_file"])
assert reader.file_paths == ["my_file"]
def test_ids_auto_generated(self):
reader = CHAT.from_strs([BASIC_CHAT])
paths = reader.file_paths
assert len(paths) == 1
assert len(paths[0]) == 36
def test_ids_length_mismatch(self):
with pytest.raises(ValueError):
CHAT.from_strs(["content1", "content2"], ids=["only_one"])
def test_multiple_files(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
assert reader.n_files == 2
assert reader.file_paths == ["a", "b"]
class TestFromUtterances:
def test_round_trip(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
new_reader = CHAT.from_utterances(utts)
assert new_reader.n_files == 1
new_utts = new_reader.utterances()
assert len(new_utts) == len(utts)
for orig, rebuilt in zip(utts, new_utts):
assert orig == rebuilt
def test_subset(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
new_reader = CHAT.from_utterances(utts[:1])
new_utts = new_reader.utterances()
assert len(new_utts) == 1
assert new_utts[0].participant == "CHI"
def test_empty(self):
new_reader = CHAT.from_utterances([])
assert new_reader.n_files == 1
assert len(new_reader.utterances()) == 0
def test_words(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
new_reader = CHAT.from_utterances(utts)
assert new_reader.words() == reader.words()
def test_to_strs(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
new_reader = CHAT.from_utterances(utts)
strs = new_reader.to_strs()
assert len(strs) == 1
assert "*CHI:" in strs[0]
assert "%mor:" in strs[0]
assert "@End" in strs[0]
def test_serialization_round_trip(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
new_reader = CHAT.from_utterances(utts)
strs = new_reader.to_strs()
reparsed = CHAT.from_strs(strs)
reparsed_utts = reparsed.utterances()
assert len(reparsed_utts) == len(utts)
for orig, reparsed_utt in zip(utts, reparsed_utts):
assert orig.participant == reparsed_utt.participant
assert orig.tokens == reparsed_utt.tokens
class TestUtterances:
def test_utterances_flat(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
assert len(utts) == 2
assert utts[0].participant == "CHI"
assert utts[1].participant == "MOT"
def test_utterances_by_file(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances(by_file=True)
assert len(utts) == 1 assert len(utts[0]) == 2
def test_utterances_multiple_files_by_file(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
utts = reader.utterances(by_file=True)
assert len(utts) == 2
assert len(utts[0]) == 1
assert len(utts[1]) == 1
def test_utterance_type(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
assert isinstance(utts[0], Utterance)
def test_utterance_tokens(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
tokens = utts[0].tokens
assert tokens is not None
assert len(tokens) == 4 assert tokens[0].word == "I"
assert tokens[1].word == "want"
assert tokens[2].word == "cookie"
def test_utterance_tiers(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
tiers = utts[0].tiers
assert tiers is not None
assert "CHI" in tiers
assert "%mor" in tiers
assert "%gra" in tiers
def test_utterance_audible(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
assert utts[0].audible == "I want cookie ."
def test_utterance_audible_constructed(self):
utt = Utterance(tokens=[Token("hello"), Token("world")])
assert utt.audible == "hello world"
def test_utterance_audible_none_tokens(self):
utt = Utterance()
assert utt.audible is None
class TestTokens:
def test_token_pos(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
tokens = utts[0].tokens
assert tokens is not None
assert tokens[0].pos == "pro"
assert tokens[1].pos == "v"
assert tokens[2].pos == "n"
def test_token_mor(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
tokens = utts[0].tokens
assert tokens is not None
assert tokens[0].mor == "I"
assert tokens[1].mor == "want"
assert tokens[2].mor == "cookie"
def test_token_gra(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
tokens = utts[0].tokens
assert tokens is not None
gra = tokens[0].gra
assert isinstance(gra, Gra)
assert gra.dep == 1
assert gra.head == 2
assert gra.rel == "SUBJ"
def test_no_mor_tier(self):
chat_str = "@UTF8\n@Begin\n*CHI:\thello world .\n@End\n"
reader = CHAT.from_strs([chat_str])
utts = reader.utterances()
tokens = utts[0].tokens
assert tokens is not None
assert tokens[0].pos is None
assert tokens[0].mor is None
assert tokens[0].gra is None
class TestWords:
def test_words_flat(self):
reader = CHAT.from_strs([BASIC_CHAT])
words = reader.words()
assert "I" in words
assert "want" in words
assert "cookie" in words
assert "no" in words
def test_words_by_utterance(self):
reader = CHAT.from_strs([BASIC_CHAT])
words = reader.words(by_utterance=True)
assert isinstance(words, list)
assert isinstance(words[0], list)
assert words[0] == ["I", "want", "cookie", "."]
def test_words_by_file(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
words = reader.words(by_file=True)
assert len(words) == 2
def test_words_by_utterance_and_files(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
words = reader.words(by_utterance=True, by_file=True)
assert len(words) == 2 assert isinstance(words[0], list)
assert isinstance(words[0][0], list)
class TestChatTokens:
def test_tokens_flat(self):
reader = CHAT.from_strs([BASIC_CHAT])
tokens = reader.tokens()
assert isinstance(tokens[0], Token)
assert tokens[0].word == "I"
assert tokens[0].pos == "pro"
def test_tokens_by_utterance(self):
reader = CHAT.from_strs([BASIC_CHAT])
tokens = reader.tokens(by_utterance=True)
assert isinstance(tokens[0], list)
assert isinstance(tokens[0][0], Token)
class TestTimeMarks:
def test_time_marks_present(self):
chat_str = "@UTF8\n@Begin\n*CHI:\thello . \x15123_456\x15\n@End\n"
reader = CHAT.from_strs([chat_str])
utts = reader.utterances()
assert utts[0].time_marks == (123, 456)
def test_time_marks_absent(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
assert utts[0].time_marks is None
class TestFilter:
def test_filter_files(self):
reader = CHAT.from_strs(TWO_FILES, ids=["action.cha", "codes.cha"])
filtered = reader.filter(files="action")
assert filtered.n_files == 1
assert filtered.file_paths == ["action.cha"]
def test_filter_single_participant(self):
reader = CHAT.from_strs([BASIC_CHAT])
filtered = reader.filter(participants="CHI")
utts = filtered.utterances()
assert len(utts) == 1
assert utts[0].participant == "CHI"
def test_filter_multiple_participants_list(self):
reader = CHAT.from_strs([BASIC_CHAT])
filtered = reader.filter(participants=["CHI", "MOT"])
utts = filtered.utterances()
assert len(utts) == 2
def test_filter_single_participant_as_list(self):
reader = CHAT.from_strs([BASIC_CHAT])
filtered = reader.filter(participants=["MOT"])
utts = filtered.utterances()
assert len(utts) == 1
assert utts[0].participant == "MOT"
def test_filter_regex_alternation(self):
reader = CHAT.from_strs([BASIC_CHAT])
filtered = reader.filter(participants="CHI|MOT")
utts = filtered.utterances()
assert len(utts) == 2
def test_filter_no_match(self):
reader = CHAT.from_strs([BASIC_CHAT])
filtered = reader.filter(participants="INV")
utts = filtered.utterances()
assert len(utts) == 0
def test_filter_auto_anchored(self):
reader = CHAT.from_strs([BASIC_CHAT])
filtered = reader.filter(participants="CH")
utts = filtered.utterances()
assert len(utts) == 0
def test_filter_words(self):
reader = CHAT.from_strs([BASIC_CHAT])
filtered = reader.filter(participants="CHI")
words = filtered.words()
assert "I" in words
assert "want" in words
assert "no" not in words
def test_filter_tokens(self):
reader = CHAT.from_strs([BASIC_CHAT])
filtered = reader.filter(participants="CHI")
tokens = filtered.tokens()
assert all(t.word != "no" for t in tokens)
def test_filter_participants_header(self):
reader = CHAT.from_strs([BASIC_CHAT])
filtered = reader.filter(participants="CHI")
parts = filtered.participants()
codes = [p.code for p in parts]
assert "CHI" in codes
assert "MOT" not in codes
def test_filter_files_and_participants(self):
reader = CHAT.from_strs(TWO_FILES, ids=["action.cha", "codes.cha"])
filtered = reader.filter(files="action", participants="CHI")
assert filtered.n_files == 1
utts = filtered.utterances()
assert all(u.participant == "CHI" for u in utts)
def test_filter_does_not_mutate_original(self):
reader = CHAT.from_strs([BASIC_CHAT])
_ = reader.filter(participants="CHI")
assert len(reader.utterances()) == 2
def test_filter_invalid_regex(self):
reader = CHAT.from_strs([BASIC_CHAT])
with pytest.raises(ValueError, match="Invalid participant regex"):
reader.filter(participants="[invalid")
def test_filter_wrong_type(self):
reader = CHAT.from_strs([BASIC_CHAT])
with pytest.raises(TypeError):
reader.filter(participants=123)
def test_filter_empty_list(self):
reader = CHAT.from_strs([BASIC_CHAT])
with pytest.raises(ValueError, match="must not be empty"):
reader.filter(participants=[])
def test_filter_participant_lookahead(self):
reader = CHAT.from_strs([BASIC_CHAT])
filtered = reader.filter(participants="(?=.*H).+")
utts = filtered.utterances()
assert len(utts) == 1
assert utts[0].participant == "CHI"
def test_filter_participant_negative_lookahead(self):
reader = CHAT.from_strs([BASIC_CHAT])
filtered = reader.filter(participants="(?!CHI).+")
utts = filtered.utterances()
assert len(utts) == 1
assert utts[0].participant == "MOT"
def test_filter_files_lookahead(self):
reader = CHAT.from_strs(TWO_FILES, ids=["action.cha", "codes.cha"])
filtered = reader.filter(files="(?=.*action).+")
assert filtered.n_files == 1
assert filtered.file_paths == ["action.cha"]
def test_filter_files_negative_lookahead(self):
reader = CHAT.from_strs(TWO_FILES, ids=["action.cha", "codes.cha"])
filtered = reader.filter(files="^(?!.*action).+")
assert filtered.n_files == 1
assert filtered.file_paths == ["codes.cha"]
class TestLen:
def test_len_raises(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
with pytest.raises(TypeError):
len(reader)
class TestRepr:
def test_repr(self):
reader = CHAT.from_strs([BASIC_CHAT])
assert "1 file(s)" in repr(reader)
class TestReprHtml:
def test_basic_html(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
html = utts[0]._repr_html_()
assert isinstance(html, str)
assert "<table" in html
assert "*CHI:" in html
assert "want" in html
assert "%mor:" in html
assert "%gra:" in html
assert "pro|I" in html
assert "v|want" in html
assert "n|cookie" in html
assert "1|2|SUBJ" in html
assert "2|0|ROOT" in html
def test_html_no_annotations(self):
chat_str = "@UTF8\n@Begin\n*CHI:\thello world .\n@End\n"
reader = CHAT.from_strs([chat_str])
utts = reader.utterances()
html = utts[0]._repr_html_()
assert "*CHI:" in html
assert "hello" in html
assert "world" in html
assert "%mor:" not in html
assert "%gra:" not in html
def test_html_time_marks(self):
chat_str = (
"@UTF8\n@Begin\n@Participants:\tCHI Child\n"
"*CHI:\thi . \x150_1500\x15\n@End\n"
)
reader = CHAT.from_strs([chat_str])
utts = reader.utterances()
html = utts[0]._repr_html_()
assert "0" in html
assert "1500" in html
assert "ms" in html
assert "rustling-utterance-wrapper" in html
def test_html_no_time_marks(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
html = utts[0]._repr_html_()
assert "rustling-utterance-wrapper" not in html
def test_html_escaping(self):
utt = Utterance(
participant="CHI",
tokens=[Token(word="a<b")],
time_marks=None,
tiers={"CHI": "a<b"},
)
html = utt._repr_html_()
assert "a<b" in html
assert "a<b" not in html.replace("a<b", "")
def test_html_empty_tokens(self):
utt = Utterance(
participant="CHI",
tokens=[],
time_marks=None,
tiers={"CHI": ""},
)
html = utt._repr_html_()
assert "<table" in html
assert "*CHI:" in html
class TestCleanUtterance:
def test_removes_overlap_markers(self):
chat_str = "@UTF8\n@Begin\n*CHI:\tI [<] want cookie .\n@End\n"
reader = CHAT.from_strs([chat_str])
words = reader.words()
assert "[<]" not in words
def test_removes_error_markers(self):
chat_str = "@UTF8\n@Begin\n*CHI:\tgoed [*] .\n@End\n"
reader = CHAT.from_strs([chat_str])
words = reader.words()
assert "[*]" not in words
assert "goed" in words
def test_removes_explanations(self):
chat_str = "@UTF8\n@Begin\n*CHI:\tI want [= desire] cookie .\n@End\n"
reader = CHAT.from_strs([chat_str])
words = reader.words()
assert "[=" not in " ".join(words)
assert "desire" not in " ".join(words)
def test_removes_xxx(self):
chat_str = "@UTF8\n@Begin\n*CHI:\txxx hello .\n@End\n"
reader = CHAT.from_strs([chat_str])
words = reader.words()
assert "xxx" not in words
assert "hello" in words
class TestMorAlignment:
def test_postclitic(self):
chat_str = (
"@UTF8\n@Begin\n"
"*CHI:\tthat's good .\n"
"%mor:\tpro:dem|that~cop|be&3S adj|good .\n"
"@End\n"
)
reader = CHAT.from_strs([chat_str])
utts = reader.utterances()
tokens = utts[0].tokens
tokens = utts[0].tokens
assert tokens is not None
assert tokens[0].word == "that's"
assert tokens[0].pos == "pro:dem"
assert tokens[1].word == ""
assert tokens[1].pos == "cop"
assert tokens[2].word == "good"
assert tokens[2].pos == "adj"
class TestContinuationLines:
def test_continuation_joined(self):
chat_str = "@UTF8\n@Begin\n" "*CHI:\tI want\n" "\tcookie .\n" "@End\n"
reader = CHAT.from_strs([chat_str])
words = reader.words()
assert "I" in words
assert "want" in words
assert "cookie" in words
class TestLeadingWhitespace:
def test_leading_whitespace_stripped(self):
chat_str = (
" @UTF8\n"
" @Begin\n"
" @Participants:\tCHI Child, MOT Mother\n"
" *CHI:\tI want cookie .\n"
" %mor:\tpro|I v|want n|cookie .\n"
" *MOT:\tno .\n"
" %mor:\tco|no .\n"
" @End\n"
)
reader = CHAT.from_strs([chat_str])
utts = reader.utterances()
assert len(utts) == 2
words = reader.words()
assert words == ["I", "want", "cookie", ".", "no", "."]
class TestFromDir:
def test_from_dir(self, testchat_good_dir):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = CHAT.from_dir(str(testchat_good_dir), strict=False)
assert reader.n_files > 0
def test_from_dir_with_path(self, testchat_good_dir):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = CHAT.from_dir(testchat_good_dir, strict=False)
assert reader.n_files > 0
def test_from_dir_with_match(self, testchat_good_dir):
reader = CHAT.from_dir(str(testchat_good_dir), match="action")
paths = reader.file_paths
assert len(paths) > 0
assert all("action" in p for p in paths)
def test_utterances_from_real_files(self, testchat_good_dir):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = CHAT.from_dir(str(testchat_good_dir), strict=False)
utts = reader.utterances()
assert len(utts) > 0
def test_words_from_real_files(self, testchat_good_dir):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = CHAT.from_dir(str(testchat_good_dir), strict=False)
words = reader.words()
assert len(words) > 0
def test_testchat_strict_compliance(self, testchat_good_dir):
try:
CHAT.from_dir(str(testchat_good_dir), strict=True)
except ValueError as e:
warnings.warn(
f"testchat/good files failing strict=True:\n\n{e}",
stacklevel=1,
)
def test_testchat_bad_files_catch_errors(self, testchat_bad_dir):
known_exceptions = {
"mor-commas.cha",
"mor-empty.cha",
"latetalkers.cha",
"mornumber-spanish.cha",
"mornumber-spanish-2.cha",
"pho-group-compound.cha",
"pho-repetition.cha",
"pho-repetition-bad.cha",
"media-needs-bullets.cha",
"media-notrans-bullets.cha",
"media-unlinked-bullets.cha",
"retrace-in-group-bad.cha",
"retrace-multiple-no-following.cha",
"retrace-no-following-content.cha",
"quotation-nested.cha",
"language-different-speakers.cha",
"zho-f.cha",
"ca-segment-repetition.cha",
"ca-segment-repetition-bad-content.cha",
"repetition.cha",
"grouprepetition.cha",
"x-repetition.cha",
"sign.cha",
"words-sign.cha",
"heritage.cha",
"heritage-lsfal14a.cha",
"bs5.cha",
"who.cha",
"zero-others.cha",
"space-bracket.cha",
}
no_error_files = []
for path in sorted(testchat_bad_dir.glob("*.cha")):
if path.name in known_exceptions:
continue
try:
CHAT.from_files([str(path)], strict=True)
except Exception:
pass
else:
no_error_files.append(path.name)
assert not no_error_files, (
f"{len(no_error_files)} testchat/bad files that raised no parsing error:\n"
+ "\n".join(no_error_files)
)
def test_private_data_strict_compliance(self, private_data_dir):
try:
CHAT.from_dir(str(private_data_dir), strict=True)
except ValueError as e:
warnings.warn(
f"Private test data files failing strict=True:\n\n{e}",
stacklevel=1,
)
class TestFromFiles:
def test_from_files(self, testchat_good_dir):
import glob
cha_files = sorted(glob.glob(str(testchat_good_dir / "*.cha")))[:3]
if cha_files:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = CHAT.from_files(cha_files, strict=False)
assert reader.n_files == len(cha_files)
def test_from_files_with_path(self, testchat_good_dir):
paths = sorted(testchat_good_dir.glob("*.cha"))[:3]
if paths:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = CHAT.from_files(paths, strict=False)
assert reader.n_files == len(paths)
def test_from_files_mixed_str_and_path(self, testchat_good_dir):
all_paths = sorted(testchat_good_dir.glob("*.cha"))[:2]
if len(all_paths) >= 2:
mixed = [str(all_paths[0]), all_paths[1]]
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = CHAT.from_files(mixed, strict=False)
assert reader.n_files == 2
class TestAppend:
def test_append(self):
a = CHAT.from_strs(["@UTF8\n@Begin\n*CHI:\thi .\n@End\n"], ids=["a"])
b = CHAT.from_strs(["@UTF8\n@Begin\n*MOT:\tbye .\n@End\n"], ids=["b"])
a.append(b)
assert a.n_files == 2
assert a.file_paths == ["a", "b"]
def test_append_does_not_modify_other(self):
a = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
b = CHAT.from_strs(["@UTF8\n@Begin\n*CHI:\tyes .\n@End\n"], ids=["c"])
a.append(b)
assert a.n_files == 3
assert b.n_files == 1
def test_append_left(self):
a = CHAT.from_strs(["@UTF8\n@Begin\n*CHI:\thi .\n@End\n"], ids=["a"])
b = CHAT.from_strs(["@UTF8\n@Begin\n*MOT:\tbye .\n@End\n"], ids=["b"])
a.append_left(b)
assert a.file_paths == ["b", "a"]
def test_append_left_preserves_order(self):
a = CHAT.from_strs(["@UTF8\n@Begin\n*CHI:\thi .\n@End\n"], ids=["a"])
b = CHAT.from_strs(TWO_FILES, ids=["b", "c"])
a.append_left(b)
assert a.file_paths == ["b", "c", "a"]
class TestExtend:
def test_extend(self):
a = CHAT.from_strs(["@UTF8\n@Begin\n*CHI:\thi .\n@End\n"], ids=["a"])
b = CHAT.from_strs(["@UTF8\n@Begin\n*MOT:\tbye .\n@End\n"], ids=["b"])
c = CHAT.from_strs(["@UTF8\n@Begin\n*CHI:\tyes .\n@End\n"], ids=["c"])
a.extend([b, c])
assert a.n_files == 3
assert a.file_paths == ["a", "b", "c"]
def test_extend_left(self):
a = CHAT.from_strs(["@UTF8\n@Begin\n*CHI:\thi .\n@End\n"], ids=["a"])
b = CHAT.from_strs(["@UTF8\n@Begin\n*MOT:\tbye .\n@End\n"], ids=["b"])
c = CHAT.from_strs(["@UTF8\n@Begin\n*CHI:\tyes .\n@End\n"], ids=["c"])
a.extend_left([b, c])
assert a.file_paths == ["b", "c", "a"]
def test_extend_empty_list(self):
a = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
a.extend([])
assert a.n_files == 2
class TestPop:
def test_pop(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
popped = reader.pop()
assert reader.n_files == 1
assert reader.file_paths == ["a"]
assert popped.n_files == 1
assert popped.file_paths == ["b"]
def test_pop_left(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
popped = reader.pop_left()
assert reader.n_files == 1
assert reader.file_paths == ["b"]
assert popped.n_files == 1
assert popped.file_paths == ["a"]
def test_pop_empty_raises(self):
reader = CHAT.from_strs([])
with pytest.raises(IndexError):
reader.pop()
def test_pop_left_empty_raises(self):
reader = CHAT.from_strs([])
with pytest.raises(IndexError):
reader.pop_left()
def test_pop_preserves_data(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
popped = reader.pop()
assert popped.words() == ["bye", "."]
class TestClear:
def test_clear(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
reader.clear()
assert reader.n_files == 0
def test_clear_empty(self):
reader = CHAT.from_strs([])
reader.clear()
assert reader.n_files == 0
class TestAdd:
def test_add_creates_new_chat(self):
a = CHAT.from_strs(["@UTF8\n@Begin\n*CHI:\thi .\n@End\n"], ids=["a"])
b = CHAT.from_strs(["@UTF8\n@Begin\n*MOT:\tbye .\n@End\n"], ids=["b"])
c = a + b
assert c.n_files == 2
assert c.file_paths == ["a", "b"]
def test_add_does_not_mutate_operands(self):
a = CHAT.from_strs(["@UTF8\n@Begin\n*CHI:\thi .\n@End\n"], ids=["a"])
b = CHAT.from_strs(["@UTF8\n@Begin\n*MOT:\tbye .\n@End\n"], ids=["b"])
_ = a + b
assert a.n_files == 1
assert a.file_paths == ["a"]
assert b.n_files == 1
assert b.file_paths == ["b"]
def test_add_multi_file(self):
a = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
b = CHAT.from_strs(["@UTF8\n@Begin\n*CHI:\tyes .\n@End\n"], ids=["c"])
c = a + b
assert c.n_files == 3
assert c.file_paths == ["a", "b", "c"]
def test_iadd_mutates_in_place(self):
a = CHAT.from_strs(["@UTF8\n@Begin\n*CHI:\thi .\n@End\n"], ids=["a"])
b = CHAT.from_strs(["@UTF8\n@Begin\n*MOT:\tbye .\n@End\n"], ids=["b"])
a += b
assert a.n_files == 2
assert a.file_paths == ["a", "b"]
def test_iadd_does_not_mutate_other(self):
a = CHAT.from_strs(["@UTF8\n@Begin\n*CHI:\thi .\n@End\n"], ids=["a"])
b = CHAT.from_strs(["@UTF8\n@Begin\n*MOT:\tbye .\n@End\n"], ids=["b"])
a += b
assert b.n_files == 1
assert b.file_paths == ["b"]
class TestBool:
def test_empty_default_is_falsy(self):
assert not CHAT()
def test_empty_from_strs_is_falsy(self):
assert not CHAT.from_strs([])
def test_with_data_is_truthy(self):
reader = CHAT.from_strs(["@UTF8\n@Begin\n*CHI:\thi .\n@End\n"])
assert reader
def test_after_clear_is_falsy(self):
reader = CHAT.from_strs(["@UTF8\n@Begin\n*CHI:\thi .\n@End\n"])
reader.clear()
assert not reader
THREE_FILES = [
"@UTF8\n@Begin\n*CHI:\thi .\n@End\n",
"@UTF8\n@Begin\n*MOT:\tbye .\n@End\n",
"@UTF8\n@Begin\n*CHI:\tyes .\n@End\n",
]
FOUR_FILES = [
"@UTF8\n@Begin\n*CHI:\thi .\n@End\n",
"@UTF8\n@Begin\n*MOT:\tbye .\n@End\n",
"@UTF8\n@Begin\n*CHI:\tyes .\n@End\n",
"@UTF8\n@Begin\n*MOT:\tok .\n@End\n",
]
class TestIterAndGetitem:
def test_iter_yields_chat_objects(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
items = list(reader)
assert len(items) == 2
assert all(isinstance(item, CHAT) for item in items)
def test_iter_each_has_one_file(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
items = list(reader)
assert items[0].n_files == 1
assert items[1].n_files == 1
def test_iter_preserves_order(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
items = list(reader)
assert items[0].file_paths == ["a"]
assert items[1].file_paths == ["b"]
def test_iter_preserves_data(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
items = list(reader)
assert items[0].words() == ["hi", "."]
assert items[1].words() == ["bye", "."]
def test_iter_empty_reader(self):
reader = CHAT.from_strs([])
assert list(reader) == []
def test_iter_does_not_mutate(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
_ = list(reader)
assert reader.n_files == 2
def test_iter_for_loop(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
count = 0
for item in reader:
assert isinstance(item, CHAT)
count += 1
assert count == 2
def test_getitem_positive_index(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
item = reader[0]
assert isinstance(item, CHAT)
assert item.n_files == 1
assert item.file_paths == ["a"]
def test_getitem_second_index(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
assert reader[1].file_paths == ["b"]
def test_getitem_negative_index(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
assert reader[-1].file_paths == ["b"]
assert reader[-2].file_paths == ["a"]
def test_getitem_out_of_range(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
with pytest.raises(IndexError):
reader[5]
def test_getitem_negative_out_of_range(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
with pytest.raises(IndexError):
reader[-3]
def test_getitem_preserves_data(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
assert reader[0].words() == ["hi", "."]
assert reader[1].words() == ["bye", "."]
def test_getitem_does_not_mutate(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
_ = reader[0]
assert reader.n_files == 2
def test_getitem_empty_reader(self):
reader = CHAT.from_strs([])
with pytest.raises(IndexError):
reader[0]
def test_getitem_slice_basic(self):
reader = CHAT.from_strs(THREE_FILES, ids=["a", "b", "c"])
result = reader[:2]
assert isinstance(result, CHAT)
assert result.n_files == 2
assert result.file_paths == ["a", "b"]
def test_getitem_slice_with_start(self):
reader = CHAT.from_strs(THREE_FILES, ids=["a", "b", "c"])
result = reader[1:]
assert result.n_files == 2
assert result.file_paths == ["b", "c"]
def test_getitem_slice_with_start_and_stop(self):
reader = CHAT.from_strs(FOUR_FILES, ids=["a", "b", "c", "d"])
result = reader[1:3]
assert result.n_files == 2
assert result.file_paths == ["b", "c"]
def test_getitem_slice_with_step(self):
reader = CHAT.from_strs(FOUR_FILES, ids=["a", "b", "c", "d"])
result = reader[::2]
assert result.n_files == 2
assert result.file_paths == ["a", "c"]
def test_getitem_slice_negative(self):
reader = CHAT.from_strs(THREE_FILES, ids=["a", "b", "c"])
result = reader[-2:]
assert result.n_files == 2
assert result.file_paths == ["b", "c"]
def test_getitem_slice_negative_step(self):
reader = CHAT.from_strs(THREE_FILES, ids=["a", "b", "c"])
result = reader[::-1]
assert result.n_files == 3
assert result.file_paths == ["c", "b", "a"]
def test_getitem_slice_empty_result(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
result = reader[5:10]
assert isinstance(result, CHAT)
assert result.n_files == 0
def test_getitem_slice_full(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
result = reader[:]
assert result.n_files == 2
assert result.file_paths == ["a", "b"]
def test_getitem_slice_does_not_mutate(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
_ = reader[:1]
assert reader.n_files == 2
def test_getitem_slice_preserves_data(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
result = reader[:1]
assert result.words() == ["hi", "."]
def test_getitem_invalid_type(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
with pytest.raises(TypeError):
reader["key"]
class TestToStrs:
def test_to_strs_basic(self):
reader = CHAT.from_strs([BASIC_CHAT])
strs = reader.to_strs()
assert len(strs) == 1
assert "@UTF8" in strs[0]
assert "@Begin" in strs[0]
assert "@End" in strs[0]
assert "*CHI:" in strs[0]
assert "%mor:" in strs[0]
def test_to_strs_multiple_files(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
strs = reader.to_strs()
assert len(strs) == 2
def test_to_strs_round_trip(self):
reader1 = CHAT.from_strs([BASIC_CHAT])
strs = reader1.to_strs()
reader2 = CHAT.from_strs(strs)
assert reader1.words() == reader2.words()
assert len(reader1.utterances()) == len(reader2.utterances())
def test_to_strs_empty(self):
reader = CHAT.from_strs([])
assert reader.to_strs() == []
def test_to_strs_round_trip_real_files(self, testchat_good_dir):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = CHAT.from_dir(str(testchat_good_dir), strict=False)
strs = reader.to_strs()
reader2 = CHAT.from_strs(strs, strict=False)
assert reader.words() == reader2.words()
class TestToFiles:
def test_to_files_single_file(self, tmp_path):
reader = CHAT.from_strs([BASIC_CHAT])
out_dir = str(tmp_path / "output")
reader.to_files(out_dir)
reader2 = CHAT.from_dir(out_dir)
assert reader2.words() == reader.words()
def test_to_files_with_path(self, tmp_path):
reader = CHAT.from_strs([BASIC_CHAT])
out_dir = tmp_path / "output"
reader.to_files(out_dir)
reader2 = CHAT.from_dir(out_dir)
assert reader2.words() == reader.words()
def test_to_files_directory(self, tmp_path):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
out_dir = str(tmp_path / "output")
reader.to_files(out_dir)
reader2 = CHAT.from_dir(out_dir)
assert reader2.n_files == 2
def test_to_files_custom_filenames(self, tmp_path):
import os
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
out_dir = str(tmp_path / "output")
reader.to_files(out_dir, filenames=["file1.cha", "file2.cha"])
assert os.path.exists(os.path.join(out_dir, "file1.cha"))
assert os.path.exists(os.path.join(out_dir, "file2.cha"))
def test_to_files_default_filenames_from_ids(self, tmp_path):
import os
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
out_dir = str(tmp_path / "output")
reader.to_files(out_dir)
assert os.path.exists(os.path.join(out_dir, "a.cha"))
assert os.path.exists(os.path.join(out_dir, "b.cha"))
def test_to_files_default_filenames_numbered(self, tmp_path):
import os
reader = CHAT.from_strs(TWO_FILES)
out_dir = str(tmp_path / "output")
reader.to_files(out_dir)
assert os.path.exists(os.path.join(out_dir, "0001.cha"))
assert os.path.exists(os.path.join(out_dir, "0002.cha"))
def test_to_files_filename_count_mismatch_raises(self, tmp_path):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
with pytest.raises(ValueError):
reader.to_files(
str(tmp_path / "output"),
filenames=["only_one.cha"],
)
def test_to_files_preserves_filenames(self, testchat_good_dir, tmp_path):
import os
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = CHAT.from_dir(str(testchat_good_dir), strict=False)
out_dir = str(tmp_path / "output")
reader.to_files(out_dir)
for fp in reader.file_paths:
expected = os.path.splitext(os.path.basename(fp))[0] + ".cha"
assert os.path.exists(os.path.join(out_dir, expected)), expected
def test_to_files_round_trip_real_files(self, testchat_good_dir, tmp_path):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = CHAT.from_dir(str(testchat_good_dir), strict=False)
out_dir = str(tmp_path / "output")
reader.to_files(out_dir)
reader2 = CHAT.from_dir(out_dir, strict=False)
assert reader.words() == reader2.words()
class TestToElan:
def test_to_elan_returns_elan_object(self):
from rustling.elan import ELAN
reader = CHAT.from_strs([BASIC_CHAT])
elan = reader.to_elan()
assert isinstance(elan, ELAN)
assert elan.n_files == 1
def test_to_elan_tiers(self):
reader = CHAT.from_strs([BASIC_CHAT])
elan = reader.to_elan()
tiers_dict = elan.tiers()[0]
tier_ids = list(tiers_dict.keys())
assert "CHI" in tier_ids
assert "MOT" in tier_ids
assert "mor@CHI" in tier_ids
assert "gra@CHI" in tier_ids
assert "mor@MOT" in tier_ids
assert "gra@MOT" in tier_ids
def test_to_elan_strs(self):
reader = CHAT.from_strs([BASIC_CHAT])
strs = reader.to_elan_strs()
assert len(strs) == 1
assert "ANNOTATION_DOCUMENT" in strs[0]
assert "TIER_ID" in strs[0]
def test_to_elan_dep_tier_structure(self):
reader = CHAT.from_strs([BASIC_CHAT])
elan = reader.to_elan()
tiers_dict = elan.tiers()[0]
chi_tier = tiers_dict["CHI"]
mor_chi = tiers_dict["mor@CHI"]
assert chi_tier.parent_id is None
assert mor_chi.parent_id == "CHI"
assert len(chi_tier.annotations) == 1
assert chi_tier.annotations[0].value == "I want cookie ."
assert len(mor_chi.annotations) == 1
assert mor_chi.annotations[0].value == "pro|I v|want n|cookie ."
def test_to_elan_multiple_files(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
elan = reader.to_elan()
assert elan.n_files == 2
def test_to_elan_files_directory(self, tmp_path):
from rustling.elan import ELAN
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
out_dir = str(tmp_path / "output")
reader.to_elan_files(out_dir)
elan = ELAN.from_dir(out_dir)
assert elan.n_files == 2
def test_to_elan_files_custom_filenames(self, tmp_path):
import os
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
out_dir = str(tmp_path / "output")
reader.to_elan_files(out_dir, filenames=["a.eaf", "b.eaf"])
assert os.path.exists(os.path.join(out_dir, "a.eaf"))
assert os.path.exists(os.path.join(out_dir, "b.eaf"))
def test_to_elan_files_default_filenames_from_ids(self, tmp_path):
import os
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
out_dir = str(tmp_path / "output")
reader.to_elan_files(out_dir)
assert os.path.exists(os.path.join(out_dir, "a.eaf"))
assert os.path.exists(os.path.join(out_dir, "b.eaf"))
def test_to_elan_files_default_filenames_numbered(self, tmp_path):
import os
reader = CHAT.from_strs(TWO_FILES)
out_dir = str(tmp_path / "output")
reader.to_elan_files(out_dir)
assert os.path.exists(os.path.join(out_dir, "0001.eaf"))
assert os.path.exists(os.path.join(out_dir, "0002.eaf"))
def test_to_elan_files_preserves_filenames(self, testchat_good_dir, tmp_path):
import os
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = CHAT.from_dir(str(testchat_good_dir), strict=False)
out_dir = str(tmp_path / "output")
reader.to_elan_files(out_dir)
for fp in reader.file_paths:
expected = os.path.splitext(os.path.basename(fp))[0] + ".eaf"
assert os.path.exists(os.path.join(out_dir, expected)), expected
def test_to_elan_files_round_trip_real_files(self, testchat_good_dir, tmp_path):
from rustling.elan import ELAN
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = CHAT.from_dir(str(testchat_good_dir), strict=False)
out_dir = str(tmp_path / "output")
reader.to_elan_files(out_dir)
elan = ELAN.from_dir(out_dir)
assert elan.n_files == reader.n_files
class TestPopAndStitch:
def test_pop_and_append(self):
reader = CHAT.from_strs(TWO_FILES, ids=["a", "b"])
n = reader.n_files
popped = reader.pop()
assert reader.n_files == n - 1
reader.append(popped)
assert reader.n_files == n
def test_pop_and_append_real_files(self, testchat_good_dir):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = CHAT.from_dir(str(testchat_good_dir), strict=False)
n = reader.n_files
all_words = reader.words()
popped = reader.pop()
reader.append(popped)
assert reader.n_files == n
assert reader.words() == all_words
HEADER_CHAT = (
"@UTF8\n"
"@PID:\t11312/c-00044068-1\n"
"@Begin\n"
"@Languages:\teng, zho\n"
"@Participants:\tCHI Child Target_Child, MOT Mary Mother\n"
"@ID:\teng|brown|CHI|2;10.05|male|||Target_Child|||\n"
"@ID:\teng|brown|MOT||female|||Mother|||\n"
"@Date:\t25-JAN-1983\n"
"@Location:\tBoston, MA, USA\n"
"@Media:\tabe88, audio, missing\n"
"@Situation:\tPlaying with toys\n"
"@Comment:\tFirst recording\n"
"@Comment:\tfor Yiddish/Hebrew glossary see file 4504\n"
"*CHI:\thello .\n"
"@New Episode\n"
"*MOT:\thi .\n"
"@Comment:\tChild laughs\n"
"*CHI:\tcookie .\n"
"@End\n"
)
class TestHeaders:
def test_headers_returns_list(self):
reader = CHAT.from_strs([HEADER_CHAT])
headers = reader.headers()
assert isinstance(headers, list)
assert len(headers) == 1
assert isinstance(headers[0], Headers)
def test_pid(self):
reader = CHAT.from_strs([HEADER_CHAT])
h = reader.headers()[0]
assert h.pid == "11312/c-00044068-1"
def test_languages(self):
reader = CHAT.from_strs([HEADER_CHAT])
h = reader.headers()[0]
assert h.languages == ["eng", "zho"]
def test_languages_method(self):
reader = CHAT.from_strs([HEADER_CHAT])
assert reader.languages() == ["eng", "zho"]
def test_languages_by_file(self):
reader = CHAT.from_strs([HEADER_CHAT])
result = reader.languages(by_file=True)
assert result == [["eng", "zho"]]
def test_date(self):
reader = CHAT.from_strs([HEADER_CHAT])
h = reader.headers()[0]
assert h.date == datetime.date(1983, 1, 25)
def test_date_iso(self):
chat_str = HEADER_CHAT.replace("25-JAN-1983", "1983-01-25")
reader = CHAT.from_strs([chat_str])
h = reader.headers()[0]
assert h.date == datetime.date(1983, 1, 25)
def test_date_unparseable(self):
chat_str = HEADER_CHAT.replace("25-JAN-1983", "not-a-date")
reader = CHAT.from_strs([chat_str])
h = reader.headers()[0]
assert h.date is None
def test_location(self):
reader = CHAT.from_strs([HEADER_CHAT])
h = reader.headers()[0]
assert h.location == "Boston, MA, USA"
def test_situation(self):
reader = CHAT.from_strs([HEADER_CHAT])
h = reader.headers()[0]
assert h.situation == "Playing with toys"
def test_media(self):
reader = CHAT.from_strs([HEADER_CHAT])
h = reader.headers()[0]
assert isinstance(h.media, dict)
assert h.media["filename"] == "abe88"
assert h.media["format"] == "audio"
assert h.media["status"] == "missing"
def test_comments(self):
reader = CHAT.from_strs([HEADER_CHAT])
h = reader.headers()[0]
assert isinstance(h.comments, list)
assert len(h.comments) == 2
assert h.comments[0] == "First recording"
assert h.comments[1] == "for Yiddish/Hebrew glossary see file 4504"
def test_comments_empty(self):
simple = (
"@UTF8\n@Begin\n@Participants:\tCHI Child Target_Child\n"
"*CHI:\thello .\n@End\n"
)
reader = CHAT.from_strs([simple])
h = reader.headers()[0]
assert h.comments is None
def test_other_empty_by_default(self):
reader = CHAT.from_strs([HEADER_CHAT])
h = reader.headers()[0]
assert isinstance(h.other, dict)
class TestParticipants:
def test_participants_count(self):
reader = CHAT.from_strs([HEADER_CHAT])
h = reader.headers()[0]
assert len(h.participants) == 2
def test_participant_fields(self):
reader = CHAT.from_strs([HEADER_CHAT])
chi = reader.headers()[0].participants[0]
assert isinstance(chi, Participant)
assert chi.code == "CHI"
assert chi.name == "Child"
assert chi.role == "Target_Child"
def test_participant_id_fields(self):
reader = CHAT.from_strs([HEADER_CHAT])
chi = reader.headers()[0].participants[0]
assert chi.language == "eng"
assert chi.corpus == "brown"
assert chi.sex == "male"
def test_participant_age(self):
reader = CHAT.from_strs([HEADER_CHAT])
chi = reader.headers()[0].participants[0]
assert isinstance(chi.age, Age)
assert chi.age.years == 2
assert chi.age.months == 10
assert chi.age.days == 5
def test_age_in_months(self):
reader = CHAT.from_strs([HEADER_CHAT])
chi = reader.headers()[0].participants[0]
assert chi.age is not None
months = chi.age.in_months()
assert months == pytest.approx(34.1667, abs=0.01)
def test_participant_no_age(self):
reader = CHAT.from_strs([HEADER_CHAT])
mot = reader.headers()[0].participants[1]
assert mot.code == "MOT"
assert mot.age is None
assert mot.sex == "female"
def test_participants_method(self):
reader = CHAT.from_strs([HEADER_CHAT])
parts = reader.participants()
assert len(parts) == 2
assert [p.code for p in parts] == ["CHI", "MOT"]
def test_participants_by_file(self):
reader = CHAT.from_strs([HEADER_CHAT])
result = reader.participants(by_file=True)
assert len(result) == 1
assert isinstance(result[0], list)
assert len(result[0]) == 2
def test_participant_specific_headers(self):
chat_str = (
"@UTF8\n"
"@Begin\n"
"@Languages:\teng\n"
"@Participants:\tCHI Ross Target_Child\n"
"@ID:\teng|macwhinney|CHI|2;06.||||Target_Child|||\n"
"@Birth of CHI:\t28-JUN-2001\n"
"@Birthplace of CHI:\tPittsburgh, PA\n"
"@L1 of CHI:\teng\n"
"*CHI:\thello .\n"
"@End\n"
)
reader = CHAT.from_strs([chat_str])
chi = reader.headers()[0].participants[0]
assert chi.birth == "28-JUN-2001"
assert chi.birthplace == "Pittsburgh, PA"
assert chi.l1 == "eng"
class TestHeadersMultipleFiles:
def test_headers_multiple_files(self):
strs = [
"@UTF8\n@Begin\n@Languages:\teng\n"
"@Participants:\tCHI Child Target_Child\n"
"*CHI:\thi .\n@End\n",
"@UTF8\n@Begin\n@Languages:\tzho\n"
"@Participants:\tMOT Mother Mother\n"
"*MOT:\tbye .\n@End\n",
]
reader = CHAT.from_strs(strs)
headers = reader.headers()
assert len(headers) == 2
assert headers[0].languages == ["eng"]
assert headers[1].languages == ["zho"]
def test_languages_flat_multiple_files(self):
strs = [
"@UTF8\n@Begin\n@Languages:\teng\n"
"@Participants:\tCHI Child Target_Child\n"
"*CHI:\thi .\n@End\n",
"@UTF8\n@Begin\n@Languages:\tzho\n"
"@Participants:\tMOT Mother Mother\n"
"*MOT:\tbye .\n@End\n",
]
reader = CHAT.from_strs(strs)
assert reader.languages() == ["eng", "zho"]
def test_participants_flat_multiple_files(self):
strs = [
"@UTF8\n@Begin\n@Languages:\teng\n"
"@Participants:\tCHI Child Target_Child\n"
"*CHI:\thi .\n@End\n",
"@UTF8\n@Begin\n@Languages:\tzho\n"
"@Participants:\tMOT Mother Mother\n"
"*MOT:\tbye .\n@End\n",
]
reader = CHAT.from_strs(strs)
parts = reader.participants()
assert len(parts) == 2
assert [p.code for p in parts] == ["CHI", "MOT"]
class TestHeadersRealFiles:
def test_headers_from_dir(self, testchat_good_dir):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = CHAT.from_dir(str(testchat_good_dir), strict=False)
headers = reader.headers()
assert len(headers) == reader.n_files
for h in headers:
assert isinstance(h, Headers)
def test_languages_from_dir(self, testchat_good_dir):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = CHAT.from_dir(str(testchat_good_dir), strict=False)
langs = reader.languages(by_file=True)
assert len(langs) == reader.n_files
for file_langs in langs:
assert isinstance(file_langs, list)
def test_participants_from_dir(self, testchat_good_dir):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reader = CHAT.from_dir(str(testchat_good_dir), strict=False)
parts = reader.participants(by_file=True)
assert len(parts) == reader.n_files
for file_parts in parts:
assert isinstance(file_parts, list)
for p in file_parts:
assert isinstance(p, Participant)
assert p.code
MISALIGNED_CHAT = (
"@UTF8\n"
"@Begin\n"
"@Participants:\tCHI Child\n"
"*CHI:\tI want cookie .\n"
"%mor:\tpro|I v|want .\n" "@End\n"
)
WELL_FORMED_CHAT = (
"@UTF8\n"
"@Begin\n"
"@Participants:\tCHI Child\n"
"*CHI:\tI want cookie .\n"
"%mor:\tpro|I v|want n|cookie .\n"
"@End\n"
)
class TestStrictMode:
def test_strict_true_raises(self):
with pytest.raises(ValueError, match="misalignment"):
CHAT.from_strs([MISALIGNED_CHAT], strict=True)
def test_strict_default_raises(self):
with pytest.raises(ValueError, match="misalignment"):
CHAT.from_strs([MISALIGNED_CHAT])
def test_strict_true_error_message_details(self):
with pytest.raises(ValueError, match="CHI") as exc_info:
CHAT.from_strs([MISALIGNED_CHAT], strict=True)
msg = str(exc_info.value)
assert "%mor" in msg or "mor" in msg
assert "strict=False" in msg
def test_strict_false_warns(self):
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
CHAT.from_strs([MISALIGNED_CHAT], strict=False)
assert len(w) == 1
assert issubclass(w[0].category, UserWarning)
assert "misalignment" in str(w[0].message)
def test_strict_false_empty_tokens(self):
with warnings.catch_warnings(record=True):
warnings.simplefilter("always")
reader = CHAT.from_strs([MISALIGNED_CHAT], strict=False)
utts = reader.utterances()
assert len(utts) == 1
assert utts[0].tokens == []
def test_strict_false_preserves_tiers(self):
with warnings.catch_warnings(record=True):
warnings.simplefilter("always")
reader = CHAT.from_strs([MISALIGNED_CHAT], strict=False)
utts = reader.utterances()
tiers = utts[0].tiers
assert tiers is not None
assert "CHI" in tiers
assert "%mor" in tiers
assert "cookie" in tiers["CHI"]
def test_no_misalignment_strict_true_ok(self):
reader = CHAT.from_strs([WELL_FORMED_CHAT], strict=True)
utts = reader.utterances()
assert len(utts) == 1
assert utts[0].tokens is not None
assert len(utts[0].tokens) == 4
def test_strict_parallel_false(self):
with pytest.raises(ValueError, match="misalignment"):
CHAT.from_strs([MISALIGNED_CHAT], parallel=False, strict=True)
with warnings.catch_warnings(record=True):
warnings.simplefilter("always")
reader = CHAT.from_strs([MISALIGNED_CHAT], parallel=False, strict=False)
utts = reader.utterances()
assert utts[0].tokens == []
assert utts[0].tiers is not None
assert "%mor" in utts[0].tiers
class TestDevelopmentalMeasures:
def test_mlum_basic(self):
reader = CHAT.from_strs([BASIC_CHAT])
result = reader.mlum()
assert result == [3.0]
def test_mlu_aliases_mlum(self):
reader = CHAT.from_strs([BASIC_CHAT])
assert reader.mlu() == reader.mlum()
def test_mluw_basic(self):
reader = CHAT.from_strs([BASIC_CHAT])
result = reader.mluw()
assert result == [3.0]
def test_ttr_basic(self):
reader = CHAT.from_strs([BASIC_CHAT])
result = reader.ttr()
assert result == [1.0]
def test_mlum_multiple_files(self):
reader = CHAT.from_strs([BASIC_CHAT, BASIC_CHAT], ids=["a", "b"])
result = reader.mlum()
assert len(result) == 2
assert result == [3.0, 3.0]
def test_mluw_multiple_files(self):
reader = CHAT.from_strs([BASIC_CHAT, BASIC_CHAT], ids=["a", "b"])
result = reader.mluw()
assert len(result) == 2
assert result == [3.0, 3.0]
def test_ttr_multiple_files(self):
reader = CHAT.from_strs([BASIC_CHAT, BASIC_CHAT], ids=["a", "b"])
result = reader.ttr()
assert len(result) == 2
assert result == [1.0, 1.0]
def test_mlum_empty(self):
reader = CHAT.from_strs([])
assert reader.mlum() == []
def test_mluw_empty(self):
reader = CHAT.from_strs([])
assert reader.mluw() == []
def test_ttr_empty(self):
reader = CHAT.from_strs([])
assert reader.ttr() == []
def test_ttr_repeated_words(self):
chat_str = (
"@UTF8\n@Begin\n"
"@Participants:\tCHI Child\n"
"*CHI:\tno no no .\n"
"%mor:\tco|no co|no co|no .\n"
"@End\n"
)
reader = CHAT.from_strs([chat_str])
result = reader.ttr()
assert result == pytest.approx([1.0 / 3.0])
def test_mlum_participant(self):
reader = CHAT.from_strs([BASIC_CHAT])
assert reader.mlum(participant="MOT") == [1.0]
def test_mluw_participant(self):
reader = CHAT.from_strs([BASIC_CHAT])
assert reader.mluw(participant="MOT") == [1.0]
def test_mlum_n_none(self):
reader = CHAT.from_strs([BASIC_CHAT])
assert reader.mlum(n=None) == [3.0]
def test_mluw_n_none(self):
reader = CHAT.from_strs([BASIC_CHAT])
assert reader.mluw(n=None) == [3.0]
def test_mlum_n_truncation(self):
chat_str = (
"@UTF8\n@Begin\n"
"@Participants:\tCHI Child\n"
"*CHI:\tI want cookie .\n"
"%mor:\tpro|I v|want n|cookie .\n"
"*CHI:\tno .\n"
"%mor:\tco|no .\n"
"@End\n"
)
reader = CHAT.from_strs([chat_str])
assert reader.mlum(n=None) == [2.0]
assert reader.mlum(n=1) == [3.0]
def test_mluw_n_truncation(self):
chat_str = (
"@UTF8\n@Begin\n"
"@Participants:\tCHI Child\n"
"*CHI:\tI want cookie .\n"
"%mor:\tpro|I v|want n|cookie .\n"
"*CHI:\tno .\n"
"%mor:\tco|no .\n"
"@End\n"
)
reader = CHAT.from_strs([chat_str])
assert reader.mluw(n=None) == [2.0]
assert reader.mluw(n=1) == [3.0]
def test_ttr_participant(self):
reader = CHAT.from_strs([BASIC_CHAT])
assert reader.ttr(participant="MOT") == [1.0]
def test_ttr_n_none(self):
reader = CHAT.from_strs([BASIC_CHAT])
assert reader.ttr(n=None) == [1.0]
def test_ttr_n_truncation(self):
chat_str = (
"@UTF8\n@Begin\n"
"@Participants:\tCHI Child\n"
"*CHI:\tno no no yes yes .\n"
"%mor:\tco|no co|no co|no co|yes co|yes .\n"
"@End\n"
)
reader = CHAT.from_strs([chat_str])
assert reader.ttr(n=None) == pytest.approx([2.0 / 5.0])
assert reader.ttr(n=3) == pytest.approx([1.0 / 3.0])
def test_measures_with_filter(self):
reader = CHAT.from_strs([BASIC_CHAT])
filtered = reader.filter(participants="CHI")
assert filtered.mlum() == [3.0]
assert filtered.mluw() == [3.0]
assert filtered.ttr() == [1.0]
def test_measures_return_type(self):
reader = CHAT.from_strs([BASIC_CHAT])
assert isinstance(reader.mlum(), list)
assert isinstance(reader.mluw(), list)
assert isinstance(reader.ttr(), list)
assert all(isinstance(v, float) for v in reader.mlum())
class TestAges:
def test_ages_basic(self):
reader = CHAT.from_strs([HEADER_CHAT])
result = reader.ages()
assert len(result) == 1
age = result[0]
assert isinstance(age, Age)
assert age.years == 2
assert age.months == 10
assert age.days == 5
def test_ages_no_chi(self):
chat_str = (
"@UTF8\n@Begin\n" "@Participants:\tMOT Mother\n" "*MOT:\thello .\n" "@End\n"
)
reader = CHAT.from_strs([chat_str])
assert reader.ages() == [None]
def test_ages_empty(self):
reader = CHAT.from_strs([])
assert reader.ages() == []
def test_ages_multiple_files(self):
reader = CHAT.from_strs([HEADER_CHAT, HEADER_CHAT], ids=["a", "b"])
result = reader.ages()
assert len(result) == 2
assert result[0] is not None and result[0].years == 2
assert result[1] is not None and result[1].years == 2
class TestWordNgrams:
def test_word_ngrams_unigrams(self):
reader = CHAT.from_strs([BASIC_CHAT])
counter = reader.word_ngrams(1)
assert counter.get(["I"]) == 1
assert counter.get(["want"]) == 1
assert counter.get(["cookie"]) == 1
assert counter.get(["no"]) == 1
def test_word_ngrams_bigrams(self):
reader = CHAT.from_strs([BASIC_CHAT])
counter = reader.word_ngrams(2)
assert counter.get(["I", "want"]) == 1
assert counter.get(["want", "cookie"]) == 1
def test_word_ngrams_returns_ngrams(self):
from rustling.ngram import Ngrams
reader = CHAT.from_strs([BASIC_CHAT])
counter = reader.word_ngrams(1)
assert isinstance(counter, Ngrams)
assert counter.n == 1
def test_word_ngrams_no_cross_utterance(self):
reader = CHAT.from_strs([BASIC_CHAT])
counter = reader.word_ngrams(2)
assert counter.get([".", "no"]) == 0
def test_word_ngrams_empty_reader(self):
reader = CHAT.from_strs([])
counter = reader.word_ngrams(1)
assert len(counter) == 0
assert counter.total() == 0
def test_word_ngrams_multiple_files(self):
reader = CHAT.from_strs(
[
"@UTF8\n@Begin\n*CHI:\thi .\n@End\n",
"@UTF8\n@Begin\n*MOT:\thi .\n@End\n",
],
ids=["a", "b"],
)
counter = reader.word_ngrams(1)
assert counter.get(["hi"]) == 2
class TestHeadTail:
def test_head_returns_utterances(self):
reader = CHAT.from_strs([BASIC_CHAT])
result = reader.head()
assert type(result).__name__ == "Utterances"
def test_head_repr_displays_formatted(self):
reader = CHAT.from_strs([BASIC_CHAT])
result = reader.head()
text = repr(result)
assert "*CHI:" in text
assert "*MOT:" in text
assert "\n" in text
def test_head_str_matches_repr(self):
reader = CHAT.from_strs([BASIC_CHAT])
result = reader.head()
assert str(result) == repr(result)
def test_head_len(self):
reader = CHAT.from_strs([BASIC_CHAT])
assert len(reader.head()) == 2
assert len(reader.head(1)) == 1
def test_head_getitem(self):
reader = CHAT.from_strs([BASIC_CHAT])
result = reader.head()
assert isinstance(result[0], Utterance)
assert result[0].participant == "CHI"
assert result[-1].participant == "MOT"
def test_head_getitem_out_of_range(self):
reader = CHAT.from_strs([BASIC_CHAT])
result = reader.head(1)
with pytest.raises(IndexError):
result[5]
def test_head_iter(self):
reader = CHAT.from_strs([BASIC_CHAT])
result = reader.head()
participants = [u.participant for u in result]
assert participants == ["CHI", "MOT"]
def test_head_n_1(self):
reader = CHAT.from_strs([BASIC_CHAT])
result = reader.head(1)
text = repr(result)
assert "*CHI:" in text
assert "*MOT:" not in text
def test_tail_n_1(self):
reader = CHAT.from_strs([BASIC_CHAT])
result = reader.tail(1)
text = repr(result)
assert "*MOT:" in text
assert "*CHI:" not in text
def test_head_with_mor_and_gra(self):
reader = CHAT.from_strs([BASIC_CHAT])
text = repr(reader.head(1))
assert "%mor:" in text
assert "%gra:" in text
assert "pro|I" in text
assert "1|2|SUBJ" in text
def test_head_no_mor(self):
chat_str = "@UTF8\n@Begin\n*CHI:\thello world .\n@End\n"
reader = CHAT.from_strs([chat_str])
text = repr(reader.head())
assert "*CHI:" in text
assert "%mor:" not in text
assert "%gra:" not in text
def test_head_empty_reader(self):
reader = CHAT.from_strs([])
result = reader.head()
assert len(result) == 0
assert repr(result) == ""
def test_tail_empty_reader(self):
reader = CHAT.from_strs([])
result = reader.tail()
assert len(result) == 0
assert repr(result) == ""
def test_head_separation(self):
reader = CHAT.from_strs([BASIC_CHAT])
text = repr(reader.head())
assert "\n\n" in text
def test_utterance_to_str(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
s = utts[0].to_str()
assert isinstance(s, str)
assert "*CHI:" in s
def test_head_time_marks(self):
chat_str = (
"@UTF8\n@Begin\n@Participants:\tCHI Child\n"
"*CHI:\thi . \x150_1500\x15\n@End\n"
)
reader = CHAT.from_strs([chat_str])
text = repr(reader.head())
assert "0" in text
assert "1500" in text
assert "ms" in text
def test_head_column_alignment(self):
reader = CHAT.from_strs([BASIC_CHAT])
text = repr(reader.head(1))
lines = text.strip().split("\n")
assert len(lines) >= 3
first_data_offsets = []
for line in lines:
idx = line.find(" ")
if idx >= 0:
first_data_offsets.append(idx)
assert len(set(first_data_offsets)) == 1
def test_head_repr_html(self):
reader = CHAT.from_strs([BASIC_CHAT])
result = reader.head()
html = result._repr_html_()
assert "<table" in html
assert "*CHI:" in html
class TestChangeableHeaders:
def test_changeable_headers_in_utterances(self):
reader = CHAT.from_strs([HEADER_CHAT])
utts = reader.utterances()
assert len(utts) == 5
def test_ordering_preserved(self):
reader = CHAT.from_strs([HEADER_CHAT])
utts = reader.utterances()
assert utts[0].participant == "CHI"
assert utts[0].changeable_header is None
assert isinstance(utts[1].changeable_header, ChangeableHeader.NewEpisode)
assert utts[2].participant == "MOT"
assert isinstance(utts[3].changeable_header, ChangeableHeader.Comment)
assert utts[3].changeable_header.value == "Child laughs"
assert utts[4].participant == "CHI"
def test_changeable_header_fields_none(self):
reader = CHAT.from_strs([HEADER_CHAT])
utts = reader.utterances()
headers = [u for u in utts if u.changeable_header is not None]
assert len(headers) == 2
for h in headers:
assert h.participant is None
assert h.tokens is None
assert h.tiers is None
def test_real_utterance_has_no_changeable_header(self):
reader = CHAT.from_strs([HEADER_CHAT])
utts = reader.utterances()
real = [u for u in utts if u.changeable_header is None]
assert len(real) == 3
for u in real:
assert u.participant is not None
assert u.tokens is not None
assert u.tiers is not None
def test_file_level_headers_excluded(self):
reader = CHAT.from_strs([HEADER_CHAT])
utts = reader.utterances()
comments = [
u
for u in utts
if u.changeable_header is not None
and isinstance(u.changeable_header, ChangeableHeader.Comment)
]
assert len(comments) == 1
assert comments[0].changeable_header.value == "Child laughs"
def test_words_skips_headers(self):
reader = CHAT.from_strs([HEADER_CHAT])
words = reader.words()
assert "hello" in words
assert "hi" in words
assert "cookie" in words
assert len(words) == 6
def test_filter_drops_changeable_headers(self):
reader = CHAT.from_strs([HEADER_CHAT])
filtered = reader.filter(participants="CHI")
utts = filtered.utterances()
assert len(utts) == 2
assert all(u.participant == "CHI" for u in utts)
assert all(u.changeable_header is None for u in utts)
BASIC_CHAT_2 = (
"@UTF8\n"
"@Begin\n"
"@Participants:\tMOT Mother\n"
"*MOT:\tgood morning .\n"
"%mor:\tadj|good n|morning .\n"
"%gra:\t1|2|MOD 2|0|ROOT 3|2|PUNCT\n"
"@End\n"
)
class TestInfo:
def test_info_single_file(self, capsys):
reader = CHAT.from_strs([BASIC_CHAT])
reader.info()
output = capsys.readouterr().out
lines = output.strip().split("\n")
assert lines[0] == "1 files"
assert lines[1] == "2 utterances"
assert lines[2] == "6 words"
assert len(lines) == 3
def test_info_multiple_files(self, capsys):
reader = CHAT.from_strs([BASIC_CHAT, BASIC_CHAT_2], ids=["a.cha", "b.cha"])
reader.info()
output = capsys.readouterr().out
lines = output.strip().split("\n")
assert lines[0] == "2 files"
assert lines[1] == "3 utterances"
assert lines[2] == "9 words"
assert "Utterance Count" in lines[3]
assert "Word Count" in lines[3]
assert "File Path" in lines[3]
assert set(lines[4].strip()) <= {"-", " "}
assert "#1" in lines[5]
assert "a.cha" in lines[5]
assert "#2" in lines[6]
assert "b.cha" in lines[6]
def test_info_verbose(self, capsys):
files = [BASIC_CHAT] * 7
ids = [f"file{i}.cha" for i in range(7)]
reader = CHAT.from_strs(files, ids=ids)
reader.info(verbose=True)
output = capsys.readouterr().out
assert "7 files" in output
assert "#7" in output
assert "verbose" not in output
def test_info_not_verbose_truncates(self, capsys):
files = [BASIC_CHAT] * 7
ids = [f"file{i}.cha" for i in range(7)]
reader = CHAT.from_strs(files, ids=ids)
reader.info()
output = capsys.readouterr().out
assert "#5" in output
assert "#6" not in output
assert "set `verbose` to True for all the files" in output
def test_info_empty(self, capsys):
reader = CHAT.from_strs([])
reader.info()
output = capsys.readouterr().out
lines = output.strip().split("\n")
assert lines[0] == "0 files"
assert lines[1] == "0 utterances"
assert lines[2] == "0 words"
def test_no_changeable_headers_in_basic(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
assert len(utts) == 2
assert all(u.changeable_header is None for u in utts)
def test_to_str_changeable_header(self):
reader = CHAT.from_strs([HEADER_CHAT])
utts = reader.utterances()
assert utts[1].to_str() == "@New Episode"
assert utts[3].to_str() == "@Comment:\tChild laughs"
def test_head_includes_headers(self):
reader = CHAT.from_strs([HEADER_CHAT])
result = reader.head(10)
has_header = any(u.changeable_header is not None for u in result)
assert has_header
class TestIPSyn:
def test_ipsyn_basic(self):
reader = CHAT.from_strs([BASIC_CHAT])
result = reader.ipsyn()
assert len(result) == 1
assert result[0] > 0
def test_ipsyn_empty(self):
reader = CHAT.from_strs([])
assert reader.ipsyn() == []
def test_ipsyn_no_matching_participant(self):
reader = CHAT.from_strs([BASIC_CHAT])
assert reader.ipsyn(participant="INV") == [0]
def test_ipsyn_participant(self):
reader = CHAT.from_strs([BASIC_CHAT])
result = reader.ipsyn(participant="MOT")
assert len(result) == 1
assert result[0] < reader.ipsyn()[0]
def test_ipsyn_n_truncation(self):
chat_str = (
"@UTF8\n@Begin\n"
"@Participants:\tCHI Child\n"
"*CHI:\tI want cookie .\n"
"%mor:\tpro|I v|want n|cookie .\n"
"%gra:\t1|2|SUBJ 2|0|ROOT 3|2|OBJ 4|2|PUNCT\n"
"*CHI:\tthe big dog is running .\n"
"%mor:\tdet|the adj|big n|dog cop|be&3S v|run-PRESP .\n"
"%gra:\t1|3|DET 2|3|MOD 3|4|SUBJ 4|0|ROOT 5|4|PRED 6|4|PUNCT\n"
"@End\n"
)
reader = CHAT.from_strs([chat_str])
score_all = reader.ipsyn(n=None)[0]
score_one = reader.ipsyn(n=1)[0]
assert score_all >= score_one
def test_ipsyn_n_none(self):
reader = CHAT.from_strs([BASIC_CHAT])
result = reader.ipsyn(n=None)
assert len(result) == 1
def test_ipsyn_multiple_files(self):
reader = CHAT.from_strs([BASIC_CHAT, BASIC_CHAT], ids=["a", "b"])
result = reader.ipsyn()
assert len(result) == 2
assert result[0] == result[1]
def test_ipsyn_return_type(self):
reader = CHAT.from_strs([BASIC_CHAT])
result = reader.ipsyn()
assert isinstance(result, list)
assert all(isinstance(v, int) for v in result)
def test_ipsyn_max_score_bounded(self):
reader = CHAT.from_strs([BASIC_CHAT])
result = reader.ipsyn()
assert 0 <= result[0] <= 112
def test_ipsyn_noun_phrase_items(self):
chat_str = (
"@UTF8\n@Begin\n"
"@Participants:\tCHI Child\n"
"*CHI:\tthe cookie .\n"
"%mor:\tdet|the n|cookie .\n"
"%gra:\t1|2|DET 2|0|ROOT 3|2|PUNCT\n"
"*CHI:\tthe cookie .\n"
"%mor:\tdet|the n|cookie .\n"
"%gra:\t1|2|DET 2|0|ROOT 3|2|PUNCT\n"
"@End\n"
)
reader = CHAT.from_strs([chat_str])
result = reader.ipsyn()[0]
assert result >= 8
def test_ipsyn_verb_items(self):
chat_str = (
"@UTF8\n@Begin\n"
"@Participants:\tCHI Child\n"
"*CHI:\the is running .\n"
"%mor:\tpro|he cop|be&3S v|run-PRESP .\n"
"%gra:\t1|2|SUBJ 2|0|ROOT 3|2|PRED 4|2|PUNCT\n"
"*CHI:\the walked .\n"
"%mor:\tpro|he v|walk-PAST .\n"
"%gra:\t1|2|SUBJ 2|0|ROOT 3|2|PUNCT\n"
"@End\n"
)
reader = CHAT.from_strs([chat_str])
result = reader.ipsyn()[0]
assert result >= 3
def test_ipsyn_question_items(self):
chat_str = (
"@UTF8\n@Begin\n"
"@Participants:\tCHI Child\n"
"*CHI:\tis it good ?\n"
"%mor:\tcop|be&3S pro|it adj|good ?\n"
"%gra:\t1|0|ROOT 2|1|SUBJ 3|1|PRED 4|1|PUNCT\n"
"*CHI:\twhy ?\n"
"%mor:\tadv:wh|why ?\n"
"%gra:\t1|0|ROOT 2|1|PUNCT\n"
"@End\n"
)
reader = CHAT.from_strs([chat_str])
result = reader.ipsyn()[0]
assert result >= 2
def test_ipsyn_no_mor_no_gra(self):
chat_str = (
"@UTF8\n@Begin\n"
"@Participants:\tCHI Child\n"
"*CHI:\thello world .\n"
"@End\n"
)
reader = CHAT.from_strs([chat_str])
assert reader.ipsyn() == [0]
def test_ipsyn_svo_scores_s4_and_credits(self):
chat_str = (
"@UTF8\n@Begin\n"
"@Participants:\tCHI Child\n"
"*CHI:\tI want cookie .\n"
"%mor:\tpro|I v|want n|cookie .\n"
"%gra:\t1|2|SUBJ 2|0|ROOT 3|2|OBJ 4|2|PUNCT\n"
"*CHI:\tI eat food .\n"
"%mor:\tpro|I v|eat n|food .\n"
"%gra:\t1|2|SUBJ 2|0|ROOT 3|2|OBJ 4|2|PUNCT\n"
"@End\n"
)
reader = CHAT.from_strs([chat_str])
result = reader.ipsyn()[0]
assert result >= 14
CHAT_WITH_XMOR = (
"@UTF8\n"
"@Begin\n"
"@Participants:\tCHI Child\n"
"*CHI:\tI want cookie .\n"
"%xmor:\tpro|I v|want n|cookie .\n"
"%xgra:\t1|2|SUBJ 2|0|ROOT 3|2|OBJ 4|2|PUNCT\n"
"@End\n"
)
class TestMorGraTierKwargs:
def test_custom_tier_names(self):
reader = CHAT.from_strs([CHAT_WITH_XMOR], mor_tier="xmor", gra_tier="xgra")
tokens = reader.tokens()
assert tokens[0].pos == "pro"
assert tokens[0].mor == "I"
assert tokens[0].gra == Gra(dep=1, head=2, rel="SUBJ")
def test_custom_tiers_default_ignores(self):
reader = CHAT.from_strs([CHAT_WITH_XMOR])
tokens = reader.tokens()
assert tokens[0].mor is None
assert tokens[0].gra is None
def test_none_mor_disables_both(self):
reader = CHAT.from_strs([BASIC_CHAT], mor_tier=None)
tokens = reader.tokens()
assert tokens[0].mor is None
assert tokens[0].gra is None
def test_none_gra_disables_both(self):
reader = CHAT.from_strs([BASIC_CHAT], gra_tier=None)
tokens = reader.tokens()
assert tokens[0].mor is None
assert tokens[0].gra is None
def test_both_none_disables(self):
reader = CHAT.from_strs([BASIC_CHAT], mor_tier=None, gra_tier=None)
tokens = reader.tokens()
assert tokens[0].mor is None
assert tokens[0].gra is None
def test_utterance_tier_names_custom(self):
reader = CHAT.from_strs([CHAT_WITH_XMOR], mor_tier="xmor", gra_tier="xgra")
utts = reader.utterances()
assert utts[0].mor_tier_name == "%xmor"
assert utts[0].gra_tier_name == "%xgra"
def test_utterance_tier_names_default(self):
reader = CHAT.from_strs([BASIC_CHAT])
utts = reader.utterances()
assert utts[0].mor_tier_name == "%mor"
assert utts[0].gra_tier_name == "%gra"
def test_utterance_tier_names_disabled(self):
reader = CHAT.from_strs([BASIC_CHAT], mor_tier=None, gra_tier=None)
utts = reader.utterances()
assert utts[0].mor_tier_name is None
assert utts[0].gra_tier_name is None
def test_to_strs_roundtrip_custom_tiers(self):
reader = CHAT.from_strs([CHAT_WITH_XMOR], mor_tier="xmor", gra_tier="xgra")
strs = reader.to_strs()
assert "%xmor:" in strs[0]
assert "%xgra:" in strs[0]
assert "%mor:" not in strs[0]
assert "%gra:" not in strs[0]