from __future__ import annotations
def load_hkcancor(min_sent_length: int = 3) -> list[list[tuple[str, str]]]:
try:
import pycantonese
except ImportError:
raise ImportError(
"pycantonese is required for loading HKCanCor data. "
"Install with: pip install pycantonese"
)
corpus = pycantonese.hkcancor()
tagged_sents = []
for sent_tokens in corpus.tokens(by_utterances=True):
sent = [(token.word, token.pos) for token in sent_tokens if token.pos]
if len(sent) >= min_sent_length:
tagged_sents.append(sent)
return tagged_sents
def tagging_data(
tagged_sents: list[list[tuple[str, str]]],
train_ratio: float = 0.8,
) -> tuple[list[list[tuple[str, str]]], list[list[str]]]:
split_idx = int(len(tagged_sents) * train_ratio)
training_data = tagged_sents[:split_idx]
test_sentences = [[word for word, tag in sent] for sent in tagged_sents[split_idx:]]
return training_data, test_sentences
def wordseg_data(
tagged_sents: list[list[tuple[str, str]]],
train_ratio: float = 0.8,
) -> tuple[list[tuple[str, ...]], list[str]]:
split_idx = int(len(tagged_sents) * train_ratio)
training_data = [
tuple(word for word, tag in sent) for sent in tagged_sents[:split_idx]
]
test_sentences = [
"".join(word for word, tag in sent) for sent in tagged_sents[split_idx:]
]
return training_data, test_sentences
def lm_data(
tagged_sents: list[list[tuple[str, str]]],
train_ratio: float = 0.8,
) -> list[list[str]]:
split_idx = int(len(tagged_sents) * train_ratio)
return [[word for word, tag in sent] for sent in tagged_sents[:split_idx]]
def hmm_data(
tagged_sents: list[list[tuple[str, str]]],
train_ratio: float = 0.8,
) -> tuple[list[list[str]], list[list[str]]]:
split_idx = int(len(tagged_sents) * train_ratio)
training = [[word for word, tag in sent] for sent in tagged_sents[:split_idx]]
test = [[word for word, tag in sent] for sent in tagged_sents[split_idx:]]
return training, test