use std::collections::{BTreeMap, BTreeSet};
use std::sync::OnceLock;
use bimap::BiHashMap;
use jieba_rs::Jieba;
fn jieba() -> &'static Jieba {
static INSTANCE: OnceLock<Jieba> = OnceLock::new();
INSTANCE.get_or_init(Jieba::new)
}
use super::model::{
AaakDocument, AaakHeader, AaakLine, AaakMeta, EncodeOutput, EncodeReport, RoundtripReport,
Zettel,
};
const DEFAULT_MAX_TOPICS: usize = 3;
const DEFAULT_EMOTION: &str = "determ";
const DEFAULT_ENTITY_CODE: &str = "UNK";
pub(crate) const EMOTION_SIGNALS: &[(&str, &str)] = &[
("decided", "determ"),
("determined", "determ"),
("prefer", "convict"),
("confident", "convict"),
("worried", "anx"),
("anxious", "anx"),
("concern", "anx"),
("excited", "excite"),
("frustrated", "frust"),
("confused", "confuse"),
("love", "love"),
("hate", "rage"),
("hope", "hope"),
("fear", "fear"),
("trust", "trust"),
("happy", "joy"),
("joy", "joy"),
("sad", "grief"),
("grief", "grief"),
("surprised", "surpr"),
("grateful", "grat"),
("curious", "curious"),
("wonder", "wonder"),
("relieved", "relief"),
("satisf", "satis"),
("disappoint", "grief"),
("vulnerable", "vul"),
("tender", "tender"),
("honest", "raw"),
("doubt", "doubt"),
("exhaust", "exhaust"),
("warm", "warmth"),
("humor", "humor"),
("funny", "humor"),
("peace", "peace"),
("despair", "despair"),
("passion", "passion"),
("\u{51b3}\u{5b9a}", "determ"), ("\u{786e}\u{5b9a}", "determ"), ("\u{62c5}\u{5fc3}", "anx"), ("\u{7126}\u{8651}", "anx"), ("\u{5174}\u{594b}", "excite"), ("\u{6cae}\u{4e27}", "frust"), ("\u{56f0}\u{60d1}", "confuse"), ("\u{5f00}\u{5fc3}", "joy"), ("\u{9ad8}\u{5174}", "joy"), ("\u{60b2}\u{4f24}", "grief"), ("\u{60ca}\u{8bb6}", "surpr"), ("\u{611f}\u{6069}", "grat"), ("\u{611f}\u{8c22}", "grat"), ("\u{597d}\u{5947}", "curious"), ("\u{4fe1}\u{4efb}", "trust"), ("\u{5e0c}\u{671b}", "hope"), ("\u{6050}\u{60e7}", "fear"), ("\u{5bb3}\u{6015}", "fear"), ("\u{6ee1}\u{610f}", "satis"), ("\u{5931}\u{671b}", "grief"), ("\u{8f7b}\u{677e}", "relief"), ("\u{653e}\u{5fc3}", "relief"), ("\u{7231}", "love"), ("\u{6068}", "rage"), ("\u{6016}\u{60e7}", "fear"), ("\u{5e73}\u{9759}", "peace"), ("\u{7edd}\u{671b}", "despair"), ("\u{70ed}\u{60c5}", "passion"), ("\u{6000}\u{7591}", "doubt"), ("\u{75b2}\u{60eb}", "exhaust"), ];
pub(crate) const FLAG_SIGNALS: &[(&str, &str)] = &[
("decid", "DECISION"),
("chose", "DECISION"),
("switch", "DECISION"),
("migrat", "DECISION"),
("replace", "DECISION"),
("recommend", "DECISION"),
("because", "DECISION"),
("found", "ORIGIN"),
("create", "ORIGIN"),
("start", "ORIGIN"),
("born", "ORIGIN"),
("launch", "ORIGIN"),
("first time", "ORIGIN"),
("core", "CORE"),
("fundamental", "CORE"),
("essential", "CORE"),
("principle", "CORE"),
("belief", "CORE"),
("always", "CORE"),
("turning point", "PIVOT"),
("changed everything", "PIVOT"),
("realized", "PIVOT"),
("breakthrough", "PIVOT"),
("epiphany", "PIVOT"),
("api", "TECHNICAL"),
("database", "TECHNICAL"),
("architecture", "TECHNICAL"),
("deploy", "TECHNICAL"),
("infrastructure", "TECHNICAL"),
("framework", "TECHNICAL"),
("server", "TECHNICAL"),
("config", "TECHNICAL"),
("auth", "TECHNICAL"),
("token", "SENSITIVE"),
("password", "SENSITIVE"),
("secret", "SENSITIVE"),
("credential", "SENSITIVE"),
("private", "SENSITIVE"),
("sensitive", "SENSITIVE"),
("pii", "SENSITIVE"),
("\u{51b3}\u{5b9a}", "DECISION"), ("\u{9009}\u{62e9}", "DECISION"), ("\u{5207}\u{6362}", "DECISION"), ("\u{8fc1}\u{79fb}", "DECISION"), ("\u{66ff}\u{6362}", "DECISION"), ("\u{63a8}\u{8350}", "DECISION"), ("\u{56e0}\u{4e3a}", "DECISION"), ("\u{521b}\u{5efa}", "ORIGIN"), ("\u{521b}\u{7acb}", "ORIGIN"), ("\u{5f00}\u{59cb}", "ORIGIN"), ("\u{7b2c}\u{4e00}\u{6b21}", "ORIGIN"), ("\u{6838}\u{5fc3}", "CORE"), ("\u{57fa}\u{672c}", "CORE"), ("\u{539f}\u{5219}", "CORE"), ("\u{4fe1}\u{5ff5}", "CORE"), ("\u{8f6c}\u{6298}", "PIVOT"), ("\u{7a81}\u{7834}", "PIVOT"), ("\u{987f}\u{609f}", "PIVOT"), ("\u{63a5}\u{53e3}", "TECHNICAL"), ("\u{6570}\u{636e}\u{5e93}", "TECHNICAL"), ("\u{67b6}\u{6784}", "TECHNICAL"), ("\u{90e8}\u{7f72}", "TECHNICAL"), ("\u{6846}\u{67b6}", "TECHNICAL"), ("\u{670d}\u{52a1}\u{5668}", "TECHNICAL"), ("\u{914d}\u{7f6e}", "TECHNICAL"), ("\u{8ba4}\u{8bc1}", "TECHNICAL"), ("\u{5bc6}\u{7801}", "SENSITIVE"), ("\u{5bc6}\u{94a5}", "SENSITIVE"), ("\u{51ed}\u{8bc1}", "SENSITIVE"), ("\u{9690}\u{79c1}", "SENSITIVE"), ];
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum TextSegmentKind {
AsciiWord,
Cjk,
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct TextSegment {
kind: TextSegmentKind,
text: String,
}
#[derive(Debug, Clone)]
pub struct AaakCodec {
entity_map: BiHashMap<String, String>,
max_topics: usize,
}
impl Default for AaakCodec {
fn default() -> Self {
Self {
entity_map: BiHashMap::new(),
max_topics: DEFAULT_MAX_TOPICS,
}
}
}
impl AaakCodec {
pub fn with_entity_aliases(aliases: BTreeMap<String, String>) -> Self {
let entity_map = aliases.into_iter().collect::<BiHashMap<_, _>>();
Self {
entity_map,
max_topics: DEFAULT_MAX_TOPICS,
}
}
pub fn encode(&self, text: &str, meta: &AaakMeta) -> EncodeOutput {
let normalized = normalize_whitespace(text);
let all_topics = extract_topics(&normalized);
let topics = all_topics
.iter()
.take(self.max_topics)
.cloned()
.collect::<Vec<_>>();
let topics_truncated = all_topics.len().saturating_sub(self.max_topics);
let mut entity_codes = Vec::new();
let mut seen_codes = BTreeSet::new();
for (name, code) in self.entity_map.iter() {
if normalized.contains(name.as_str()) && seen_codes.insert(code.clone()) {
entity_codes.push(code.clone());
}
}
for entity in extract_entities(&normalized) {
let code = self.entity_code(&entity);
if seen_codes.insert(code.clone()) {
entity_codes.push(code);
}
}
let entities = if entity_codes.is_empty() {
vec![DEFAULT_ENTITY_CODE.to_string()]
} else {
entity_codes
};
let flags = detect_flags(&normalized);
let emotions = detect_emotions(&normalized);
let zettel = Zettel {
id: 0,
entities,
topics: if topics.is_empty() {
vec!["note".to_string()]
} else {
topics
},
quote: normalized,
weight: infer_weight(&flags),
emotions,
flags,
};
let document = AaakDocument {
header: AaakHeader {
version: 1,
wing: meta.wing.clone(),
room: meta.room.clone(),
date: meta.date.clone(),
source: meta.source.clone(),
},
body: vec![AaakLine::Zettel(zettel.clone())],
zettels: vec![zettel],
};
let roundtrip = self.verify_roundtrip(text, &document);
EncodeOutput {
document,
report: EncodeReport {
topics_truncated,
key_sentence_truncated: false,
coverage: roundtrip.coverage,
lost_assertions: roundtrip.lost,
},
}
}
pub fn decode(&self, document: &AaakDocument) -> String {
document
.zettel_lines()
.iter()
.map(|zettel| self.decode_zettel(zettel))
.collect::<Vec<_>>()
.join("\n")
}
pub fn verify_roundtrip(&self, original: &str, document: &AaakDocument) -> RoundtripReport {
let decoded = normalize_whitespace(&self.decode(document)).to_lowercase();
let assertions = split_assertions(original);
if assertions.is_empty() {
return RoundtripReport {
preserved: Vec::new(),
lost: Vec::new(),
coverage: 1.0,
};
}
let (preserved, lost): (Vec<_>, Vec<_>) = assertions
.into_iter()
.partition(|assertion| decoded.contains(&assertion.to_lowercase()));
let coverage = preserved.len() as f32 / (preserved.len() + lost.len()) as f32;
RoundtripReport {
preserved,
lost,
coverage,
}
}
fn decode_zettel(&self, zettel: &Zettel) -> String {
let mut quote = zettel.quote.clone();
for entity in &zettel.entities {
if let Some(name) = self.entity_map.get_by_right(entity) {
quote = replace_code("e, entity, name);
}
}
if quote.is_empty() {
return zettel
.entities
.iter()
.map(|entity| {
self.entity_map
.get_by_right(entity)
.cloned()
.unwrap_or_else(|| entity.clone())
})
.collect::<Vec<_>>()
.join(" ");
}
quote
}
fn entity_code(&self, entity: &str) -> String {
self.entity_map
.get_by_left(entity)
.cloned()
.unwrap_or_else(|| default_entity_code(entity))
}
}
pub(crate) fn normalize_whitespace(text: &str) -> String {
text.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
.replace('"', "'")
.trim()
.to_string()
}
pub(crate) fn extract_entities(text: &str) -> Vec<String> {
let mut seen = BTreeSet::new();
let mut entities = Vec::new();
for segment in text_segments(text) {
match segment.kind {
TextSegmentKind::AsciiWord => {
if looks_like_ascii_entity(&segment.text) && seen.insert(segment.text.clone()) {
entities.push(segment.text);
}
}
TextSegmentKind::Cjk => {
for candidate in extract_cjk_entities_from_segment(&segment.text) {
if seen.insert(candidate.clone()) {
entities.push(candidate);
}
}
}
}
}
entities
}
pub(crate) fn extract_topics(text: &str) -> Vec<String> {
const STOP_WORDS: &[&str] = &[
"the", "and", "for", "with", "over", "based", "this", "that", "was", "use", "why",
];
let mut seen = BTreeSet::new();
let mut topics = Vec::new();
for segment in text_segments(text) {
match segment.kind {
TextSegmentKind::AsciiWord => {
let token = segment.text.to_lowercase();
if token.is_empty() || STOP_WORDS.contains(&token.as_str()) {
continue;
}
if seen.insert(token.clone()) {
topics.push(token);
}
}
TextSegmentKind::Cjk => {
for topic in extract_cjk_topics_from_segment(&segment.text) {
if seen.insert(topic.clone()) {
topics.push(topic);
}
}
}
}
}
topics
}
pub(crate) fn detect_flags(text: &str) -> Vec<String> {
let lower = text.to_lowercase();
let mut flags = Vec::new();
for (needle, flag) in FLAG_SIGNALS {
if lower.contains(needle) && !flags.iter().any(|existing| existing == flag) {
flags.push((*flag).to_string());
}
}
if flags.is_empty() {
flags.push("CORE".to_string());
}
flags
}
pub(crate) fn detect_emotions(text: &str) -> Vec<String> {
let lower = text.to_lowercase();
let mut emotions = Vec::new();
for (needle, emotion) in EMOTION_SIGNALS {
if lower.contains(needle) && !emotions.iter().any(|existing| existing == emotion) {
emotions.push((*emotion).to_string());
}
}
if emotions.is_empty() {
emotions.push(DEFAULT_EMOTION.to_string());
}
emotions
}
pub(crate) fn infer_weight(flags: &[String]) -> u8 {
if flags
.iter()
.any(|flag| flag == "DECISION" || flag == "PIVOT")
{
4
} else if flags.iter().any(|flag| flag == "TECHNICAL") {
3
} else {
2
}
}
fn split_assertions(text: &str) -> Vec<String> {
text.split([
'.', '!', '?', ';', '\u{3002}', '\u{FF01}', '\u{FF1F}', '\u{FF1B}', '\u{FF0C}', ])
.map(normalize_whitespace)
.filter(|item| !item.is_empty())
.collect()
}
fn replace_code(quote: &str, code: &str, name: &str) -> String {
let mut replaced = String::with_capacity(quote.len());
let mut token = String::new();
for ch in quote.chars() {
if ch.is_ascii_alphanumeric() {
token.push(ch);
continue;
}
push_replacement(&mut replaced, &mut token, code, name);
replaced.push(ch);
}
push_replacement(&mut replaced, &mut token, code, name);
replaced
}
pub(crate) fn default_entity_code(entity: &str) -> String {
let ascii_code: String = entity
.chars()
.filter(|ch| ch.is_ascii_alphabetic())
.take(3)
.collect::<String>()
.to_uppercase();
if ascii_code.len() >= 3 {
return ascii_code;
}
let hash = stable_hash(entity);
let mut code = String::with_capacity(3);
for i in 0..3u64 {
let byte = ((hash >> (i * 5)) & 0x1F) as u8;
code.push((b'A' + byte % 26) as char);
}
code
}
fn stable_hash(s: &str) -> u64 {
let mut h: u64 = 0;
for byte in s.bytes() {
h = h.wrapping_mul(31).wrapping_add(u64::from(byte));
}
h
}
fn is_cjk_ideograph(ch: char) -> bool {
matches!(ch,
'\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{F900}'..='\u{FAFF}' )
}
fn text_segments(text: &str) -> Vec<TextSegment> {
let mut segments = Vec::new();
let mut current = String::new();
let mut current_kind = None;
for ch in text.chars() {
let next_kind = if ch.is_ascii_alphanumeric() {
Some(TextSegmentKind::AsciiWord)
} else if is_cjk_ideograph(ch) {
Some(TextSegmentKind::Cjk)
} else {
None
};
match (current_kind, next_kind) {
(Some(kind), Some(next)) if kind == next => current.push(ch),
(Some(kind), Some(next)) => {
segments.push(TextSegment {
kind,
text: std::mem::take(&mut current),
});
current.push(ch);
current_kind = Some(next);
}
(Some(kind), None) => {
if !current.is_empty() {
segments.push(TextSegment {
kind,
text: std::mem::take(&mut current),
});
}
current_kind = None;
}
(None, Some(next)) => {
current.push(ch);
current_kind = Some(next);
}
(None, None) => {}
}
}
if let Some(kind) = current_kind
&& !current.is_empty()
{
segments.push(TextSegment {
kind,
text: current,
});
}
segments
}
fn looks_like_ascii_entity(token: &str) -> bool {
token.len() >= 2
&& token
.chars()
.next()
.is_some_and(|first| first.is_ascii_uppercase())
}
fn extract_cjk_entities_from_segment(segment: &str) -> Vec<String> {
let mut entities = Vec::new();
let mut seen = BTreeSet::new();
for tag in jieba().tag(segment, true) {
let is_entity =
tag.tag.starts_with("nr") || tag.tag == "ns" || tag.tag == "nt" || tag.tag == "nz";
if !is_entity {
continue;
}
if tag.word.chars().count() < 2 {
continue;
}
if seen.insert(tag.word.to_string()) {
entities.push(tag.word.to_string());
}
}
entities
}
fn extract_cjk_topics_from_segment(segment: &str) -> Vec<String> {
let mut topics = Vec::new();
let mut seen = BTreeSet::new();
for tag in jieba().tag(segment, true) {
if tag.tag.starts_with("nr") || tag.tag == "ns" || tag.tag == "nt" || tag.tag == "nz" {
continue;
}
let first = tag.tag.chars().next().unwrap_or(' ');
if !matches!(first, 'n' | 'v' | 'a') {
continue;
}
if tag.word.chars().count() < 2 {
continue;
}
if is_cjk_function_word(tag.word) {
continue;
}
if seen.insert(tag.word.to_string()) {
topics.push(tag.word.to_string());
}
}
topics
}
fn is_cjk_function_word(word: &str) -> bool {
matches!(
word,
"\u{6211}\u{4eec}" | "\u{4f60}\u{4eec}" | "\u{4ed6}\u{4eec}" | "\u{5979}\u{4eec}" | "\u{5b83}\u{4eec}" | "\u{8fd9}\u{4e2a}" | "\u{90a3}\u{4e2a}" | "\u{8fd9}\u{4e9b}" | "\u{90a3}\u{4e9b}" | "\u{4e3a}\u{4ec0}\u{4e48}" | "\u{600e}\u{4e48}" | "\u{4ec0}\u{4e48}" | "\u{56e0}\u{4e3a}" | "\u{6240}\u{4ee5}" | "\u{4f46}\u{662f}" )
}
fn push_replacement(output: &mut String, token: &mut String, code: &str, name: &str) {
if token.is_empty() {
return;
}
if token == code {
output.push_str(name);
} else {
output.push_str(token);
}
token.clear();
}