Skip to main content

scitadel_core/models/
annotation.rs

1//! Annotations (highlights + threaded notes) anchored to paper text.
2//!
3//! Follows the W3C Web Annotation selector pattern: a single annotation
4//! may carry multiple selectors (position, quote + context, sentence id),
5//! and the resolver tries them in order on open. Threading is self-
6//! referential via `parent_id`; replies inherit the root's anchor.
7
8use chrono::{DateTime, Utc};
9use serde::{Deserialize, Serialize};
10
11use super::{AnnotationId, PaperId, QuestionId};
12
13/// Status of the anchor after last resolve attempt.
14#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
15#[serde(rename_all = "lowercase")]
16pub enum AnchorStatus {
17    #[default]
18    /// Character-range match; the quote still lives at the same offset.
19    Ok,
20    /// The exact offsets moved but the quote (or sentence id) still matches.
21    Drifted,
22    /// None of the selectors matched — needs user re-anchoring.
23    Orphan,
24}
25
26impl AnchorStatus {
27    #[must_use]
28    pub fn as_str(self) -> &'static str {
29        match self {
30            Self::Ok => "ok",
31            Self::Drifted => "drifted",
32            Self::Orphan => "orphan",
33        }
34    }
35
36    #[must_use]
37    pub fn parse(s: &str) -> Option<Self> {
38        match s {
39            "ok" => Some(Self::Ok),
40            "drifted" => Some(Self::Drifted),
41            "orphan" => Some(Self::Orphan),
42            _ => None,
43        }
44    }
45}
46
47/// Multi-selector anchor. Any field may be `None`; the resolver falls
48/// through: position → quote + context → sentence id → orphan.
49#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
50pub struct Anchor {
51    /// TextPositionSelector: fast, fragile. `(start, end)` in chars.
52    pub char_range: Option<(usize, usize)>,
53    /// TextQuoteSelector body.
54    pub quote: Option<String>,
55    /// Context before the quote — used for disambiguation.
56    pub prefix: Option<String>,
57    /// Context after the quote.
58    pub suffix: Option<String>,
59    /// SHA1 of the normalized sentence containing the quote.
60    pub sentence_id: Option<String>,
61    /// Which paper-text extraction version this was anchored against.
62    pub source_version: Option<String>,
63    /// Last-known resolution status; updated on open.
64    pub status: AnchorStatus,
65}
66
67impl Anchor {
68    /// Is this an orphan that requires user re-anchoring?
69    #[must_use]
70    pub fn is_orphan(&self) -> bool {
71        matches!(self.status, AnchorStatus::Orphan)
72    }
73
74    /// True when the anchor's only "selector" is a synthetic
75    /// import-marker `sentence_id` (no quote, no char_range, no
76    /// real sentence hash) — i.e. a `note=` from a `.bib` import
77    /// that has nothing to anchor against in the paper text yet.
78    /// The resolver short-circuits these so they don't trip the
79    /// orphan-warning UI flow (#158).
80    #[must_use]
81    pub fn is_imported_synthetic(&self) -> bool {
82        self.char_range.is_none()
83            && self.quote.is_none()
84            && self
85                .sentence_id
86                .as_deref()
87                .is_some_and(|s| s.starts_with(IMPORTED_SENTENCE_ID_PREFIX))
88    }
89
90    /// True when this anchor represents a paper-level note: a
91    /// commentary on the publication as a whole rather than a
92    /// passage in it. Built by `paper_note_sentence_id(paper_id)`
93    /// and recognised by the resolver as `AnchorStatus::Ok` without
94    /// needing a quote / char_range / fuzzy match. The TUI renders
95    /// these in a separate "paper-level notes" section above the
96    /// thread list. (#185)
97    #[must_use]
98    pub fn is_paper_note(&self) -> bool {
99        self.char_range.is_none()
100            && self.quote.is_none()
101            && self
102                .sentence_id
103                .as_deref()
104                .is_some_and(|s| s.starts_with(PAPER_NOTE_SENTENCE_ID_PREFIX))
105    }
106}
107
108/// Marker prefix on a synthetic `sentence_id` produced by the
109/// `.bib` import path for unanchored `note={...}` entries (#158).
110/// Picked to be unambiguous: SHA1 hex (the real `sentence_id`
111/// output) cannot start with `bibtex-import:`.
112pub const IMPORTED_SENTENCE_ID_PREFIX: &str = "bibtex-import:";
113
114/// Marker prefix on a synthetic `sentence_id` for paper-level
115/// commentary (no quote, no anchor — the user is commenting on the
116/// publication as a whole). Lives in a different namespace from
117/// [`IMPORTED_SENTENCE_ID_PREFIX`] so a single resolver pass can
118/// route each kind to its own short-circuit path. SHA1 hex output
119/// cannot start with `paper-note:`, and `bibtex-import:` /
120/// `paper-note:` are disjoint by construction. (#185)
121pub const PAPER_NOTE_SENTENCE_ID_PREFIX: &str = "paper-note:";
122
123/// Build the synthetic `sentence_id` that identifies a paper-level
124/// note. Stable per `paper_id` so future calls (e.g. for de-dup)
125/// can re-derive the same handle. (#185)
126#[must_use]
127pub fn paper_note_sentence_id(paper_id: &str) -> String {
128    format!("{PAPER_NOTE_SENTENCE_ID_PREFIX}{paper_id}")
129}
130
131/// Build the synthetic `Anchor` that flags an annotation as a
132/// paper-level note. No quote, no char_range — only the
133/// `paper-note:<paper_id>` sentinel sentence_id and `AnchorStatus::Ok`
134/// so the resolver short-circuits without trying to match anything in
135/// the body text. Shared by every paper-note write path (MCP tool +
136/// TUI DataStore wrapper) so the two surfaces produce byte-identical
137/// anchors. (#185)
138#[must_use]
139pub fn paper_note_anchor(paper_id: &str) -> Anchor {
140    Anchor {
141        sentence_id: Some(paper_note_sentence_id(paper_id)),
142        status: AnchorStatus::Ok,
143        ..Anchor::default()
144    }
145}
146
147/// Build a synthetic `sentence_id` for an unanchored imported
148/// `note=`. Combines the source citekey with the SHA1 of the
149/// normalized note content so the same `(citekey, note)` pair
150/// always hashes to the same id. The result is a stable handle
151/// that the resolver recognizes as "imported, not yet anchored
152/// to paper text" rather than as a broken anchor.
153#[must_use]
154pub fn imported_sentence_id(citekey: &str, note: &str) -> String {
155    let content_hash = sentence_id(note);
156    format!("{IMPORTED_SENTENCE_ID_PREFIX}{citekey}:{content_hash}")
157}
158
159/// One annotation. May be a root (with an anchor) or a reply (parent_id set,
160/// anchor empty; the root's anchor is the canonical one for rendering).
161#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
162pub struct Annotation {
163    pub id: AnnotationId,
164    /// `None` = root (carries the anchor). `Some` = reply to that ID.
165    pub parent_id: Option<AnnotationId>,
166    pub paper_id: PaperId,
167    pub question_id: Option<QuestionId>,
168    pub anchor: Anchor,
169    pub note: String,
170    pub color: Option<String>,
171    pub tags: Vec<String>,
172    /// Identity string — `$USER` for TUI writes, required for MCP writes.
173    pub author: String,
174    pub created_at: DateTime<Utc>,
175    pub updated_at: DateTime<Utc>,
176    /// Soft-delete tombstone. None = live.
177    pub deleted_at: Option<DateTime<Utc>>,
178}
179
180impl Annotation {
181    /// Build a new root-level annotation with the given anchor.
182    #[must_use]
183    pub fn new_root(paper_id: PaperId, author: String, note: String, anchor: Anchor) -> Self {
184        let now = Utc::now();
185        Self {
186            id: AnnotationId::new(),
187            parent_id: None,
188            paper_id,
189            question_id: None,
190            anchor,
191            note,
192            color: None,
193            tags: Vec::new(),
194            author,
195            created_at: now,
196            updated_at: now,
197            deleted_at: None,
198        }
199    }
200
201    /// Build a new reply whose anchor is empty (inherits from root).
202    #[must_use]
203    pub fn new_reply(parent: &Annotation, author: String, note: String) -> Self {
204        let now = Utc::now();
205        Self {
206            id: AnnotationId::new(),
207            parent_id: Some(parent.id.clone()),
208            paper_id: parent.paper_id.clone(),
209            question_id: parent.question_id.clone(),
210            anchor: Anchor::default(),
211            note,
212            color: None,
213            tags: Vec::new(),
214            author,
215            created_at: now,
216            updated_at: now,
217            deleted_at: None,
218        }
219    }
220
221    /// True if this is a reply to another annotation.
222    #[must_use]
223    pub fn is_reply(&self) -> bool {
224        self.parent_id.is_some()
225    }
226
227    /// True if this annotation has been soft-deleted.
228    #[must_use]
229    pub fn is_deleted(&self) -> bool {
230        self.deleted_at.is_some()
231    }
232}
233
234/// Per-reader read receipt; composite key (annotation_id, reader).
235#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
236pub struct AnnotationRead {
237    pub annotation_id: AnnotationId,
238    pub reader: String,
239    pub seen_at: DateTime<Utc>,
240}
241
242/// Normalize a sentence for sentence-id hashing.
243///
244/// Per ADR-004: NFKC compose (folds ligatures: fi → fi, fl → fl), Unicode
245/// lowercase, then collapse all Unicode whitespace runs to a single
246/// ASCII space and trim. Two sentences that differ only in case,
247/// whitespace, or ligature presentation hash to the same value.
248#[must_use]
249pub fn normalize_sentence(s: &str) -> String {
250    use unicode_normalization::UnicodeNormalization;
251    // NFKC: compatibility decomposition + canonical composition.
252    let composed: String = s.nfkc().collect();
253    let lowered: String = composed.chars().flat_map(char::to_lowercase).collect();
254    let mut out = String::with_capacity(lowered.len());
255    let mut prev_was_space = true; // collapses leading whitespace
256    for ch in lowered.chars() {
257        if ch.is_whitespace() {
258            if !prev_was_space {
259                out.push(' ');
260                prev_was_space = true;
261            }
262        } else {
263            out.push(ch);
264            prev_was_space = false;
265        }
266    }
267    if out.ends_with(' ') {
268        out.pop();
269    }
270    out
271}
272
273/// SHA1 hex of the normalized sentence — stable identifier the
274/// resolver can compare against sentences extracted from current
275/// paper text. See `normalize_sentence` and ADR-004 for the
276/// normalization spec.
277#[must_use]
278pub fn sentence_id(s: &str) -> String {
279    use sha1::{Digest, Sha1};
280    let normalized = normalize_sentence(s);
281    let mut hasher = Sha1::new();
282    hasher.update(normalized.as_bytes());
283    let digest = hasher.finalize();
284    let mut hex = String::with_capacity(40);
285    for byte in digest {
286        use std::fmt::Write as _;
287        let _ = write!(hex, "{byte:02x}");
288    }
289    hex
290}
291
292#[cfg(test)]
293mod tests {
294    use super::*;
295
296    #[test]
297    fn reply_inherits_paper_and_question() {
298        let paper_id: PaperId = "p1".into();
299        let root = Annotation::new_root(
300            paper_id.clone(),
301            "lars".into(),
302            "interesting passage".into(),
303            Anchor {
304                quote: Some("neutron energy".into()),
305                ..Anchor::default()
306            },
307        );
308        let reply = Annotation::new_reply(&root, "claude".into(), "agreed; see 4.2".into());
309        assert_eq!(reply.paper_id, paper_id);
310        assert_eq!(reply.parent_id.as_ref(), Some(&root.id));
311        assert!(reply.anchor.quote.is_none(), "replies inherit anchor");
312    }
313
314    #[test]
315    fn anchor_status_round_trip() {
316        for s in [
317            AnchorStatus::Ok,
318            AnchorStatus::Drifted,
319            AnchorStatus::Orphan,
320        ] {
321            assert_eq!(AnchorStatus::parse(s.as_str()), Some(s));
322        }
323    }
324
325    #[test]
326    fn orphan_flag() {
327        let mut a = Anchor::default();
328        assert!(!a.is_orphan());
329        a.status = AnchorStatus::Orphan;
330        assert!(a.is_orphan());
331    }
332
333    #[test]
334    fn imported_synthetic_id_has_marker_prefix() {
335        let id = imported_sentence_id("smith2024", "some note");
336        assert!(id.starts_with(IMPORTED_SENTENCE_ID_PREFIX));
337        assert!(id.contains("smith2024"));
338    }
339
340    #[test]
341    fn is_imported_synthetic_recognises_marker_anchor() {
342        let a = Anchor {
343            sentence_id: Some(imported_sentence_id("k", "n")),
344            ..Anchor::default()
345        };
346        assert!(a.is_imported_synthetic());
347    }
348
349    #[test]
350    fn is_imported_synthetic_rejects_real_sentence_id() {
351        let a = Anchor {
352            sentence_id: Some(sentence_id("a real sentence.")),
353            ..Anchor::default()
354        };
355        assert!(!a.is_imported_synthetic());
356    }
357
358    #[test]
359    fn paper_note_sentinel_is_disjoint_from_imported() {
360        // The two sentinel namespaces must not collide — a single
361        // resolver pass needs to route each kind to its own
362        // short-circuit. Pin the disjointness here so a future
363        // rename can't quietly break it. (#185)
364        assert_ne!(IMPORTED_SENTENCE_ID_PREFIX, PAPER_NOTE_SENTENCE_ID_PREFIX);
365        let imp = imported_sentence_id("k", "n");
366        let pn = paper_note_sentence_id("p-attn");
367        assert!(imp.starts_with(IMPORTED_SENTENCE_ID_PREFIX));
368        assert!(pn.starts_with(PAPER_NOTE_SENTENCE_ID_PREFIX));
369        assert!(!imp.starts_with(PAPER_NOTE_SENTENCE_ID_PREFIX));
370        assert!(!pn.starts_with(IMPORTED_SENTENCE_ID_PREFIX));
371    }
372
373    #[test]
374    fn paper_note_anchor_is_recognised_by_predicate() {
375        // The shared helper must produce an anchor that
376        // `is_paper_note()` accepts; otherwise the two write paths
377        // (MCP tool + DataStore) drift from the resolver and the TUI
378        // rendering filter, and the round-trip silently breaks.
379        let a = paper_note_anchor("p-attn");
380        assert!(a.is_paper_note());
381        assert_eq!(a.status, AnchorStatus::Ok);
382        assert!(a.quote.is_none());
383        assert!(a.char_range.is_none());
384        assert_eq!(
385            a.sentence_id.as_deref(),
386            Some(&*paper_note_sentence_id("p-attn"))
387        );
388    }
389
390    #[test]
391    fn paper_note_id_is_stable_per_paper() {
392        // Same paper_id ⇒ same id; different paper_id ⇒ different id.
393        // Used by future de-dup logic if "comment on the paper as a
394        // whole" ever needs uniqueness per paper+author.
395        assert_eq!(paper_note_sentence_id("p-1"), paper_note_sentence_id("p-1"));
396        assert_ne!(paper_note_sentence_id("p-1"), paper_note_sentence_id("p-2"));
397    }
398
399    #[test]
400    fn is_paper_note_recognises_marker_anchor() {
401        let a = Anchor {
402            sentence_id: Some(paper_note_sentence_id("p-1")),
403            ..Anchor::default()
404        };
405        assert!(a.is_paper_note());
406        // …and is_imported_synthetic must NOT also be true; the two
407        // predicates are mutually exclusive on a well-formed anchor.
408        assert!(!a.is_imported_synthetic());
409    }
410
411    #[test]
412    fn is_paper_note_rejects_quote_or_range() {
413        let with_quote = Anchor {
414            sentence_id: Some(paper_note_sentence_id("p-1")),
415            quote: Some("hi".into()),
416            ..Anchor::default()
417        };
418        assert!(!with_quote.is_paper_note());
419
420        let with_range = Anchor {
421            sentence_id: Some(paper_note_sentence_id("p-1")),
422            char_range: Some((0, 2)),
423            ..Anchor::default()
424        };
425        assert!(!with_range.is_paper_note());
426    }
427
428    #[test]
429    fn is_imported_synthetic_rejects_anchor_with_quote_or_range() {
430        let with_quote = Anchor {
431            sentence_id: Some(imported_sentence_id("k", "n")),
432            quote: Some("hi".into()),
433            ..Anchor::default()
434        };
435        assert!(!with_quote.is_imported_synthetic());
436
437        let with_range = Anchor {
438            sentence_id: Some(imported_sentence_id("k", "n")),
439            char_range: Some((0, 2)),
440            ..Anchor::default()
441        };
442        assert!(!with_range.is_imported_synthetic());
443    }
444
445    #[test]
446    fn normalize_collapses_whitespace_and_lowercases() {
447        assert_eq!(normalize_sentence("  Hello   WORLD\n"), "hello world");
448    }
449
450    #[test]
451    fn normalize_folds_ligatures_via_nfkc() {
452        // U+FB01 (fi) → "fi" under NFKC, so "ef + fi + cient" → "efficient".
453        assert_eq!(normalize_sentence("ef\u{FB01}cient"), "efficient");
454    }
455
456    #[test]
457    fn sentence_id_is_stable_under_whitespace_and_case() {
458        let a = sentence_id("Hello   World");
459        let b = sentence_id("hello world");
460        let c = sentence_id("HELLO\tWORLD");
461        assert_eq!(a, b);
462        assert_eq!(b, c);
463        // Length of SHA1 hex.
464        assert_eq!(a.len(), 40);
465    }
466
467    #[test]
468    fn sentence_id_changes_when_content_does() {
469        assert_ne!(sentence_id("hello world"), sentence_id("hello mars"));
470    }
471}