Skip to main content

scitadel_db/sqlite/
paper_aliases.rs

1//! Paper citation aliases (#134 — bib import iter 2).
2//!
3//! Stores alternative citekeys a paper is known by (typically the
4//! citekey from an imported `.bib` file). See migration 011 for the
5//! schema rationale. Authoritative key stays on `papers.bibtex_key`;
6//! aliases are additive and never rename the paper.
7
8use rusqlite::{OptionalExtension, params};
9
10use crate::error::DbError;
11use crate::sqlite::Database;
12
13/// Free-form source tag recorded alongside an alias.
14pub const SOURCE_BIBTEX_IMPORT: &str = "bibtex-import";
15
16/// Source tag for aliases preserved by `scitadel bib rekey` — the
17/// pre-rekey citation key gets recorded so manuscripts that still
18/// cite the paper by its old key continue to resolve via lookup.
19pub const SOURCE_REKEY: &str = "rekey";
20
21#[derive(Clone)]
22pub struct SqlitePaperAliasRepository {
23    db: Database,
24}
25
26impl SqlitePaperAliasRepository {
27    pub fn new(db: Database) -> Self {
28        Self { db }
29    }
30
31    /// Record an alias. Idempotent on `(paper_id, alias)` — re-running
32    /// an import that already added the alias is a no-op. Returns
33    /// `true` if a new row was inserted, `false` if it already existed.
34    pub fn record(&self, paper_id: &str, alias: &str, source: &str) -> Result<bool, DbError> {
35        let conn = self.db.conn()?;
36        let rows = conn.execute(
37            "INSERT OR IGNORE INTO paper_aliases (paper_id, alias, source, created_at)
38             VALUES (?1, ?2, ?3, ?4)",
39            params![paper_id, alias, source, chrono::Utc::now().to_rfc3339()],
40        )?;
41        Ok(rows > 0)
42    }
43
44    /// Transactional sibling of [`Self::record`] (#157). Used by the
45    /// bib-import orchestrator so paper-save + alias-record + annotation-
46    /// create commit (or roll back) as a single unit per row.
47    pub fn record_in_tx(
48        tx: &rusqlite::Transaction<'_>,
49        paper_id: &str,
50        alias: &str,
51        source: &str,
52    ) -> Result<bool, DbError> {
53        let rows = tx.execute(
54            "INSERT OR IGNORE INTO paper_aliases (paper_id, alias, source, created_at)
55             VALUES (?1, ?2, ?3, ?4)",
56            params![paper_id, alias, source, chrono::Utc::now().to_rfc3339()],
57        )?;
58        Ok(rows > 0)
59    }
60
61    /// Look up a paper by alias. When two papers share an alias (legal —
62    /// two different imports may collide on citekey) this returns the
63    /// earliest-created row so the lookup is deterministic. Secondary
64    /// `paper_id ASC` tiebreak covers same-microsecond inserts during
65    /// batch imports where `created_at` ties are realistic.
66    ///
67    /// For the match pipeline, prefer [`Self::lookup_all`] so you can
68    /// explicitly detect ambiguity and fall through to other match
69    /// strategies rather than silently picking the first row.
70    pub fn lookup(&self, alias: &str) -> Result<Option<String>, DbError> {
71        let conn = self.db.conn()?;
72        let paper_id: Option<String> = conn
73            .query_row(
74                "SELECT paper_id FROM paper_aliases
75                 WHERE alias = ?1
76                 ORDER BY created_at ASC, paper_id ASC
77                 LIMIT 1",
78                params![alias],
79                |r| r.get(0),
80            )
81            .optional()?;
82        Ok(paper_id)
83    }
84
85    /// All paper IDs sharing an alias, in deterministic order. The
86    /// match pipeline uses this to detect ambiguity (`len() > 1` →
87    /// skip the alias strategy, fall through to DOI/arxiv/etc).
88    pub fn lookup_all(&self, alias: &str) -> Result<Vec<String>, DbError> {
89        let conn = self.db.conn()?;
90        let mut stmt = conn.prepare(
91            "SELECT paper_id FROM paper_aliases
92             WHERE alias = ?1
93             ORDER BY created_at ASC, paper_id ASC",
94        )?;
95        let rows = stmt.query_map(params![alias], |r| r.get::<_, String>(0))?;
96        Ok(rows.filter_map(Result::ok).collect())
97    }
98
99    /// All aliases for a paper, in insertion order.
100    pub fn list_for(&self, paper_id: &str) -> Result<Vec<(String, String)>, DbError> {
101        let conn = self.db.conn()?;
102        let mut stmt = conn.prepare(
103            "SELECT alias, source FROM paper_aliases
104             WHERE paper_id = ?1
105             ORDER BY created_at ASC",
106        )?;
107        let rows = stmt.query_map(params![paper_id], |r| {
108            Ok((r.get::<_, String>(0)?, r.get::<_, String>(1)?))
109        })?;
110        Ok(rows.filter_map(Result::ok).collect())
111    }
112}
113
114#[cfg(test)]
115mod tests {
116    use super::*;
117
118    fn fresh() -> SqlitePaperAliasRepository {
119        let db = Database::open_in_memory().unwrap();
120        db.migrate().unwrap();
121        let conn = db.conn().unwrap();
122        for id in ["p1", "p2"] {
123            conn.execute(
124                "INSERT INTO papers (id, title, created_at, updated_at)
125                 VALUES (?1, 'T', datetime('now'), datetime('now'))",
126                params![id],
127            )
128            .unwrap();
129        }
130        drop(conn);
131        SqlitePaperAliasRepository::new(db)
132    }
133
134    #[test]
135    fn record_and_lookup_round_trip() {
136        let repo = fresh();
137        assert!(
138            repo.record("p1", "smith2024old", SOURCE_BIBTEX_IMPORT)
139                .unwrap()
140        );
141        assert_eq!(repo.lookup("smith2024old").unwrap().as_deref(), Some("p1"));
142        assert_eq!(repo.lookup("nonexistent").unwrap(), None);
143    }
144
145    #[test]
146    fn record_is_idempotent_on_paper_id_alias() {
147        let repo = fresh();
148        assert!(repo.record("p1", "k", SOURCE_BIBTEX_IMPORT).unwrap());
149        assert!(
150            !repo.record("p1", "k", SOURCE_BIBTEX_IMPORT).unwrap(),
151            "second insert returns false"
152        );
153        assert_eq!(repo.list_for("p1").unwrap().len(), 1);
154    }
155
156    #[test]
157    fn alias_collision_earliest_created_wins() {
158        let repo = fresh();
159        repo.record("p2", "shared2024", SOURCE_BIBTEX_IMPORT)
160            .unwrap();
161        std::thread::sleep(std::time::Duration::from_millis(5));
162        repo.record("p1", "shared2024", SOURCE_BIBTEX_IMPORT)
163            .unwrap();
164        assert_eq!(
165            repo.lookup("shared2024").unwrap().as_deref(),
166            Some("p2"),
167            "earliest-created row wins"
168        );
169        let all = repo.lookup_all("shared2024").unwrap();
170        assert_eq!(all, vec!["p2".to_string(), "p1".to_string()]);
171    }
172
173    /// Paper-id ASC is the secondary tiebreak when `created_at` ties.
174    /// Batch imports making hundreds of `record()` calls can easily
175    /// collide on timestamps (even with nanosecond precision under
176    /// sccache-parallel test runs). Force the tie via direct SQL.
177    #[test]
178    fn alias_collision_same_timestamp_tiebreaks_on_paper_id_asc() {
179        let repo = fresh();
180        let same_ts = "2026-04-24T12:00:00Z";
181        let conn = repo.db.conn().unwrap();
182        conn.execute(
183            "INSERT INTO paper_aliases (paper_id, alias, source, created_at)
184             VALUES ('p2', 'shared2024', 'bibtex-import', ?1),
185                    ('p1', 'shared2024', 'bibtex-import', ?1)",
186            params![same_ts],
187        )
188        .unwrap();
189        drop(conn);
190        assert_eq!(
191            repo.lookup("shared2024").unwrap().as_deref(),
192            Some("p1"),
193            "paper_id ASC breaks the tie when timestamps are identical"
194        );
195        let all = repo.lookup_all("shared2024").unwrap();
196        assert_eq!(all, vec!["p1".to_string(), "p2".to_string()]);
197    }
198
199    #[test]
200    fn cascade_delete_removes_orphan_aliases() {
201        let repo = fresh();
202        repo.record("p1", "k1", SOURCE_BIBTEX_IMPORT).unwrap();
203        repo.record("p1", "k2", SOURCE_BIBTEX_IMPORT).unwrap();
204        {
205            let conn = repo.db.conn().unwrap();
206            conn.execute("DELETE FROM papers WHERE id = 'p1'", [])
207                .unwrap();
208        }
209        assert_eq!(repo.lookup("k1").unwrap(), None);
210        assert_eq!(repo.lookup("k2").unwrap(), None);
211    }
212
213    #[test]
214    fn list_for_returns_all_aliases_in_order() {
215        let repo = fresh();
216        repo.record("p1", "first", SOURCE_BIBTEX_IMPORT).unwrap();
217        std::thread::sleep(std::time::Duration::from_millis(10));
218        repo.record("p1", "second", "manual").unwrap();
219        let aliases = repo.list_for("p1").unwrap();
220        assert_eq!(aliases.len(), 2);
221        assert_eq!(aliases[0].0, "first");
222        assert_eq!(aliases[0].1, SOURCE_BIBTEX_IMPORT);
223        assert_eq!(aliases[1].0, "second");
224        assert_eq!(aliases[1].1, "manual");
225    }
226}