Skip to main content

scitadel_db/sqlite/
papers.rs

1use std::collections::HashMap;
2
3use rusqlite::params;
4use scitadel_core::error::CoreError;
5use scitadel_core::models::{DownloadStatus, Paper, PaperId};
6use scitadel_core::ports::PaperRepository;
7
8use super::Database;
9use crate::error::DbError;
10
11const UPSERT_SQL: &str = "\
12    INSERT INTO papers
13        (id, title, authors, abstract, full_text, summary, doi, arxiv_id,
14         pubmed_id, inspire_id, openalex_id, year, journal, url,
15         source_urls, created_at, updated_at)
16    VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14, ?15, ?16, ?17)
17    ON CONFLICT(id) DO UPDATE SET
18        title      = excluded.title,
19        authors    = excluded.authors,
20        abstract   = CASE WHEN excluded.abstract != '' THEN excluded.abstract
21                          ELSE papers.abstract END,
22        full_text  = COALESCE(excluded.full_text, papers.full_text),
23        summary    = COALESCE(excluded.summary, papers.summary),
24        doi        = COALESCE(excluded.doi, papers.doi),
25        arxiv_id   = COALESCE(excluded.arxiv_id, papers.arxiv_id),
26        pubmed_id  = COALESCE(excluded.pubmed_id, papers.pubmed_id),
27        inspire_id = COALESCE(excluded.inspire_id, papers.inspire_id),
28        openalex_id= COALESCE(excluded.openalex_id, papers.openalex_id),
29        year       = COALESCE(excluded.year, papers.year),
30        journal    = COALESCE(excluded.journal, papers.journal),
31        url        = COALESCE(excluded.url, papers.url),
32        source_urls= excluded.source_urls,
33        updated_at = excluded.updated_at";
34
35pub struct SqlitePaperRepository {
36    db: Database,
37}
38
39impl SqlitePaperRepository {
40    pub fn new(db: Database) -> Self {
41        Self { db }
42    }
43
44    /// Borrow the underlying `Database` so callers (e.g. the bib-import
45    /// orchestrator in #157) can drive a multi-repo transaction over a
46    /// single pooled connection. Read-only access — repos remain the
47    /// canonical write surface.
48    pub fn db(&self) -> &Database {
49        &self.db
50    }
51
52    /// If a paper with the same DOI already exists, return a clone with the existing ID
53    /// so the upsert merges into the existing row instead of violating the DOI unique index.
54    fn resolve_doi_conflict(
55        conn: &rusqlite::Connection,
56        paper: &Paper,
57    ) -> Result<Paper, CoreError> {
58        if let Some(doi) = &paper.doi {
59            let existing_id: Option<String> = conn
60                .query_row(
61                    "SELECT id FROM papers WHERE doi = ?1 AND id != ?2",
62                    params![doi, paper.id.as_str()],
63                    |row| row.get(0),
64                )
65                .optional()
66                .map_err(DbError::Sqlite)?;
67            if let Some(id) = existing_id {
68                let mut merged = paper.clone();
69                merged.id = PaperId::from(id);
70                return Ok(merged);
71            }
72        }
73        Ok(paper.clone())
74    }
75
76    fn resolve_doi_conflict_tx(
77        tx: &rusqlite::Transaction<'_>,
78        paper: &Paper,
79    ) -> Result<Paper, CoreError> {
80        if let Some(doi) = &paper.doi {
81            let existing_id: Option<String> = tx
82                .query_row(
83                    "SELECT id FROM papers WHERE doi = ?1 AND id != ?2",
84                    params![doi, paper.id.as_str()],
85                    |row| row.get(0),
86                )
87                .optional()
88                .map_err(DbError::Sqlite)?;
89            if let Some(id) = existing_id {
90                let mut merged = paper.clone();
91                merged.id = PaperId::from(id);
92                return Ok(merged);
93            }
94        }
95        Ok(paper.clone())
96    }
97
98    fn paper_params(paper: &Paper) -> [Box<dyn rusqlite::types::ToSql>; 17] {
99        [
100            Box::new(paper.id.as_str().to_string()),
101            Box::new(paper.title.clone()),
102            Box::new(serde_json::to_string(&paper.authors).unwrap_or_default()),
103            Box::new(paper.r#abstract.clone()),
104            Box::new(paper.full_text.clone()),
105            Box::new(paper.summary.clone()),
106            Box::new(paper.doi.clone()),
107            Box::new(paper.arxiv_id.clone()),
108            Box::new(paper.pubmed_id.clone()),
109            Box::new(paper.inspire_id.clone()),
110            Box::new(paper.openalex_id.clone()),
111            Box::new(paper.year),
112            Box::new(paper.journal.clone()),
113            Box::new(paper.url.clone()),
114            Box::new(serde_json::to_string(&paper.source_urls).unwrap_or_default()),
115            Box::new(paper.created_at.to_rfc3339()),
116            Box::new(paper.updated_at.to_rfc3339()),
117        ]
118    }
119
120    /// Snapshot of every `bibtex_key` currently assigned in the DB —
121    /// used as the `taken` set when assigning keys to newly-imported
122    /// papers via `scitadel_core::bibtex_key::assign_keys`. Mirrors
123    /// the seed in `Database::backfill_bibtex_keys`.
124    pub fn taken_bibtex_keys(&self) -> Result<std::collections::HashSet<String>, CoreError> {
125        let conn = self.db.conn()?;
126        Self::taken_bibtex_keys_inner(&conn)
127    }
128
129    /// Transactional sibling of [`Self::taken_bibtex_keys`] (#157). Runs
130    /// the snapshot inside an in-flight transaction so the bib-import
131    /// orchestrator's per-row tx sees its own pending writes.
132    pub fn taken_bibtex_keys_in_tx(
133        tx: &rusqlite::Transaction<'_>,
134    ) -> Result<std::collections::HashSet<String>, CoreError> {
135        Self::taken_bibtex_keys_inner(tx)
136    }
137
138    fn taken_bibtex_keys_inner(
139        conn: &rusqlite::Connection,
140    ) -> Result<std::collections::HashSet<String>, CoreError> {
141        let keys = conn
142            .prepare("SELECT bibtex_key FROM papers WHERE bibtex_key IS NOT NULL")
143            .map_err(DbError::Sqlite)?
144            .query_map([], |row| row.get::<_, String>(0))
145            .map_err(DbError::Sqlite)?
146            .filter_map(Result::ok)
147            .collect();
148        Ok(keys)
149    }
150
151    /// Transactional sibling of [`PaperRepository::save`] (#157). Same
152    /// upsert + DOI-conflict-retry logic, but runs against the caller's
153    /// transaction so a multi-step import operation can roll back as a
154    /// unit on partial failure.
155    pub fn save_in_tx(tx: &rusqlite::Transaction<'_>, paper: &Paper) -> Result<(), CoreError> {
156        let paper = Self::resolve_doi_conflict_tx(tx, paper)?;
157        let p = Self::paper_params(&paper);
158        let params: Vec<&dyn rusqlite::types::ToSql> = p.iter().map(|b| b.as_ref()).collect();
159        match tx.execute(UPSERT_SQL, params.as_slice()) {
160            Ok(_) => Ok(()),
161            Err(rusqlite::Error::SqliteFailure(err, _))
162                if err.code == rusqlite::ErrorCode::ConstraintViolation =>
163            {
164                if let Some(doi) = &paper.doi {
165                    let existing_id: Option<String> = tx
166                        .query_row(
167                            "SELECT id FROM papers WHERE doi = ?1",
168                            params![doi],
169                            |row| row.get(0),
170                        )
171                        .optional()
172                        .map_err(DbError::Sqlite)?;
173                    if let Some(eid) = existing_id {
174                        let mut retry = paper.clone();
175                        retry.id = PaperId::from(eid);
176                        let p2 = Self::paper_params(&retry);
177                        let params2: Vec<&dyn rusqlite::types::ToSql> =
178                            p2.iter().map(|b| b.as_ref()).collect();
179                        tx.execute(UPSERT_SQL, params2.as_slice())
180                            .map_err(DbError::Sqlite)?;
181                    }
182                }
183                Ok(())
184            }
185            Err(e) => Err(DbError::Sqlite(e).into()),
186        }
187    }
188
189    /// Transactional sibling of
190    /// [`PaperRepository::update_bibtex_key`] (#157).
191    pub fn update_bibtex_key_in_tx(
192        tx: &rusqlite::Transaction<'_>,
193        paper_id: &str,
194        key: &str,
195    ) -> Result<(), CoreError> {
196        tx.execute(
197            "UPDATE papers SET bibtex_key = ?1 WHERE id = ?2",
198            params![key, paper_id],
199        )
200        .map_err(DbError::Sqlite)?;
201        Ok(())
202    }
203
204    /// Lookup-by-id helpers used by the bib-import matcher (#134).
205    /// Inherent (not on `PaperRepository`) so the port surface stays
206    /// minimal — these are only consumed by the import wiring layer.
207    pub fn find_id_by_arxiv_id(&self, arxiv_id: &str) -> Result<Option<String>, CoreError> {
208        let conn = self.db.conn()?;
209        let id: Option<String> = conn
210            .query_row(
211                "SELECT id FROM papers WHERE arxiv_id = ?1",
212                params![arxiv_id],
213                |r| r.get(0),
214            )
215            .optional()
216            .map_err(DbError::Sqlite)?;
217        Ok(id)
218    }
219
220    pub fn find_id_by_pubmed_id(&self, pubmed_id: &str) -> Result<Option<String>, CoreError> {
221        let conn = self.db.conn()?;
222        let id: Option<String> = conn
223            .query_row(
224                "SELECT id FROM papers WHERE pubmed_id = ?1",
225                params![pubmed_id],
226                |r| r.get(0),
227            )
228            .optional()
229            .map_err(DbError::Sqlite)?;
230        Ok(id)
231    }
232
233    pub fn find_id_by_openalex_id(&self, openalex_id: &str) -> Result<Option<String>, CoreError> {
234        let conn = self.db.conn()?;
235        let id: Option<String> = conn
236            .query_row(
237                "SELECT id FROM papers WHERE openalex_id = ?1",
238                params![openalex_id],
239                |r| r.get(0),
240            )
241            .optional()
242            .map_err(DbError::Sqlite)?;
243        Ok(id)
244    }
245
246    pub fn find_id_by_bibtex_key(&self, key: &str) -> Result<Option<String>, CoreError> {
247        let conn = self.db.conn()?;
248        let id: Option<String> = conn
249            .query_row(
250                "SELECT id FROM papers WHERE bibtex_key = ?1",
251                params![key],
252                |r| r.get(0),
253            )
254            .optional()
255            .map_err(DbError::Sqlite)?;
256        Ok(id)
257    }
258
259    /// Case-insensitive title match, optionally constrained by year.
260    /// Year `None` means "any year". Title comparison uses
261    /// `unicode_lower()` (a Rust-side `to_lowercase` registered as a
262    /// SQL scalar function in `Database::open` — see #159) so case
263    /// folds work for non-ASCII characters like `Ü/ü`. Whitespace
264    /// normalization is the caller's responsibility (the matcher
265    /// passes title verbatim today — good enough for the issue's
266    /// stated "exact match only" intent).
267    pub fn find_id_by_title_and_year(
268        &self,
269        title: &str,
270        year: Option<i32>,
271    ) -> Result<Option<String>, CoreError> {
272        let conn = self.db.conn()?;
273        let lowered = title.to_lowercase();
274        let id: Option<String> = match year {
275            Some(y) => conn
276                .query_row(
277                    "SELECT id FROM papers WHERE unicode_lower(title) = ?1 AND year = ?2",
278                    params![lowered, y],
279                    |r| r.get(0),
280                )
281                .optional(),
282            None => conn
283                .query_row(
284                    "SELECT id FROM papers WHERE unicode_lower(title) = ?1",
285                    params![lowered],
286                    |r| r.get(0),
287                )
288                .optional(),
289        }
290        .map_err(DbError::Sqlite)?;
291        Ok(id)
292    }
293}
294
295fn row_to_paper(row: &rusqlite::Row) -> rusqlite::Result<Paper> {
296    let id: String = row.get("id")?;
297    let authors_json: String = row.get("authors")?;
298    let source_urls_json: String = row.get("source_urls")?;
299    let created_at: String = row.get("created_at")?;
300    let updated_at: String = row.get("updated_at")?;
301
302    let local_path: Option<String> = row.get("local_path").ok();
303    let download_status_raw: Option<String> = row.get("download_status").ok();
304    let last_attempt_at_raw: Option<String> = row.get("last_attempt_at").ok();
305    let bibtex_key: Option<String> = row.get("bibtex_key").ok();
306
307    Ok(Paper {
308        id: PaperId::from(id),
309        title: row.get("title")?,
310        authors: serde_json::from_str(&authors_json).unwrap_or_default(),
311        r#abstract: row.get("abstract")?,
312        full_text: row.get("full_text")?,
313        summary: row.get("summary")?,
314        doi: row.get("doi")?,
315        arxiv_id: row.get("arxiv_id")?,
316        pubmed_id: row.get("pubmed_id")?,
317        inspire_id: row.get("inspire_id")?,
318        openalex_id: row.get("openalex_id")?,
319        year: row.get("year")?,
320        journal: row.get("journal")?,
321        url: row.get("url")?,
322        source_urls: serde_json::from_str(&source_urls_json).unwrap_or_default(),
323        created_at: super::parse_rfc3339_or_now(&created_at),
324        updated_at: super::parse_rfc3339_or_now(&updated_at),
325        local_path,
326        download_status: download_status_raw
327            .as_deref()
328            .and_then(DownloadStatus::parse),
329        last_attempt_at: last_attempt_at_raw
330            .as_deref()
331            .and_then(|s| chrono::DateTime::parse_from_rfc3339(s).ok())
332            .map(|dt| dt.with_timezone(&chrono::Utc)),
333        bibtex_key,
334    })
335}
336
337impl PaperRepository for SqlitePaperRepository {
338    fn save(&self, paper: &Paper) -> Result<(), CoreError> {
339        let conn = self.db.conn()?;
340        let paper = Self::resolve_doi_conflict(&conn, paper)?;
341        let p = Self::paper_params(&paper);
342        let params: Vec<&dyn rusqlite::types::ToSql> = p.iter().map(|b| b.as_ref()).collect();
343        match conn.execute(UPSERT_SQL, params.as_slice()) {
344            Ok(_) => Ok(()),
345            Err(rusqlite::Error::SqliteFailure(err, _))
346                if err.code == rusqlite::ErrorCode::ConstraintViolation =>
347            {
348                // DOI collision — retry with existing paper's ID
349                if let Some(doi) = &paper.doi {
350                    let existing_id: Option<String> = conn
351                        .query_row(
352                            "SELECT id FROM papers WHERE doi = ?1",
353                            params![doi],
354                            |row| row.get(0),
355                        )
356                        .optional()
357                        .map_err(DbError::Sqlite)?;
358                    if let Some(eid) = existing_id {
359                        let mut retry = paper.clone();
360                        retry.id = PaperId::from(eid);
361                        let p2 = Self::paper_params(&retry);
362                        let params2: Vec<&dyn rusqlite::types::ToSql> =
363                            p2.iter().map(|b| b.as_ref()).collect();
364                        conn.execute(UPSERT_SQL, params2.as_slice())
365                            .map_err(DbError::Sqlite)?;
366                    }
367                }
368                Ok(())
369            }
370            Err(e) => Err(DbError::Sqlite(e).into()),
371        }
372    }
373
374    fn save_many(&self, papers: &[Paper]) -> Result<HashMap<PaperId, PaperId>, CoreError> {
375        let mut conn = self.db.conn()?;
376        let mut id_remap = HashMap::new();
377        let tx = conn.transaction().map_err(DbError::Sqlite)?;
378        for paper in papers {
379            let resolved = Self::resolve_doi_conflict_tx(&tx, paper)?;
380            if resolved.id != paper.id {
381                id_remap.insert(paper.id.clone(), resolved.id.clone());
382            }
383            let p = Self::paper_params(&resolved);
384            let params: Vec<&dyn rusqlite::types::ToSql> = p.iter().map(|b| b.as_ref()).collect();
385            match tx.execute(UPSERT_SQL, params.as_slice()) {
386                Ok(_) => {}
387                Err(rusqlite::Error::SqliteFailure(err, _))
388                    if err.code == rusqlite::ErrorCode::ConstraintViolation =>
389                {
390                    // DOI unique-index collision that resolve_doi_conflict missed
391                    // (e.g. case variation, concurrent insert, or within-batch dup).
392                    // Look up the existing paper by DOI and retry as an update.
393                    if let Some(doi) = &resolved.doi {
394                        let existing_id: Option<String> = tx
395                            .query_row(
396                                "SELECT id FROM papers WHERE doi = ?1",
397                                params![doi],
398                                |row| row.get(0),
399                            )
400                            .optional()
401                            .map_err(DbError::Sqlite)?;
402                        if let Some(eid) = existing_id {
403                            id_remap.insert(paper.id.clone(), PaperId::from(eid.clone()));
404                            let mut retry = resolved.clone();
405                            retry.id = PaperId::from(eid);
406                            let p2 = Self::paper_params(&retry);
407                            let params2: Vec<&dyn rusqlite::types::ToSql> =
408                                p2.iter().map(|b| b.as_ref()).collect();
409                            tx.execute(UPSERT_SQL, params2.as_slice())
410                                .map_err(DbError::Sqlite)?;
411                        }
412                        // If no DOI match found either, skip silently — paper
413                        // may have been blocked by another unique constraint.
414                    }
415                }
416                Err(e) => return Err(DbError::Sqlite(e).into()),
417            }
418        }
419        tx.commit().map_err(DbError::Sqlite)?;
420        Ok(id_remap)
421    }
422
423    fn get(&self, paper_id: &str) -> Result<Option<Paper>, CoreError> {
424        let conn = self.db.conn()?;
425        let mut stmt = conn
426            .prepare("SELECT * FROM papers WHERE id = ?1")
427            .map_err(DbError::Sqlite)?;
428        let result = stmt
429            .query_row(params![paper_id], row_to_paper)
430            .optional()
431            .map_err(DbError::Sqlite)?;
432        Ok(result)
433    }
434
435    fn find_by_doi(&self, doi: &str) -> Result<Option<Paper>, CoreError> {
436        let conn = self.db.conn()?;
437        let mut stmt = conn
438            .prepare("SELECT * FROM papers WHERE doi = ?1")
439            .map_err(DbError::Sqlite)?;
440        let result = stmt
441            .query_row(params![doi], row_to_paper)
442            .optional()
443            .map_err(DbError::Sqlite)?;
444        Ok(result)
445    }
446
447    fn find_by_title(&self, title: &str) -> Result<Option<Paper>, CoreError> {
448        let conn = self.db.conn()?;
449        let mut stmt = conn
450            .prepare("SELECT * FROM papers WHERE unicode_lower(title) = ?1")
451            .map_err(DbError::Sqlite)?;
452        let result = stmt
453            .query_row(params![title.to_lowercase()], row_to_paper)
454            .optional()
455            .map_err(DbError::Sqlite)?;
456        Ok(result)
457    }
458
459    fn list_all(&self, limit: i64, offset: i64) -> Result<Vec<Paper>, CoreError> {
460        let conn = self.db.conn()?;
461        let mut stmt = conn
462            .prepare("SELECT * FROM papers ORDER BY created_at DESC LIMIT ?1 OFFSET ?2")
463            .map_err(DbError::Sqlite)?;
464        let papers = stmt
465            .query_map(params![limit, offset], row_to_paper)
466            .map_err(DbError::Sqlite)?
467            .filter_map(Result::ok)
468            .collect();
469        Ok(papers)
470    }
471
472    fn update_full_text(&self, paper_id: &str, text: &str) -> Result<(), CoreError> {
473        let conn = self.db.conn()?;
474        conn.execute(
475            "UPDATE papers SET full_text = ?1, updated_at = ?2 WHERE id = ?3",
476            params![text, chrono::Utc::now().to_rfc3339(), paper_id],
477        )
478        .map_err(DbError::Sqlite)?;
479        Ok(())
480    }
481
482    fn update_download_state(
483        &self,
484        paper_id: &str,
485        local_path: Option<&str>,
486        status: DownloadStatus,
487    ) -> Result<(), CoreError> {
488        let conn = self.db.conn()?;
489        let now = chrono::Utc::now().to_rfc3339();
490        conn.execute(
491            "UPDATE papers SET local_path = ?1, download_status = ?2, last_attempt_at = ?3, \
492             updated_at = ?3 WHERE id = ?4",
493            params![local_path, status.as_str(), now, paper_id],
494        )
495        .map_err(DbError::Sqlite)?;
496        Ok(())
497    }
498
499    fn update_bibtex_key(&self, paper_id: &str, key: &str) -> Result<(), CoreError> {
500        let conn = self.db.conn()?;
501        conn.execute(
502            "UPDATE papers SET bibtex_key = ?1 WHERE id = ?2",
503            params![key, paper_id],
504        )
505        .map_err(DbError::Sqlite)?;
506        Ok(())
507    }
508}
509
510// Need this import for .optional()
511use rusqlite::OptionalExtension;
512
513#[cfg(test)]
514mod tests {
515    use super::*;
516    use crate::sqlite::Database;
517
518    fn setup() -> (Database, SqlitePaperRepository) {
519        let db = Database::open_in_memory().unwrap();
520        db.migrate().unwrap();
521        let repo = SqlitePaperRepository::new(db.clone());
522        (db, repo)
523    }
524
525    #[test]
526    fn test_save_and_get() {
527        let (_, repo) = setup();
528        let paper = Paper::new("Test Paper");
529        repo.save(&paper).unwrap();
530
531        let loaded = repo.get(paper.id.as_str()).unwrap().unwrap();
532        assert_eq!(loaded.title, "Test Paper");
533    }
534
535    #[test]
536    fn test_find_by_doi() {
537        let (_, repo) = setup();
538        let mut paper = Paper::new("DOI Paper");
539        paper.doi = Some("10.1234/test".to_string());
540        repo.save(&paper).unwrap();
541
542        let found = repo.find_by_doi("10.1234/test").unwrap().unwrap();
543        assert_eq!(found.id, paper.id);
544    }
545
546    #[test]
547    fn test_upsert_merges() {
548        let (_, repo) = setup();
549        let mut paper = Paper::new("Merge Test");
550        paper.doi = Some("10.1234/merge".to_string());
551        repo.save(&paper).unwrap();
552
553        let mut updated = paper.clone();
554        updated.arxiv_id = Some("2301.00001".to_string());
555        repo.save(&updated).unwrap();
556
557        let loaded = repo.get(paper.id.as_str()).unwrap().unwrap();
558        assert_eq!(loaded.arxiv_id, Some("2301.00001".to_string()));
559    }
560
561    #[test]
562    fn test_doi_conflict_across_papers() {
563        let (_, repo) = setup();
564
565        // First paper with a DOI
566        let mut paper1 = Paper::new("Original Paper");
567        paper1.doi = Some("10.1234/conflict".to_string());
568        repo.save(&paper1).unwrap();
569
570        // Second paper with same DOI but different ID (simulates a second search)
571        let mut paper2 = Paper::new("Updated Title");
572        paper2.doi = Some("10.1234/conflict".to_string());
573        paper2.arxiv_id = Some("2301.99999".to_string());
574        repo.save(&paper2).unwrap();
575
576        // Should have merged into the original, not created a second row
577        let loaded = repo.find_by_doi("10.1234/conflict").unwrap().unwrap();
578        assert_eq!(loaded.id, paper1.id, "should reuse original paper ID");
579        assert_eq!(loaded.title, "Updated Title", "should update title");
580        assert_eq!(
581            loaded.arxiv_id,
582            Some("2301.99999".to_string()),
583            "should merge arxiv_id"
584        );
585    }
586
587    #[test]
588    fn test_doi_conflict_in_save_many() {
589        let (_, repo) = setup();
590
591        let mut existing = Paper::new("Existing Paper");
592        existing.doi = Some("10.1234/batch".to_string());
593        repo.save(&existing).unwrap();
594
595        // Batch save with a colliding DOI
596        let mut new_paper = Paper::new("Batch Paper");
597        new_paper.doi = Some("10.1234/batch".to_string());
598        new_paper.pubmed_id = Some("12345".to_string());
599        repo.save_many(&[new_paper]).unwrap();
600
601        let loaded = repo.find_by_doi("10.1234/batch").unwrap().unwrap();
602        assert_eq!(loaded.id, existing.id);
603        assert_eq!(loaded.pubmed_id, Some("12345".to_string()));
604    }
605
606    #[test]
607    fn test_list_all() {
608        let (_, repo) = setup();
609        for i in 0..5 {
610            let paper = Paper::new(format!("Paper {i}"));
611            repo.save(&paper).unwrap();
612        }
613
614        let papers = repo.list_all(3, 0).unwrap();
615        assert_eq!(papers.len(), 3);
616    }
617
618    /// Integration test: simulate two searches with overlapping DOIs going
619    /// through dedup → save_many, the same flow as the MCP search tool.
620    #[test]
621    fn test_cross_search_dedup_save_roundtrip() {
622        use scitadel_core::models::CandidatePaper;
623        use scitadel_core::services::dedup::deduplicate;
624
625        let (_, repo) = setup();
626
627        // --- First search: returns 3 papers ---
628        let candidates_1 = vec![
629            CandidatePaper {
630                doi: Some("10.1000/alpha".into()),
631                ..CandidatePaper::new("openalex", "oa-1", "Alpha Paper")
632            },
633            CandidatePaper {
634                doi: Some("10.1000/beta".into()),
635                ..CandidatePaper::new("openalex", "oa-2", "Beta Paper")
636            },
637            CandidatePaper {
638                doi: Some("10.1000/gamma".into()),
639                ..CandidatePaper::new("pubmed", "pm-1", "Gamma Paper")
640            },
641        ];
642        let (papers_1, _results_1) = deduplicate(&candidates_1, 0.85);
643        assert_eq!(papers_1.len(), 3);
644        let remap_1 = repo.save_many(&papers_1).unwrap();
645        assert!(remap_1.is_empty(), "no conflicts on first save");
646
647        // --- Second search: 2 overlapping DOIs + 1 new ---
648        let candidates_2 = vec![
649            CandidatePaper {
650                doi: Some("10.1000/alpha".into()),
651                arxiv_id: Some("2301.00001".into()),
652                ..CandidatePaper::new("arxiv", "ax-1", "Alpha Paper (arxiv)")
653            },
654            CandidatePaper {
655                doi: Some("10.1000/gamma".into()),
656                pubmed_id: Some("99999".into()),
657                ..CandidatePaper::new("pubmed", "pm-2", "Gamma Paper Revised")
658            },
659            CandidatePaper {
660                doi: Some("10.1000/delta".into()),
661                ..CandidatePaper::new("openalex", "oa-3", "Delta Paper")
662            },
663        ];
664        let (papers_2, results_2) = deduplicate(&candidates_2, 0.85);
665        assert_eq!(
666            papers_2.len(),
667            3,
668            "dedup sees them as distinct (different IDs)"
669        );
670
671        let remap_2 = repo.save_many(&papers_2).unwrap();
672        assert_eq!(
673            remap_2.len(),
674            2,
675            "alpha and gamma should remap to existing IDs"
676        );
677
678        // Verify the remap points to the original paper IDs
679        let alpha_original = papers_1
680            .iter()
681            .find(|p| p.doi.as_deref() == Some("10.1000/alpha"))
682            .unwrap();
683        let alpha_new = papers_2
684            .iter()
685            .find(|p| p.doi.as_deref() == Some("10.1000/alpha"))
686            .unwrap();
687        assert_eq!(remap_2[&alpha_new.id], alpha_original.id);
688
689        // Verify DB state: should have 4 papers total, not 6
690        let all = repo.list_all(100, 0).unwrap();
691        assert_eq!(all.len(), 4, "3 from first search + 1 new from second");
692
693        // Verify metadata was merged
694        let alpha = repo.find_by_doi("10.1000/alpha").unwrap().unwrap();
695        assert_eq!(alpha.id, alpha_original.id, "kept original ID");
696        assert_eq!(
697            alpha.arxiv_id,
698            Some("2301.00001".into()),
699            "merged arxiv_id from second search"
700        );
701
702        // Verify search_results can be remapped correctly
703        for sr in &results_2 {
704            let resolved_id = remap_2.get(&sr.paper_id).unwrap_or(&sr.paper_id);
705            assert!(
706                repo.get(resolved_id.as_str()).unwrap().is_some(),
707                "remapped paper_id should exist in DB"
708            );
709        }
710    }
711
712    #[test]
713    fn download_state_round_trips() {
714        let (_, repo) = setup();
715        let paper = Paper::new("DL state");
716        repo.save(&paper).unwrap();
717
718        // Pristine row: no download attempted yet.
719        let initial = repo.get(paper.id.as_str()).unwrap().unwrap();
720        assert!(initial.local_path.is_none());
721        assert!(initial.download_status.is_none());
722        assert!(initial.last_attempt_at.is_none());
723
724        // Successful download.
725        repo.update_download_state(
726            paper.id.as_str(),
727            Some("/tmp/foo.pdf"),
728            DownloadStatus::Downloaded,
729        )
730        .unwrap();
731        let after = repo.get(paper.id.as_str()).unwrap().unwrap();
732        assert_eq!(after.local_path.as_deref(), Some("/tmp/foo.pdf"));
733        assert_eq!(after.download_status, Some(DownloadStatus::Downloaded));
734        assert!(after.last_attempt_at.is_some());
735
736        // Subsequent failure overwrites cleanly (path None, status Failed).
737        repo.update_download_state(paper.id.as_str(), None, DownloadStatus::Failed)
738            .unwrap();
739        let failed = repo.get(paper.id.as_str()).unwrap().unwrap();
740        assert!(failed.local_path.is_none());
741        assert_eq!(failed.download_status, Some(DownloadStatus::Failed));
742    }
743
744    /// Regression for #159: SQLite's built-in `LOWER()` is ASCII-only,
745    /// so a re-import of a `.bib` whose title differs only by case on
746    /// a non-ASCII letter (`Ü` vs `ü`, `Ñ` vs `ñ`, `Ï` vs `ï`) used to
747    /// miss the title+year fallback and create a duplicate paper. The
748    /// `unicode_lower` SQL function registered at connection init lifts
749    /// this — both query and stored title are folded with Rust's
750    /// Unicode-aware `to_lowercase`.
751    #[test]
752    fn find_id_by_title_and_year_unicode_case_insensitive() {
753        let (_, repo) = setup();
754        let mut paper = Paper::new("Über die naïve Quantenfeldtheorie");
755        paper.year = Some(1925);
756        repo.save(&paper).unwrap();
757
758        // Same title, different case on the non-ASCII letters: must
759        // resolve to the existing paper, not miss.
760        let found = repo
761            .find_id_by_title_and_year("ÜBER DIE NAÏVE QUANTENFELDTHEORIE", Some(1925))
762            .unwrap();
763        assert_eq!(found.as_deref(), Some(paper.id.as_str()));
764
765        // Lowercased query against title-cased stored row also matches.
766        let found_lower = repo
767            .find_id_by_title_and_year("über die naïve quantenfeldtheorie", Some(1925))
768            .unwrap();
769        assert_eq!(found_lower.as_deref(), Some(paper.id.as_str()));
770
771        // Year mismatch still misses.
772        let none = repo
773            .find_id_by_title_and_year("über die naïve quantenfeldtheorie", Some(1926))
774            .unwrap();
775        assert!(none.is_none());
776
777        // `find_by_title` (no year) follows the same unicode-aware path.
778        let by_title = repo
779            .find_by_title("ÜBER DIE NAÏVE QUANTENFELDTHEORIE")
780            .unwrap()
781            .unwrap();
782        assert_eq!(by_title.id, paper.id);
783    }
784}