Skip to main content

verso/library/
scan.rs

1use crate::{
2    library::{epub_guard, epub_meta, hashing, normalise, reanchor},
3    store::{
4        books::{resolve_identity, upsert, BookRow, IdentityMatch},
5        db::Db,
6    },
7};
8use std::path::Path;
9
10#[derive(Debug, Default)]
11pub struct ScanReport {
12    pub inserted: usize,
13    pub updated: usize,
14    pub skipped: usize,
15    pub errors: Vec<(std::path::PathBuf, String)>,
16}
17
18pub fn scan_folder(dir: &Path, db: &Db) -> anyhow::Result<ScanReport> {
19    let mut report = ScanReport::default();
20    let mut conn = db.conn()?;
21    for entry in walkdir(dir) {
22        let path = entry;
23        if path.extension().and_then(|s| s.to_str()) != Some("epub") {
24            continue;
25        }
26
27        if let Err(e) = epub_guard::validate_archive(&path, epub_guard::Limits::default()) {
28            let err_string = e.to_string();
29            record_broken(&mut conn, &path, &err_string);
30            report.errors.push((path.clone(), err_string));
31            continue;
32        }
33
34        let meta = match epub_meta::extract(&path) {
35            Ok(m) => m,
36            Err(e) => {
37                let err_string = e.to_string();
38                record_broken(&mut conn, &path, &err_string);
39                report.errors.push((path.clone(), err_string));
40                continue;
41            }
42        };
43
44        let file_hash = hashing::sha256_file(&path).ok();
45        let row = BookRow {
46            stable_id: meta.stable_id.clone(),
47            file_hash,
48            title_norm: normalise::normalise_text(&meta.title),
49            author_norm: meta.author.as_deref().map(normalise::normalise_author),
50            path: path.to_string_lossy().to_string(),
51            title: meta.title,
52            author: meta.author,
53            language: meta.language,
54            publisher: meta.publisher,
55            published_at: meta.published_at,
56            word_count: meta.word_count,
57            page_count: meta.word_count.map(|w| (w / 275).max(1)),
58            parse_error: None,
59        };
60        // Detect pre-upsert hash state so we can trigger a highlight reanchor if
61        // a re-imported edition's bytes have changed under an existing identity.
62        let pre_hash: Option<String> = match resolve_identity(&conn, &row)? {
63            Some(
64                IdentityMatch::ById(id) | IdentityMatch::ByHash(id) | IdentityMatch::ByNorm(id),
65            ) => conn
66                .query_row(
67                    "SELECT file_hash FROM books WHERE id = ?",
68                    rusqlite::params![id],
69                    |r| r.get::<_, Option<String>>(0),
70                )
71                .ok()
72                .flatten(),
73            None => None,
74        };
75
76        // For v1 we just count all as "inserted"; refine later.
77        let book_id = upsert(&mut conn, &row)?;
78
79        // If the row existed previously with a different hash, re-run anchor
80        // resolution so highlights don't silently drift/go lost.
81        if let (Some(pre), Some(post)) = (pre_hash, row.file_hash.as_ref()) {
82            if pre != *post {
83                let _ = reanchor::reanchor_book(db, book_id, &path);
84            }
85        }
86
87        report.inserted += 1;
88    }
89
90    // Soft-delete books whose on-disk file has vanished (only books under this scan dir).
91    let dir_prefix = dir.to_string_lossy().to_string();
92    let orphaned: Vec<(i64, String)> = conn
93        .prepare("SELECT id, path FROM books WHERE deleted_at IS NULL AND path LIKE ? || '%'")?
94        .query_map(rusqlite::params![dir_prefix], |r| {
95            Ok((r.get(0)?, r.get(1)?))
96        })?
97        .collect::<Result<_, _>>()?;
98
99    for (id, p) in orphaned {
100        if !std::path::Path::new(&p).exists() {
101            conn.execute(
102                "UPDATE books SET deleted_at = CURRENT_TIMESTAMP WHERE id = ?",
103                rusqlite::params![id],
104            )?;
105        }
106    }
107
108    Ok(report)
109}
110
111/// Upsert a minimal row marking this file as unparseable so it appears under
112/// the "broken" library filter. Best-effort: a DB error here must not tank the
113/// whole scan, so we discard the result.
114fn record_broken(conn: &mut rusqlite::Connection, path: &Path, err_string: &str) {
115    let title = path
116        .file_name()
117        .and_then(|s| s.to_str())
118        .unwrap_or("unknown.epub")
119        .to_string();
120    let row = BookRow {
121        stable_id: None,
122        file_hash: hashing::sha256_file(path).ok(),
123        title_norm: normalise::normalise_text(&title),
124        author_norm: None,
125        path: path.to_string_lossy().to_string(),
126        title,
127        author: None,
128        language: None,
129        publisher: None,
130        published_at: None,
131        word_count: None,
132        page_count: None,
133        parse_error: Some(err_string.to_string()),
134    };
135    let _ = upsert(conn, &row);
136}
137
138fn walkdir(dir: &Path) -> Vec<std::path::PathBuf> {
139    let mut out = Vec::new();
140    if let Ok(rd) = std::fs::read_dir(dir) {
141        for e in rd.flatten() {
142            let p = e.path();
143            if p.is_dir() {
144                out.extend(walkdir(&p));
145            } else {
146                out.push(p);
147            }
148        }
149    }
150    out
151}