rustdoc_processor 0.1.0

Compute, cache, index, and query rustdoc JSON documentation
Documentation
//! SQLite-based caching for rustdoc JSON documentation.

pub(crate) mod checksum;
pub mod entry;
mod third_party;
mod toolchain;
pub mod utils;

pub use entry::{CacheEntry, SecondaryIndexes};
use serde::de::DeserializeOwned;
pub use utils::RkyvCowBytes;

use std::collections::BTreeSet;
use std::marker::PhantomData;

use anyhow::Context;
use guppy::PackageId;
use guppy::graph::{PackageGraph, PackageMetadata};
use itertools::Itertools;
use r2d2_sqlite::SqliteConnectionManager;
use rusqlite::params;

use crate::TOOLCHAIN_CRATES;
use crate::crate_data::CrateData;
use crate::indexing::{ExternalReExports, ImportIndex, ImportPath2Id};

use third_party::ThirdPartyCrateCache;
use toolchain::ToolchainCache;

pub(crate) static BINCODE_CONFIG: bincode::config::Configuration = bincode::config::standard();

/// A cache for storing and retrieving pre-computed JSON documentation generated by `rustdoc`.
///
/// The cache is shared across all projects of the current user.
/// It is stored on disk, using a SQLite database.
///
/// The type parameter `A` represents the annotation type associated with cached entries.
#[derive(Debug)]
pub struct RustdocGlobalFsCache<A> {
    cargo_fingerprint: String,
    third_party_cache: ThirdPartyCrateCache,
    toolchain_cache: ToolchainCache,
    connection_pool: r2d2::Pool<SqliteConnectionManager>,
    _annotation: PhantomData<A>,
}

impl<A> Clone for RustdocGlobalFsCache<A> {
    fn clone(&self) -> Self {
        Self {
            cargo_fingerprint: self.cargo_fingerprint.clone(),
            third_party_cache: self.third_party_cache.clone(),
            toolchain_cache: self.toolchain_cache.clone(),
            connection_pool: self.connection_pool.clone(),
            _annotation: PhantomData,
        }
    }
}

pub enum RustdocCacheKey<'a> {
    ThirdPartyCrate(PackageMetadata<'a>),
    ToolchainCrate(&'a str),
}

impl std::fmt::Debug for RustdocCacheKey<'_> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            RustdocCacheKey::ThirdPartyCrate(metadata) => f
                .debug_struct("ThirdPartyCrate")
                .field("id", &metadata.id())
                .field("name", &metadata.name())
                .field("version", &metadata.version())
                .finish(),
            RustdocCacheKey::ToolchainCrate(name) => f
                .debug_struct("ToolchainCrate")
                .field("name", name)
                .finish(),
        }
    }
}

/// An entry retrieved from the on-disk cache.
pub enum HydratedCacheEntry<A> {
    /// Only the "raw" output returned by `rustdoc` was stored in the cache.
    ///
    /// This happens when the indexing phase emitted one or more diagnostics,
    /// thus forcing to go through that step (and report those errors)
    /// every single time we attempt a compilation.
    Raw(CrateData),
    /// The cache holds both the raw `rustdoc` output and our secondary indexes.
    /// It's ready to be used as is!
    Processed(ProcessedCacheEntry<A>),
}

/// A fully processed cache entry with all secondary indexes.
pub struct ProcessedCacheEntry<A> {
    pub package_id: PackageId,
    pub crate_data: CrateData,
    pub import_path2id: ImportPath2Id,
    pub import_index: ImportIndex,
    pub external_re_exports: ExternalReExports,
    pub annotated_items: A,
}

impl<A> ProcessedCacheEntry<A> {
    /// Convert this cache entry into a `Crate` and its associated annotation data.
    pub fn into_crate(self) -> (crate::queries::Crate, A) {
        let krate = crate::queries::Crate::new(
            crate::queries::CrateCore {
                package_id: self.package_id,
                krate: self.crate_data,
            },
            self.import_path2id,
            self.external_re_exports,
            self.import_index,
        );
        (krate, self.annotated_items)
    }
}

impl<'a> RustdocCacheKey<'a> {
    pub fn new(package_id: &'a PackageId, package_graph: &'a PackageGraph) -> RustdocCacheKey<'a> {
        if TOOLCHAIN_CRATES.contains(&package_id.repr()) {
            RustdocCacheKey::ToolchainCrate(package_id.repr())
        } else {
            RustdocCacheKey::ThirdPartyCrate(package_graph.metadata(package_id).unwrap())
        }
    }
}

impl<A: DeserializeOwned + Default> RustdocGlobalFsCache<A> {
    /// Initialize a new instance of the cache.
    ///
    /// The `cache_fingerprint` is used to determine the database file name.
    /// It should change whenever the caching logic changes.
    ///
    /// `cache_dir` is the directory where the SQLite database file will be stored.
    #[tracing::instrument(name = "Initialize on-disk rustdoc cache", skip_all)]
    pub fn new(
        cache_fingerprint: &str,
        toolchain_name: &str,
        cache_workspace_package_docs: bool,
        package_graph: &PackageGraph,
        cache_dir: &std::path::Path,
    ) -> Result<Self, anyhow::Error> {
        std::thread::scope(|scope| {
            let handle = scope.spawn(|| cargo_fingerprint(toolchain_name));

            let pool = Self::setup_database(cache_fingerprint, cache_dir)?;
            let connection = pool.get()?;
            let third_party_cache = ThirdPartyCrateCache::new(
                &connection,
                cache_workspace_package_docs,
                package_graph,
            )?;
            let toolchain_cache = ToolchainCache::new(&connection)?;
            let cargo_fingerprint = handle
                .join()
                .expect("Failed to compute on `cargo`'s fingerprint")?;
            Ok(Self {
                cargo_fingerprint,
                connection_pool: pool,
                third_party_cache,
                toolchain_cache,
                _annotation: PhantomData,
            })
        })
    }

    /// Retrieve the cached documentation for a given package, if available.
    pub fn get(
        &self,
        cache_key: &RustdocCacheKey,
        package_graph: &PackageGraph,
    ) -> Result<Option<HydratedCacheEntry<A>>, anyhow::Error> {
        let connection = self.connection_pool.get()?;
        match cache_key {
            RustdocCacheKey::ThirdPartyCrate(metadata) => self.third_party_cache.get::<A>(
                metadata,
                &self.cargo_fingerprint,
                &connection,
                package_graph,
            ),
            RustdocCacheKey::ToolchainCrate(name) => {
                self.toolchain_cache
                    .get::<A>(name, &self.cargo_fingerprint, &connection)
            }
        }
    }
}

impl<A> RustdocGlobalFsCache<A> {
    /// Store the JSON documentation for a crate in the cache.
    pub fn insert(
        &self,
        cache_key: &RustdocCacheKey,
        cache_entry: CacheEntry,
        package_graph: &PackageGraph,
    ) -> Result<(), anyhow::Error> {
        let connection = self.connection_pool.get()?;
        match cache_key {
            RustdocCacheKey::ThirdPartyCrate(metadata) => {
                let Some(key) = self.third_party_cache.cache_key(
                    metadata,
                    &self.cargo_fingerprint,
                    package_graph,
                ) else {
                    return Ok(());
                };
                self.third_party_cache.insert(key, &connection, cache_entry)
            }
            RustdocCacheKey::ToolchainCrate(name) => {
                self.toolchain_cache
                    .insert(name, cache_entry, &self.cargo_fingerprint, &connection)
            }
        }
    }

    #[tracing::instrument(skip_all, level = "trace")]
    /// Persist the list of package IDs that were accessed during the processing of the
    /// application blueprint for this project.
    pub fn persist_access_log(
        &self,
        package_ids: &BTreeSet<PackageId>,
        project_fingerprint: &str,
    ) -> Result<(), anyhow::Error> {
        let connection = self.connection_pool.get()?;

        let mut stmt = connection.prepare_cached(
            "INSERT INTO project2package_id_access_log (
                project_fingerprint,
                package_ids
            ) VALUES (?, ?)
            ON CONFLICT(project_fingerprint) DO UPDATE SET package_ids=excluded.package_ids;
            ",
        )?;
        stmt.execute(params![
            project_fingerprint,
            bincode::encode_to_vec(
                package_ids.iter().map(|s| s.repr()).collect_vec(),
                BINCODE_CONFIG
            )?
        ])?;

        Ok(())
    }

    #[tracing::instrument(skip_all, level = "trace")]
    /// Retrieve the list of package IDs that were accessed during the last time we processed the
    /// application blueprint for this project.
    ///
    /// Returns an empty set if no access log is found for the given project fingerprint.
    pub fn get_access_log(
        &self,
        project_fingerprint: &str,
    ) -> Result<BTreeSet<PackageId>, anyhow::Error> {
        let connection = self.connection_pool.get()?;

        let mut stmt = connection.prepare_cached(
            "SELECT package_ids FROM project2package_id_access_log WHERE project_fingerprint = ?",
        )?;
        let mut rows = stmt.query(params![project_fingerprint])?;
        let Some(row) = rows.next()? else {
            return Ok(BTreeSet::new());
        };

        let package_ids: Vec<&str> =
            bincode::borrow_decode_from_slice(row.get_ref_unwrap(0).as_bytes()?, BINCODE_CONFIG)?.0;
        Ok(package_ids.into_iter().map(PackageId::new).collect())
    }

    /// Initialize the database, creating the file and the relevant tables if they don't exist yet.
    fn setup_database(
        cache_fingerprint: &str,
        cache_dir: &std::path::Path,
    ) -> Result<r2d2::Pool<SqliteConnectionManager>, anyhow::Error> {
        fs_err::create_dir_all(cache_dir).with_context(|| {
            format!(
                "Failed to create the cache directory at {}",
                cache_dir.to_string_lossy()
            )
        })?;

        // For the sake of simplicity, we use a different SQLite database for each version of the
        // cache crate. This ensures that we don't have to worry about schema migrations.
        // The cost we pay: the user will have to re-generate the documentation for all their crates
        // when they upgrade the tool using this cache.
        let cache_path = cache_dir.join(format!("{cache_fingerprint}.db"));

        #[derive(Debug)]
        struct SqlitePragmas;

        impl r2d2::CustomizeConnection<rusqlite::Connection, rusqlite::Error> for SqlitePragmas {
            fn on_acquire(&self, conn: &mut rusqlite::Connection) -> Result<(), rusqlite::Error> {
                conn.execute_batch(
                    // 250MB memory-mapped, more than enough.
                    "PRAGMA mmap_size=262144000;",
                )?;
                Ok(())
            }
        }

        let manager = SqliteConnectionManager::file(cache_path);
        let pool = r2d2::Pool::builder()
            .max_size(num_cpus::get() as u32)
            .connection_customizer(Box::new(SqlitePragmas))
            .build(manager)
            .context("Failed to open/create a SQLite database to store the rustdoc cache")?;

        let connection = pool.get()?;
        connection.execute_batch(
            "PRAGMA journal_mode=WAL;
            PRAGMA synchronous=NORMAL;",
        )?;
        connection.execute(
            "CREATE TABLE IF NOT EXISTS project2package_id_access_log (
                project_fingerprint TEXT NOT NULL,
                package_ids BLOB NOT NULL,
                PRIMARY KEY (project_fingerprint)
            )",
            [],
        )?;

        Ok(pool)
    }
}

/// Return the output of `cargo --verbose --version` for the nightly toolchain,
/// which can be used to fingerprint the toolchain used by Pavex.
pub(crate) fn cargo_fingerprint(toolchain_name: &str) -> Result<String, anyhow::Error> {
    let err_msg = || {
        format!(
            "Failed to run `cargo --verbose --version` on `{toolchain_name}`.\n\
Is the `{toolchain_name}` toolchain installed?\n\
If not, invoke\n

    rustup toolchain install {toolchain_name} -c rust-docs-json

to fix it.",
        )
    };
    let mut cmd = std::process::Command::new("rustup");
    cmd.arg("run")
        .arg(toolchain_name)
        .arg("cargo")
        .arg("--verbose")
        .arg("--version");
    let output = cmd.output().with_context(err_msg)?;
    if !output.status.success() {
        anyhow::bail!(err_msg());
    }
    let output = String::from_utf8(output.stdout).with_context(|| {
        format!("An invocation of `cargo --verbose --version` for the `{toolchain_name}` toolchain returned non-UTF8 data as output.")
    })?;
    Ok(output)
}