mago-database 1.0.0-alpha.5

Provides a high-performance, in-memory database for source code analysis, featuring distinct mutable and immutable states and transactional updates.
Documentation
use std::borrow::Cow;
use std::hash::DefaultHasher;
use std::hash::Hash;
use std::hash::Hasher;
use std::path::Path;
use std::path::PathBuf;

use serde::Deserialize;
use serde::Serialize;

use crate::error::DatabaseError;
use crate::utils::read_file;

/// A stable, unique identifier for a file.
///
/// This ID is generated by hashing the file's logical name, ensuring it remains
/// consistent across application runs and is unaffected by content modifications.
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
#[repr(transparent)]
pub struct FileId(u64);

/// Distinguishes between the origins of source files.
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
#[repr(u8)]
pub enum FileType {
    /// The file is part of the primary project source code.
    /// These files typically reside on the filesystem and are actively developed.
    Host,

    /// The file belongs to a third-party dependency (e.g., a Composer package).
    /// These files exist on the filesystem within the project (e.g., in `vendor/`)
    /// but are not considered part of the primary source code.
    Vendored,

    /// The file represents a built-in language construct (e.g., a core PHP function or class).
    /// These "files" do not exist on the filesystem and their content is typically
    /// provided as pre-defined stubs for analysis.
    Builtin,
}

/// A file that's either stored on the host system's file system or in the vendored file system.
///
/// This struct encapsulates all the necessary information about a file, including its content,
/// location, and metadata for change detection.
#[derive(Debug, Eq, PartialEq, Hash)]
pub struct File {
    /// A stable, unique identifier for the file, generated from its logical name.
    /// This ID persists across application runs and content modifications.
    pub id: FileId,

    /// The logical name of the file, typically the path relative to the root of the project.
    pub name: Cow<'static, str>,

    /// The absolute path of the file on the host's filesystem, if it exists there.
    /// This will be `None` for vendored files that don't have a physical counterpart.
    pub path: Option<PathBuf>,

    /// The type of the file, indicating its origin.
    pub file_type: FileType,

    /// The contents of the file, if available.
    pub contents: Cow<'static, str>,

    /// The size of the file's contents in bytes.
    pub size: u32,

    /// A vector containing the starting byte offsets of each line in `contents`.
    /// The first line always starts at offset 0. This is useful for quickly
    /// navigating to a specific line number without scanning the whole file.
    pub lines: Vec<u32>,
}

pub trait HasFileId {
    /// Returns the unique identifier of the file.
    fn file_id(&self) -> FileId;
}

impl File {
    /// Creates a new `File` instance from its name, type, path, and contents.
    ///
    /// It automatically calculates the size, and line start offsets.
    pub fn new(
        name: Cow<'static, str>,
        file_type: FileType,
        path: Option<PathBuf>,
        contents: Cow<'static, str>,
    ) -> Self {
        let id = FileId::new(&name);
        let size = contents.len() as u32;
        let lines = line_starts(contents.as_ref()).collect::<Vec<_>>();

        Self { id, name, path, file_type, contents, size, lines }
    }

    /// Creates a new `File` instance by reading its contents from the filesystem.
    ///
    /// This is the primary factory function for creating a `File` from a disk source.
    /// It handles determining the file's logical name relative to the workspace,
    /// reading its contents, and robustly handling non-UTF-8 text via lossy conversion.
    ///
    /// # Arguments
    ///
    /// * `workspace`: The root directory of the project, used to calculate the logical name.
    /// * `path`: The absolute path to the file to read from disk.
    /// * `file_type`: The [`FileType`] to assign to the created file.
    ///
    /// # Errors
    ///
    /// Returns a [`DatabaseError::IOError`] if the file cannot be read from the disk.
    pub fn read(workspace: &Path, path: &Path, file_type: FileType) -> Result<Self, DatabaseError> {
        read_file(workspace, path, file_type)
    }

    /// Creates an ephemeral, in-memory `File` from a name and content.
    ///
    /// This is a convenience method for situations like testing or formatting where
    /// a full file context (e.g., a real path) is not required. It defaults to
    /// `FileType::Host` and a `path` of `None`.
    pub fn ephemeral(name: Cow<'static, str>, contents: Cow<'static, str>) -> Self {
        Self::new(name, FileType::Host, None, contents)
    }

    /// Retrieve the line number for the given byte offset.
    ///
    /// # Parameters
    ///
    /// - `offset`: The byte offset to retrieve the line number for.
    ///
    /// # Returns
    ///
    /// The line number for the given byte offset (0-based index).
    #[inline]
    pub fn line_number(&self, offset: u32) -> u32 {
        self.lines.binary_search(&offset).unwrap_or_else(|next_line| next_line - 1) as u32
    }

    /// Retrieve the byte offset for the start of the given line.
    ///
    /// # Parameters
    ///
    /// - `line`: The line number to retrieve the start offset for.
    ///
    /// # Returns
    ///
    /// The byte offset for the start of the given line (0-based index).
    pub fn get_line_start_offset(&self, line: u32) -> Option<u32> {
        self.lines.get(line as usize).copied()
    }

    /// Retrieve the byte offset for the end of the given line.
    ///
    /// # Parameters
    ///
    /// - `line`: The line number to retrieve the end offset for.
    ///
    /// # Returns
    ///
    /// The byte offset for the end of the given line (0-based index).
    pub fn get_line_end_offset(&self, line: u32) -> Option<u32> {
        match self.lines.get(line as usize + 1) {
            Some(&end) => Some(end - 1),
            None if line as usize == self.lines.len() - 1 => Some(self.size),
            _ => None,
        }
    }

    /// Retrieve the column number for the given byte offset.
    ///
    /// # Parameters
    ///
    /// - `offset`: The byte offset to retrieve the column number for.
    ///
    /// # Returns
    ///
    /// The column number for the given byte offset (0-based index).
    #[inline]
    pub fn column_number(&self, offset: u32) -> u32 {
        let line_start =
            self.lines.binary_search(&offset).unwrap_or_else(|next_line| self.lines[next_line - 1] as usize);

        offset - line_start as u32
    }
}

impl FileType {
    /// Returns `true` if the file is a host file, meaning it is part of the project's source code.
    pub const fn is_host(self) -> bool {
        matches!(self, FileType::Host)
    }

    /// Returns `true` if the file is a vendored file, meaning it comes from an external library or dependency.
    pub const fn is_vendored(self) -> bool {
        matches!(self, FileType::Vendored)
    }

    /// Returns `true` if the file is a built-in file, meaning it represents a core language construct.
    pub const fn is_builtin(self) -> bool {
        matches!(self, FileType::Builtin)
    }
}

impl FileId {
    pub fn new(logical_name: &str) -> Self {
        let mut hasher = DefaultHasher::new();
        logical_name.hash(&mut hasher);
        Self(hasher.finish())
    }

    pub const fn zero() -> Self {
        Self(0)
    }

    pub const fn is_zero(self) -> bool {
        self.0 == 0
    }

    #[must_use]
    pub fn as_u64(self) -> u64 {
        self.0
    }
}

impl HasFileId for File {
    fn file_id(&self) -> FileId {
        self.id
    }
}

impl std::fmt::Display for FileId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.0)
    }
}

/// Returns an iterator over the starting byte offsets of each line in `source`.
#[inline]
pub(crate) fn line_starts(source: &str) -> impl Iterator<Item = u32> + '_ {
    let bytes = source.as_bytes();

    std::iter::once(0)
        .chain(memchr::memchr_iter(b'\n', bytes).map(|i| if i > 0 && bytes[i - 1] == b'\r' { i } else { i + 1 }))
        .map(|i| i as u32)
}