vectorless 0.1.21

Hierarchical, reasoning-native document intelligence engine
Documentation
// Copyright (c) 2026 vectorless developers
// SPDX-License-Identifier: Apache-2.0

//! Document type definitions.
//!
//! This module defines the types used for document parsing:
//! - [`RawNode`] - A raw node extracted from a document before tree construction
//! - [`DocumentMeta`] - Metadata about a document
//! - [`DocumentFormat`] - Supported document formats

use serde::{Deserialize, Serialize};

/// Supported document formats.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum DocumentFormat {
    /// Markdown files (.md, .markdown)
    Markdown,
    /// PDF files (.pdf)
    Pdf,
    /// HTML files (.html, .htm)
    Html,
    /// Word documents (.docx)
    Docx,
}

impl DocumentFormat {
    /// Detect format from file extension.
    pub fn from_extension(ext: &str) -> Option<Self> {
        match ext.to_lowercase().as_str() {
            "md" | "markdown" => Some(Self::Markdown),
            "pdf" => Some(Self::Pdf),
            "html" | "htm" => Some(Self::Html),
            "docx" => Some(Self::Docx),
            _ => None,
        }
    }

    /// Get the file extension for this format.
    pub fn extension(&self) -> &'static str {
        match self {
            Self::Markdown => "md",
            Self::Pdf => "pdf",
            Self::Html => "html",
            Self::Docx => "docx",
        }
    }
}

/// A raw node extracted from a document.
///
/// This represents a section or element before it's organized into a tree.
/// Raw nodes are produced by parsers and consumed by the indexer.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RawNode {
    /// Title or heading of this node.
    pub title: String,

    /// Text content of this node (including all children's content).
    pub content: String,

    /// Level in the hierarchy (0 = root, 1 = top-level section, etc.).
    pub level: usize,

    /// Line number where this node starts (1-based).
    pub line_start: usize,

    /// Line number where this node ends (1-based).
    pub line_end: usize,

    /// Page number for PDF documents (1-based).
    pub page: Option<usize>,

    /// Estimated token count for this node's own content.
    pub token_count: Option<usize>,

    /// Total token count including all children (recursive, computed by thinner).
    #[serde(default)]
    pub total_token_count: Option<usize>,
}

impl Default for RawNode {
    fn default() -> Self {
        Self {
            title: String::new(),
            content: String::new(),
            level: 0,
            line_start: 1,
            line_end: 1,
            page: None,
            token_count: None,
            total_token_count: None,
        }
    }
}

impl RawNode {
    /// Create a new raw node with the given title.
    pub fn new(title: impl Into<String>) -> Self {
        Self {
            title: title.into(),
            ..Default::default()
        }
    }

    /// Set the content of this node.
    pub fn with_content(mut self, content: impl Into<String>) -> Self {
        self.content = content.into();
        self
    }

    /// Set the level of this node.
    pub fn with_level(mut self, level: usize) -> Self {
        self.level = level;
        self
    }

    /// Set the line range of this node.
    pub fn with_lines(mut self, start: usize, end: usize) -> Self {
        self.line_start = start;
        self.line_end = end;
        self
    }

    /// Set the page number of this node.
    pub fn with_page(mut self, page: usize) -> Self {
        self.page = Some(page);
        self
    }

    /// Check if this node has any content.
    pub fn has_content(&self) -> bool {
        !self.content.trim().is_empty()
    }

    /// Get the character count of the content.
    pub fn char_count(&self) -> usize {
        self.content.chars().count()
    }

    /// Get the word count (approximate) of the content.
    pub fn word_count(&self) -> usize {
        self.content.split_whitespace().count()
    }
}

/// Document metadata.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentMeta {
    /// Document name or title.
    pub name: String,

    /// Document format.
    pub format: DocumentFormat,

    /// Total number of pages (for PDF).
    pub page_count: Option<usize>,

    /// Total number of lines.
    pub line_count: usize,

    /// Source file path (if applicable).
    pub source_path: Option<String>,

    /// Document description (generated by LLM).
    pub description: Option<String>,
}

impl Default for DocumentMeta {
    fn default() -> Self {
        Self {
            name: String::new(),
            format: DocumentFormat::Markdown,
            page_count: None,
            line_count: 0,
            source_path: None,
            description: None,
        }
    }
}

/// Result of parsing a document.
#[derive(Debug, Clone)]
pub struct ParseResult {
    /// Document metadata.
    pub meta: DocumentMeta,

    /// Raw nodes extracted from the document.
    pub nodes: Vec<RawNode>,
}

impl ParseResult {
    /// Create a new parse result.
    pub fn new(meta: DocumentMeta, nodes: Vec<RawNode>) -> Self {
        Self { meta, nodes }
    }

    /// Get the number of nodes.
    pub fn node_count(&self) -> usize {
        self.nodes.len()
    }

    /// Check if there are no nodes.
    pub fn is_empty(&self) -> bool {
        self.nodes.is_empty()
    }
}