kreuzberg 4.4.2

//! Native Rust Typst document extractor.
//!
//! This extractor provides Typst document parsing and text extraction.
//! It uses a hybrid approach combining regex patterns and character-level parsing
//! to extract text while preserving document structure.
//!
//! Features:
//! - Metadata extraction: title, author, date, subject, keywords from `#set document()`
//! - Section hierarchy: `=`, `==`, `===`, etc. heading levels
//! - Inline formatting: `*bold*`, `_italic_`, `` `code` ``
//! - Lists: extraction of list content (both `+` and `-` markers)
//! - Links: extraction of URLs and link text from `#link("url")[text]` syntax
//! - Math: inline (`$...$`) and display math preservation
//! - Code blocks: triple-backtick code blocks with language specifiers
//! - Tables: extraction of `#table()` function content
//! - Complex formatting: handling of nested and combined formatting
//!
//! Requires the `office` feature.

#[cfg(feature = "office")]
use crate::Result;
#[cfg(feature = "office")]
use crate::core::config::ExtractionConfig;
#[cfg(feature = "office")]
use crate::plugins::{DocumentExtractor, Plugin};
#[cfg(feature = "office")]
use crate::types::{ExtractionResult, Metadata};
#[cfg(feature = "office")]
use async_trait::async_trait;
#[cfg(feature = "office")]
use regex::Regex;

/// Typst document extractor
#[cfg(feature = "office")]
pub struct TypstExtractor;

#[cfg(feature = "office")]
impl TypstExtractor {
    /// Create a new Typst extractor.
    pub fn new() -> Self {
        Self
    }

    /// Parse Typst content and extract text.
    fn extract_from_typst(content: &str) -> (String, Metadata) {
        let mut extractor = TypstParser::new(content);
        let text = extractor.parse();
        let metadata = extractor.metadata;

        (text, metadata)
    }
}

#[cfg(feature = "office")]
impl Default for TypstExtractor {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(feature = "office")]
impl Plugin for TypstExtractor {
    fn name(&self) -> &str {
        "typst-extractor"
    }

    fn version(&self) -> String {
        env!("CARGO_PKG_VERSION").to_string()
    }

    fn initialize(&self) -> Result<()> {
        Ok(())
    }

    fn shutdown(&self) -> Result<()> {
        Ok(())
    }

    fn description(&self) -> &str {
        "Native Rust Typst document extractor with metadata support"
    }

    fn author(&self) -> &str {
        "Kreuzberg Team"
    }
}

#[cfg(feature = "office")]
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
impl DocumentExtractor for TypstExtractor {
    #[cfg_attr(feature = "otel", tracing::instrument(
        skip(self, content, _config),
        fields(
            extractor.name = self.name(),
            content.size_bytes = content.len(),
        )
    ))]
    async fn extract_bytes(
        &self,
        content: &[u8],
        mime_type: &str,
        _config: &ExtractionConfig,
    ) -> Result<ExtractionResult> {
        let typst_str = String::from_utf8_lossy(content).to_string();
        let (text, metadata) = Self::extract_from_typst(&typst_str);

        Ok(ExtractionResult {
            content: text,
            mime_type: mime_type.to_string().into(),
            metadata,
            tables: Vec::new(),
            detected_languages: None,
            chunks: None,
            images: None,
            djot_content: None,
            pages: None,
            elements: None,
            ocr_elements: None,
            document: None,
            #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
            extracted_keywords: None,
            quality_score: None,
            processing_warnings: Vec::new(),
            annotations: None,
        })
    }

    fn supported_mime_types(&self) -> &[&str] {
        &["application/x-typst", "text/x-typst"]
    }

    fn priority(&self) -> i32 {
        50
    }
}

/// Internal Typst parser
#[cfg(feature = "office")]
struct TypstParser {
    content: String,
    metadata: Metadata,
}

#[cfg(feature = "office")]
impl TypstParser {
    fn new(content: &str) -> Self {
        Self {
            content: content.to_string(),
            metadata: Metadata::default(),
        }
    }

    fn parse(&mut self) -> String {
        self.extract_metadata();

        self.extract_content()
    }

    fn extract_metadata(&mut self) {
        if let Some(title) = self.extract_quoted_value("title") {
            self.metadata.title = Some(title);
        }

        if let Some(author) = self.extract_quoted_value("author") {
            self.metadata.authors = Some(vec![author]);
        }

        if let Some(date) = self.extract_quoted_value("date") {
            self.metadata.created_at = Some(date);
        }

        if let Some(subject) = self.extract_quoted_value("subject") {
            self.metadata.subject = Some(subject);
        }

        if let Some(keywords) = self.extract_keywords() {
            self.metadata.keywords = Some(keywords);
        }
    }

    fn extract_quoted_value(&self, field: &str) -> Option<String> {
        let pattern = format!(r#"{}:\s*"([^"]*)""#, regex::escape(field));
        if let Ok(re) = Regex::new(&pattern)
            && let Some(caps) = re.captures(&self.content)
        {
            return caps.get(1).map(|m| m.as_str().to_string());
        }
        None
    }

    fn extract_keywords(&self) -> Option<Vec<String>> {
        let pattern = r#"keywords:\s*(?:"([^"]*)"|(\([^)]*\)))"#;
        if let Ok(re) = Regex::new(pattern)
            && let Some(caps) = re.captures(&self.content)
        {
            // Single quoted string: split by comma
            if let Some(m) = caps.get(1) {
                let keywords: Vec<String> = m
                    .as_str()
                    .split(',')
                    .map(|s| s.trim().to_string())
                    .filter(|s| !s.is_empty())
                    .collect();
                if !keywords.is_empty() {
                    return Some(keywords);
                }
            }
            // Array form: ("keyword1", "keyword2")
            if let Some(m) = caps.get(2) {
                let array_str = m.as_str();
                let mut keywords = Vec::new();
                let item_pattern = r#""([^"]*)""#;
                if let Ok(item_re) = Regex::new(item_pattern) {
                    for item_caps in item_re.captures_iter(array_str) {
                        if let Some(keyword) = item_caps.get(1) {
                            keywords.push(keyword.as_str().to_string());
                        }
                    }
                }
                if !keywords.is_empty() {
                    return Some(keywords);
                }
            }
        }
        None
    }

    fn extract_content(&self) -> String {
        let mut output = String::new();
        let mut lines = self.content.lines().peekable();
        let mut in_code_block = false;
        let mut code_block_fence = String::new();
        let mut in_set_document = false;
        let mut paren_depth: i32 = 0;

        while let Some(line) = lines.next() {
            let trimmed = line.trim();

            // Skip multi-line #set document(...) blocks
            if in_set_document {
                for ch in trimmed.chars() {
                    match ch {
                        '(' => paren_depth += 1,
                        ')' => paren_depth -= 1,
                        _ => {}
                    }
                }
                if paren_depth <= 0 {
                    in_set_document = false;
                    paren_depth = 0;
                }
                continue;
            }

            if trimmed.starts_with("```") {
                if in_code_block {
                    if trimmed == "```" {
                        in_code_block = false;
                        code_block_fence.clear();
                        output.push_str("```\n");
                        continue;
                    }
                } else {
                    in_code_block = true;
                    code_block_fence = "```".to_string();
                    output.push_str("```");
                    if let Some(lang) = trimmed.strip_prefix("```") {
                        let lang = lang.trim();
                        if !lang.is_empty() {
                            output.push_str(lang);
                        }
                    }
                    output.push('\n');
                    continue;
                }
            }

            if in_code_block {
                output.push_str(line);
                output.push('\n');
                continue;
            }

            // Skip #set document(...) - may span multiple lines
            if trimmed.starts_with("#set document(") {
                paren_depth = 0;
                for ch in trimmed.chars() {
                    match ch {
                        '(' => paren_depth += 1,
                        ')' => paren_depth -= 1,
                        _ => {}
                    }
                }
                if paren_depth > 0 {
                    in_set_document = true;
                }
                continue;
            }

            if trimmed.starts_with("#set ") || trimmed.starts_with("#let ") {
                continue;
            }

            if trimmed.starts_with("#import ") || trimmed.starts_with("#include ") {
                continue;
            }

            // Skip layout directives
            if trimmed.starts_with("#pagebreak")
                || trimmed.starts_with("#colbreak")
                || trimmed.starts_with("#v(")
                || trimmed.starts_with("#h(")
            {
                continue;
            }

            if trimmed.starts_with("#table(") {
                let table_content = self.extract_table_content(trimmed, &mut lines);
                output.push_str(&table_content);
                output.push('\n');
                continue;
            }

            if trimmed.starts_with('=') {
                let next_char_pos = trimmed.find(|c: char| c != '=');
                if next_char_pos.is_some() {
                    let heading_level = trimmed.chars().take_while(|&c| c == '=').count();
                    let heading_text = trimmed[heading_level..].trim();

                    output.push_str(heading_text);
                    let _ = heading_level;
                    output.push('\n');
                    continue;
                }
            }

            if (trimmed.starts_with('+') || trimmed.starts_with('-'))
                && trimmed.len() > 1
                && trimmed.chars().nth(1).is_some_and(|c| !c.is_alphanumeric())
            {
                output.push_str("- ");
                output.push_str(trimmed[1..].trim());
                output.push('\n');
                continue;
            }

            if trimmed.starts_with('#')
                && !trimmed.starts_with("#set")
                && !trimmed.starts_with("#let")
                && !trimmed.starts_with("#import")
                && !trimmed.starts_with("#include")
            {
                if trimmed.contains('[')
                    && trimmed.contains(']')
                    && let Some(content) = self.extract_text_from_brackets(trimmed)
                {
                    let processed = self.process_line(&content);
                    if !processed.is_empty() {
                        output.push_str(&processed);
                        output.push('\n');
                    }
                }
                continue;
            }

            if !trimmed.is_empty() {
                let processed = self.process_line(trimmed);
                if !processed.is_empty() {
                    output.push_str(&processed);
                    output.push('\n');
                }
            } else {
                output.push('\n');
            }
        }

        output
    }

    /// Extract content from #table() function calls
    fn extract_table_content<'a, I>(&self, first_line: &str, lines: &mut std::iter::Peekable<I>) -> String
    where
        I: Iterator<Item = &'a str>,
    {
        let mut content = first_line.to_string();
        let mut bracket_depth = 0;
        let mut paren_depth = 0;

        for ch in first_line.chars() {
            match ch {
                '(' => paren_depth += 1,
                ')' => paren_depth -= 1,
                '[' => bracket_depth += 1,
                ']' => bracket_depth -= 1,
                _ => {}
            }
        }

        while paren_depth > 0 || bracket_depth > 0 {
            if let Some(next_line) = lines.next() {
                content.push('\n');
                content.push_str(next_line);
                for ch in next_line.chars() {
                    match ch {
                        '(' => paren_depth += 1,
                        ')' => paren_depth -= 1,
                        '[' => bracket_depth += 1,
                        ']' => bracket_depth -= 1,
                        _ => {}
                    }
                }
            } else {
                break;
            }
        }

        // Extract column count from `columns: N`
        let num_cols = {
            let col_re = Regex::new(r"columns:\s*(\d+)").ok();
            col_re
                .and_then(|re| re.captures(&content))
                .and_then(|caps| caps.get(1))
                .and_then(|m| m.as_str().parse::<usize>().ok())
                .unwrap_or(0)
        };

        // Collect all cell texts
        let mut cells: Vec<String> = Vec::new();
        let mut in_bracket = false;
        let mut cell = String::new();
        for ch in content.chars() {
            match ch {
                '[' => {
                    in_bracket = true;
                    cell.clear();
                }
                ']' => {
                    if in_bracket {
                        let trimmed = cell.trim().to_string();
                        cells.push(trimmed);
                        in_bracket = false;
                        cell.clear();
                    }
                }
                _ if in_bracket => {
                    cell.push(ch);
                }
                _ => {}
            }
        }

        // Arrange cells into rows
        let mut table_content = String::new();
        if num_cols > 0 && !cells.is_empty() {
            for (i, cell_text) in cells.iter().enumerate() {
                if i > 0 && i % num_cols == 0 {
                    table_content.push('\n');
                }
                if i % num_cols > 0 {
                    table_content.push('\t');
                }
                table_content.push_str(cell_text);
            }
        } else {
            // Fallback: join all cells with separator
            table_content = cells.join(" | ");
        }

        table_content
    }

    fn process_line(&self, line: &str) -> String {
        let mut result = String::new();
        let mut chars = line.chars().peekable();

        while let Some(ch) = chars.next() {
            match ch {
                '`' => {
                    result.push('`');
                    for c in chars.by_ref() {
                        result.push(c);
                        if c == '`' {
                            break;
                        }
                    }
                }
                '$' => {
                    result.push('$');
                    for c in chars.by_ref() {
                        result.push(c);
                        if c == '$' {
                            break;
                        }
                    }
                }
                '*' => {
                    result.push('*');
                    for c in chars.by_ref() {
                        result.push(c);
                        if c == '*' {
                            break;
                        }
                    }
                }
                '_' => {
                    result.push('_');
                    for c in chars.by_ref() {
                        result.push(c);
                        if c == '_' {
                            break;
                        }
                    }
                }
                '#' => {
                    // Skip the # prefix for Typst function calls like #link
                    // The link extraction happens later in extract_link_text
                    result.push(ch);
                }
                _ => {
                    result.push(ch);
                }
            }
        }

        self.extract_link_text(&result)
    }

    fn extract_link_text(&self, line: &str) -> String {
        // Handle #link("url")[text] pattern - extract just the display text
        let pattern = r#"#?link\("([^"]*)"\)\[([^\]]*)\]"#;
        if let Ok(re) = Regex::new(pattern) {
            return re
                .replace_all(line, |caps: &regex::Captures| {
                    let text = caps.get(2).map(|m| m.as_str()).unwrap_or("");
                    text.to_string()
                })
                .to_string();
        }
        line.to_string()
    }

    fn extract_text_from_brackets(&self, line: &str) -> Option<String> {
        if let Some(start) = line.find('[')
            && let Some(end) = line.rfind(']')
            && end > start
        {
            let text = &line[start + 1..end];
            return Some(text.to_string());
        }
        None
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_extract_metadata() {
        let content = r#"#set document(
            title: "Test Document",
            author: "Test Author"
        )

        = Heading
        Some text
        "#;

        let (_, metadata) = TypstExtractor::extract_from_typst(content);

        assert!(metadata.title.is_some(), "Title should be extracted");
        assert_eq!(metadata.title.as_deref(), Some("Test Document"));
        assert!(metadata.authors.is_some(), "Author should be extracted");
        assert_eq!(metadata.authors.as_deref(), Some(&["Test Author".to_string()][..]));
    }

    #[test]
    fn test_extract_headings() {
        let content = r#"= Level 1
Content

== Level 2
More content
"#;

        let (output, _) = TypstExtractor::extract_from_typst(content);

        assert!(output.contains("Level 1"));
        assert!(output.contains("Level 2"));
    }

    #[test]
    fn test_extract_formatting() {
        let content = r#"Some *bold* and _italic_ text with `code`."#;

        let (output, _) = TypstExtractor::extract_from_typst(content);

        assert!(output.contains("*bold*") || output.contains("bold"));
        assert!(output.contains("_italic_") || output.contains("italic"));
        assert!(output.contains("`code`") || output.contains("code"));
    }

    #[test]
    fn test_extract_code_blocks() {
        let content = r#"Here is code:

```python
def hello():
    print("world")
```

Done."#;

        let (output, _) = TypstExtractor::extract_from_typst(content);

        assert!(output.contains("```python"));
        assert!(output.contains("def hello"));
        assert!(output.contains("print"));
    }

    #[test]
    fn test_extract_links() {
        let content = r#"Visit #link("https://example.com")[example site] for info."#;

        let (output, _) = TypstExtractor::extract_from_typst(content);

        assert!(
            output.contains("example.com")
                || output.contains("example site")
                || output.contains("[example site](https://example.com)")
        );
    }

    #[test]
    fn test_extract_list_items() {
        let content = r#"= Lists

+ First item
+ Second item
+ Third item"#;

        let (output, _) = TypstExtractor::extract_from_typst(content);

        assert!(output.contains("First item"));
        assert!(output.contains("Second item"));
        assert!(output.contains("Third item"));
    }

    #[test]
    fn test_extract_tables() {
        let content = r#"== Tables

#table(
  columns: 2,
  [Name], [Age],
  [Alice], [30],
)"#;

        let (output, _) = TypstExtractor::extract_from_typst(content);

        assert!(output.contains("TABLE:") || output.contains("Name") || output.contains("Alice"));
    }

    #[test]
    fn test_extract_math() {
        let content = r#"The formula $E = mc^2$ is important.

Display:
$ a^2 + b^2 = c^2 $"#;

        let (output, _) = TypstExtractor::extract_from_typst(content);

        assert!(output.contains("$") && output.contains("mc"));
    }

    #[test]
    fn test_metadata_extraction_comprehensive() {
        let content = r#"#set document(
            title: "Advanced Document",
            author: "John Doe",
            date: "2024-12-06",
            subject: "Test Subject",
            keywords: ("test", "example", "rust")
        )

        Content here."#;

        let (_, metadata) = TypstExtractor::extract_from_typst(content);

        assert_eq!(
            metadata.title.as_deref(),
            Some("Advanced Document"),
            "Title should be extracted"
        );
        assert!(metadata.authors.is_some(), "Author should be extracted");
        assert_eq!(metadata.authors.as_deref(), Some(&["John Doe".to_string()][..]));
        assert!(metadata.created_at.is_some(), "Date should be extracted");
        assert_eq!(
            metadata.subject.as_deref(),
            Some("Test Subject"),
            "Subject should be extracted"
        );
        assert!(metadata.keywords.is_some(), "Keywords should be extracted");
        let keywords = metadata.keywords.unwrap();
        assert_eq!(keywords, vec!["test", "example", "rust"]);
    }

    #[test]
    fn test_skip_directives() {
        let content = r#"#set heading(numbering: "1.")
#let x = 5
#import "@preview/foo:1.0"
#include "other.typ"

= Heading
Actual content"#;

        let (output, _) = TypstExtractor::extract_from_typst(content);

        assert!(!output.contains("#set"));
        assert!(!output.contains("#let"));
        assert!(!output.contains("#import"));
        assert!(!output.contains("#include"));
        assert!(output.contains("Heading"));
        assert!(output.contains("content"));
    }

    #[test]
    fn test_combined_formatting() {
        let content = r#"This is *bold with _nested italic_* and more."#;

        let (output, _) = TypstExtractor::extract_from_typst(content);

        assert!(output.contains("*") || output.contains("_") || (output.contains("bold") && output.contains("italic")));
    }
}