gobby-wiki 0.2.0

use std::fs::File;
use std::io::{Cursor, Read, Write};
use std::path::{Path, PathBuf};

use calamine::{Data, Reader, open_workbook_auto_from_rs};
use quick_xml::Reader as XmlReader;
use quick_xml::events::Event;
use scraper::{ElementRef, Html, Node, Selector};
use tempfile::{Builder, NamedTempFile};
use zip::ZipArchive;

use crate::document::{
    DocumentDegradation, DocumentDegradationMatrix, DocumentFailureMode, DocumentUnitCount,
};
use crate::ingest::{
    IngestResult, index_after_ingest, markdown_metadata, markdown_title, path_to_string,
    single_line, text_from_utf8_lossy, write_asset, write_raw_markdown,
};
use crate::sources::{CompileStatus, IngestionMethod, SourceDraftRef, SourceKind, SourceManifest};
use crate::store::WikiIndexStore;
use crate::{ScopeIdentity, WikiError};

/// Maximum spreadsheet sheets rendered into markdown during bounded extraction.
const MAX_SHEETS: usize = 8;
/// Maximum rows rendered per spreadsheet sheet before truncation is reported.
const MAX_ROWS_PER_SHEET: usize = 64;
/// Maximum columns rendered per spreadsheet sheet before truncation is reported.
const MAX_COLUMNS_PER_SHEET: usize = 16;
/// Maximum PPTX slides parsed during bounded extraction.
const MAX_SLIDES: usize = 200;
/// Maximum uncompressed XML bytes read from a ZIP entry.
const MAX_ENTRY_BYTES: u64 = 2 * 1024 * 1024;

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DocumentSnapshot {
    pub location: String,
    pub file_name: String,
    pub fetched_at: String,
    pub bytes: Vec<u8>,
    pub kind: SourceKind,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DocumentIngestResult {
    pub record: crate::sources::SourceRecord,
    pub raw_path: PathBuf,
    pub asset_path: PathBuf,
    pub derived_path: PathBuf,
    pub document_degradation: Option<DocumentDegradation>,
}

impl From<DocumentIngestResult> for IngestResult {
    fn from(result: DocumentIngestResult) -> Self {
        Self {
            record: result.record,
            raw_path: result.raw_path,
            asset_path: Some(result.asset_path),
        }
    }
}

#[derive(Debug, Clone, Copy)]
pub struct DocumentRequest<'a> {
    pub file_name: &'a str,
    pub kind: &'a SourceKind,
    pub bytes: &'a [u8],
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DocumentExtraction {
    pub title: Option<String>,
    pub markdown: String,
    pub units_label: &'static str,
    pub units_count: usize,
}

pub trait DocumentExtractor {
    fn extract(&self, request: &DocumentRequest<'_>) -> Result<DocumentExtraction, WikiError>;
}

pub enum DocumentEndpoint<'a> {
    Available(&'a dyn DocumentExtractor),
    Unavailable(DocumentDegradation),
}

struct LocalDocumentExtractor;

pub fn ingest_document(
    vault_root: &Path,
    store: &mut impl WikiIndexStore,
    scope: ScopeIdentity,
    snapshot: DocumentSnapshot,
) -> Result<DocumentIngestResult, WikiError> {
    let result = ingest_document_without_index(vault_root, scope, snapshot)?;
    index_after_ingest(vault_root, store)?;
    Ok(result)
}

pub(crate) fn ingest_document_without_index(
    vault_root: &Path,
    scope: ScopeIdentity,
    snapshot: DocumentSnapshot,
) -> Result<DocumentIngestResult, WikiError> {
    static EXTRACTOR: LocalDocumentExtractor = LocalDocumentExtractor;
    ingest_document_with_endpoint_without_index(
        vault_root,
        scope,
        snapshot,
        DocumentEndpoint::Available(&EXTRACTOR),
    )
}

pub fn ingest_document_with_endpoint(
    vault_root: &Path,
    store: &mut impl WikiIndexStore,
    scope: ScopeIdentity,
    snapshot: DocumentSnapshot,
    endpoint: DocumentEndpoint<'_>,
) -> Result<DocumentIngestResult, WikiError> {
    let result =
        ingest_document_with_endpoint_without_index(vault_root, scope, snapshot, endpoint)?;
    index_after_ingest(vault_root, store)?;
    Ok(result)
}

pub(crate) fn ingest_document_with_endpoint_without_index(
    vault_root: &Path,
    scope: ScopeIdentity,
    snapshot: DocumentSnapshot,
    endpoint: DocumentEndpoint<'_>,
) -> Result<DocumentIngestResult, WikiError> {
    let request = DocumentRequest {
        file_name: &snapshot.file_name,
        kind: &snapshot.kind,
        bytes: &snapshot.bytes,
    };
    let (extraction, degradation) = match endpoint {
        DocumentEndpoint::Available(extractor) => match extractor.extract(&request) {
            Ok(extraction) => (Some(extraction), None),
            Err(error) => (
                None,
                Some(document_degradation_for_error(&request, error.to_string())),
            ),
        },
        DocumentEndpoint::Unavailable(degradation) => (None, Some(degradation)),
    };
    let title = extraction
        .as_ref()
        .and_then(|value| value.title.clone())
        .unwrap_or_else(|| markdown_title(&snapshot.file_name));
    let record = SourceManifest::register_borrowed(
        vault_root,
        SourceDraftRef {
            location: snapshot.location.clone(),
            kind: snapshot.kind.clone(),
            fetched_at: snapshot.fetched_at.clone(),
            content: &snapshot.bytes,
            title: Some(title.clone()),
            citation: Some(snapshot.location.clone()),
            license: None,
            ingestion_method: IngestionMethod::Manual,
            compile_status: CompileStatus::Pending,
        },
    )?;
    let asset_path = write_asset(vault_root, &record, &snapshot.file_name, &snapshot.bytes)?;
    let raw_markdown = render_raw_document_markdown(&snapshot, &record.content_hash, &asset_path);
    let raw_path = write_raw_markdown(vault_root, &record, &raw_markdown)?;
    let derived_path = write_document_derived_markdown(
        vault_root,
        &scope,
        &record,
        &snapshot,
        &title,
        &asset_path,
        extraction.as_ref(),
        degradation.as_ref(),
    )?;

    Ok(DocumentIngestResult {
        record,
        raw_path,
        asset_path,
        derived_path,
        document_degradation: degradation,
    })
}

impl DocumentExtractor for LocalDocumentExtractor {
    fn extract(&self, request: &DocumentRequest<'_>) -> Result<DocumentExtraction, WikiError> {
        match request.kind {
            SourceKind::Html => extract_html_document(request.bytes),
            SourceKind::Office => extract_office_document(request.file_name, request.bytes),
            _ => Err(document_error("unsupported document kind")),
        }
    }
}

fn extract_office_document(file_name: &str, bytes: &[u8]) -> Result<DocumentExtraction, WikiError> {
    match extension(file_name).as_deref() {
        Some("docx") => extract_docx(bytes),
        Some("pptx") => extract_pptx(bytes),
        Some("xlsx" | "xls" | "ods") => extract_spreadsheet(bytes),
        Some(extension) => Err(document_error(format!(
            "unsupported office extension .{extension}"
        ))),
        None => Err(document_error("missing office extension")),
    }
}

fn extract_docx(bytes: &[u8]) -> Result<DocumentExtraction, WikiError> {
    let xml = read_zip_entry(bytes, "word/document.xml")?;
    let paragraphs = extract_xml_paragraphs(&xml)?;
    if paragraphs.is_empty() {
        return Err(document_error("docx contained no paragraph text"));
    }
    Ok(DocumentExtraction {
        title: paragraphs.first().cloned(),
        markdown: paragraphs.join("\n\n"),
        units_label: "paragraph_count",
        units_count: paragraphs.len(),
    })
}

fn extract_pptx(bytes: &[u8]) -> Result<DocumentExtraction, WikiError> {
    let mut archive = zip_archive(bytes)?;
    let mut slide_names = archive
        .file_names()
        .filter(|name| name.starts_with("ppt/slides/slide") && name.ends_with(".xml"))
        .map(str::to_string)
        .collect::<Vec<_>>();
    slide_names.sort_by_key(|name| slide_number(name).unwrap_or(usize::MAX));
    if slide_names.is_empty() {
        return Err(document_error("pptx contained no slide XML"));
    }
    let slides_truncated = slide_names.len() > MAX_SLIDES;
    if slide_names.len() > MAX_SLIDES {
        slide_names.truncate(MAX_SLIDES);
    }

    let mut markdown = String::new();
    let mut title = None;
    let mut slide_count = 0;
    for (index, name) in slide_names.iter().enumerate() {
        let xml = read_zip_entry_from_archive(&mut archive, name)?;
        let paragraphs = extract_xml_paragraphs(&xml)?;
        if paragraphs.is_empty() {
            continue;
        }
        slide_count += 1;
        if title.is_none() {
            title = paragraphs.first().cloned();
        }
        if !markdown.is_empty() {
            markdown.push('\n');
        }
        markdown.push_str("## Slide ");
        markdown.push_str(&(index + 1).to_string());
        markdown.push_str("\n\n");
        markdown.push_str(&paragraphs.join("\n\n"));
        markdown.push_str("\n\n");
    }
    if markdown.trim().is_empty() {
        return Err(document_error("pptx contained no slide text"));
    }
    if slides_truncated {
        markdown.push_str("_Slides truncated for bounded extraction._\n\n");
    }
    Ok(DocumentExtraction {
        title,
        markdown: markdown.trim_end().to_string(),
        units_label: "slide_count",
        units_count: slide_count,
    })
}

fn extract_spreadsheet(bytes: &[u8]) -> Result<DocumentExtraction, WikiError> {
    let cursor = Cursor::new(bytes);
    let mut workbook = open_workbook_auto_from_rs(cursor)
        .map_err(|error| document_error(format!("open spreadsheet: {error}")))?;
    let sheet_names = workbook.sheet_names().to_vec();
    if sheet_names.is_empty() {
        return Err(document_error("spreadsheet contained no sheets"));
    }

    let mut markdown = String::new();
    let mut rendered_sheets = 0;
    let mut title = None;
    for sheet_name in sheet_names.iter().take(MAX_SHEETS) {
        let range = workbook
            .worksheet_range(sheet_name)
            .map_err(|error| document_error(format!("read sheet {sheet_name}: {error}")))?;
        let rows = range
            .rows()
            .take(MAX_ROWS_PER_SHEET)
            .map(|row| {
                row.iter()
                    .take(MAX_COLUMNS_PER_SHEET)
                    .map(cell_text)
                    .collect::<Vec<_>>()
            })
            .filter(|row| row.iter().any(|cell| !cell.is_empty()))
            .collect::<Vec<_>>();
        if rows.is_empty() {
            continue;
        }
        if title.is_none() {
            title = Some(markdown_title(sheet_name));
        }
        rendered_sheets += 1;
        if !markdown.is_empty() {
            markdown.push('\n');
        }
        markdown.push_str("## Sheet: ");
        markdown.push_str(&single_line(sheet_name));
        markdown.push_str("\n\n");
        markdown.push_str(&markdown_table(&rows));
        markdown.push('\n');
        if range.height() > MAX_ROWS_PER_SHEET || range.width() > MAX_COLUMNS_PER_SHEET {
            markdown.push_str("\n_Table truncated for bounded extraction._\n");
        }
    }
    if markdown.trim().is_empty() {
        return Err(document_error("spreadsheet contained no cell text"));
    }
    Ok(DocumentExtraction {
        title,
        markdown: markdown.trim_end().to_string(),
        units_label: "sheet_count",
        units_count: rendered_sheets,
    })
}

fn extract_html_document(bytes: &[u8]) -> Result<DocumentExtraction, WikiError> {
    let html = text_from_utf8_lossy(bytes);
    let document = Html::parse_document(&html);
    let title = extract_html_title(&document);
    let root = Selector::parse("body")
        .ok()
        .and_then(|selector| document.select(&selector).next())
        .unwrap_or_else(|| document.root_element());
    let mut parts = Vec::new();
    collect_visible_text(root, &mut parts);
    let markdown = normalize_markdown_text(&parts.join("\n"));
    if markdown.is_empty() {
        return Err(document_error("html contained no readable text"));
    }
    Ok(DocumentExtraction {
        title,
        markdown,
        units_label: "section_count",
        units_count: 1,
    })
}

fn render_raw_document_markdown(
    snapshot: &DocumentSnapshot,
    source_hash: &str,
    asset_path: &Path,
) -> String {
    let asset_path = path_to_string(asset_path);
    let fields = vec![
        ("source_kind", snapshot.kind.to_string()),
        ("source_location", snapshot.location.clone()),
        ("fetched_at", snapshot.fetched_at.clone()),
        ("source_hash", source_hash.to_string()),
        ("source_asset", asset_path.clone()),
    ];

    let mut markdown = markdown_metadata(&fields);
    markdown.push_str("# ");
    markdown.push_str(&markdown_title(&snapshot.file_name));
    markdown.push_str("\n\n");
    markdown.push_str("Original document stored under `");
    markdown.push_str(&asset_path);
    markdown.push_str("`.\n");
    markdown
}

#[allow(clippy::too_many_arguments)]
fn write_document_derived_markdown(
    vault_root: &Path,
    scope: &ScopeIdentity,
    record: &crate::sources::SourceRecord,
    snapshot: &DocumentSnapshot,
    title: &str,
    asset_path: &Path,
    extraction: Option<&DocumentExtraction>,
    degradation: Option<&DocumentDegradation>,
) -> Result<PathBuf, WikiError> {
    let relative_path = derived_markdown_path(record);
    let path = vault_root.join(&relative_path);
    if let Some(parent) = path.parent() {
        std::fs::create_dir_all(parent).map_err(|error| WikiError::Io {
            action: "create document derived markdown directory",
            path: Some(parent.to_path_buf()),
            source: error,
        })?;
    }

    let markdown = render_document_derived_markdown(
        scope,
        record,
        snapshot,
        title,
        asset_path,
        extraction,
        degradation,
    );
    write_document_markdown_atomic(&path, markdown.as_bytes())?;
    Ok(relative_path)
}

fn write_document_markdown_atomic(path: &Path, contents: &[u8]) -> Result<(), WikiError> {
    let mut temp_file = create_document_temp_file(path)?;
    if let Err(error) = temp_file.write_all(contents) {
        return Err(WikiError::Io {
            action: "write document derived markdown temp file",
            path: Some(temp_file.path().to_path_buf()),
            source: error,
        });
    }
    if let Err(error) = temp_file.as_file().sync_all() {
        return Err(WikiError::Io {
            action: "sync document derived markdown temp file",
            path: Some(temp_file.path().to_path_buf()),
            source: error,
        });
    }
    if let Err(error) = temp_file.persist(path) {
        return Err(WikiError::Io {
            action: "write document derived markdown",
            path: Some(path.to_path_buf()),
            source: error.error,
        });
    }
    sync_parent_dir(path)
}

fn create_document_temp_file(path: &Path) -> Result<NamedTempFile, WikiError> {
    let Some(parent) = path
        .parent()
        .filter(|parent| !parent.as_os_str().is_empty())
    else {
        return Err(WikiError::Io {
            action: "create document derived markdown temp file",
            path: Some(path.to_path_buf()),
            source: std::io::Error::new(
                std::io::ErrorKind::InvalidInput,
                "document derived markdown target has no parent directory",
            ),
        });
    };
    let file_name = path
        .file_name()
        .and_then(|name| name.to_str())
        .unwrap_or("document.md");
    Builder::new()
        .prefix(&format!(".{file_name}."))
        .suffix(".tmp")
        .tempfile_in(parent)
        .map_err(|source| WikiError::Io {
            action: "create document derived markdown temp file",
            path: Some(parent.to_path_buf()),
            source,
        })
}

fn sync_parent_dir(path: &Path) -> Result<(), WikiError> {
    #[cfg(not(unix))]
    {
        let _ = path;
        Ok(())
    }
    #[cfg(unix)]
    {
        let Some(parent) = path.parent() else {
            return Ok(());
        };
        File::open(parent)
            .and_then(|dir| dir.sync_all())
            .map_err(|error| WikiError::Io {
                action: "sync document derived markdown directory",
                path: Some(parent.to_path_buf()),
                source: error,
            })
    }
}

fn render_document_derived_markdown(
    scope: &ScopeIdentity,
    record: &crate::sources::SourceRecord,
    snapshot: &DocumentSnapshot,
    title: &str,
    asset_path: &Path,
    extraction: Option<&DocumentExtraction>,
    degradation: Option<&DocumentDegradation>,
) -> String {
    let asset_path = path_to_string(asset_path);
    let raw_path = format!("raw/{}.md", record.id);
    let mut fields = vec![
        ("title".to_string(), title.to_string()),
        ("source_kind".to_string(), snapshot.kind.to_string()),
        ("source_location".to_string(), record.location.clone()),
        ("source_hash".to_string(), record.content_hash.clone()),
        ("source_asset".to_string(), asset_path.clone()),
        ("source_raw".to_string(), raw_path.clone()),
        ("fetched_at".to_string(), record.fetched_at.clone()),
        ("scope_kind".to_string(), scope.kind.as_str().to_string()),
        ("scope_id".to_string(), scope.id.clone()),
        (
            "document_status".to_string(),
            if extraction.is_some() {
                "extracted".to_string()
            } else {
                "unavailable".to_string()
            },
        ),
    ];
    if let Some(extraction) = extraction {
        fields.push((
            extraction.units_label.to_string(),
            extraction.units_count.to_string(),
        ));
    }
    if let Some(degradation) = degradation {
        fields.extend(DocumentDegradationMatrix::metadata(
            degradation,
            snapshot.bytes.len(),
        ));
    } else {
        fields.push((
            "file_size_bytes".to_string(),
            snapshot.bytes.len().to_string(),
        ));
    }

    let mut markdown = {
        let field_refs = fields
            .iter()
            .map(|(key, value)| (key.as_str(), value.clone()))
            .collect::<Vec<_>>();
        markdown_metadata(&field_refs)
    };
    markdown.push_str("# ");
    markdown.push_str(title);
    markdown.push_str("\n\n");
    markdown.push_str("Original document: `");
    markdown.push_str(&asset_path);
    markdown.push_str("`\n\n");
    markdown.push_str("Raw source: `");
    markdown.push_str(&raw_path);
    markdown.push_str("`\n\n");

    if let Some(extraction) = extraction {
        markdown.push_str(&extraction.markdown);
        if !markdown.ends_with('\n') {
            markdown.push('\n');
        }
    } else if let Some(degradation) = degradation {
        markdown.push_str(&DocumentDegradationMatrix::markdown_section(degradation));
    }

    markdown.push_str("## Source References\n\n");
    markdown.push_str("- Raw source: `");
    markdown.push_str(&raw_path);
    markdown.push_str("`\n");
    markdown.push_str("- Original document: `");
    markdown.push_str(&asset_path);
    markdown.push_str("`\n");
    if let Some(citation) = &record.citation {
        markdown.push_str("- Citation: ");
        markdown.push_str(&single_line(citation));
        markdown.push('\n');
    }
    markdown
}

fn derived_markdown_path(record: &crate::sources::SourceRecord) -> PathBuf {
    PathBuf::from("wiki")
        .join("sources")
        .join(format!("{}.md", record.id))
}

fn document_degradation_for_error(
    request: &DocumentRequest<'_>,
    error: String,
) -> DocumentDegradation {
    let mode = match request.kind {
        SourceKind::Html => DocumentFailureMode::HtmlParseError,
        SourceKind::Office => DocumentFailureMode::OfficeParseError,
        _ => DocumentFailureMode::OfficeParseError,
    };
    DocumentDegradation::new(
        mode,
        document_unit_count_for_failure(request.file_name, request.kind),
        format!("Document parsing failed: {error}; original asset is preserved."),
    )
}

fn document_unit_count_for_failure(file_name: &str, kind: &SourceKind) -> DocumentUnitCount {
    match kind {
        SourceKind::Html => DocumentUnitCount::pages(1),
        SourceKind::Office => match extension(file_name).as_deref() {
            Some("pptx") => DocumentUnitCount::slides(0),
            Some("xlsx" | "xls" | "ods") => DocumentUnitCount::sheets(0),
            _ => DocumentUnitCount::pages(0),
        },
        _ => DocumentUnitCount::pages(0),
    }
}

fn read_zip_entry(bytes: &[u8], name: &str) -> Result<String, WikiError> {
    let mut archive = zip_archive(bytes)?;
    read_zip_entry_from_archive(&mut archive, name)
}

fn read_zip_entry_from_archive(
    archive: &mut ZipArchive<Cursor<&[u8]>>,
    name: &str,
) -> Result<String, WikiError> {
    let mut xml = String::new();
    let mut entry = archive
        .by_name(name)
        .map_err(|error| document_error(format!("read {name}: {error}")))?;
    if entry.size() > MAX_ENTRY_BYTES {
        return Err(document_error(format!(
            "{name} is {} bytes; maximum supported XML entry is {MAX_ENTRY_BYTES} bytes",
            entry.size()
        )));
    }
    entry
        .by_ref()
        .take(MAX_ENTRY_BYTES + 1)
        .read_to_string(&mut xml)
        .map_err(|error| document_error(format!("read {name}: {error}")))?;
    if xml.len() as u64 > MAX_ENTRY_BYTES {
        return Err(document_error(format!(
            "{name} exceeds maximum supported XML entry size of {MAX_ENTRY_BYTES} bytes"
        )));
    }
    Ok(xml)
}

fn zip_archive(bytes: &[u8]) -> Result<ZipArchive<Cursor<&[u8]>>, WikiError> {
    ZipArchive::new(Cursor::new(bytes))
        .map_err(|error| document_error(format!("open zip: {error}")))
}

fn extract_xml_paragraphs(xml: &str) -> Result<Vec<String>, WikiError> {
    let mut reader = XmlReader::from_str(xml);
    reader.config_mut().trim_text(true);
    let mut paragraphs = Vec::new();
    let mut current = String::new();
    let mut in_text = false;

    loop {
        match reader.read_event() {
            Ok(Event::Start(event)) => match local_name(event.name().as_ref()) {
                b"p" => current.clear(),
                b"t" => in_text = true,
                _ => {}
            },
            Ok(Event::End(event)) => match local_name(event.name().as_ref()) {
                b"p" => {
                    let paragraph = single_line(&decode_xml_entities(&current));
                    if !paragraph.is_empty() {
                        paragraphs.push(paragraph);
                    }
                    current.clear();
                }
                b"t" => in_text = false,
                _ => {}
            },
            Ok(Event::Text(text)) if in_text => {
                current.push_str(&String::from_utf8_lossy(text.as_ref()));
            }
            Ok(Event::CData(text)) if in_text => {
                current.push_str(&String::from_utf8_lossy(text.as_ref()));
            }
            Ok(Event::Eof) => break,
            Ok(_) => {}
            Err(error) => return Err(document_error(format!("parse xml: {error}"))),
        }
    }
    Ok(paragraphs)
}

fn markdown_table(rows: &[Vec<String>]) -> String {
    if rows.is_empty() {
        return String::new();
    }
    let column_count = rows.iter().map(Vec::len).max().unwrap_or(1);
    let mut markdown = String::new();
    let header = &rows[0];
    push_table_row(&mut markdown, header, column_count);
    let separators = vec!["---".to_string(); column_count];
    push_table_row(&mut markdown, &separators, column_count);
    for row in rows.iter().skip(1) {
        push_table_row(&mut markdown, row, column_count);
    }
    markdown.trim_end().to_string()
}

fn push_table_row(markdown: &mut String, row: &[String], column_count: usize) {
    markdown.push('|');
    for index in 0..column_count {
        markdown.push(' ');
        if let Some(cell) = row.get(index) {
            markdown.push_str(&escape_table_cell(cell));
        }
        markdown.push_str(" |");
    }
    markdown.push('\n');
}

fn cell_text(cell: &Data) -> String {
    match cell {
        Data::Empty => String::new(),
        _ => single_line(&cell.to_string()),
    }
}

fn escape_table_cell(cell: &str) -> String {
    single_line(cell).replace('|', "\\|")
}

fn extract_html_title(document: &Html) -> Option<String> {
    let selector = Selector::parse("title").ok()?;
    let title = document
        .select(&selector)
        .next()?
        .text()
        .collect::<Vec<_>>()
        .join(" ");
    let title = markdown_title(&decode_xml_entities(&title));
    (!title.is_empty()).then_some(title)
}

fn collect_visible_text(element: ElementRef<'_>, parts: &mut Vec<String>) {
    if matches!(element.value().name(), "head" | "script" | "style") {
        return;
    }
    let mut inline = String::new();
    for child in element.children() {
        match child.value() {
            Node::Text(text) => append_inline_text(&mut inline, &text.text),
            Node::Element(_) => {
                if let Some(child_element) = ElementRef::wrap(child) {
                    if is_block_element(child_element.value().name()) {
                        push_visible_part(parts, &mut inline);
                        collect_visible_text(child_element, parts);
                    } else {
                        let child_text = collect_inline_text(child_element);
                        append_inline_text(&mut inline, &child_text);
                    }
                }
            }
            _ => {}
        }
    }
    push_visible_part(parts, &mut inline);
}

fn collect_inline_text(element: ElementRef<'_>) -> String {
    if matches!(element.value().name(), "head" | "script" | "style") {
        return String::new();
    }
    let mut text = String::new();
    for child in element.children() {
        match child.value() {
            Node::Text(node_text) => append_inline_text(&mut text, &node_text.text),
            Node::Element(_) => {
                if let Some(child_element) = ElementRef::wrap(child) {
                    let child_text = collect_inline_text(child_element);
                    append_inline_text(&mut text, &child_text);
                }
            }
            _ => {}
        }
    }
    text
}

fn append_inline_text(output: &mut String, text: &str) {
    let text = text.trim();
    if text.is_empty() {
        return;
    }
    if !output.is_empty()
        && output.chars().last().is_some_and(|ch| !ch.is_whitespace())
        && !starts_with_closing_punctuation(text)
    {
        output.push(' ');
    }
    output.push_str(text);
}

fn starts_with_closing_punctuation(text: &str) -> bool {
    text.chars().next().is_some_and(|ch| {
        matches!(
            ch,
            '.' | ','
                | ';'
                | ':'
                | '!'
                | '?'
                | ')'
                | ']'
                | '}'
                | '"'
                | '\''
                | '\u{201d}'
                | '\u{2019}'
                | '\u{203a}'
                | '\u{00bb}'
                | '\u{3002}'
                | '\u{ff0c}'
                | '\u{3001}'
                | '\u{ff09}'
                | '\u{3011}'
                | '\u{3015}'
                | '\u{3017}'
                | '\u{300b}'
        )
    })
}

fn push_visible_part(parts: &mut Vec<String>, inline: &mut String) {
    let part = single_line(inline);
    if !part.is_empty() {
        parts.push(part);
    }
    inline.clear();
}

fn is_block_element(name: &str) -> bool {
    matches!(
        name,
        "address"
            | "article"
            | "aside"
            | "blockquote"
            | "br"
            | "dd"
            | "div"
            | "dl"
            | "dt"
            | "figcaption"
            | "figure"
            | "footer"
            | "h1"
            | "h2"
            | "h3"
            | "h4"
            | "h5"
            | "h6"
            | "header"
            | "hr"
            | "li"
            | "main"
            | "nav"
            | "ol"
            | "p"
            | "pre"
            | "section"
            | "table"
            | "td"
            | "th"
            | "tr"
            | "ul"
    )
}

fn normalize_markdown_text(text: &str) -> String {
    decode_xml_entities(text)
        .lines()
        .map(single_line)
        .filter(|line| !line.is_empty())
        .fold(Vec::<String>::new(), |mut lines, line| {
            if lines.last() != Some(&line) {
                lines.push(line);
            }
            lines
        })
        .join("\n\n")
}

fn local_name(name: &[u8]) -> &[u8] {
    name.iter()
        .position(|byte| *byte == b':')
        .map_or(name, |index| &name[index + 1..])
}

fn slide_number(name: &str) -> Option<usize> {
    name.strip_prefix("ppt/slides/slide")?
        .strip_suffix(".xml")?
        .parse()
        .ok()
}

fn extension(file_name: &str) -> Option<String> {
    Path::new(file_name)
        .extension()
        .and_then(|value| value.to_str())
        .map(str::to_ascii_lowercase)
}

fn decode_xml_entities(text: &str) -> String {
    html_escape::decode_html_entities(text).into_owned()
}

fn document_error(message: impl Into<String>) -> WikiError {
    WikiError::InvalidInput {
        field: "document",
        message: message.into(),
    }
}

#[cfg(test)]
mod tests {
    use std::io::Write;
    use std::path::Path;

    use super::*;
    use crate::ScopeIdentity;
    use crate::sources::SourceKind;
    use crate::store::MemoryWikiStore;

    fn zip_bytes(entries: &[(&str, &str)]) -> Vec<u8> {
        let cursor = std::io::Cursor::new(Vec::new());
        let mut zip = zip::ZipWriter::new(cursor);
        let options = zip::write::SimpleFileOptions::default();
        for (path, contents) in entries {
            zip.start_file(path, options).expect("start zip entry");
            zip.write_all(contents.as_bytes()).expect("write zip entry");
        }
        zip.finish().expect("finish zip").into_inner()
    }

    fn sample_docx() -> Vec<u8> {
        zip_bytes(&[(
            "word/document.xml",
            r#"<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body><w:p><w:r><w:t>Quarterly Brief</w:t></w:r></w:p><w:p><w:r><w:t>Revenue rose in Duluth.</w:t></w:r></w:p></w:body></w:document>"#,
        )])
    }

    fn sample_pptx() -> Vec<u8> {
        zip_bytes(&[
            (
                "ppt/slides/slide2.xml",
                r#"<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"><p:cSld><p:spTree><p:sp><p:txBody><a:p><a:r><a:t>Second slide summary</a:t></a:r></a:p></p:txBody></p:sp></p:spTree></p:cSld></p:sld>"#,
            ),
            (
                "ppt/slides/slide1.xml",
                r#"<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"><p:cSld><p:spTree><p:sp><p:txBody><a:p><a:r><a:t>First slide title</a:t></a:r></a:p></p:txBody></p:sp></p:spTree></p:cSld></p:sld>"#,
            ),
        ])
    }

    fn oversized_pptx(slide_count: usize) -> Vec<u8> {
        let cursor = std::io::Cursor::new(Vec::new());
        let mut zip = zip::ZipWriter::new(cursor);
        let options = zip::write::SimpleFileOptions::default();
        for index in 1..=slide_count {
            zip.start_file(format!("ppt/slides/slide{index}.xml"), options)
                .expect("start slide");
            zip.write_all(
                br#"<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"><p:cSld><p:spTree><p:sp><p:txBody><a:p><a:r><a:t>Slide</a:t></a:r></a:p></p:txBody></p:sp></p:spTree></p:cSld></p:sld>"#,
            )
            .expect("write slide");
        }
        zip.finish().expect("finish zip").into_inner()
    }

    fn sample_xlsx() -> Vec<u8> {
        zip_bytes(&[
            (
                "[Content_Types].xml",
                r#"<?xml version="1.0" encoding="UTF-8"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Default Extension="xml" ContentType="application/xml"/><Override PartName="/xl/workbook.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/><Override PartName="/xl/worksheets/sheet1.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"/></Types>"#,
            ),
            (
                "_rels/.rels",
                r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="xl/workbook.xml"/></Relationships>"#,
            ),
            (
                "xl/workbook.xml",
                r#"<?xml version="1.0" encoding="UTF-8"?><workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"><sheets><sheet name="Data" sheetId="1" r:id="rId1"/></sheets></workbook>"#,
            ),
            (
                "xl/_rels/workbook.xml.rels",
                r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" Target="worksheets/sheet1.xml"/></Relationships>"#,
            ),
            (
                "xl/worksheets/sheet1.xml",
                r#"<?xml version="1.0" encoding="UTF-8"?><worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"><sheetData><row r="1"><c r="A1" t="inlineStr"><is><t>City</t></is></c><c r="B1" t="inlineStr"><is><t>Count</t></is></c></row><row r="2"><c r="A2" t="inlineStr"><is><t>Duluth</t></is></c><c r="B2"><v>3</v></c></row></sheetData></worksheet>"#,
            ),
        ])
    }

    fn ingest_sample(
        temp: &Path,
        store: &mut MemoryWikiStore,
        file_name: &str,
        kind: SourceKind,
        bytes: Vec<u8>,
    ) -> DocumentIngestResult {
        ingest_document(
            temp,
            store,
            ScopeIdentity::project("project-123"),
            DocumentSnapshot {
                location: format!("/tmp/{file_name}"),
                file_name: file_name.to_string(),
                fetched_at: "2026-05-31T20:00:00Z".to_string(),
                bytes,
                kind,
            },
        )
        .expect("ingest document")
    }

    #[test]
    fn extracts_office_html_and_degrades() {
        let temp = tempfile::tempdir().expect("tempdir");
        let mut store = MemoryWikiStore::default();

        let cases = [
            (
                "data.xlsx",
                SourceKind::Office,
                sample_xlsx(),
                "| City | Count |",
                "Duluth",
            ),
            (
                "brief.docx",
                SourceKind::Office,
                sample_docx(),
                "Quarterly Brief",
                "Revenue rose in Duluth.",
            ),
            (
                "deck.pptx",
                SourceKind::Office,
                sample_pptx(),
                "## Slide 1",
                "Second slide summary",
            ),
            (
                "page.html",
                SourceKind::Html,
                b"<!doctype html><html><head><title>Readable &amp; Useful</title><script>drop()</script></head><body><main><h1>Readable &amp; Useful</h1><p>Keep this body text.</p></main></body></html>".to_vec(),
                "# Readable & Useful",
                "Keep this body text.",
            ),
        ];

        for (file_name, kind, bytes, first_expected, second_expected) in cases {
            let result = ingest_sample(
                temp.path(),
                &mut store,
                file_name,
                kind.clone(),
                bytes.clone(),
            );
            assert_eq!(result.record.kind, kind);
            assert_eq!(
                std::fs::read(temp.path().join(&result.asset_path)).expect("asset bytes"),
                bytes
            );
            let document = store
                .documents
                .get(&result.derived_path)
                .expect("derived document indexed");
            assert!(document.body.contains(first_expected), "{file_name}");
            assert!(document.body.contains(second_expected), "{file_name}");
        }

        let degraded = ingest_sample(
            temp.path(),
            &mut store,
            "broken.docx",
            SourceKind::Office,
            b"not a zip".to_vec(),
        );

        assert_eq!(
            std::fs::read(temp.path().join(&degraded.asset_path)).expect("degraded asset"),
            b"not a zip"
        );
        assert!(degraded.document_degradation.is_some());
        let document = store
            .documents
            .get(&degraded.derived_path)
            .expect("degraded document indexed");
        assert!(
            document
                .body
                .contains("media_degradation: office_parse_error")
        );
        assert!(document.body.contains("## Document Parse Unavailable"));
    }

    #[test]
    fn office_html_degradation_uses_uniform_metadata() {
        let temp = tempfile::tempdir().expect("tempdir");
        let mut store = MemoryWikiStore::default();

        let office_bytes = b"not a zip".to_vec();
        let office = ingest_sample(
            temp.path(),
            &mut store,
            "broken.xlsx",
            SourceKind::Office,
            office_bytes.clone(),
        );
        assert_eq!(
            std::fs::read(temp.path().join(&office.asset_path)).expect("office asset"),
            office_bytes
        );
        let office_doc = store
            .documents
            .get(&office.derived_path)
            .expect("office derived document indexed");
        assert!(
            office_doc
                .body
                .contains("media_degradation: office_parse_error")
        );
        assert!(
            office_doc
                .body
                .contains(&format!("file_size_bytes: {}", office_bytes.len()))
        );
        assert!(office_doc.body.contains("sheet_count: 0"));

        let html_bytes = b"<html><head></head><body><script>drop()</script></body></html>".to_vec();
        let html = ingest_sample(
            temp.path(),
            &mut store,
            "empty.html",
            SourceKind::Html,
            html_bytes.clone(),
        );
        assert_eq!(
            std::fs::read(temp.path().join(&html.asset_path)).expect("html asset"),
            html_bytes
        );
        let html_doc = store
            .documents
            .get(&html.derived_path)
            .expect("html derived document indexed");
        assert!(
            html_doc
                .body
                .contains("media_degradation: html_parse_error")
        );
        assert!(
            html_doc
                .body
                .contains(&format!("file_size_bytes: {}", html_bytes.len()))
        );
        assert!(html_doc.body.contains("page_count: 1"));
    }

    #[test]
    fn markdown_table_handles_empty_rows() {
        assert_eq!(markdown_table(&[]), "");
    }

    #[test]
    fn office_zip_reads_are_bounded() {
        let oversized_xml = "x".repeat(MAX_ENTRY_BYTES as usize + 1);
        let error = extract_docx(&zip_bytes(&[("word/document.xml", &oversized_xml)]))
            .expect_err("oversized docx XML rejected");

        assert!(matches!(error, WikiError::InvalidInput { .. }));
        assert!(error.to_string().contains("maximum supported XML entry"));
    }

    #[test]
    fn pptx_slide_count_is_bounded() {
        let extraction = extract_pptx(&oversized_pptx(MAX_SLIDES + 1))
            .expect("oversized slide deck is truncated");

        assert_eq!(extraction.units_count, MAX_SLIDES);
        assert!(
            extraction
                .markdown
                .contains("_Slides truncated for bounded extraction._")
        );
    }

    #[test]
    fn html_extraction_combines_inline_text_nodes() {
        let extraction = extract_html_document(
            b"<html><body><p>Hello <strong>world</strong>.</p><p>Next line.</p></body></html>",
        )
        .expect("html extracts");

        assert_eq!(extraction.markdown, "Hello world.\n\nNext line.");
    }

    #[test]
    fn html_extraction_avoids_spaces_before_closing_quotes() {
        let extraction = extract_html_document(
            b"<html><body><p>Hello <span>world</span><span>\xe2\x80\x9d</span>.</p></body></html>",
        )
        .expect("html extracts");

        assert_eq!(extraction.markdown, "Hello world\u{201d}.");
    }
}