office_oxide 0.1.1

use std::collections::HashMap;
use std::fs::File;
use std::hash::{Hash, Hasher};
use std::io::{Read, Seek, Write};
use std::path::Path;

use log::{debug, trace};
use zip::CompressionMethod;
use zip::read::ZipArchive;
use zip::write::{SimpleFileOptions, ZipWriter};

use super::content_types::{ContentTypes, ContentTypesBuilder};
use super::error::{Error, Result};
use super::properties::{AppProperties, CoreProperties};
use super::relationships::{Relationships, RelationshipsBuilder, TargetMode, rel_types};

// ---------------------------------------------------------------------------
// PartName
// ---------------------------------------------------------------------------

/// A validated, normalized OPC part name.
///
/// Part names always start with '/' and are compared case-insensitively per OPC §5.
#[derive(Debug, Clone)]
pub struct PartName(String);

/// Decode percent-encoded UTF-8 sequences (e.g., `%C3%A1` → `á`).
fn decode_percent_encoding(input: &str) -> String {
    let bytes = input.as_bytes();
    let mut decoded = Vec::with_capacity(bytes.len());
    let mut i = 0;
    while i < bytes.len() {
        if bytes[i] == b'%' && i + 2 < bytes.len() {
            if let (Some(hi), Some(lo)) = (hex_digit(bytes[i + 1]), hex_digit(bytes[i + 2])) {
                decoded.push(hi << 4 | lo);
                i += 3;
                continue;
            }
        }
        decoded.push(bytes[i]);
        i += 1;
    }
    String::from_utf8(decoded).unwrap_or_else(|_| input.to_string())
}

fn hex_digit(b: u8) -> Option<u8> {
    match b {
        b'0'..=b'9' => Some(b - b'0'),
        b'a'..=b'f' => Some(b - b'a' + 10),
        b'A'..=b'F' => Some(b - b'A' + 10),
        _ => None,
    }
}

impl PartName {
    /// Validate and create a new PartName per OPC part naming rules.
    /// Percent-encoded sequences are decoded before validation (tolerant parsing
    /// for files generated by LibreOffice which incorrectly percent-encodes non-ASCII).
    pub fn new(name: &str) -> Result<Self> {
        // Decode percent-encoding if present (tolerant of non-spec-compliant producers)
        let name = if name.contains('%') {
            decode_percent_encoding(name)
        } else {
            name.to_string()
        };

        // Must start with '/'
        if !name.starts_with('/') {
            return Err(Error::InvalidPartName(format!("must start with '/': {name}")));
        }
        // Must not end with '/'
        if name.len() > 1 && name.ends_with('/') {
            return Err(Error::InvalidPartName(format!("must not end with '/': {name}")));
        }
        // No empty segments (//)
        if name.contains("//") {
            return Err(Error::InvalidPartName(format!("empty segment (//): {name}")));
        }
        // No dot segments
        for segment in name.split('/').skip(1) {
            if segment == "." || segment == ".." {
                return Err(Error::InvalidPartName(format!("dot segment not allowed: {name}")));
            }
            // Segments must not end with '.'
            if segment.ends_with('.') {
                return Err(Error::InvalidPartName(format!("segment ends with dot: {name}")));
            }
        }
        // No query/fragment
        if name.contains('?') || name.contains('#') {
            return Err(Error::InvalidPartName(format!("query/fragment not allowed: {name}")));
        }
        // No backslashes
        if name.contains('\\') {
            return Err(Error::InvalidPartName(format!("backslash not allowed: {name}")));
        }

        Ok(Self(name))
    }

    /// Return the part name as a string slice (always starts with `/`).
    pub fn as_str(&self) -> &str {
        &self.0
    }

    /// Extract the file extension (without leading dot), if any.
    pub fn extension(&self) -> Option<&str> {
        let filename = self.filename();
        filename.rfind('.').map(|i| &filename[i + 1..])
    }

    /// The directory portion, e.g. "/word/" for "/word/document.xml".
    pub fn directory(&self) -> &str {
        match self.0.rfind('/') {
            Some(i) => &self.0[..=i],
            None => "/",
        }
    }

    /// The filename portion, e.g. "document.xml" for "/word/document.xml".
    pub fn filename(&self) -> &str {
        match self.0.rfind('/') {
            Some(i) => &self.0[i + 1..],
            None => &self.0,
        }
    }

    /// Compute the path to this part's relationships file.
    /// e.g. "/word/document.xml" -> "/word/_rels/document.xml.rels"
    pub fn rels_path(&self) -> String {
        let dir = self.directory();
        let file = self.filename();
        format!("{dir}_rels/{file}.rels")
    }

    /// Resolve a relative URI against this part name's directory.
    pub fn resolve_relative(&self, relative: &str) -> Result<PartName> {
        if relative.starts_with('/') {
            // Absolute path
            return PartName::new(relative);
        }

        let base = self.directory();
        let combined = format!("{base}{relative}");

        // Normalize path by resolving .. and .
        let normalized = normalize_path(&combined);
        PartName::new(&normalized)
    }
}

/// Case-insensitive equality per OPC spec §5.
impl PartialEq for PartName {
    fn eq(&self, other: &Self) -> bool {
        self.0.eq_ignore_ascii_case(&other.0)
    }
}

impl Eq for PartName {}

/// Case-insensitive hashing to match case-insensitive equality.
impl Hash for PartName {
    fn hash<H: Hasher>(&self, state: &mut H) {
        for byte in self.0.bytes() {
            state.write_u8(byte.to_ascii_lowercase());
        }
    }
}

impl std::fmt::Display for PartName {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(&self.0)
    }
}

/// Normalize a path by resolving `.` and `..` segments.
fn normalize_path(path: &str) -> String {
    let mut segments: Vec<&str> = Vec::new();
    for segment in path.split('/') {
        match segment {
            "." | "" => {
                // Skip empty (from leading slash or //) and current-dir refs
                if segments.is_empty() {
                    segments.push(""); // preserve leading slash
                }
            },
            ".." => {
                // Go up one level, but don't go above root
                if segments.len() > 1 {
                    segments.pop();
                }
            },
            s => segments.push(s),
        }
    }
    if segments.len() == 1 && segments[0].is_empty() {
        return "/".to_string();
    }
    segments.join("/")
}

// ---------------------------------------------------------------------------
// OpcReader
// ---------------------------------------------------------------------------

/// Reader for OPC (Open Packaging Conventions) packages (ZIP-based).
pub struct OpcReader<R: Read + Seek> {
    archive: ZipArchive<R>,
    content_types: ContentTypes,
    package_rels: Relationships,
}

impl OpcReader<File> {
    /// Open an OPC package from a file path.
    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
        let file = File::open(path)?;
        Self::new(file)
    }
}

#[cfg(feature = "mmap")]
impl OpcReader<std::io::Cursor<memmap2::Mmap>> {
    /// Open an OPC package using memory-mapped I/O for better performance on large files.
    ///
    /// # Safety
    /// Uses `unsafe` for memory mapping. The file must not be modified while mapped.
    pub fn open_mmap(path: impl AsRef<Path>) -> Result<Self> {
        let file = File::open(path)?;
        let mmap = unsafe { memmap2::Mmap::map(&file)? };
        debug!("OPC package opened via mmap ({} bytes)", mmap.len());
        Self::new(std::io::Cursor::new(mmap))
    }
}

impl<R: Read + Seek> OpcReader<R> {
    /// Create an OPC reader from any `Read + Seek` source.
    pub fn new(reader: R) -> Result<Self> {
        let mut archive = ZipArchive::new(reader)?;
        debug!("OPC package opened, {} entries", archive.len());

        // Eagerly parse [Content_Types].xml
        let ct_data = read_zip_entry(&mut archive, "[Content_Types].xml")?;
        let content_types = ContentTypes::parse(&ct_data)?;

        // Eagerly parse _rels/.rels
        let rels_data = read_zip_entry(&mut archive, "_rels/.rels").unwrap_or_default();
        let package_rels = if rels_data.is_empty() {
            Relationships::empty()
        } else {
            Relationships::parse(&rels_data)?
        };

        Ok(Self {
            archive,
            content_types,
            package_rels,
        })
    }

    /// Return the parsed `[Content_Types].xml` table.
    pub fn content_types(&self) -> &ContentTypes {
        &self.content_types
    }

    /// Return the package-level relationships from `_rels/.rels`.
    pub fn package_rels(&self) -> &Relationships {
        &self.package_rels
    }

    /// Read a part's raw bytes.
    /// XML parts with non-UTF-8 encoding declarations are automatically transcoded to UTF-8.
    pub fn read_part(&mut self, name: &PartName) -> Result<Vec<u8>> {
        let zip_path = &name.as_str()[1..]; // strip leading /
        let data = read_zip_entry(&mut self.archive, zip_path)?;
        trace!("read_part '{}' ({} bytes)", name, data.len());

        // Transcode non-UTF-8 XML to UTF-8 (handles ISO-8859-1, Windows-1252, etc.)
        if name.as_str().ends_with(".xml") || name.as_str().ends_with(".rels") {
            if let Some(utf8_data) = super::xml::ensure_utf8(&data) {
                trace!("read_part '{}': transcoded to UTF-8", name);
                return Ok(utf8_data);
            }
        }
        Ok(data)
    }

    /// Read and parse the relationships for a given part.
    pub fn read_rels_for(&mut self, part: &PartName) -> Result<Relationships> {
        let rels_zip_path = part.rels_path();
        let zip_path = &rels_zip_path[1..]; // strip leading /
        match read_zip_entry(&mut self.archive, zip_path) {
            Ok(data) => {
                trace!("read_rels_for '{}' ({} bytes)", part, data.len());
                Relationships::parse(&data)
            },
            Err(Error::Zip(zip::result::ZipError::FileNotFound)) => {
                trace!("read_rels_for '{}': no rels file", part);
                Ok(Relationships::empty())
            },
            Err(Error::MissingPart(_)) => {
                trace!("read_rels_for '{}': no rels file", part);
                Ok(Relationships::empty())
            },
            Err(e) => Err(e),
        }
    }

    /// Find the main document part via the officeDocument relationship.
    /// Falls back to scanning `[Content_Types].xml` overrides when `_rels/.rels` is
    /// missing or corrupted.
    pub fn main_document_part(&self) -> Result<PartName> {
        // Primary: use the officeDocument relationship from _rels/.rels
        if let Some(rel) = self.package_rels.first_by_type(rel_types::OFFICE_DOCUMENT) {
            let target = if rel.target.starts_with('/') {
                rel.target.clone()
            } else {
                format!("/{}", rel.target)
            };
            return PartName::new(&target);
        }

        // Fallback: scan [Content_Types].xml overrides for a main document content type
        const MAIN_CONTENT_TYPES: &[&str] = &[
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml",
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml",
            "application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml",
        ];
        for (part_name, ct) in self.content_types.overrides() {
            if MAIN_CONTENT_TYPES.iter().any(|&expected| ct == expected) {
                debug!(
                    "main_document_part: fallback to Content_Types override '{}' ({})",
                    part_name, ct
                );
                return Ok(part_name.clone());
            }
        }

        Err(Error::RelationshipNotFound("officeDocument relationship not found".to_string()))
    }

    /// List all part names in the package (excluding special OPC files).
    pub fn part_names(&self) -> Vec<PartName> {
        self.archive
            .file_names()
            .filter_map(|name| {
                // Normalize backslash separators (Windows-created ZIPs)
                let name = name.replace('\\', "/");
                // Skip directories
                if name.ends_with('/') {
                    return None;
                }
                // Skip [Content_Types].xml
                if name.eq_ignore_ascii_case("[Content_Types].xml") {
                    return None;
                }
                // Skip .rels files in _rels directories
                if name.contains("_rels/") {
                    return None;
                }
                let part_name = format!("/{name}");
                PartName::new(&part_name).ok()
            })
            .collect()
    }

    /// Check if a part exists in the package.
    pub fn has_part(&self, name: &PartName) -> bool {
        let zip_path = &name.as_str()[1..];
        self.archive.file_names().any(|n| {
            let normalized = n.replace('\\', "/");
            normalized.eq_ignore_ascii_case(zip_path)
        })
    }
}

/// Read a ZIP entry by name, returning its bytes.
/// Falls back to case-insensitive and backslash-normalized lookup if exact match fails.
pub(crate) fn read_zip_entry<R: Read + Seek>(
    archive: &mut ZipArchive<R>,
    name: &str,
) -> Result<Vec<u8>> {
    // Try exact match first
    let index = match archive.index_for_name(name) {
        Some(i) => i,
        None => {
            // Fall back to fuzzy match: case-insensitive + backslash normalization.
            // Handles [Content_Types].xml case variants and Windows-created ZIPs
            // that use backslash separators (e.g., `_rels\.rels`).
            let normalized = name.replace('\\', "/");
            let mut found = None;
            for i in 0..archive.len() {
                if let Ok(entry) = archive.by_index_raw(i) {
                    let entry_name = entry.name().replace('\\', "/");
                    if entry_name.eq_ignore_ascii_case(&normalized) {
                        found = Some(i);
                        break;
                    }
                }
            }
            found.ok_or_else(|| Error::MissingPart(name.to_string()))?
        },
    };
    let mut file = archive
        .by_index(index)
        .map_err(|_| Error::MissingPart(name.to_string()))?;
    let mut buf = Vec::with_capacity(file.size() as usize);
    match file.read_to_end(&mut buf) {
        Ok(_) => Ok(buf),
        Err(e)
            if e.kind() == std::io::ErrorKind::InvalidData
                && e.to_string().contains("checksum")
                && !buf.is_empty() =>
        {
            // CRC32 mismatch — the data was fully decompressed but the checksum
            // doesn't match. Accept the data anyway for tolerance of real-world
            // files with minor corruption (e.g., re-saved without recomputing CRC).
            trace!("read_zip_entry '{}': ignoring CRC mismatch", name);
            Ok(buf)
        },
        Err(e) => Err(e.into()),
    }
}

// ---------------------------------------------------------------------------
// OpcWriter
// ---------------------------------------------------------------------------

/// Writer for creating OPC packages (ZIP-based).
pub struct OpcWriter<W: Write + Seek> {
    writer: ZipWriter<W>,
    content_types: ContentTypesBuilder,
    package_rels: RelationshipsBuilder,
    part_rels: HashMap<String, RelationshipsBuilder>,
}

impl OpcWriter<File> {
    /// Create a new OPC package at the given file path.
    pub fn create(path: impl AsRef<Path>) -> Result<Self> {
        let file = File::create(path)?;
        Self::new(file)
    }
}

impl<W: Write + Seek> OpcWriter<W> {
    /// Create a new OPC package writer wrapping an arbitrary `Write + Seek` sink.
    pub fn new(writer: W) -> Result<Self> {
        Ok(Self {
            writer: ZipWriter::new(writer),
            content_types: ContentTypesBuilder::new(),
            package_rels: RelationshipsBuilder::new(),
            part_rels: HashMap::new(),
        })
    }

    /// Add a part to the package.
    pub fn add_part(&mut self, name: &PartName, content_type: &str, data: &[u8]) -> Result<()> {
        // Register content type override
        self.content_types.add_override(name.clone(), content_type);

        // Write to ZIP
        let zip_path = &name.as_str()[1..]; // strip leading /
        let options = SimpleFileOptions::default().compression_method(CompressionMethod::Deflated);
        self.writer.start_file(zip_path, options)?;
        self.writer.write_all(data)?;
        Ok(())
    }

    /// Add a package-level relationship and return its rId.
    pub fn add_package_rel(&mut self, rel_type: &str, target: &str) -> String {
        self.package_rels.add(rel_type, target)
    }

    /// Add a part-level relationship and return its rId.
    pub fn add_part_rel(&mut self, source: &PartName, rel_type: &str, target: &str) -> String {
        self.part_rels
            .entry(source.as_str().to_string())
            .or_default()
            .add(rel_type, target)
    }

    /// Add a part-level relationship with an explicit target mode and return its rId.
    pub fn add_part_rel_with_mode(
        &mut self,
        source: &PartName,
        rel_type: &str,
        target: &str,
        target_mode: TargetMode,
    ) -> String {
        self.part_rels
            .entry(source.as_str().to_string())
            .or_default()
            .add_with_mode(rel_type, target, target_mode)
    }

    /// Set core properties (Dublin Core metadata) for the package.
    pub fn set_core_properties(&mut self, props: &CoreProperties) -> Result<()> {
        let data = props.serialize();
        let name = PartName::new("/docProps/core.xml")?;
        self.content_types.add_override(
            name.clone(),
            "application/vnd.openxmlformats-package.core-properties+xml",
        );
        let zip_path = &name.as_str()[1..];
        let options = SimpleFileOptions::default().compression_method(CompressionMethod::Deflated);
        self.writer.start_file(zip_path, options)?;
        self.writer.write_all(&data)?;
        self.package_rels
            .add(rel_types::CORE_PROPERTIES, "docProps/core.xml");
        Ok(())
    }

    /// Set app/extended properties for the package.
    pub fn set_app_properties(&mut self, props: &AppProperties) -> Result<()> {
        let data = props.serialize();
        let name = PartName::new("/docProps/app.xml")?;
        self.content_types.add_override(
            name.clone(),
            "application/vnd.openxmlformats-officedocument.extended-properties+xml",
        );
        let zip_path = &name.as_str()[1..];
        let options = SimpleFileOptions::default().compression_method(CompressionMethod::Deflated);
        self.writer.start_file(zip_path, options)?;
        self.writer.write_all(&data)?;
        self.package_rels
            .add(rel_types::EXTENDED_PROPERTIES, "docProps/app.xml");
        Ok(())
    }

    /// Finalize the OPC package: writes `[Content_Types].xml`, `_rels/.rels`,
    /// part-level `.rels` files, and closes the ZIP archive.
    pub fn finish(mut self) -> Result<W> {
        let options = SimpleFileOptions::default().compression_method(CompressionMethod::Deflated);

        // Write part-level .rels files
        for (source_path, builder) in &self.part_rels {
            if builder.is_empty() {
                continue;
            }
            let source = PartName::new(source_path)?;
            let rels_path = source.rels_path();
            let zip_path = &rels_path[1..]; // strip leading /
            let data = builder.serialize();
            self.writer.start_file(zip_path, options)?;
            self.writer.write_all(&data)?;
        }

        // Write _rels/.rels
        let rels_data = self.package_rels.serialize();
        self.writer.start_file("_rels/.rels", options)?;
        self.writer.write_all(&rels_data)?;

        // Write [Content_Types].xml
        let ct_data = self.content_types.serialize();
        self.writer.start_file("[Content_Types].xml", options)?;
        self.writer.write_all(&ct_data)?;

        Ok(self.writer.finish()?)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn valid_part_names() {
        assert!(PartName::new("/word/document.xml").is_ok());
        assert!(PartName::new("/xl/worksheets/sheet1.xml").is_ok());
        assert!(PartName::new("/docProps/core.xml").is_ok());
        assert!(PartName::new("/word/media/image1.png").is_ok());
    }

    #[test]
    fn invalid_part_names() {
        assert!(PartName::new("word/document.xml").is_err()); // no leading /
        assert!(PartName::new("/word/document.xml/").is_err()); // trailing /
        assert!(PartName::new("/word//document.xml").is_err()); // empty segment
        assert!(PartName::new("/word/./document.xml").is_err()); // dot segment
        assert!(PartName::new("/word/../document.xml").is_err()); // dotdot segment
        // Percent-encoding is now decoded tolerantly
        let pn = PartName::new("/word/my%20doc.xml").unwrap();
        assert_eq!(pn.as_str(), "/word/my doc.xml");
        assert!(PartName::new("/word/doc.xml?v=1").is_err()); // query string
    }

    #[test]
    fn part_name_case_insensitive_eq() {
        let a = PartName::new("/Word/Document.xml").unwrap();
        let b = PartName::new("/word/document.xml").unwrap();
        assert_eq!(a, b);
    }

    #[test]
    fn part_name_case_insensitive_hash() {
        use std::collections::HashSet;
        let mut set = HashSet::new();
        set.insert(PartName::new("/Word/Document.xml").unwrap());
        assert!(set.contains(&PartName::new("/word/document.xml").unwrap()));
    }

    #[test]
    fn part_name_components() {
        let pn = PartName::new("/word/document.xml").unwrap();
        assert_eq!(pn.directory(), "/word/");
        assert_eq!(pn.filename(), "document.xml");
        assert_eq!(pn.extension(), Some("xml"));
        assert_eq!(pn.rels_path(), "/word/_rels/document.xml.rels");
    }

    #[test]
    fn resolve_relative_simple() {
        let source = PartName::new("/word/document.xml").unwrap();
        let resolved = source.resolve_relative("media/image1.png").unwrap();
        assert_eq!(resolved.as_str(), "/word/media/image1.png");
    }

    #[test]
    fn resolve_relative_parent() {
        let source = PartName::new("/word/document.xml").unwrap();
        let resolved = source.resolve_relative("../docProps/core.xml").unwrap();
        assert_eq!(resolved.as_str(), "/docProps/core.xml");
    }

    #[test]
    fn resolve_relative_absolute() {
        let source = PartName::new("/word/document.xml").unwrap();
        let resolved = source.resolve_relative("/xl/workbook.xml").unwrap();
        assert_eq!(resolved.as_str(), "/xl/workbook.xml");
    }

    #[test]
    fn opc_round_trip() {
        use std::io::Cursor;

        let buf = Vec::new();
        let cursor = Cursor::new(buf);
        let mut writer = OpcWriter::new(cursor).unwrap();

        // Add a part
        let doc_name = PartName::new("/word/document.xml").unwrap();
        writer
            .add_part(
                &doc_name,
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml",
                b"<document/>",
            )
            .unwrap();
        writer.add_package_rel(rel_types::OFFICE_DOCUMENT, "word/document.xml");

        let result = writer.finish().unwrap();
        let data = result.into_inner();

        // Read it back
        let cursor = Cursor::new(data);
        let mut reader = OpcReader::new(cursor).unwrap();

        // Check content types
        let ct = reader.content_types().resolve(&doc_name);
        assert_eq!(
            ct,
            Some(
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"
            )
        );

        // Check package rels
        let main = reader.main_document_part().unwrap();
        assert_eq!(main.as_str(), "/word/document.xml");

        // Read part content
        let content = reader.read_part(&doc_name).unwrap();
        assert_eq!(content, b"<document/>");

        // Check part list
        let parts = reader.part_names();
        assert!(parts.iter().any(|p| p == &doc_name));
    }
}