axiomsync 1.0.0

use std::collections::HashMap;
use std::path::Path;

use serde::{Deserialize, Serialize};

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ParsedDocument {
    pub parser: String,
    pub is_text: bool,
    pub title: Option<String>,
    pub text_preview: String,
    #[serde(skip)]
    pub normalized_text: Option<String>,
    pub line_count: usize,
    pub tags: Vec<String>,
}

#[derive(Debug, Clone, Default)]
pub struct ParserRegistry;

impl ParserRegistry {
    #[must_use]
    pub const fn new() -> Self {
        Self
    }

    #[must_use]
    pub fn parse_file(&self, path: &Path, bytes: &[u8]) -> ParsedDocument {
        let ext = path
            .extension()
            .and_then(|s| s.to_str())
            .unwrap_or_default()
            .to_ascii_lowercase();

        match ext.as_str() {
            "md" | "markdown" => parse_markdown(bytes),
            "json" => parse_json(bytes),
            "yaml" | "yml" => parse_yaml(bytes),
            "toml" => parse_toml(bytes),
            "jsonl" => parse_jsonl(bytes),
            "xml" => parse_xml(bytes),
            _ => {
                if let Ok(text) = std::str::from_utf8(bytes) {
                    return parse_plain_text(text);
                }
                parse_binary(bytes)
            }
        }
    }
}

fn parse_markdown(bytes: &[u8]) -> ParsedDocument {
    let text = String::from_utf8_lossy(bytes);
    let normalized = normalize_markdown_for_indexing(&text);
    let mut title = None;
    for line in normalized.lines() {
        let trimmed = line.trim();
        if let Some(rest) = trimmed.strip_prefix("# ") {
            title = Some(rest.trim().to_string());
            break;
        }
        if !trimmed.is_empty() && title.is_none() && !is_markdown_rule_line(trimmed) {
            title = Some(trimmed.to_string());
        }
    }

    let preview = normalized.chars().take(240).collect::<String>();
    ParsedDocument {
        parser: "markdown".to_string(),
        is_text: true,
        title,
        text_preview: preview,
        normalized_text: Some(normalized.clone()),
        line_count: normalized.lines().count(),
        tags: vec!["markdown".to_string()],
    }
}

fn parse_plain_text(text: &str) -> ParsedDocument {
    let preview = text.chars().take(240).collect::<String>();
    let title = text
        .lines()
        .map(str::trim)
        .find(|line| !line.is_empty())
        .map(ToString::to_string);

    ParsedDocument {
        parser: "text".to_string(),
        is_text: true,
        title,
        text_preview: preview,
        normalized_text: Some(text.to_string()),
        line_count: text.lines().count(),
        tags: vec!["text".to_string()],
    }
}

fn parse_json(bytes: &[u8]) -> ParsedDocument {
    let Ok(text) = std::str::from_utf8(bytes) else {
        return parse_binary(bytes);
    };
    let Ok(value) = serde_json::from_str::<serde_json::Value>(text) else {
        return parse_plain_text(text);
    };

    let mut lines = Vec::new();
    flatten_json_value(&value, "$", &mut lines);
    if lines.is_empty() {
        lines.push("$={}".to_string());
    }
    structured_document(
        "json",
        infer_title_from_json_value(&value),
        lines,
        vec!["json".to_string(), "config".to_string()],
    )
}

fn parse_yaml(bytes: &[u8]) -> ParsedDocument {
    let Ok(text) = std::str::from_utf8(bytes) else {
        return parse_binary(bytes);
    };
    let Ok(value) = serde_norway::from_str::<serde_norway::Value>(text) else {
        return parse_plain_text(text);
    };
    let Ok(json_value) = serde_json::to_value(value) else {
        return parse_plain_text(text);
    };

    let mut lines = Vec::new();
    flatten_json_value(&json_value, "$", &mut lines);
    if lines.is_empty() {
        lines.push("$={}".to_string());
    }
    structured_document(
        "yaml",
        infer_title_from_json_value(&json_value),
        lines,
        vec!["yaml".to_string(), "config".to_string()],
    )
}

fn parse_toml(bytes: &[u8]) -> ParsedDocument {
    let Ok(text) = std::str::from_utf8(bytes) else {
        return parse_binary(bytes);
    };
    let Ok(value) = text.parse::<toml::Value>() else {
        return parse_plain_text(text);
    };
    let Ok(json_value) = serde_json::to_value(value) else {
        return parse_plain_text(text);
    };

    let mut lines = Vec::new();
    flatten_json_value(&json_value, "$", &mut lines);
    if lines.is_empty() {
        lines.push("$={}".to_string());
    }
    structured_document(
        "toml",
        infer_title_from_json_value(&json_value),
        lines,
        vec!["toml".to_string(), "config".to_string()],
    )
}

fn parse_jsonl(bytes: &[u8]) -> ParsedDocument {
    let Ok(text) = std::str::from_utf8(bytes) else {
        return parse_binary(bytes);
    };

    let mut lines = Vec::new();
    let mut key_histogram = HashMap::<String, usize>::new();
    let mut parsed_count = 0usize;
    let mut inferred_title = None::<String>;

    for (line_idx, line) in text.lines().enumerate() {
        let trimmed = line.trim();
        if trimmed.is_empty() {
            continue;
        }
        let Ok(value) = serde_json::from_str::<serde_json::Value>(trimmed) else {
            continue;
        };
        parsed_count = parsed_count.saturating_add(1);
        if inferred_title.is_none() {
            inferred_title = infer_title_from_json_value(&value);
        }
        update_top_level_key_histogram(&value, &mut key_histogram);
        flatten_json_value(
            &value,
            &format!("$[{}]", line_idx.saturating_add(1)),
            &mut lines,
        );
    }

    if parsed_count == 0 {
        return parse_plain_text(text);
    }

    let mut normalized = Vec::new();
    normalized.push(format!("jsonl.records={parsed_count}"));
    let dominant_keys = dominant_keys(&key_histogram, 8);
    if !dominant_keys.is_empty() {
        normalized.push(format!("jsonl.keys={}", dominant_keys.join(",")));
    }
    normalized.extend(lines);

    structured_document(
        "jsonl",
        inferred_title,
        normalized,
        vec!["jsonl".to_string(), "data".to_string()],
    )
}

fn parse_xml(bytes: &[u8]) -> ParsedDocument {
    let Ok(text) = std::str::from_utf8(bytes) else {
        return parse_binary(bytes);
    };
    let lines = flatten_xml(text);
    if lines.is_empty() {
        return parse_plain_text(text);
    }

    let title = lines
        .iter()
        .find_map(|line| {
            let (path, value) = line.split_once('=')?;
            if path.ends_with("/title") || path.ends_with("/name") || path.ends_with("/id") {
                return Some(value.to_string());
            }
            None
        })
        .filter(|value| !value.trim().is_empty());

    structured_document(
        "xml",
        title,
        lines,
        vec!["xml".to_string(), "data".to_string()],
    )
}

fn structured_document(
    parser: &str,
    title: Option<String>,
    lines: Vec<String>,
    tags: Vec<String>,
) -> ParsedDocument {
    let normalized = lines.join("\n");
    ParsedDocument {
        parser: parser.to_string(),
        is_text: true,
        title,
        text_preview: preview_from_lines(&lines),
        normalized_text: Some(normalized),
        line_count: lines.len(),
        tags,
    }
}

fn preview_from_lines(lines: &[String]) -> String {
    let joined = lines
        .iter()
        .take(12)
        .cloned()
        .collect::<Vec<_>>()
        .join("\n");
    joined.chars().take(240).collect::<String>()
}

fn infer_title_from_json_value(value: &serde_json::Value) -> Option<String> {
    let object = value.as_object()?;
    for key in ["title", "name", "id"] {
        if let Some(text) = object.get(key).and_then(serde_json::Value::as_str) {
            let trimmed = text.trim();
            if !trimmed.is_empty() {
                return Some(trimmed.to_string());
            }
        }
    }
    None
}

fn flatten_json_value(value: &serde_json::Value, path: &str, out: &mut Vec<String>) {
    match value {
        serde_json::Value::Object(map) => {
            if map.is_empty() {
                out.push(format!("{path}={{}}"));
                return;
            }
            let mut keys = map.keys().cloned().collect::<Vec<_>>();
            keys.sort();
            for key in keys {
                if let Some(child) = map.get(&key) {
                    let next_path = if path == "$" {
                        format!("$.{key}")
                    } else {
                        format!("{path}.{key}")
                    };
                    flatten_json_value(child, &next_path, out);
                }
            }
        }
        serde_json::Value::Array(list) => {
            if list.is_empty() {
                out.push(format!("{path}=[]"));
                return;
            }
            for (idx, child) in list.iter().enumerate() {
                flatten_json_value(child, &format!("{path}[{idx}]"), out);
            }
        }
        _ => {
            out.push(format!("{path}={}", json_scalar_text(value)));
        }
    }
}

fn json_scalar_text(value: &serde_json::Value) -> String {
    match value {
        serde_json::Value::String(text) => normalize_scalar_text(text),
        serde_json::Value::Bool(v) => v.to_string(),
        serde_json::Value::Number(v) => v.to_string(),
        serde_json::Value::Null => "null".to_string(),
        serde_json::Value::Array(_) | serde_json::Value::Object(_) => "{}".to_string(),
    }
}

fn normalize_scalar_text(raw: &str) -> String {
    let collapsed = raw.split_whitespace().collect::<Vec<_>>().join(" ");
    collapsed.chars().take(240).collect::<String>()
}

fn update_top_level_key_histogram(
    value: &serde_json::Value,
    histogram: &mut HashMap<String, usize>,
) {
    let Some(map) = value.as_object() else {
        return;
    };
    for key in map.keys() {
        *histogram.entry(key.clone()).or_insert(0) += 1;
    }
}

fn dominant_keys(histogram: &HashMap<String, usize>, limit: usize) -> Vec<String> {
    let mut pairs = histogram
        .iter()
        .map(|(key, count)| (key.clone(), *count))
        .collect::<Vec<_>>();
    pairs.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
    pairs.into_iter().take(limit).map(|(key, _)| key).collect()
}

fn flatten_xml(text: &str) -> Vec<String> {
    let mut out = Vec::new();
    let mut stack = Vec::<String>::new();
    let mut index = 0usize;
    let bytes = text.as_bytes();

    while index < bytes.len() {
        if bytes[index] == b'<' {
            if text[index..].starts_with("<!--") {
                if let Some(end) = text[index + 4..].find("-->") {
                    index = index + 4 + end + 3;
                } else {
                    break;
                }
                continue;
            }
            let Some(close_offset) = text[index..].find('>') else {
                break;
            };
            let close_index = index + close_offset;
            let token = text[index + 1..close_index].trim();
            if token.is_empty() || token.starts_with('?') || token.starts_with('!') {
                index = close_index.saturating_add(1);
                continue;
            }
            if let Some(rest) = token.strip_prefix('/') {
                let closing = parse_xml_tag_name(rest);
                pop_xml_stack(&mut stack, closing);
                index = close_index.saturating_add(1);
                continue;
            }

            let self_closing = token.ends_with('/');
            let body = if self_closing {
                token[..token.len().saturating_sub(1)].trim()
            } else {
                token
            };
            let (name, attrs) = split_xml_tag(body);
            if !name.is_empty() {
                stack.push(name.to_string());
                let path = xml_path(&stack);
                for (key, value) in parse_xml_attributes(attrs) {
                    out.push(format!("{path}/@{key}={value}"));
                }
                if self_closing {
                    stack.pop();
                }
            }
            index = close_index.saturating_add(1);
        } else {
            let start = index;
            while index < bytes.len() && bytes[index] != b'<' {
                index += 1;
            }
            let text_value = text[start..index].trim();
            if !text_value.is_empty() && !stack.is_empty() {
                out.push(format!(
                    "{}={}",
                    xml_path(&stack),
                    normalize_scalar_text(text_value)
                ));
            }
        }
    }

    out
}

fn parse_xml_tag_name(raw: &str) -> &str {
    raw.split_whitespace().next().unwrap_or_default().trim()
}

fn split_xml_tag(raw: &str) -> (&str, &str) {
    let mut parts = raw.splitn(2, char::is_whitespace);
    let name = parts.next().unwrap_or_default().trim();
    let attrs = parts.next().unwrap_or_default().trim();
    (name, attrs)
}

fn parse_xml_attributes(raw: &str) -> Vec<(String, String)> {
    let mut out = Vec::new();
    let mut index = 0usize;
    let bytes = raw.as_bytes();
    while index < bytes.len() {
        while index < bytes.len() && bytes[index].is_ascii_whitespace() {
            index += 1;
        }
        if index >= bytes.len() {
            break;
        }
        let key_start = index;
        while index < bytes.len() && !bytes[index].is_ascii_whitespace() && bytes[index] != b'=' {
            index += 1;
        }
        let key = raw[key_start..index].trim();
        while index < bytes.len() && bytes[index].is_ascii_whitespace() {
            index += 1;
        }
        if index >= bytes.len() || bytes[index] != b'=' {
            continue;
        }
        index += 1;
        while index < bytes.len() && bytes[index].is_ascii_whitespace() {
            index += 1;
        }
        if index >= bytes.len() {
            break;
        }
        let quote = bytes[index];
        let value = if quote == b'"' || quote == b'\'' {
            index += 1;
            let value_start = index;
            while index < bytes.len() && bytes[index] != quote {
                index += 1;
            }
            let v = raw[value_start..index].to_string();
            if index < bytes.len() {
                index += 1;
            }
            v
        } else {
            let value_start = index;
            while index < bytes.len() && !bytes[index].is_ascii_whitespace() {
                index += 1;
            }
            raw[value_start..index].to_string()
        };

        if !key.is_empty() {
            out.push((key.to_string(), normalize_scalar_text(&value)));
        }
    }
    out
}

fn xml_path(stack: &[String]) -> String {
    format!("/{}", stack.join("/"))
}

fn pop_xml_stack(stack: &mut Vec<String>, closing: &str) {
    if stack.last().is_some_and(|name| name == closing) {
        stack.pop();
        return;
    }
    if let Some(pos) = stack.iter().rposition(|name| name == closing) {
        stack.truncate(pos);
    }
}

fn parse_binary(bytes: &[u8]) -> ParsedDocument {
    ParsedDocument {
        parser: "binary".to_string(),
        is_text: false,
        title: None,
        text_preview: format!("binary file ({} bytes)", bytes.len()),
        normalized_text: None,
        line_count: 0,
        tags: vec!["binary".to_string()],
    }
}

fn normalize_markdown_for_indexing(raw: &str) -> String {
    raw.strip_prefix('\u{feff}').unwrap_or(raw).to_string()
}

fn is_markdown_rule_line(trimmed: &str) -> bool {
    let bytes = trimmed.as_bytes();
    if bytes.len() < 3 {
        return false;
    }
    let first = bytes[0];
    if !matches!(first, b'-' | b'_' | b'*') {
        return false;
    }
    bytes.iter().all(|b| *b == first)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn markdown_parser_extracts_title() {
        let registry = ParserRegistry::new();
        let parsed = registry.parse_file(
            Path::new("readme.md"),
            b"# Hello\n\nThis is a markdown file.",
        );

        assert_eq!(parsed.parser, "markdown");
        assert_eq!(parsed.title.as_deref(), Some("Hello"));
        assert!(parsed.is_text);
        assert!(parsed.normalized_text.is_some());
    }

    #[test]
    fn text_parser_handles_plain_text() {
        let registry = ParserRegistry::new();
        let parsed = registry.parse_file(Path::new("notes.txt"), b"first line\nsecond line");

        assert_eq!(parsed.parser, "text");
        assert_eq!(parsed.line_count, 2);
        assert_eq!(parsed.title.as_deref(), Some("first line"));
        assert_eq!(
            parsed.normalized_text.as_deref(),
            Some("first line\nsecond line")
        );
    }

    #[test]
    fn binary_parser_detects_non_utf8() {
        let registry = ParserRegistry::new();
        let parsed = registry.parse_file(Path::new("image.bin"), &[0xff, 0xfe, 0xfd]);

        assert_eq!(parsed.parser, "binary");
        assert!(!parsed.is_text);
        assert!(parsed.normalized_text.is_none());
    }

    #[test]
    fn markdown_parser_keeps_yaml_frontmatter_content() {
        let registry = ParserRegistry::new();
        let parsed = registry.parse_file(
            Path::new("note.md"),
            b"---\ntype: area\ntags: [rust]\n---\n# Real Title\n\nBody",
        );

        assert_eq!(parsed.title.as_deref(), Some("Real Title"));
        let normalized = parsed.normalized_text.expect("normalized");
        assert!(normalized.contains("type: area"));
        assert!(normalized.contains("# Real Title"));
    }

    #[test]
    fn markdown_parser_keeps_leading_metadata_lines() {
        let registry = ParserRegistry::new();
        let parsed = registry.parse_file(
            Path::new("note.md"),
            "> 작성일: 2026-02-15\n> tags: rust\n# 제목\n본문".as_bytes(),
        );

        assert_eq!(parsed.title.as_deref(), Some("제목"));
        let normalized = parsed.normalized_text.expect("normalized");
        assert!(normalized.contains("tags: rust"));
        assert!(normalized.contains("# 제목"));
    }

    #[test]
    fn markdown_parser_ignores_rule_line_when_guessing_title() {
        let registry = ParserRegistry::new();
        let parsed = registry.parse_file(Path::new("note.md"), b"---\n\nIntro line\nBody");
        assert_eq!(parsed.title.as_deref(), Some("Intro line"));
    }

    #[test]
    fn parser_json_keypath_flatten() {
        let registry = ParserRegistry::new();
        let parsed = registry.parse_file(
            Path::new("config.json"),
            br#"{"name":"oauth","oauth":{"client_id":"abc","enabled":true}}"#,
        );
        assert_eq!(parsed.parser, "json");
        assert!(parsed.tags.contains(&"json".to_string()));
        assert!(parsed.tags.contains(&"config".to_string()));
        let normalized = parsed.normalized_text.expect("normalized");
        assert!(normalized.contains("$.oauth.client_id=abc"));
        assert!(normalized.contains("$.oauth.enabled=true"));
        assert_eq!(parsed.title.as_deref(), Some("oauth"));
    }

    #[test]
    fn parser_yaml_keypath_flatten() {
        let registry = ParserRegistry::new();
        let parsed = registry.parse_file(
            Path::new("settings.yaml"),
            b"name: runtime\nqueue:\n  dead_letter_rate: 0.01\n",
        );
        assert_eq!(parsed.parser, "yaml");
        assert!(parsed.tags.contains(&"yaml".to_string()));
        assert!(parsed.tags.contains(&"config".to_string()));
        let normalized = parsed.normalized_text.expect("normalized");
        assert!(normalized.contains("$.queue.dead_letter_rate=0.01"));
        assert_eq!(parsed.title.as_deref(), Some("runtime"));
    }

    #[test]
    fn parser_jsonl_histogram_preview() {
        let registry = ParserRegistry::new();
        let parsed = registry.parse_file(
            Path::new("events.jsonl"),
            b"{\"id\":\"a1\",\"status\":\"ok\"}\n{\"id\":\"a2\",\"status\":\"fail\",\"worker\":\"w1\"}\n",
        );
        assert_eq!(parsed.parser, "jsonl");
        let normalized = parsed.normalized_text.expect("normalized");
        assert!(normalized.contains("jsonl.records=2"));
        assert!(normalized.contains("jsonl.keys=id,status,worker"));
        assert!(normalized.contains("$[1].id=a1"));
        assert!(parsed.text_preview.contains("jsonl.records=2"));
    }

    #[test]
    fn parser_xml_path_flatten() {
        let registry = ParserRegistry::new();
        let parsed = registry.parse_file(
            Path::new("schema.xml"),
            br#"<root><title>Search Contract</title><item id="x1">alpha</item></root>"#,
        );
        assert_eq!(parsed.parser, "xml");
        let normalized = parsed.normalized_text.expect("normalized");
        assert!(normalized.contains("/root/title=Search Contract"));
        assert!(normalized.contains("/root/item/@id=x1"));
        assert!(normalized.contains("/root/item=alpha"));
        assert_eq!(parsed.title.as_deref(), Some("Search Contract"));
    }
}