pub mod markdown;
pub mod pdf;
pub mod toc;
pub mod types;
pub use types::{DocumentFormat, DocumentMeta, ParseResult, RawNode};
use std::path::Path;
use crate::error::Result;
use crate::index::parse::markdown::MarkdownParser;
use crate::llm::LlmClient;
pub async fn parse_content(
content: &str,
format: DocumentFormat,
_llm_client: Option<LlmClient>,
) -> Result<ParseResult> {
match format {
DocumentFormat::Markdown => {
let parser = MarkdownParser::new();
parser.parse(content).await
}
DocumentFormat::Pdf => Err(crate::Error::Parse(
"PDF requires bytes, not string content".to_string(),
)),
}
}
pub async fn parse_file(
path: &Path,
format: DocumentFormat,
llm_client: Option<LlmClient>,
) -> Result<ParseResult> {
match format {
DocumentFormat::Markdown => {
let parser = MarkdownParser::new();
parser.parse_file(path).await
}
DocumentFormat::Pdf => {
let parser = match llm_client {
Some(client) => pdf::PdfParser::with_llm_client(client),
None => pdf::PdfParser::new(),
};
parser.parse_file(path).await
}
}
}
pub async fn parse_bytes(
bytes: &[u8],
format: DocumentFormat,
llm_client: Option<LlmClient>,
) -> Result<ParseResult> {
match format {
DocumentFormat::Markdown => {
let content = std::str::from_utf8(bytes)
.map_err(|e| crate::Error::Parse(format!("Invalid UTF-8 content: {}", e)))?;
let parser = MarkdownParser::new();
parser.parse(content).await
}
DocumentFormat::Pdf => {
let parser = match llm_client {
Some(client) => pdf::PdfParser::with_llm_client(client),
None => pdf::PdfParser::new(),
};
parser.parse_bytes_async(bytes, None).await
}
}
}
pub fn format_from_extension(ext: &str) -> Option<DocumentFormat> {
DocumentFormat::from_extension(ext)
}