Skip to main content

graphrag_core/text/
layout_parser.rs

1//! Layout parser trait and factory for document structure detection
2
3use crate::text::document_structure::DocumentStructure;
4use crate::core::Document;
5
6/// Trait for document layout parsers
7pub trait LayoutParser: Send + Sync {
8    /// Parse document structure from content
9    fn parse(&self, content: &str) -> DocumentStructure;
10
11    /// Check if this parser supports a given format
12    fn supports_format(&self, format: &str) -> bool;
13
14    /// Get parser name
15    fn name(&self) -> &'static str;
16}
17
18/// Factory for creating layout parsers based on document type
19pub struct LayoutParserFactory;
20
21impl LayoutParserFactory {
22    /// Create appropriate parser for a document
23    pub fn create_for_document(document: &Document) -> Box<dyn LayoutParser> {
24        // Detect format from title/extension
25        if document.title.ends_with(".md") || document.title.ends_with(".markdown") {
26            return Box::new(crate::text::parsers::MarkdownLayoutParser::new());
27        }
28
29        if document.title.ends_with(".html") || document.title.ends_with(".htm") {
30            return Box::new(crate::text::parsers::HtmlLayoutParser::new());
31        }
32
33        // Detect from content
34        if document.content.contains("<h1") || document.content.contains("<h2")
35            || document.content.contains("<html") || document.content.contains("<!DOCTYPE") {
36            return Box::new(crate::text::parsers::HtmlLayoutParser::new());
37        }
38
39        // Check for markdown headings
40        if document.content.lines().any(|line| line.trim_start().starts_with('#')) {
41            return Box::new(crate::text::parsers::MarkdownLayoutParser::new());
42        }
43
44        // Default to plain text parser
45        Box::new(crate::text::parsers::PlainTextLayoutParser::new())
46    }
47
48    /// Create parser for specific format
49    pub fn create_for_format(format: &str) -> Box<dyn LayoutParser> {
50        match format.to_lowercase().as_str() {
51            "markdown" | "md" => Box::new(crate::text::parsers::MarkdownLayoutParser::new()),
52            "html" | "htm" => Box::new(crate::text::parsers::HtmlLayoutParser::new()),
53            "text" | "txt" | "plain" => Box::new(crate::text::parsers::PlainTextLayoutParser::new()),
54            _ => Box::new(crate::text::parsers::PlainTextLayoutParser::new()),
55        }
56    }
57}