Skip to main content

graphrag_core/text/
layout_parser.rs

1//! Layout parser trait and factory for document structure detection
2
3use crate::core::Document;
4use crate::text::document_structure::DocumentStructure;
5
6/// Trait for document layout parsers
7pub trait LayoutParser: Send + Sync {
8    /// Parse document structure from content
9    fn parse(&self, content: &str) -> DocumentStructure;
10
11    /// Check if this parser supports a given format
12    fn supports_format(&self, format: &str) -> bool;
13
14    /// Get parser name
15    fn name(&self) -> &'static str;
16}
17
18/// Factory for creating layout parsers based on document type
19pub struct LayoutParserFactory;
20
21impl LayoutParserFactory {
22    /// Create appropriate parser for a document
23    pub fn create_for_document(document: &Document) -> Box<dyn LayoutParser> {
24        // Detect format from title/extension
25        if document.title.ends_with(".md") || document.title.ends_with(".markdown") {
26            return Box::new(crate::text::parsers::MarkdownLayoutParser::new());
27        }
28
29        if document.title.ends_with(".html") || document.title.ends_with(".htm") {
30            return Box::new(crate::text::parsers::HtmlLayoutParser::new());
31        }
32
33        // Detect from content
34        if document.content.contains("<h1")
35            || document.content.contains("<h2")
36            || document.content.contains("<html")
37            || document.content.contains("<!DOCTYPE")
38        {
39            return Box::new(crate::text::parsers::HtmlLayoutParser::new());
40        }
41
42        // Check for markdown headings
43        if document
44            .content
45            .lines()
46            .any(|line| line.trim_start().starts_with('#'))
47        {
48            return Box::new(crate::text::parsers::MarkdownLayoutParser::new());
49        }
50
51        // Default to plain text parser
52        Box::new(crate::text::parsers::PlainTextLayoutParser::new())
53    }
54
55    /// Create parser for specific format
56    pub fn create_for_format(format: &str) -> Box<dyn LayoutParser> {
57        match format.to_lowercase().as_str() {
58            "markdown" | "md" => Box::new(crate::text::parsers::MarkdownLayoutParser::new()),
59            "html" | "htm" => Box::new(crate::text::parsers::HtmlLayoutParser::new()),
60            "text" | "txt" | "plain" => {
61                Box::new(crate::text::parsers::PlainTextLayoutParser::new())
62            },
63            _ => Box::new(crate::text::parsers::PlainTextLayoutParser::new()),
64        }
65    }
66}