graphrag_core/text/
layout_parser.rs1use crate::text::document_structure::DocumentStructure;
4use crate::core::Document;
5
6pub trait LayoutParser: Send + Sync {
8 fn parse(&self, content: &str) -> DocumentStructure;
10
11 fn supports_format(&self, format: &str) -> bool;
13
14 fn name(&self) -> &'static str;
16}
17
18pub struct LayoutParserFactory;
20
21impl LayoutParserFactory {
22 pub fn create_for_document(document: &Document) -> Box<dyn LayoutParser> {
24 if document.title.ends_with(".md") || document.title.ends_with(".markdown") {
26 return Box::new(crate::text::parsers::MarkdownLayoutParser::new());
27 }
28
29 if document.title.ends_with(".html") || document.title.ends_with(".htm") {
30 return Box::new(crate::text::parsers::HtmlLayoutParser::new());
31 }
32
33 if document.content.contains("<h1") || document.content.contains("<h2")
35 || document.content.contains("<html") || document.content.contains("<!DOCTYPE") {
36 return Box::new(crate::text::parsers::HtmlLayoutParser::new());
37 }
38
39 if document.content.lines().any(|line| line.trim_start().starts_with('#')) {
41 return Box::new(crate::text::parsers::MarkdownLayoutParser::new());
42 }
43
44 Box::new(crate::text::parsers::PlainTextLayoutParser::new())
46 }
47
48 pub fn create_for_format(format: &str) -> Box<dyn LayoutParser> {
50 match format.to_lowercase().as_str() {
51 "markdown" | "md" => Box::new(crate::text::parsers::MarkdownLayoutParser::new()),
52 "html" | "htm" => Box::new(crate::text::parsers::HtmlLayoutParser::new()),
53 "text" | "txt" | "plain" => Box::new(crate::text::parsers::PlainTextLayoutParser::new()),
54 _ => Box::new(crate::text::parsers::PlainTextLayoutParser::new()),
55 }
56 }
57}