graphrag_core/text/
layout_parser.rs1use crate::core::Document;
4use crate::text::document_structure::DocumentStructure;
5
6pub trait LayoutParser: Send + Sync {
8 fn parse(&self, content: &str) -> DocumentStructure;
10
11 fn supports_format(&self, format: &str) -> bool;
13
14 fn name(&self) -> &'static str;
16}
17
18pub struct LayoutParserFactory;
20
21impl LayoutParserFactory {
22 pub fn create_for_document(document: &Document) -> Box<dyn LayoutParser> {
24 if document.title.ends_with(".md") || document.title.ends_with(".markdown") {
26 return Box::new(crate::text::parsers::MarkdownLayoutParser::new());
27 }
28
29 if document.title.ends_with(".html") || document.title.ends_with(".htm") {
30 return Box::new(crate::text::parsers::HtmlLayoutParser::new());
31 }
32
33 if document.content.contains("<h1")
35 || document.content.contains("<h2")
36 || document.content.contains("<html")
37 || document.content.contains("<!DOCTYPE")
38 {
39 return Box::new(crate::text::parsers::HtmlLayoutParser::new());
40 }
41
42 if document
44 .content
45 .lines()
46 .any(|line| line.trim_start().starts_with('#'))
47 {
48 return Box::new(crate::text::parsers::MarkdownLayoutParser::new());
49 }
50
51 Box::new(crate::text::parsers::PlainTextLayoutParser::new())
53 }
54
55 pub fn create_for_format(format: &str) -> Box<dyn LayoutParser> {
57 match format.to_lowercase().as_str() {
58 "markdown" | "md" => Box::new(crate::text::parsers::MarkdownLayoutParser::new()),
59 "html" | "htm" => Box::new(crate::text::parsers::HtmlLayoutParser::new()),
60 "text" | "txt" | "plain" => {
61 Box::new(crate::text::parsers::PlainTextLayoutParser::new())
62 },
63 _ => Box::new(crate::text::parsers::PlainTextLayoutParser::new()),
64 }
65 }
66}