Skip to main content

sh_layer3/document_loaders/
markdown.rs

1//! # Markdown Document Loader
2//!
3//! Markdown 文件加载器,支持提取结构信息。
4
5use crate::document_loaders::{DocumentLoader, LoadOptions};
6use crate::retriever_engine::Document;
7use crate::types::Layer3Result;
8use async_trait::async_trait;
9use std::path::PathBuf;
10
11/// Markdown Loader 实现
12#[allow(dead_code)]
13pub struct MarkdownLoader {
14    #[allow(dead_code)]
15    options: LoadOptions,
16}
17
18impl MarkdownLoader {
19    pub fn new() -> Self {
20        Self {
21            options: LoadOptions::default(),
22        }
23    }
24}
25
26impl Default for MarkdownLoader {
27    fn default() -> Self {
28        Self::new()
29    }
30}
31
32#[async_trait]
33impl DocumentLoader for MarkdownLoader {
34    async fn load(&self, path: PathBuf) -> Layer3Result<Document> {
35        let content = tokio::fs::read_to_string(&path).await?;
36        Ok(Document::new(content).with_source(path.to_string_lossy().to_string()))
37    }
38
39    async fn load_and_split(&self, path: PathBuf) -> Layer3Result<Vec<Document>> {
40        let content = tokio::fs::read_to_string(&path).await?;
41
42        // 按标题分割
43        let mut documents = Vec::new();
44        let mut current_section = String::new();
45        let mut current_title = String::from("intro");
46
47        for line in content.lines() {
48            if line.starts_with("#") {
49                // 新标题,保存当前节
50                if !current_section.trim().is_empty() {
51                    documents.push(
52                        Document::new(current_section.trim().to_string()).with_source(format!(
53                            "{}#{}",
54                            path.to_string_lossy(),
55                            current_title
56                        )),
57                    );
58                }
59                current_title = line.trim_start_matches('#').trim().to_string();
60                current_section = format!("{}\n\n", line);
61            } else {
62                current_section.push_str(line);
63                current_section.push('\n');
64            }
65        }
66
67        // 保存最后一节
68        if !current_section.trim().is_empty() {
69            documents.push(
70                Document::new(current_section.trim().to_string()).with_source(format!(
71                    "{}#{}",
72                    path.to_string_lossy(),
73                    current_title
74                )),
75            );
76        }
77
78        Ok(documents)
79    }
80
81    fn supports(&self, path: &std::path::Path) -> bool {
82        path.extension()
83            .and_then(|e| e.to_str())
84            .map(|e| e == "md" || e == "markdown")
85            .unwrap_or(false)
86    }
87
88    fn extensions(&self) -> &[&str] {
89        &["md", "markdown"]
90    }
91}
92
93#[cfg(test)]
94mod tests {
95    use super::*;
96
97    #[test]
98    fn test_markdown_loader_extensions() {
99        let loader = MarkdownLoader::new();
100        assert!(loader.extensions().contains(&"md"));
101    }
102}