swiftide_integrations/scraping/
html_to_markdown_transformer.rs

1use std::sync::Arc;
2
3use anyhow::Result;
4use async_trait::async_trait;
5use htmd::HtmlToMarkdown;
6
7use swiftide_core::{Transformer, indexing::Node};
8
9/// Transforms HTML content into markdown.
10///
11/// Useful for converting scraping results into markdown.
12#[swiftide_macros::indexing_transformer(derive(skip_default, skip_debug))]
13pub struct HtmlToMarkdownTransformer {
14    /// The `HtmlToMarkdown` instance used to convert HTML to markdown.
15    ///
16    /// Sets a sane default, but can be customized.
17    htmd: Arc<HtmlToMarkdown>,
18}
19
20impl Default for HtmlToMarkdownTransformer {
21    fn default() -> Self {
22        Self {
23            htmd: HtmlToMarkdown::builder()
24                .skip_tags(vec!["script", "style"])
25                .build()
26                .into(),
27            concurrency: None,
28            client: None,
29            indexing_defaults: None,
30        }
31    }
32}
33
34impl std::fmt::Debug for HtmlToMarkdownTransformer {
35    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36        f.debug_struct("HtmlToMarkdownTransformer").finish()
37    }
38}
39
40#[async_trait]
41impl Transformer for HtmlToMarkdownTransformer {
42    /// Converts the HTML content in the `Node` to markdown.
43    ///
44    /// Will Err the node if the conversion fails.
45    #[tracing::instrument(skip_all, name = "transformer.html_to_markdown")]
46    async fn transform_node(&self, node: Node) -> Result<Node> {
47        let chunk = self.htmd.convert(&node.chunk)?;
48
49        Node::chunking_from(&node)
50            .chunk(chunk)
51            .parent_id(node.id())
52            .build()
53    }
54
55    fn concurrency(&self) -> Option<usize> {
56        self.concurrency
57    }
58}
59
60#[cfg(test)]
61mod test {
62    use super::*;
63
64    #[tokio::test]
65    async fn test_html_to_markdown() {
66        let node = Node::new("<h1>Hello, World!</h1>");
67        let transformer = HtmlToMarkdownTransformer::default();
68        let transformed_node = transformer.transform_node(node).await.unwrap();
69        assert_eq!(transformed_node.chunk, "# Hello, World!");
70    }
71}