swiftide_integrations/scraping/
html_to_markdown_transformer.rs

1use std::sync::Arc;
2
3use anyhow::Result;
4use async_trait::async_trait;
5use htmd::HtmlToMarkdown;
6
7use swiftide_core::{Transformer, indexing::Node};
8
9/// Transforms HTML content into markdown.
10///
11/// Useful for converting scraping results into markdown.
12#[swiftide_macros::indexing_transformer(derive(skip_default, skip_debug))]
13pub struct HtmlToMarkdownTransformer {
14    /// The `HtmlToMarkdown` instance used to convert HTML to markdown.
15    ///
16    /// Sets a sane default, but can be customized.
17    htmd: Arc<HtmlToMarkdown>,
18}
19
20impl Default for HtmlToMarkdownTransformer {
21    fn default() -> Self {
22        Self {
23            htmd: HtmlToMarkdown::builder()
24                .skip_tags(vec!["script", "style"])
25                .build()
26                .into(),
27            concurrency: None,
28            client: None,
29            indexing_defaults: None,
30        }
31    }
32}
33
34impl std::fmt::Debug for HtmlToMarkdownTransformer {
35    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36        f.debug_struct("HtmlToMarkdownTransformer").finish()
37    }
38}
39
40#[async_trait]
41impl Transformer for HtmlToMarkdownTransformer {
42    /// Converts the HTML content in the `Node` to markdown.
43    ///
44    /// Will Err the node if the conversion fails.
45    #[tracing::instrument(skip_all, name = "transformer.html_to_markdown")]
46    async fn transform_node(&self, node: Node) -> Result<Node> {
47        let chunk = self.htmd.convert(&node.chunk)?;
48
49        Node::build_from_other(&node).chunk(chunk).build()
50    }
51
52    fn concurrency(&self) -> Option<usize> {
53        self.concurrency
54    }
55}
56
57#[cfg(test)]
58mod test {
59    use super::*;
60
61    #[tokio::test]
62    async fn test_html_to_markdown() {
63        let node = Node::new("<h1>Hello, World!</h1>");
64        let transformer = HtmlToMarkdownTransformer::default();
65        let transformed_node = transformer.transform_node(node).await.unwrap();
66        assert_eq!(transformed_node.chunk, "# Hello, World!");
67    }
68}