swiftide_integrations/scraping/
html_to_markdown_transformer.rs

1use std::sync::Arc;
2
3use anyhow::Result;
4use async_trait::async_trait;
5use htmd::HtmlToMarkdown;
6
7use swiftide_core::{Transformer, indexing::TextNode};
8
9/// Transforms HTML content into markdown.
10///
11/// Useful for converting scraping results into markdown.
12#[swiftide_macros::indexing_transformer(derive(skip_default, skip_debug))]
13pub struct HtmlToMarkdownTransformer {
14    /// The `HtmlToMarkdown` instance used to convert HTML to markdown.
15    ///
16    /// Sets a sane default, but can be customized.
17    htmd: Arc<HtmlToMarkdown>,
18}
19
20impl Default for HtmlToMarkdownTransformer {
21    fn default() -> Self {
22        Self {
23            htmd: HtmlToMarkdown::builder()
24                .skip_tags(vec!["script", "style"])
25                .build()
26                .into(),
27            concurrency: None,
28            client: None,
29            indexing_defaults: None,
30        }
31    }
32}
33
34impl std::fmt::Debug for HtmlToMarkdownTransformer {
35    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36        f.debug_struct("HtmlToMarkdownTransformer").finish()
37    }
38}
39
40#[async_trait]
41impl Transformer for HtmlToMarkdownTransformer {
42    type Input = String;
43    type Output = String;
44    /// Converts the HTML content in the `TextNode` to markdown.
45    ///
46    /// Will Err the node if the conversion fails.
47    #[tracing::instrument(skip_all, name = "transformer.html_to_markdown")]
48    async fn transform_node(&self, node: TextNode) -> Result<TextNode> {
49        let chunk = self.htmd.convert(&node.chunk)?;
50
51        TextNode::build_from_other(&node).chunk(chunk).build()
52    }
53
54    fn concurrency(&self) -> Option<usize> {
55        self.concurrency
56    }
57}
58
59#[cfg(test)]
60mod test {
61    use super::*;
62
63    #[tokio::test]
64    async fn test_html_to_markdown() {
65        let node = TextNode::new("<h1>Hello, World!</h1>");
66        let transformer = HtmlToMarkdownTransformer::default();
67        let transformed_node = transformer.transform_node(node).await.unwrap();
68        assert_eq!(transformed_node.chunk, "# Hello, World!");
69    }
70}