swiftide_integrations/scraping/
html_to_markdown_transformer.rs1use std::sync::Arc;
2
3use anyhow::Result;
4use async_trait::async_trait;
5use htmd::HtmlToMarkdown;
6
7use swiftide_core::{Transformer, indexing::TextNode};
8
9#[swiftide_macros::indexing_transformer(derive(skip_default, skip_debug))]
13pub struct HtmlToMarkdownTransformer {
14 htmd: Arc<HtmlToMarkdown>,
18}
19
20impl Default for HtmlToMarkdownTransformer {
21 fn default() -> Self {
22 Self {
23 htmd: HtmlToMarkdown::builder()
24 .skip_tags(vec!["script", "style"])
25 .build()
26 .into(),
27 concurrency: None,
28 client: None,
29 indexing_defaults: None,
30 }
31 }
32}
33
34impl std::fmt::Debug for HtmlToMarkdownTransformer {
35 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36 f.debug_struct("HtmlToMarkdownTransformer").finish()
37 }
38}
39
40#[async_trait]
41impl Transformer for HtmlToMarkdownTransformer {
42 type Input = String;
43 type Output = String;
44 #[tracing::instrument(skip_all, name = "transformer.html_to_markdown")]
48 async fn transform_node(&self, node: TextNode) -> Result<TextNode> {
49 let chunk = self.htmd.convert(&node.chunk)?;
50
51 TextNode::build_from_other(&node).chunk(chunk).build()
52 }
53
54 fn concurrency(&self) -> Option<usize> {
55 self.concurrency
56 }
57}
58
59#[cfg(test)]
60mod test {
61 use super::*;
62
63 #[tokio::test]
64 async fn test_html_to_markdown() {
65 let node = TextNode::new("<h1>Hello, World!</h1>");
66 let transformer = HtmlToMarkdownTransformer::default();
67 let transformed_node = transformer.transform_node(node).await.unwrap();
68 assert_eq!(transformed_node.chunk, "# Hello, World!");
69 }
70}