swiftide_integrations/scraping/
html_to_markdown_transformer.rs1use std::sync::Arc;
2
3use anyhow::Result;
4use async_trait::async_trait;
5use htmd::HtmlToMarkdown;
6
7use swiftide_core::{Transformer, indexing::Node};
8
9#[swiftide_macros::indexing_transformer(derive(skip_default, skip_debug))]
13pub struct HtmlToMarkdownTransformer {
14 htmd: Arc<HtmlToMarkdown>,
18}
19
20impl Default for HtmlToMarkdownTransformer {
21 fn default() -> Self {
22 Self {
23 htmd: HtmlToMarkdown::builder()
24 .skip_tags(vec!["script", "style"])
25 .build()
26 .into(),
27 concurrency: None,
28 client: None,
29 indexing_defaults: None,
30 }
31 }
32}
33
34impl std::fmt::Debug for HtmlToMarkdownTransformer {
35 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36 f.debug_struct("HtmlToMarkdownTransformer").finish()
37 }
38}
39
40#[async_trait]
41impl Transformer for HtmlToMarkdownTransformer {
42 #[tracing::instrument(skip_all, name = "transformer.html_to_markdown")]
46 async fn transform_node(&self, node: Node) -> Result<Node> {
47 let chunk = self.htmd.convert(&node.chunk)?;
48
49 Node::build_from_other(&node).chunk(chunk).build()
50 }
51
52 fn concurrency(&self) -> Option<usize> {
53 self.concurrency
54 }
55}
56
57#[cfg(test)]
58mod test {
59 use super::*;
60
61 #[tokio::test]
62 async fn test_html_to_markdown() {
63 let node = Node::new("<h1>Hello, World!</h1>");
64 let transformer = HtmlToMarkdownTransformer::default();
65 let transformed_node = transformer.transform_node(node).await.unwrap();
66 assert_eq!(transformed_node.chunk, "# Hello, World!");
67 }
68}