swiftide_integrations/scraping/
html_to_markdown_transformer.rs1use std::sync::Arc;
2
3use anyhow::Result;
4use async_trait::async_trait;
5use htmd::HtmlToMarkdown;
6
7use swiftide_core::{Transformer, indexing::Node};
8
9#[swiftide_macros::indexing_transformer(derive(skip_default, skip_debug))]
13pub struct HtmlToMarkdownTransformer {
14 htmd: Arc<HtmlToMarkdown>,
18}
19
20impl Default for HtmlToMarkdownTransformer {
21 fn default() -> Self {
22 Self {
23 htmd: HtmlToMarkdown::builder()
24 .skip_tags(vec!["script", "style"])
25 .build()
26 .into(),
27 concurrency: None,
28 client: None,
29 indexing_defaults: None,
30 }
31 }
32}
33
34impl std::fmt::Debug for HtmlToMarkdownTransformer {
35 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36 f.debug_struct("HtmlToMarkdownTransformer").finish()
37 }
38}
39
40#[async_trait]
41impl Transformer for HtmlToMarkdownTransformer {
42 #[tracing::instrument(skip_all, name = "transformer.html_to_markdown")]
46 async fn transform_node(&self, node: Node) -> Result<Node> {
47 let chunk = self.htmd.convert(&node.chunk)?;
48
49 Node::chunking_from(&node)
50 .chunk(chunk)
51 .parent_id(node.id())
52 .build()
53 }
54
55 fn concurrency(&self) -> Option<usize> {
56 self.concurrency
57 }
58}
59
60#[cfg(test)]
61mod test {
62 use super::*;
63
64 #[tokio::test]
65 async fn test_html_to_markdown() {
66 let node = Node::new("<h1>Hello, World!</h1>");
67 let transformer = HtmlToMarkdownTransformer::default();
68 let transformed_node = transformer.transform_node(node).await.unwrap();
69 assert_eq!(transformed_node.chunk, "# Hello, World!");
70 }
71}