Skip to main content

stillo_core/
markdown.rs

1use chrono::Utc;
2use crate::document::{ExtractedContent, MarkdownDocument};
3
4#[derive(Debug, Clone)]
5pub struct MarkdownConfig {
6    pub max_line_width: usize,
7    pub include_links: bool,
8    pub include_images: bool,
9    pub heading_style: HeadingStyle,
10}
11
12impl Default for MarkdownConfig {
13    fn default() -> Self {
14        Self {
15            max_line_width: 80,
16            include_links: true,
17            include_images: false,
18            heading_style: HeadingStyle::Atx,
19        }
20    }
21}
22
23#[derive(Debug, Clone)]
24pub enum HeadingStyle {
25    Atx,
26    Setext,
27}
28
29pub struct MarkdownSerializer {
30    config: MarkdownConfig,
31}
32
33impl MarkdownSerializer {
34    pub fn new(config: MarkdownConfig) -> Self {
35        Self { config }
36    }
37
38    /// ExtractedContent → MarkdownDocument(純粋関数)
39    pub fn serialize(&self, content: &ExtractedContent) -> MarkdownDocument {
40        let mut out = String::new();
41
42        out.push_str(&format!("# {}\n\n", content.title));
43        if let Some(byline) = &content.byline {
44            out.push_str(&format!("*{}*\n\n", byline));
45        }
46        out.push_str(&format!("> Source: {}\n\n", content.url));
47
48        let body_md = self.html_to_markdown(&content.body_html);
49        out.push_str(&body_md);
50
51        if self.config.include_links && !content.links.is_empty() {
52            out.push_str("\n\n---\n\n## Links\n\n");
53            for (i, link) in content.links.iter().enumerate() {
54                out.push_str(&format!("{}. [{}]({})\n", i + 1, link.text, link.href));
55            }
56        }
57
58        MarkdownDocument {
59            content: out,
60            source_url: content.url.clone(),
61            extracted_at: Utc::now(),
62        }
63    }
64
65    fn html_to_markdown(&self, html: &str) -> String {
66        let mut converter = HtmlToMarkdown::new(self.config.include_links);
67        converter.convert(html);
68        normalize_blank_lines(&converter.output)
69    }
70}
71
72struct HtmlToMarkdown {
73    output: String,
74    include_links: bool,
75    /// リンク処理中: (href, collected_text)
76    link_stack: Vec<(String, String)>,
77    list_depth: usize,
78    ordered_counters: Vec<usize>,
79}
80
81impl HtmlToMarkdown {
82    fn new(include_links: bool) -> Self {
83        Self {
84            output: String::new(),
85            include_links,
86            link_stack: Vec::new(),
87            list_depth: 0,
88            ordered_counters: Vec::new(),
89        }
90    }
91
92    fn convert(&mut self, html: &str) {
93        let mut pos = 0;
94        let bytes = html.as_bytes();
95
96        while pos < html.len() {
97            if bytes[pos] == b'<' {
98                if let Some(close_offset) = html[pos..].find('>') {
99                    let inner = &html[pos + 1..pos + close_offset];
100                    let (tag, attrs_str, is_closing, is_self_closing) = parse_tag(inner);
101                    self.handle_tag(&tag, attrs_str, is_closing, is_self_closing);
102                    pos += close_offset + 1;
103                    continue;
104                }
105            }
106
107            // テキストノード
108            let next = html[pos..].find('<').map(|i| pos + i).unwrap_or(html.len());
109            let text = html_decode(&html[pos..next]);
110            self.push_text(&text);
111            pos = next;
112        }
113    }
114
115    fn handle_tag(&mut self, tag: &str, attrs: &str, is_closing: bool, _is_self_closing: bool) {
116        match (tag, is_closing) {
117            ("h1", false) => self.push_str("\n# "),
118            ("h2", false) => self.push_str("\n## "),
119            ("h3", false) => self.push_str("\n### "),
120            ("h4", false) => self.push_str("\n#### "),
121            ("h5", false) => self.push_str("\n##### "),
122            ("h6", false) => self.push_str("\n###### "),
123            ("h1" | "h2" | "h3" | "h4" | "h5" | "h6", true) => self.push_str("\n\n"),
124
125            ("p", false) => self.push_str("\n"),
126            ("p", true) => self.push_str("\n\n"),
127
128            ("br", _) => self.push_str("\n"),
129            ("hr", _) => self.push_str("\n---\n"),
130
131            ("strong" | "b", false) => self.push_str("**"),
132            ("strong" | "b", true) => self.push_str("**"),
133            ("em" | "i", false) => self.push_str("*"),
134            ("em" | "i", true) => self.push_str("*"),
135            ("code", false) => self.push_str("`"),
136            ("code", true) => self.push_str("`"),
137
138            ("pre", false) => self.push_str("\n```\n"),
139            ("pre", true) => self.push_str("\n```\n\n"),
140
141            ("blockquote", false) => self.push_str("\n> "),
142            ("blockquote", true) => self.push_str("\n"),
143
144            ("ul", false) => {
145                self.list_depth += 1;
146                self.ordered_counters.push(0);
147            }
148            ("ul", true) => {
149                self.list_depth = self.list_depth.saturating_sub(1);
150                self.ordered_counters.pop();
151                self.push_str("\n");
152            }
153            ("ol", false) => {
154                self.list_depth += 1;
155                self.ordered_counters.push(0);
156            }
157            ("ol", true) => {
158                self.list_depth = self.list_depth.saturating_sub(1);
159                self.ordered_counters.pop();
160                self.push_str("\n");
161            }
162            ("li", false) => {
163                let indent = "  ".repeat(self.list_depth.saturating_sub(1));
164                let counter_val = self.ordered_counters.last_mut().map(|c| { *c += 1; *c });
165                match counter_val {
166                    Some(n) => self.push_str(&format!("\n{}{}. ", indent, n)),
167                    None => self.push_str(&format!("\n{}- ", indent)),
168                }
169            }
170            ("li", true) => {}
171
172            ("a", false) if self.include_links => {
173                let href = extract_attr(attrs, "href").unwrap_or_default();
174                self.link_stack.push((href, String::new()));
175            }
176            ("a", true) if self.include_links => {
177                if let Some((href, text)) = self.link_stack.pop() {
178                    let md_link = format!("[{}]({})", text.trim(), href);
179                    self.push_str(&md_link);
180                }
181            }
182
183            // ノイズタグは無視
184            ("script" | "style" | "noscript" | "iframe", _) => {}
185
186            _ => {}
187        }
188    }
189
190    fn push_str(&mut self, s: &str) {
191        if let Some((_, ref mut text)) = self.link_stack.last_mut() {
192            text.push_str(s);
193        } else {
194            self.output.push_str(s);
195        }
196    }
197
198    fn push_text(&mut self, text: &str) {
199        if let Some((_, ref mut link_text)) = self.link_stack.last_mut() {
200            link_text.push_str(text);
201        } else {
202            self.output.push_str(text);
203        }
204    }
205}
206
207fn parse_tag(inner: &str) -> (String, &str, bool, bool) {
208    let is_self_closing = inner.ends_with('/');
209    let trimmed = if is_self_closing { &inner[..inner.len() - 1] } else { inner };
210    let is_closing = trimmed.starts_with('/');
211    let body = if is_closing { &trimmed[1..] } else { trimmed };
212    let body = body.trim();
213
214    let (tag_name, attrs) = body.split_once(|c: char| c.is_whitespace())
215        .unwrap_or((body, ""));
216    (tag_name.to_lowercase(), attrs.trim(), is_closing, is_self_closing)
217}
218
219fn extract_attr(attrs: &str, name: &str) -> Option<String> {
220    // href="..." または href='...' の両形式に対応
221    for quote in &['"', '\''] {
222        let search = format!("{}={}", name, quote);
223        if let Some(start_idx) = attrs.find(&search) {
224            let value_start = start_idx + search.len();
225            if let Some(end_offset) = attrs[value_start..].find(*quote) {
226                return Some(attrs[value_start..value_start + end_offset].to_owned());
227            }
228        }
229    }
230    None
231}
232
233fn html_decode(s: &str) -> String {
234    s.replace("&amp;", "&")
235        .replace("&lt;", "<")
236        .replace("&gt;", ">")
237        .replace("&quot;", "\"")
238        .replace("&#39;", "'")
239        .replace("&nbsp;", " ")
240}
241
242fn normalize_blank_lines(s: &str) -> String {
243    let mut result = String::with_capacity(s.len());
244    let mut blank_count = 0u32;
245
246    for line in s.lines() {
247        if line.trim().is_empty() {
248            blank_count += 1;
249            if blank_count <= 2 {
250                result.push('\n');
251            }
252        } else {
253            blank_count = 0;
254            result.push_str(line);
255            result.push('\n');
256        }
257    }
258    result
259}