1use chrono::Utc;
2use crate::document::{ExtractedContent, MarkdownDocument};
3
4#[derive(Debug, Clone)]
5pub struct MarkdownConfig {
6 pub max_line_width: usize,
7 pub include_links: bool,
8 pub include_images: bool,
9 pub heading_style: HeadingStyle,
10}
11
12impl Default for MarkdownConfig {
13 fn default() -> Self {
14 Self {
15 max_line_width: 80,
16 include_links: true,
17 include_images: false,
18 heading_style: HeadingStyle::Atx,
19 }
20 }
21}
22
23#[derive(Debug, Clone)]
24pub enum HeadingStyle {
25 Atx,
26 Setext,
27}
28
29pub struct MarkdownSerializer {
30 config: MarkdownConfig,
31}
32
33impl MarkdownSerializer {
34 pub fn new(config: MarkdownConfig) -> Self {
35 Self { config }
36 }
37
38 pub fn serialize(&self, content: &ExtractedContent) -> MarkdownDocument {
40 let mut out = String::new();
41
42 out.push_str(&format!("# {}\n\n", content.title));
43 if let Some(byline) = &content.byline {
44 out.push_str(&format!("*{}*\n\n", byline));
45 }
46 out.push_str(&format!("> Source: {}\n\n", content.url));
47
48 let body_md = self.html_to_markdown(&content.body_html, &content.url);
49 out.push_str(&body_md);
50
51 if self.config.include_links && !content.links.is_empty() {
52 out.push_str("\n\n---\n\n## Links\n\n");
53 for (i, link) in content.links.iter().enumerate() {
54 out.push_str(&format!("{}. [{}]({})\n", i + 1, link.text, link.href));
55 }
56 }
57
58 MarkdownDocument {
59 content: out,
60 source_url: content.url.clone(),
61 extracted_at: Utc::now(),
62 }
63 }
64
65 fn html_to_markdown(&self, html: &str, base_url: &url::Url) -> String {
66 let mut converter = HtmlToMarkdown::new(self.config.include_links, base_url.clone());
67 converter.convert(html);
68 normalize_blank_lines(&converter.output)
69 }
70}
71
72struct HtmlToMarkdown {
73 output: String,
74 include_links: bool,
75 base_url: url::Url,
76 link_stack: Vec<(String, String)>,
78 list_depth: usize,
79 ordered_counters: Vec<usize>,
80}
81
82impl HtmlToMarkdown {
83 fn new(include_links: bool, base_url: url::Url) -> Self {
84 Self {
85 output: String::new(),
86 include_links,
87 base_url,
88 link_stack: Vec::new(),
89 list_depth: 0,
90 ordered_counters: Vec::new(),
91 }
92 }
93
94 fn convert(&mut self, html: &str) {
95 let mut pos = 0;
96 let bytes = html.as_bytes();
97
98 while pos < html.len() {
99 if bytes[pos] == b'<' {
100 if let Some(close_offset) = html[pos..].find('>') {
101 let inner = &html[pos + 1..pos + close_offset];
102 let (tag, attrs_str, is_closing, is_self_closing) = parse_tag(inner);
103 self.handle_tag(&tag, attrs_str, is_closing, is_self_closing);
104 pos += close_offset + 1;
105 continue;
106 }
107 }
108
109 let next = html[pos..].find('<').map(|i| pos + i).unwrap_or(html.len());
111 let text = html_decode(&html[pos..next]);
112 self.push_text(&text);
113 pos = next;
114 }
115 }
116
117 fn handle_tag(&mut self, tag: &str, attrs: &str, is_closing: bool, _is_self_closing: bool) {
118 match (tag, is_closing) {
119 ("h1", false) => self.push_str("\n# "),
120 ("h2", false) => self.push_str("\n## "),
121 ("h3", false) => self.push_str("\n### "),
122 ("h4", false) => self.push_str("\n#### "),
123 ("h5", false) => self.push_str("\n##### "),
124 ("h6", false) => self.push_str("\n###### "),
125 ("h1" | "h2" | "h3" | "h4" | "h5" | "h6", true) => self.push_str("\n\n"),
126
127 ("p", false) => self.push_str("\n"),
128 ("p", true) => self.push_str("\n\n"),
129
130 ("br", _) => self.push_str("\n"),
131 ("hr", _) => self.push_str("\n---\n"),
132
133 ("strong" | "b", false) => self.push_str("**"),
134 ("strong" | "b", true) => self.push_str("**"),
135 ("em" | "i", false) => self.push_str("*"),
136 ("em" | "i", true) => self.push_str("*"),
137 ("code", false) => self.push_str("`"),
138 ("code", true) => self.push_str("`"),
139
140 ("pre", false) => self.push_str("\n```\n"),
141 ("pre", true) => self.push_str("\n```\n\n"),
142
143 ("blockquote", false) => self.push_str("\n> "),
144 ("blockquote", true) => self.push_str("\n"),
145
146 ("ul", false) => {
147 self.list_depth += 1;
148 self.ordered_counters.push(0);
149 }
150 ("ul", true) => {
151 self.list_depth = self.list_depth.saturating_sub(1);
152 self.ordered_counters.pop();
153 self.push_str("\n");
154 }
155 ("ol", false) => {
156 self.list_depth += 1;
157 self.ordered_counters.push(0);
158 }
159 ("ol", true) => {
160 self.list_depth = self.list_depth.saturating_sub(1);
161 self.ordered_counters.pop();
162 self.push_str("\n");
163 }
164 ("li", false) => {
165 let indent = " ".repeat(self.list_depth.saturating_sub(1));
166 let counter_val = self.ordered_counters.last_mut().map(|c| { *c += 1; *c });
167 match counter_val {
168 Some(n) => self.push_str(&format!("\n{}{}. ", indent, n)),
169 None => self.push_str(&format!("\n{}- ", indent)),
170 }
171 }
172 ("li", true) => {}
173
174 ("a", false) if self.include_links => {
175 let raw_href = extract_attr(attrs, "href").unwrap_or_default();
176 let href = if raw_href.is_empty() {
178 raw_href
179 } else {
180 self.base_url.join(&raw_href)
181 .map(|u| u.to_string())
182 .unwrap_or(raw_href)
183 };
184 self.link_stack.push((href, String::new()));
185 }
186 ("a", true) if self.include_links => {
187 if let Some((href, text)) = self.link_stack.pop() {
188 let md_link = format!("[{}]({})", text.trim(), href);
189 self.push_str(&md_link);
190 }
191 }
192
193 ("script" | "style" | "noscript" | "iframe", _) => {}
195
196 _ => {}
197 }
198 }
199
200 fn push_str(&mut self, s: &str) {
201 if let Some((_, ref mut text)) = self.link_stack.last_mut() {
202 text.push_str(s);
203 } else {
204 self.output.push_str(s);
205 }
206 }
207
208 fn push_text(&mut self, text: &str) {
209 if let Some((_, ref mut link_text)) = self.link_stack.last_mut() {
210 link_text.push_str(text);
211 } else {
212 self.output.push_str(text);
213 }
214 }
215}
216
217fn parse_tag(inner: &str) -> (String, &str, bool, bool) {
218 let is_self_closing = inner.ends_with('/');
219 let trimmed = if is_self_closing { &inner[..inner.len() - 1] } else { inner };
220 let is_closing = trimmed.starts_with('/');
221 let body = if is_closing { &trimmed[1..] } else { trimmed };
222 let body = body.trim();
223
224 let (tag_name, attrs) = body.split_once(|c: char| c.is_whitespace())
225 .unwrap_or((body, ""));
226 (tag_name.to_lowercase(), attrs.trim(), is_closing, is_self_closing)
227}
228
229fn extract_attr(attrs: &str, name: &str) -> Option<String> {
230 for quote in &['"', '\''] {
232 let search = format!("{}={}", name, quote);
233 if let Some(start_idx) = attrs.find(&search) {
234 let value_start = start_idx + search.len();
235 if let Some(end_offset) = attrs[value_start..].find(*quote) {
236 return Some(attrs[value_start..value_start + end_offset].to_owned());
237 }
238 }
239 }
240 None
241}
242
243fn html_decode(s: &str) -> String {
244 s.replace("&", "&")
245 .replace("<", "<")
246 .replace(">", ">")
247 .replace(""", "\"")
248 .replace("'", "'")
249 .replace(" ", " ")
250}
251
252fn normalize_blank_lines(s: &str) -> String {
253 let mut result = String::with_capacity(s.len());
254 let mut blank_count = 0u32;
255
256 for line in s.lines() {
257 if line.trim().is_empty() {
258 blank_count += 1;
259 if blank_count <= 2 {
260 result.push('\n');
261 }
262 } else {
263 blank_count = 0;
264 result.push_str(line);
265 result.push('\n');
266 }
267 }
268 result
269}