1use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11use thiserror::Error;
12
13#[derive(Debug, Error)]
19pub enum ParserError {
20 #[error("Failed to parse HTML: {0}")]
22 ParseError(String),
23
24 #[error("Invalid CSS selector: {0}")]
26 SelectorError(String),
27
28 #[error("URL error: {0}")]
30 UrlError(#[from] url::ParseError),
31
32 #[error("IO error: {0}")]
34 IoError(#[from] std::io::Error),
35
36 #[error("Encoding error: {0}")]
38 EncodingError(String),
39
40 #[error("Configuration error: {0}")]
42 ConfigError(String),
43}
44
45pub type ParserResult<T> = Result<T, ParserError>;
47
48#[derive(Debug, Clone, Serialize, Deserialize, Default)]
54pub struct TextContent {
55 pub raw_text: String,
57
58 pub cleaned_text: String,
60
61 pub word_count: usize,
63
64 pub char_count: usize,
66
67 pub language: Option<String>,
69
70 pub readability_score: Option<f64>,
72
73 pub reading_time_minutes: Option<f64>,
75}
76
77impl TextContent {
78 pub fn from_raw(raw: &str) -> Self {
80 let cleaned = normalize_whitespace(raw);
81 let word_count = cleaned.split_whitespace().count();
82 let char_count = cleaned.chars().count();
83
84 let reading_time = if word_count > 0 {
86 Some(word_count as f64 / 225.0)
87 } else {
88 None
89 };
90
91 Self {
92 raw_text: raw.to_string(),
93 cleaned_text: cleaned,
94 word_count,
95 char_count,
96 language: None,
97 readability_score: None,
98 reading_time_minutes: reading_time,
99 }
100 }
101
102 pub fn is_empty(&self) -> bool {
104 self.word_count == 0
105 }
106
107 pub fn is_substantial(&self) -> bool {
109 self.word_count >= 50
110 }
111}
112
113#[derive(Debug, Clone, Serialize, Deserialize)]
119pub struct Heading {
120 pub level: u8,
122
123 pub text: String,
125
126 pub id: Option<String>,
128
129 pub classes: Vec<String>,
131}
132
133impl Heading {
134 pub fn new(level: u8, text: impl Into<String>) -> Self {
136 Self {
137 level: level.clamp(1, 6),
138 text: text.into(),
139 id: None,
140 classes: Vec::new(),
141 }
142 }
143
144 pub fn with_id(mut self, id: impl Into<String>) -> Self {
146 self.id = Some(id.into());
147 self
148 }
149}
150
151#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
157#[serde(rename_all = "snake_case")]
158pub enum LinkRel {
159 Follow,
161 NoFollow,
163 Ugc,
165 Sponsored,
167 External,
169 NoOpener,
171 NoReferrer,
173 Other,
175}
176
177#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
179#[serde(rename_all = "snake_case")]
180pub enum LinkType {
181 Internal,
183 External,
185 Unknown,
187}
188
189#[derive(Debug, Clone, Serialize, Deserialize)]
191pub struct Link {
192 pub href: String,
194
195 pub url: Option<String>,
197
198 pub text: String,
200
201 pub title: Option<String>,
203
204 pub rel: Vec<LinkRel>,
206
207 pub link_type: LinkType,
209
210 pub is_nofollow: bool,
212
213 pub target: Option<String>,
215
216 pub hreflang: Option<String>,
218}
219
220impl Link {
221 pub fn new(href: impl Into<String>, text: impl Into<String>) -> Self {
223 Self {
224 href: href.into(),
225 url: None,
226 text: text.into(),
227 title: None,
228 rel: Vec::new(),
229 link_type: LinkType::Unknown,
230 is_nofollow: false,
231 target: None,
232 hreflang: None,
233 }
234 }
235
236 pub fn should_follow(&self) -> bool {
238 !self.is_nofollow && !self.rel.contains(&LinkRel::Sponsored) && !self.rel.contains(&LinkRel::Ugc)
239 }
240
241 pub fn opens_new_tab(&self) -> bool {
243 self.target.as_deref() == Some("_blank")
244 }
245}
246
247#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
253#[serde(rename_all = "snake_case")]
254#[derive(Default)]
255pub enum ImageLoading {
256 #[default]
258 Eager,
259 Lazy,
261}
262
263
264#[derive(Debug, Clone, Serialize, Deserialize)]
266pub struct Image {
267 pub src: String,
269
270 pub url: Option<String>,
272
273 pub alt: String,
275
276 pub title: Option<String>,
278
279 pub width: Option<u32>,
281
282 pub height: Option<u32>,
284
285 pub srcset: Option<String>,
287
288 pub sizes: Option<String>,
290
291 pub loading: ImageLoading,
293
294 pub is_decorative: bool,
296}
297
298impl Image {
299 pub fn new(src: impl Into<String>, alt: impl Into<String>) -> Self {
301 let alt_str = alt.into();
302 let is_decorative = alt_str.is_empty();
303 Self {
304 src: src.into(),
305 url: None,
306 alt: alt_str,
307 title: None,
308 width: None,
309 height: None,
310 srcset: None,
311 sizes: None,
312 loading: ImageLoading::default(),
313 is_decorative,
314 }
315 }
316
317 pub fn is_responsive(&self) -> bool {
319 self.srcset.is_some()
320 }
321}
322
323#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
329#[serde(rename_all = "snake_case")]
330pub enum ListType {
331 Ordered,
333 Unordered,
335 Definition,
337}
338
339#[derive(Debug, Clone, Serialize, Deserialize)]
341pub struct ListItem {
342 pub text: String,
344
345 pub nested: Option<Box<ListContent>>,
347}
348
349impl ListItem {
350 pub fn new(text: impl Into<String>) -> Self {
352 Self {
353 text: text.into(),
354 nested: None,
355 }
356 }
357
358 pub fn with_nested(text: impl Into<String>, nested: ListContent) -> Self {
360 Self {
361 text: text.into(),
362 nested: Some(Box::new(nested)),
363 }
364 }
365}
366
367#[derive(Debug, Clone, Serialize, Deserialize)]
369pub struct ListContent {
370 pub list_type: ListType,
372
373 pub items: Vec<ListItem>,
375
376 pub total_items: usize,
378}
379
380impl ListContent {
381 pub fn new(list_type: ListType) -> Self {
383 Self {
384 list_type,
385 items: Vec::new(),
386 total_items: 0,
387 }
388 }
389
390 pub fn add_item(&mut self, item: ListItem) {
392 self.total_items += 1;
393 if let Some(ref nested) = item.nested {
394 self.total_items += nested.total_items;
395 }
396 self.items.push(item);
397 }
398
399 pub fn is_empty(&self) -> bool {
401 self.items.is_empty()
402 }
403}
404
405#[derive(Debug, Clone, Serialize, Deserialize)]
411pub struct TableCell {
412 pub content: String,
414
415 pub is_header: bool,
417
418 pub colspan: u32,
420
421 pub rowspan: u32,
423}
424
425impl TableCell {
426 pub fn data(content: impl Into<String>) -> Self {
428 Self {
429 content: content.into(),
430 is_header: false,
431 colspan: 1,
432 rowspan: 1,
433 }
434 }
435
436 pub fn header(content: impl Into<String>) -> Self {
438 Self {
439 content: content.into(),
440 is_header: true,
441 colspan: 1,
442 rowspan: 1,
443 }
444 }
445}
446
447#[derive(Debug, Clone, Serialize, Deserialize)]
449pub struct TableRow {
450 pub cells: Vec<TableCell>,
452
453 pub is_header_row: bool,
455}
456
457impl TableRow {
458 pub fn new(cells: Vec<TableCell>) -> Self {
460 let is_header = cells.iter().all(|c| c.is_header);
461 Self {
462 cells,
463 is_header_row: is_header,
464 }
465 }
466}
467
468#[derive(Debug, Clone, Serialize, Deserialize)]
470pub struct TableContent {
471 pub caption: Option<String>,
473
474 pub headers: Vec<TableRow>,
476
477 pub rows: Vec<TableRow>,
479
480 pub column_count: usize,
482
483 pub summary: Option<String>,
485}
486
487impl TableContent {
488 pub fn new() -> Self {
490 Self {
491 caption: None,
492 headers: Vec::new(),
493 rows: Vec::new(),
494 column_count: 0,
495 summary: None,
496 }
497 }
498
499 pub fn is_empty(&self) -> bool {
501 self.headers.is_empty() && self.rows.is_empty()
502 }
503
504 pub fn row_count(&self) -> usize {
506 self.headers.len() + self.rows.len()
507 }
508}
509
510impl Default for TableContent {
511 fn default() -> Self {
512 Self::new()
513 }
514}
515
516#[derive(Debug, Clone, Serialize, Deserialize)]
522pub struct CodeBlock {
523 pub code: String,
525
526 pub language: Option<String>,
528
529 pub line_count: usize,
531
532 pub is_inline: bool,
534
535 pub filename: Option<String>,
537}
538
539impl CodeBlock {
540 pub fn new(code: impl Into<String>) -> Self {
542 let code_str = code.into();
543 let line_count = code_str.lines().count();
544 Self {
545 code: code_str,
546 language: None,
547 line_count,
548 is_inline: false,
549 filename: None,
550 }
551 }
552
553 pub fn with_language(mut self, lang: impl Into<String>) -> Self {
555 self.language = Some(lang.into());
556 self
557 }
558
559 pub fn inline(mut self) -> Self {
561 self.is_inline = true;
562 self
563 }
564}
565
566#[derive(Debug, Clone, Serialize, Deserialize)]
572pub struct Quote {
573 pub text: String,
575
576 pub cite: Option<String>,
578
579 pub cite_url: Option<String>,
581}
582
583impl Quote {
584 pub fn new(text: impl Into<String>) -> Self {
586 Self {
587 text: text.into(),
588 cite: None,
589 cite_url: None,
590 }
591 }
592
593 pub fn with_cite(mut self, cite: impl Into<String>) -> Self {
595 self.cite = Some(cite.into());
596 self
597 }
598}
599
600#[derive(Debug, Clone, Default, Serialize, Deserialize)]
606pub struct OpenGraph {
607 pub title: Option<String>,
609
610 pub og_type: Option<String>,
612
613 pub url: Option<String>,
615
616 pub image: Option<String>,
618
619 pub description: Option<String>,
621
622 pub site_name: Option<String>,
624
625 pub locale: Option<String>,
627
628 pub video: Option<String>,
630
631 pub audio: Option<String>,
633
634 pub extra: HashMap<String, String>,
636}
637
638impl OpenGraph {
639 pub fn is_present(&self) -> bool {
641 self.title.is_some() || self.og_type.is_some() || self.url.is_some()
642 }
643}
644
645#[derive(Debug, Clone, Default, Serialize, Deserialize)]
647pub struct TwitterCard {
648 pub card: Option<String>,
650
651 pub site: Option<String>,
653
654 pub creator: Option<String>,
656
657 pub title: Option<String>,
659
660 pub description: Option<String>,
662
663 pub image: Option<String>,
665
666 pub extra: HashMap<String, String>,
668}
669
670impl TwitterCard {
671 pub fn is_present(&self) -> bool {
673 self.card.is_some() || self.site.is_some()
674 }
675}
676
677#[derive(Debug, Clone, Default, Serialize, Deserialize)]
679pub struct RobotsMeta {
680 pub index: bool,
682
683 pub follow: bool,
685
686 pub archive: bool,
688
689 pub cache: bool,
691
692 pub snippet: bool,
694
695 pub max_snippet: i32,
697
698 pub max_image_preview: Option<String>,
700
701 pub max_video_preview: i32,
703
704 pub raw: Option<String>,
706}
707
708impl RobotsMeta {
709 pub fn allowed() -> Self {
711 Self {
712 index: true,
713 follow: true,
714 archive: true,
715 cache: true,
716 snippet: true,
717 max_snippet: -1,
718 max_image_preview: Some("large".to_string()),
719 max_video_preview: -1,
720 raw: None,
721 }
722 }
723
724 pub fn noindex_nofollow() -> Self {
726 Self {
727 index: false,
728 follow: false,
729 ..Self::allowed()
730 }
731 }
732}
733
734#[derive(Debug, Clone, Serialize, Deserialize)]
736pub struct AlternateLink {
737 pub hreflang: String,
739
740 pub href: String,
742}
743
744#[derive(Debug, Clone, Default, Serialize, Deserialize)]
746pub struct PageMetadata {
747 pub title: Option<String>,
749
750 pub description: Option<String>,
752
753 pub keywords: Vec<String>,
755
756 pub author: Option<String>,
758
759 pub generator: Option<String>,
761
762 pub canonical: Option<String>,
764
765 pub base_url: Option<String>,
767
768 pub language: Option<String>,
770
771 pub charset: Option<String>,
773
774 pub viewport: Option<String>,
776
777 pub robots: RobotsMeta,
779
780 pub opengraph: OpenGraph,
782
783 pub twitter: TwitterCard,
785
786 pub alternates: Vec<AlternateLink>,
788
789 pub favicon: Option<String>,
791
792 pub apple_touch_icon: Option<String>,
794
795 pub theme_color: Option<String>,
797
798 pub published_date: Option<String>,
800
801 pub modified_date: Option<String>,
803
804 pub schema_type: Option<String>,
806
807 pub custom: HashMap<String, String>,
809}
810
811impl PageMetadata {
812 pub fn effective_title(&self) -> Option<&str> {
814 self.opengraph.title.as_deref()
815 .or(self.twitter.title.as_deref())
816 .or(self.title.as_deref())
817 }
818
819 pub fn effective_description(&self) -> Option<&str> {
821 self.opengraph.description.as_deref()
822 .or(self.twitter.description.as_deref())
823 .or(self.description.as_deref())
824 }
825
826 pub fn effective_image(&self) -> Option<&str> {
828 self.opengraph.image.as_deref()
829 .or(self.twitter.image.as_deref())
830 }
831
832 pub fn should_index(&self) -> bool {
834 self.robots.index
835 }
836
837 pub fn should_follow(&self) -> bool {
839 self.robots.follow
840 }
841}
842
843#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
849#[serde(rename_all = "snake_case")]
850pub enum StructuredDataFormat {
851 JsonLd,
853 Microdata,
855 Rdfa,
857}
858
859#[derive(Debug, Clone, Serialize, Deserialize)]
861pub struct StructuredData {
862 pub format: StructuredDataFormat,
864
865 pub schema_type: Option<String>,
867
868 pub raw_json: Option<String>,
870
871 pub properties: HashMap<String, serde_json::Value>,
873}
874
875impl StructuredData {
876 pub fn json_ld(raw: impl Into<String>) -> Self {
878 Self {
879 format: StructuredDataFormat::JsonLd,
880 schema_type: None,
881 raw_json: Some(raw.into()),
882 properties: HashMap::new(),
883 }
884 }
885
886 pub fn microdata(schema_type: impl Into<String>) -> Self {
888 Self {
889 format: StructuredDataFormat::Microdata,
890 schema_type: Some(schema_type.into()),
891 raw_json: None,
892 properties: HashMap::new(),
893 }
894 }
895}
896
897#[derive(Debug, Clone, Default, Serialize, Deserialize)]
903pub struct ParsedContent {
904 pub metadata: PageMetadata,
906
907 pub text: TextContent,
909
910 pub headings: Vec<Heading>,
912
913 pub paragraphs: Vec<String>,
915
916 pub links: Vec<Link>,
918
919 pub images: Vec<Image>,
921
922 pub lists: Vec<ListContent>,
924
925 pub tables: Vec<TableContent>,
927
928 pub code_blocks: Vec<CodeBlock>,
930
931 pub quotes: Vec<Quote>,
933
934 pub structured_data: Vec<StructuredData>,
936
937 pub stats: ParseStats,
939}
940
941impl ParsedContent {
942 pub fn internal_links(&self) -> Vec<&Link> {
944 self.links.iter().filter(|l| l.link_type == LinkType::Internal).collect()
945 }
946
947 pub fn external_links(&self) -> Vec<&Link> {
949 self.links.iter().filter(|l| l.link_type == LinkType::External).collect()
950 }
951
952 pub fn followable_links(&self) -> Vec<&Link> {
954 self.links.iter().filter(|l| l.should_follow()).collect()
955 }
956
957 pub fn outline(&self) -> Vec<&Heading> {
959 self.headings.iter().collect()
960 }
961
962 pub fn has_structured_data(&self) -> bool {
964 !self.structured_data.is_empty()
965 }
966}
967
968#[derive(Debug, Clone, Default, Serialize, Deserialize)]
970pub struct ParseStats {
971 pub html_size: usize,
973
974 pub parse_time_us: u64,
976
977 pub node_count: usize,
979
980 pub element_count: usize,
982
983 pub text_node_count: usize,
985
986 pub comment_count: usize,
988
989 pub errors: Vec<String>,
991
992 pub warnings: Vec<String>,
994}
995
996impl ParseStats {
997 pub fn has_errors(&self) -> bool {
999 !self.errors.is_empty()
1000 }
1001
1002 pub fn has_warnings(&self) -> bool {
1004 !self.warnings.is_empty()
1005 }
1006}
1007
1008#[derive(Debug, Clone)]
1014pub struct ParserConfig {
1015 pub base_url: Option<url::Url>,
1017
1018 pub max_text_length: usize,
1020
1021 pub extract_images: bool,
1023
1024 pub extract_links: bool,
1026
1027 pub extract_tables: bool,
1029
1030 pub extract_code_blocks: bool,
1032
1033 pub extract_structured_data: bool,
1035
1036 pub compute_readability: bool,
1038
1039 pub min_paragraph_length: usize,
1041
1042 pub content_selectors: Vec<String>,
1044
1045 pub remove_selectors: Vec<String>,
1047
1048 pub preserve_whitespace: bool,
1050}
1051
1052impl Default for ParserConfig {
1053 fn default() -> Self {
1054 Self {
1055 base_url: None,
1056 max_text_length: 1_000_000, extract_images: true,
1058 extract_links: true,
1059 extract_tables: true,
1060 extract_code_blocks: true,
1061 extract_structured_data: true,
1062 compute_readability: false,
1063 min_paragraph_length: 20,
1064 content_selectors: vec![
1065 "article".to_string(),
1066 "main".to_string(),
1067 "[role=main]".to_string(),
1068 ".content".to_string(),
1069 ".post-content".to_string(),
1070 ".entry-content".to_string(),
1071 ],
1072 remove_selectors: vec![
1073 "script".to_string(),
1074 "style".to_string(),
1075 "noscript".to_string(),
1076 "nav".to_string(),
1077 "header".to_string(),
1078 "footer".to_string(),
1079 "aside".to_string(),
1080 ".sidebar".to_string(),
1081 ".advertisement".to_string(),
1082 ".ad".to_string(),
1083 ".ads".to_string(),
1084 "[role=navigation]".to_string(),
1085 "[role=banner]".to_string(),
1086 "[role=contentinfo]".to_string(),
1087 ],
1088 preserve_whitespace: false,
1089 }
1090 }
1091}
1092
1093impl ParserConfig {
1094 pub fn with_base_url(url: impl AsRef<str>) -> Result<Self, url::ParseError> {
1096 Ok(Self {
1097 base_url: Some(url::Url::parse(url.as_ref())?),
1098 ..Default::default()
1099 })
1100 }
1101
1102 pub fn minimal() -> Self {
1104 Self {
1105 extract_images: false,
1106 extract_tables: false,
1107 extract_code_blocks: false,
1108 extract_structured_data: false,
1109 compute_readability: false,
1110 ..Default::default()
1111 }
1112 }
1113
1114 pub fn full() -> Self {
1116 Self {
1117 compute_readability: true,
1118 ..Default::default()
1119 }
1120 }
1121
1122 pub fn base_url(mut self, url: url::Url) -> Self {
1124 self.base_url = Some(url);
1125 self
1126 }
1127
1128 pub fn add_content_selector(mut self, selector: impl Into<String>) -> Self {
1130 self.content_selectors.push(selector.into());
1131 self
1132 }
1133
1134 pub fn add_remove_selector(mut self, selector: impl Into<String>) -> Self {
1136 self.remove_selectors.push(selector.into());
1137 self
1138 }
1139}
1140
1141pub fn normalize_whitespace(text: &str) -> String {
1147 let mut result = String::with_capacity(text.len());
1149 let mut prev_ws = false;
1150
1151 for c in text.chars() {
1152 if c.is_whitespace() {
1153 if !prev_ws {
1154 result.push(' ');
1155 prev_ws = true;
1156 }
1157 } else {
1158 result.push(c);
1159 prev_ws = false;
1160 }
1161 }
1162
1163 result.trim().to_string()
1164}
1165
1166pub fn clean_text(text: &str) -> String {
1168 text.chars()
1169 .filter(|c| !c.is_control() || c.is_whitespace())
1170 .collect::<String>()
1171}
1172
1173pub fn truncate_text(text: &str, max_len: usize) -> String {
1175 if text.len() <= max_len {
1176 text.to_string()
1177 } else {
1178 let mut truncated = text.chars().take(max_len - 3).collect::<String>();
1179 truncated.push_str("...");
1180 truncated
1181 }
1182}
1183
1184#[cfg(test)]
1189mod tests {
1190 use super::*;
1191
1192 #[test]
1193 fn test_text_content_creation() {
1194 let text = TextContent::from_raw("Hello world, this is a test.");
1195 assert_eq!(text.cleaned_text, "Hello world, this is a test.");
1196 assert_eq!(text.word_count, 6);
1197 assert!(!text.is_empty());
1198 }
1199
1200 #[test]
1201 fn test_heading_creation() {
1202 let h1 = Heading::new(1, "Main Title").with_id("main");
1203 assert_eq!(h1.level, 1);
1204 assert_eq!(h1.id, Some("main".to_string()));
1205 }
1206
1207 #[test]
1208 fn test_heading_level_clamping() {
1209 let h = Heading::new(10, "Test");
1210 assert_eq!(h.level, 6); }
1212
1213 #[test]
1214 fn test_link_creation() {
1215 let link = Link::new("https://example.com", "Example");
1216 assert!(!link.is_nofollow);
1217 assert!(link.should_follow());
1218 }
1219
1220 #[test]
1221 fn test_link_nofollow() {
1222 let mut link = Link::new("/page", "Page");
1223 link.is_nofollow = true;
1224 assert!(!link.should_follow());
1225 }
1226
1227 #[test]
1228 fn test_image_creation() {
1229 let img = Image::new("/img/photo.jpg", "A photo");
1230 assert!(!img.is_decorative);
1231
1232 let decorative = Image::new("/img/spacer.gif", "");
1233 assert!(decorative.is_decorative);
1234 }
1235
1236 #[test]
1237 fn test_list_content() {
1238 let mut list = ListContent::new(ListType::Unordered);
1239 list.add_item(ListItem::new("Item 1"));
1240 list.add_item(ListItem::new("Item 2"));
1241 assert_eq!(list.total_items, 2);
1242 assert!(!list.is_empty());
1243 }
1244
1245 #[test]
1246 fn test_table_content() {
1247 let table = TableContent::new();
1248 assert!(table.is_empty());
1249 assert_eq!(table.row_count(), 0);
1250 }
1251
1252 #[test]
1253 fn test_code_block() {
1254 let code = CodeBlock::new("fn main() {\n println!(\"Hello\");\n}").with_language("rust");
1255 assert_eq!(code.language, Some("rust".to_string()));
1256 assert_eq!(code.line_count, 3);
1257 assert!(!code.is_inline);
1258 }
1259
1260 #[test]
1261 fn test_opengraph() {
1262 let og = OpenGraph::default();
1263 assert!(!og.is_present());
1264
1265 let og2 = OpenGraph {
1266 title: Some("Test".to_string()),
1267 ..Default::default()
1268 };
1269 assert!(og2.is_present());
1270 }
1271
1272 #[test]
1273 fn test_robots_meta() {
1274 let allowed = RobotsMeta::allowed();
1275 assert!(allowed.index);
1276 assert!(allowed.follow);
1277
1278 let noindex = RobotsMeta::noindex_nofollow();
1279 assert!(!noindex.index);
1280 assert!(!noindex.follow);
1281 }
1282
1283 #[test]
1284 fn test_page_metadata_effective() {
1285 let mut meta = PageMetadata::default();
1286 meta.title = Some("Page Title".to_string());
1287 meta.opengraph.title = Some("OG Title".to_string());
1288
1289 assert_eq!(meta.effective_title(), Some("OG Title"));
1291 }
1292
1293 #[test]
1294 fn test_parser_config() {
1295 let config = ParserConfig::default();
1296 assert!(config.extract_images);
1297 assert!(config.extract_links);
1298
1299 let minimal = ParserConfig::minimal();
1300 assert!(!minimal.extract_images);
1301 }
1302
1303 #[test]
1304 fn test_normalize_whitespace() {
1305 assert_eq!(normalize_whitespace(" hello world "), "hello world");
1306 assert_eq!(normalize_whitespace("a\n\n\nb"), "a b");
1307 assert_eq!(normalize_whitespace(" "), "");
1308 }
1309
1310 #[test]
1311 fn test_clean_text() {
1312 let text = "Hello\x00World\x01Test";
1313 let cleaned = clean_text(text);
1314 assert_eq!(cleaned, "HelloWorldTest");
1315 }
1316
1317 #[test]
1318 fn test_truncate_text() {
1319 assert_eq!(truncate_text("Hello", 10), "Hello");
1320 assert_eq!(truncate_text("Hello World", 8), "Hello...");
1321 }
1322
1323 #[test]
1324 fn test_parsed_content_links() {
1325 let mut content = ParsedContent::default();
1326 content.links.push(Link {
1327 link_type: LinkType::Internal,
1328 ..Link::new("/page", "Page")
1329 });
1330 content.links.push(Link {
1331 link_type: LinkType::External,
1332 ..Link::new("https://ext.com", "Ext")
1333 });
1334
1335 assert_eq!(content.internal_links().len(), 1);
1336 assert_eq!(content.external_links().len(), 1);
1337 }
1338
1339 #[test]
1340 fn test_reading_time() {
1341 let text = TextContent::from_raw(&"word ".repeat(450));
1343 assert!(text.reading_time_minutes.is_some());
1344 let time = text.reading_time_minutes.unwrap();
1345 assert!((time - 2.0).abs() < 0.1); }
1347}