halldyll_parser/
types.rs

1//! Type definitions for halldyll-parser
2//!
3//! This module contains all public types used throughout the parser:
4//! - Error types
5//! - Content types (text, headings, lists, tables, etc.)
6//! - Metadata types (OpenGraph, Twitter Cards, etc.)
7//! - Structured data types (JSON-LD, Microdata, RDFa)
8
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11use thiserror::Error;
12
13// ============================================================================
14// ERROR TYPES
15// ============================================================================
16
17/// Errors that can occur during HTML parsing
18#[derive(Debug, Error)]
19pub enum ParserError {
20    /// HTML parsing failed
21    #[error("Failed to parse HTML: {0}")]
22    ParseError(String),
23
24    /// Invalid selector syntax
25    #[error("Invalid CSS selector: {0}")]
26    SelectorError(String),
27
28    /// URL parsing/resolution error
29    #[error("URL error: {0}")]
30    UrlError(#[from] url::ParseError),
31
32    /// IO error (reading files, etc.)
33    #[error("IO error: {0}")]
34    IoError(#[from] std::io::Error),
35
36    /// Encoding error
37    #[error("Encoding error: {0}")]
38    EncodingError(String),
39
40    /// Configuration error
41    #[error("Configuration error: {0}")]
42    ConfigError(String),
43}
44
45/// Result type for parser operations
46pub type ParserResult<T> = Result<T, ParserError>;
47
48// ============================================================================
49// TEXT CONTENT
50// ============================================================================
51
52/// Extracted text content with metadata
53#[derive(Debug, Clone, Serialize, Deserialize, Default)]
54pub struct TextContent {
55    /// Raw extracted text
56    pub raw_text: String,
57
58    /// Cleaned text (whitespace normalized)
59    pub cleaned_text: String,
60
61    /// Word count
62    pub word_count: usize,
63
64    /// Character count
65    pub char_count: usize,
66
67    /// Detected language (ISO 639-1 code)
68    pub language: Option<String>,
69
70    /// Readability score (Flesch-Kincaid or similar)
71    pub readability_score: Option<f64>,
72
73    /// Estimated reading time in minutes
74    pub reading_time_minutes: Option<f64>,
75}
76
77impl TextContent {
78    /// Create new text content from raw text
79    pub fn from_raw(raw: &str) -> Self {
80        let cleaned = normalize_whitespace(raw);
81        let word_count = cleaned.split_whitespace().count();
82        let char_count = cleaned.chars().count();
83        
84        // Average reading speed: 200-250 WPM, we use 225
85        let reading_time = if word_count > 0 {
86            Some(word_count as f64 / 225.0)
87        } else {
88            None
89        };
90
91        Self {
92            raw_text: raw.to_string(),
93            cleaned_text: cleaned,
94            word_count,
95            char_count,
96            language: None,
97            readability_score: None,
98            reading_time_minutes: reading_time,
99        }
100    }
101
102    /// Check if content is empty or minimal
103    pub fn is_empty(&self) -> bool {
104        self.word_count == 0
105    }
106
107    /// Check if content is substantial (more than just a few words)
108    pub fn is_substantial(&self) -> bool {
109        self.word_count >= 50
110    }
111}
112
113// ============================================================================
114// HEADINGS
115// ============================================================================
116
117/// A heading element (h1-h6)
118#[derive(Debug, Clone, Serialize, Deserialize)]
119pub struct Heading {
120    /// Heading level (1-6)
121    pub level: u8,
122
123    /// Heading text content
124    pub text: String,
125
126    /// ID attribute if present
127    pub id: Option<String>,
128
129    /// Class names if present
130    pub classes: Vec<String>,
131}
132
133impl Heading {
134    /// Create a new heading
135    pub fn new(level: u8, text: impl Into<String>) -> Self {
136        Self {
137            level: level.clamp(1, 6),
138            text: text.into(),
139            id: None,
140            classes: Vec::new(),
141        }
142    }
143
144    /// Create heading with ID
145    pub fn with_id(mut self, id: impl Into<String>) -> Self {
146        self.id = Some(id.into());
147        self
148    }
149}
150
151// ============================================================================
152// LINKS
153// ============================================================================
154
155/// Relationship types for links
156#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
157#[serde(rename_all = "snake_case")]
158pub enum LinkRel {
159    /// Standard follow link
160    Follow,
161    /// nofollow link
162    NoFollow,
163    /// ugc (user generated content)
164    Ugc,
165    /// sponsored link
166    Sponsored,
167    /// external link
168    External,
169    /// noopener
170    NoOpener,
171    /// noreferrer
172    NoReferrer,
173    /// Other rel value
174    Other,
175}
176
177/// Type of link (internal vs external)
178#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
179#[serde(rename_all = "snake_case")]
180pub enum LinkType {
181    /// Link to same domain
182    Internal,
183    /// Link to different domain
184    External,
185    /// Cannot determine (no base URL)
186    Unknown,
187}
188
189/// An extracted link
190#[derive(Debug, Clone, Serialize, Deserialize)]
191pub struct Link {
192    /// Original href value
193    pub href: String,
194
195    /// Resolved absolute URL (if possible)
196    pub url: Option<String>,
197
198    /// Anchor text
199    pub text: String,
200
201    /// Title attribute
202    pub title: Option<String>,
203
204    /// Relationship attributes
205    pub rel: Vec<LinkRel>,
206
207    /// Link type (internal/external)
208    pub link_type: LinkType,
209
210    /// Whether link is nofollow
211    pub is_nofollow: bool,
212
213    /// Target attribute (_blank, _self, etc.)
214    pub target: Option<String>,
215
216    /// hreflang attribute
217    pub hreflang: Option<String>,
218}
219
220impl Link {
221    /// Create a new link
222    pub fn new(href: impl Into<String>, text: impl Into<String>) -> Self {
223        Self {
224            href: href.into(),
225            url: None,
226            text: text.into(),
227            title: None,
228            rel: Vec::new(),
229            link_type: LinkType::Unknown,
230            is_nofollow: false,
231            target: None,
232            hreflang: None,
233        }
234    }
235
236    /// Check if link should be followed by crawlers
237    pub fn should_follow(&self) -> bool {
238        !self.is_nofollow && !self.rel.contains(&LinkRel::Sponsored) && !self.rel.contains(&LinkRel::Ugc)
239    }
240
241    /// Check if link opens in new tab
242    pub fn opens_new_tab(&self) -> bool {
243        self.target.as_deref() == Some("_blank")
244    }
245}
246
247// ============================================================================
248// IMAGES
249// ============================================================================
250
251/// Image loading strategy
252#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
253#[serde(rename_all = "snake_case")]
254#[derive(Default)]
255pub enum ImageLoading {
256    /// Eager loading (default)
257    #[default]
258    Eager,
259    /// Lazy loading
260    Lazy,
261}
262
263
264/// An extracted image
265#[derive(Debug, Clone, Serialize, Deserialize)]
266pub struct Image {
267    /// Original src attribute
268    pub src: String,
269
270    /// Resolved absolute URL
271    pub url: Option<String>,
272
273    /// Alt text
274    pub alt: String,
275
276    /// Title attribute
277    pub title: Option<String>,
278
279    /// Width if specified
280    pub width: Option<u32>,
281
282    /// Height if specified
283    pub height: Option<u32>,
284
285    /// srcset for responsive images
286    pub srcset: Option<String>,
287
288    /// sizes attribute
289    pub sizes: Option<String>,
290
291    /// Loading strategy (lazy/eager)
292    pub loading: ImageLoading,
293
294    /// Whether image is decorative (empty alt)
295    pub is_decorative: bool,
296}
297
298impl Image {
299    /// Create a new image
300    pub fn new(src: impl Into<String>, alt: impl Into<String>) -> Self {
301        let alt_str = alt.into();
302        let is_decorative = alt_str.is_empty();
303        Self {
304            src: src.into(),
305            url: None,
306            alt: alt_str,
307            title: None,
308            width: None,
309            height: None,
310            srcset: None,
311            sizes: None,
312            loading: ImageLoading::default(),
313            is_decorative,
314        }
315    }
316
317    /// Check if image has responsive srcset
318    pub fn is_responsive(&self) -> bool {
319        self.srcset.is_some()
320    }
321}
322
323// ============================================================================
324// LISTS
325// ============================================================================
326
327/// Type of list
328#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
329#[serde(rename_all = "snake_case")]
330pub enum ListType {
331    /// Ordered list (ol)
332    Ordered,
333    /// Unordered list (ul)
334    Unordered,
335    /// Definition list (dl)
336    Definition,
337}
338
339/// A list item (may contain nested content)
340#[derive(Debug, Clone, Serialize, Deserialize)]
341pub struct ListItem {
342    /// Item text content
343    pub text: String,
344
345    /// Nested list if any
346    pub nested: Option<Box<ListContent>>,
347}
348
349impl ListItem {
350    /// Create a simple list item
351    pub fn new(text: impl Into<String>) -> Self {
352        Self {
353            text: text.into(),
354            nested: None,
355        }
356    }
357
358    /// Create item with nested list
359    pub fn with_nested(text: impl Into<String>, nested: ListContent) -> Self {
360        Self {
361            text: text.into(),
362            nested: Some(Box::new(nested)),
363        }
364    }
365}
366
367/// An extracted list
368#[derive(Debug, Clone, Serialize, Deserialize)]
369pub struct ListContent {
370    /// Type of list
371    pub list_type: ListType,
372
373    /// List items
374    pub items: Vec<ListItem>,
375
376    /// Total item count (including nested)
377    pub total_items: usize,
378}
379
380impl ListContent {
381    /// Create a new list
382    pub fn new(list_type: ListType) -> Self {
383        Self {
384            list_type,
385            items: Vec::new(),
386            total_items: 0,
387        }
388    }
389
390    /// Add an item
391    pub fn add_item(&mut self, item: ListItem) {
392        self.total_items += 1;
393        if let Some(ref nested) = item.nested {
394            self.total_items += nested.total_items;
395        }
396        self.items.push(item);
397    }
398
399    /// Check if list is empty
400    pub fn is_empty(&self) -> bool {
401        self.items.is_empty()
402    }
403}
404
405// ============================================================================
406// TABLES
407// ============================================================================
408
409/// A table cell
410#[derive(Debug, Clone, Serialize, Deserialize)]
411pub struct TableCell {
412    /// Cell content
413    pub content: String,
414
415    /// Is header cell (th)
416    pub is_header: bool,
417
418    /// Column span
419    pub colspan: u32,
420
421    /// Row span
422    pub rowspan: u32,
423}
424
425impl TableCell {
426    /// Create a data cell
427    pub fn data(content: impl Into<String>) -> Self {
428        Self {
429            content: content.into(),
430            is_header: false,
431            colspan: 1,
432            rowspan: 1,
433        }
434    }
435
436    /// Create a header cell
437    pub fn header(content: impl Into<String>) -> Self {
438        Self {
439            content: content.into(),
440            is_header: true,
441            colspan: 1,
442            rowspan: 1,
443        }
444    }
445}
446
447/// A table row
448#[derive(Debug, Clone, Serialize, Deserialize)]
449pub struct TableRow {
450    /// Cells in this row
451    pub cells: Vec<TableCell>,
452
453    /// Is this a header row
454    pub is_header_row: bool,
455}
456
457impl TableRow {
458    /// Create a new row
459    pub fn new(cells: Vec<TableCell>) -> Self {
460        let is_header = cells.iter().all(|c| c.is_header);
461        Self {
462            cells,
463            is_header_row: is_header,
464        }
465    }
466}
467
468/// An extracted table
469#[derive(Debug, Clone, Serialize, Deserialize)]
470pub struct TableContent {
471    /// Table caption
472    pub caption: Option<String>,
473
474    /// Header rows
475    pub headers: Vec<TableRow>,
476
477    /// Body rows
478    pub rows: Vec<TableRow>,
479
480    /// Number of columns
481    pub column_count: usize,
482
483    /// Table summary (if provided)
484    pub summary: Option<String>,
485}
486
487impl TableContent {
488    /// Create a new empty table
489    pub fn new() -> Self {
490        Self {
491            caption: None,
492            headers: Vec::new(),
493            rows: Vec::new(),
494            column_count: 0,
495            summary: None,
496        }
497    }
498
499    /// Check if table is empty
500    pub fn is_empty(&self) -> bool {
501        self.headers.is_empty() && self.rows.is_empty()
502    }
503
504    /// Get total row count
505    pub fn row_count(&self) -> usize {
506        self.headers.len() + self.rows.len()
507    }
508}
509
510impl Default for TableContent {
511    fn default() -> Self {
512        Self::new()
513    }
514}
515
516// ============================================================================
517// CODE BLOCKS
518// ============================================================================
519
520/// An extracted code block
521#[derive(Debug, Clone, Serialize, Deserialize)]
522pub struct CodeBlock {
523    /// The code content
524    pub code: String,
525
526    /// Programming language (if detected)
527    pub language: Option<String>,
528
529    /// Line count
530    pub line_count: usize,
531
532    /// Whether it's inline code
533    pub is_inline: bool,
534
535    /// Filename if specified (e.g., in markdown)
536    pub filename: Option<String>,
537}
538
539impl CodeBlock {
540    /// Create a new code block
541    pub fn new(code: impl Into<String>) -> Self {
542        let code_str = code.into();
543        let line_count = code_str.lines().count();
544        Self {
545            code: code_str,
546            language: None,
547            line_count,
548            is_inline: false,
549            filename: None,
550        }
551    }
552
553    /// Create with language
554    pub fn with_language(mut self, lang: impl Into<String>) -> Self {
555        self.language = Some(lang.into());
556        self
557    }
558
559    /// Mark as inline
560    pub fn inline(mut self) -> Self {
561        self.is_inline = true;
562        self
563    }
564}
565
566// ============================================================================
567// QUOTES
568// ============================================================================
569
570/// An extracted blockquote
571#[derive(Debug, Clone, Serialize, Deserialize)]
572pub struct Quote {
573    /// Quote text content
574    pub text: String,
575
576    /// Citation/source
577    pub cite: Option<String>,
578
579    /// Citation URL
580    pub cite_url: Option<String>,
581}
582
583impl Quote {
584    /// Create a new quote
585    pub fn new(text: impl Into<String>) -> Self {
586        Self {
587            text: text.into(),
588            cite: None,
589            cite_url: None,
590        }
591    }
592
593    /// Add citation
594    pub fn with_cite(mut self, cite: impl Into<String>) -> Self {
595        self.cite = Some(cite.into());
596        self
597    }
598}
599
600// ============================================================================
601// METADATA TYPES
602// ============================================================================
603
604/// OpenGraph metadata
605#[derive(Debug, Clone, Default, Serialize, Deserialize)]
606pub struct OpenGraph {
607    /// og:title
608    pub title: Option<String>,
609
610    /// og:type
611    pub og_type: Option<String>,
612
613    /// og:url
614    pub url: Option<String>,
615
616    /// og:image
617    pub image: Option<String>,
618
619    /// og:description
620    pub description: Option<String>,
621
622    /// og:site_name
623    pub site_name: Option<String>,
624
625    /// og:locale
626    pub locale: Option<String>,
627
628    /// og:video
629    pub video: Option<String>,
630
631    /// og:audio
632    pub audio: Option<String>,
633
634    /// Additional properties
635    pub extra: HashMap<String, String>,
636}
637
638impl OpenGraph {
639    /// Check if OG data is present
640    pub fn is_present(&self) -> bool {
641        self.title.is_some() || self.og_type.is_some() || self.url.is_some()
642    }
643}
644
645/// Twitter Card metadata
646#[derive(Debug, Clone, Default, Serialize, Deserialize)]
647pub struct TwitterCard {
648    /// twitter:card
649    pub card: Option<String>,
650
651    /// twitter:site
652    pub site: Option<String>,
653
654    /// twitter:creator
655    pub creator: Option<String>,
656
657    /// twitter:title
658    pub title: Option<String>,
659
660    /// twitter:description
661    pub description: Option<String>,
662
663    /// twitter:image
664    pub image: Option<String>,
665
666    /// Additional properties
667    pub extra: HashMap<String, String>,
668}
669
670impl TwitterCard {
671    /// Check if Twitter Card data is present
672    pub fn is_present(&self) -> bool {
673        self.card.is_some() || self.site.is_some()
674    }
675}
676
677/// Robots meta directives
678#[derive(Debug, Clone, Default, Serialize, Deserialize)]
679pub struct RobotsMeta {
680    /// Can be indexed
681    pub index: bool,
682
683    /// Links can be followed
684    pub follow: bool,
685
686    /// Can be archived
687    pub archive: bool,
688
689    /// Can be cached
690    pub cache: bool,
691
692    /// Can show snippet
693    pub snippet: bool,
694
695    /// Max snippet length (-1 = unlimited)
696    pub max_snippet: i32,
697
698    /// Max image preview (none, standard, large)
699    pub max_image_preview: Option<String>,
700
701    /// Max video preview seconds
702    pub max_video_preview: i32,
703
704    /// Raw robots content
705    pub raw: Option<String>,
706}
707
708impl RobotsMeta {
709    /// Create default (all allowed)
710    pub fn allowed() -> Self {
711        Self {
712            index: true,
713            follow: true,
714            archive: true,
715            cache: true,
716            snippet: true,
717            max_snippet: -1,
718            max_image_preview: Some("large".to_string()),
719            max_video_preview: -1,
720            raw: None,
721        }
722    }
723
724    /// Create noindex nofollow
725    pub fn noindex_nofollow() -> Self {
726        Self {
727            index: false,
728            follow: false,
729            ..Self::allowed()
730        }
731    }
732}
733
734/// Alternate language version (hreflang)
735#[derive(Debug, Clone, Serialize, Deserialize)]
736pub struct AlternateLink {
737    /// Language code (e.g., "en", "fr", "x-default")
738    pub hreflang: String,
739
740    /// URL of alternate version
741    pub href: String,
742}
743
744/// Complete page metadata
745#[derive(Debug, Clone, Default, Serialize, Deserialize)]
746pub struct PageMetadata {
747    /// Page title
748    pub title: Option<String>,
749
750    /// Meta description
751    pub description: Option<String>,
752
753    /// Meta keywords
754    pub keywords: Vec<String>,
755
756    /// Author
757    pub author: Option<String>,
758
759    /// Generator (CMS/framework)
760    pub generator: Option<String>,
761
762    /// Canonical URL
763    pub canonical: Option<String>,
764
765    /// Base URL from <base> tag
766    pub base_url: Option<String>,
767
768    /// Language (html lang attribute)
769    pub language: Option<String>,
770
771    /// Character encoding
772    pub charset: Option<String>,
773
774    /// Viewport
775    pub viewport: Option<String>,
776
777    /// Robots directives
778    pub robots: RobotsMeta,
779
780    /// OpenGraph data
781    pub opengraph: OpenGraph,
782
783    /// Twitter Card data
784    pub twitter: TwitterCard,
785
786    /// Alternate language versions
787    pub alternates: Vec<AlternateLink>,
788
789    /// Favicon URL
790    pub favicon: Option<String>,
791
792    /// Apple touch icon
793    pub apple_touch_icon: Option<String>,
794
795    /// Theme color
796    pub theme_color: Option<String>,
797
798    /// Published date
799    pub published_date: Option<String>,
800
801    /// Modified date
802    pub modified_date: Option<String>,
803
804    /// Schema.org type (if detected)
805    pub schema_type: Option<String>,
806
807    /// Custom meta tags (name -> content)
808    pub custom: HashMap<String, String>,
809}
810
811impl PageMetadata {
812    /// Get effective title (OG > Twitter > title tag)
813    pub fn effective_title(&self) -> Option<&str> {
814        self.opengraph.title.as_deref()
815            .or(self.twitter.title.as_deref())
816            .or(self.title.as_deref())
817    }
818
819    /// Get effective description
820    pub fn effective_description(&self) -> Option<&str> {
821        self.opengraph.description.as_deref()
822            .or(self.twitter.description.as_deref())
823            .or(self.description.as_deref())
824    }
825
826    /// Get effective image
827    pub fn effective_image(&self) -> Option<&str> {
828        self.opengraph.image.as_deref()
829            .or(self.twitter.image.as_deref())
830    }
831
832    /// Check if page should be indexed
833    pub fn should_index(&self) -> bool {
834        self.robots.index
835    }
836
837    /// Check if links should be followed
838    pub fn should_follow(&self) -> bool {
839        self.robots.follow
840    }
841}
842
843// ============================================================================
844// STRUCTURED DATA
845// ============================================================================
846
847/// Type of structured data
848#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
849#[serde(rename_all = "snake_case")]
850pub enum StructuredDataFormat {
851    /// JSON-LD (recommended)
852    JsonLd,
853    /// Microdata
854    Microdata,
855    /// RDFa
856    Rdfa,
857}
858
859/// Extracted structured data item
860#[derive(Debug, Clone, Serialize, Deserialize)]
861pub struct StructuredData {
862    /// Format (JSON-LD, Microdata, RDFa)
863    pub format: StructuredDataFormat,
864
865    /// Schema.org type (e.g., "Article", "Product", "Organization")
866    pub schema_type: Option<String>,
867
868    /// Raw JSON content (for JSON-LD)
869    pub raw_json: Option<String>,
870
871    /// Parsed properties
872    pub properties: HashMap<String, serde_json::Value>,
873}
874
875impl StructuredData {
876    /// Create JSON-LD data
877    pub fn json_ld(raw: impl Into<String>) -> Self {
878        Self {
879            format: StructuredDataFormat::JsonLd,
880            schema_type: None,
881            raw_json: Some(raw.into()),
882            properties: HashMap::new(),
883        }
884    }
885
886    /// Create Microdata
887    pub fn microdata(schema_type: impl Into<String>) -> Self {
888        Self {
889            format: StructuredDataFormat::Microdata,
890            schema_type: Some(schema_type.into()),
891            raw_json: None,
892            properties: HashMap::new(),
893        }
894    }
895}
896
897// ============================================================================
898// PARSED CONTENT (COMPLETE RESULT)
899// ============================================================================
900
901/// Complete parsed content from an HTML document
902#[derive(Debug, Clone, Default, Serialize, Deserialize)]
903pub struct ParsedContent {
904    /// Page metadata
905    pub metadata: PageMetadata,
906
907    /// Extracted text content
908    pub text: TextContent,
909
910    /// All headings
911    pub headings: Vec<Heading>,
912
913    /// All paragraphs
914    pub paragraphs: Vec<String>,
915
916    /// All links
917    pub links: Vec<Link>,
918
919    /// All images
920    pub images: Vec<Image>,
921
922    /// All lists
923    pub lists: Vec<ListContent>,
924
925    /// All tables
926    pub tables: Vec<TableContent>,
927
928    /// All code blocks
929    pub code_blocks: Vec<CodeBlock>,
930
931    /// All quotes
932    pub quotes: Vec<Quote>,
933
934    /// Structured data (JSON-LD, Microdata, RDFa)
935    pub structured_data: Vec<StructuredData>,
936
937    /// Parsing statistics
938    pub stats: ParseStats,
939}
940
941impl ParsedContent {
942    /// Get internal links only
943    pub fn internal_links(&self) -> Vec<&Link> {
944        self.links.iter().filter(|l| l.link_type == LinkType::Internal).collect()
945    }
946
947    /// Get external links only
948    pub fn external_links(&self) -> Vec<&Link> {
949        self.links.iter().filter(|l| l.link_type == LinkType::External).collect()
950    }
951
952    /// Get followable links only
953    pub fn followable_links(&self) -> Vec<&Link> {
954        self.links.iter().filter(|l| l.should_follow()).collect()
955    }
956
957    /// Get the document outline (headings hierarchy)
958    pub fn outline(&self) -> Vec<&Heading> {
959        self.headings.iter().collect()
960    }
961
962    /// Check if page has structured data
963    pub fn has_structured_data(&self) -> bool {
964        !self.structured_data.is_empty()
965    }
966}
967
968/// Parsing statistics
969#[derive(Debug, Clone, Default, Serialize, Deserialize)]
970pub struct ParseStats {
971    /// HTML size in bytes
972    pub html_size: usize,
973
974    /// Parse time in microseconds
975    pub parse_time_us: u64,
976
977    /// Number of DOM nodes
978    pub node_count: usize,
979
980    /// Number of elements
981    pub element_count: usize,
982
983    /// Number of text nodes
984    pub text_node_count: usize,
985
986    /// Number of comments
987    pub comment_count: usize,
988
989    /// Errors encountered during parsing
990    pub errors: Vec<String>,
991
992    /// Warnings
993    pub warnings: Vec<String>,
994}
995
996impl ParseStats {
997    /// Check if parsing had errors
998    pub fn has_errors(&self) -> bool {
999        !self.errors.is_empty()
1000    }
1001
1002    /// Check if parsing had warnings
1003    pub fn has_warnings(&self) -> bool {
1004        !self.warnings.is_empty()
1005    }
1006}
1007
1008// ============================================================================
1009// PARSER CONFIGURATION
1010// ============================================================================
1011
1012/// Configuration for the HTML parser
1013#[derive(Debug, Clone)]
1014pub struct ParserConfig {
1015    /// Base URL for resolving relative URLs
1016    pub base_url: Option<url::Url>,
1017
1018    /// Maximum text length to extract
1019    pub max_text_length: usize,
1020
1021    /// Whether to extract images
1022    pub extract_images: bool,
1023
1024    /// Whether to extract links
1025    pub extract_links: bool,
1026
1027    /// Whether to extract tables
1028    pub extract_tables: bool,
1029
1030    /// Whether to extract code blocks
1031    pub extract_code_blocks: bool,
1032
1033    /// Whether to extract structured data
1034    pub extract_structured_data: bool,
1035
1036    /// Whether to compute readability scores
1037    pub compute_readability: bool,
1038
1039    /// Minimum paragraph length to include
1040    pub min_paragraph_length: usize,
1041
1042    /// Content selectors (CSS selectors for main content)
1043    pub content_selectors: Vec<String>,
1044
1045    /// Selectors for elements to remove (ads, nav, footer, etc.)
1046    pub remove_selectors: Vec<String>,
1047
1048    /// Whether to preserve whitespace
1049    pub preserve_whitespace: bool,
1050}
1051
1052impl Default for ParserConfig {
1053    fn default() -> Self {
1054        Self {
1055            base_url: None,
1056            max_text_length: 1_000_000, // 1MB
1057            extract_images: true,
1058            extract_links: true,
1059            extract_tables: true,
1060            extract_code_blocks: true,
1061            extract_structured_data: true,
1062            compute_readability: false,
1063            min_paragraph_length: 20,
1064            content_selectors: vec![
1065                "article".to_string(),
1066                "main".to_string(),
1067                "[role=main]".to_string(),
1068                ".content".to_string(),
1069                ".post-content".to_string(),
1070                ".entry-content".to_string(),
1071            ],
1072            remove_selectors: vec![
1073                "script".to_string(),
1074                "style".to_string(),
1075                "noscript".to_string(),
1076                "nav".to_string(),
1077                "header".to_string(),
1078                "footer".to_string(),
1079                "aside".to_string(),
1080                ".sidebar".to_string(),
1081                ".advertisement".to_string(),
1082                ".ad".to_string(),
1083                ".ads".to_string(),
1084                "[role=navigation]".to_string(),
1085                "[role=banner]".to_string(),
1086                "[role=contentinfo]".to_string(),
1087            ],
1088            preserve_whitespace: false,
1089        }
1090    }
1091}
1092
1093impl ParserConfig {
1094    /// Create a new config with base URL
1095    pub fn with_base_url(url: impl AsRef<str>) -> Result<Self, url::ParseError> {
1096        Ok(Self {
1097            base_url: Some(url::Url::parse(url.as_ref())?),
1098            ..Default::default()
1099        })
1100    }
1101
1102    /// Create minimal config (faster, less extraction)
1103    pub fn minimal() -> Self {
1104        Self {
1105            extract_images: false,
1106            extract_tables: false,
1107            extract_code_blocks: false,
1108            extract_structured_data: false,
1109            compute_readability: false,
1110            ..Default::default()
1111        }
1112    }
1113
1114    /// Create config for full extraction
1115    pub fn full() -> Self {
1116        Self {
1117            compute_readability: true,
1118            ..Default::default()
1119        }
1120    }
1121
1122    /// Set base URL
1123    pub fn base_url(mut self, url: url::Url) -> Self {
1124        self.base_url = Some(url);
1125        self
1126    }
1127
1128    /// Add content selector
1129    pub fn add_content_selector(mut self, selector: impl Into<String>) -> Self {
1130        self.content_selectors.push(selector.into());
1131        self
1132    }
1133
1134    /// Add remove selector
1135    pub fn add_remove_selector(mut self, selector: impl Into<String>) -> Self {
1136        self.remove_selectors.push(selector.into());
1137        self
1138    }
1139}
1140
1141// ============================================================================
1142// HELPER FUNCTIONS
1143// ============================================================================
1144
1145/// Normalize whitespace in text
1146pub fn normalize_whitespace(text: &str) -> String {
1147    // Replace multiple whitespace with single space
1148    let mut result = String::with_capacity(text.len());
1149    let mut prev_ws = false;
1150    
1151    for c in text.chars() {
1152        if c.is_whitespace() {
1153            if !prev_ws {
1154                result.push(' ');
1155                prev_ws = true;
1156            }
1157        } else {
1158            result.push(c);
1159            prev_ws = false;
1160        }
1161    }
1162    
1163    result.trim().to_string()
1164}
1165
1166/// Clean text by removing control characters
1167pub fn clean_text(text: &str) -> String {
1168    text.chars()
1169        .filter(|c| !c.is_control() || c.is_whitespace())
1170        .collect::<String>()
1171}
1172
1173/// Truncate text to max length with ellipsis
1174pub fn truncate_text(text: &str, max_len: usize) -> String {
1175    if text.len() <= max_len {
1176        text.to_string()
1177    } else {
1178        let mut truncated = text.chars().take(max_len - 3).collect::<String>();
1179        truncated.push_str("...");
1180        truncated
1181    }
1182}
1183
1184// ============================================================================
1185// TESTS
1186// ============================================================================
1187
1188#[cfg(test)]
1189mod tests {
1190    use super::*;
1191
1192    #[test]
1193    fn test_text_content_creation() {
1194        let text = TextContent::from_raw("Hello   world,   this is   a test.");
1195        assert_eq!(text.cleaned_text, "Hello world, this is a test.");
1196        assert_eq!(text.word_count, 6);
1197        assert!(!text.is_empty());
1198    }
1199
1200    #[test]
1201    fn test_heading_creation() {
1202        let h1 = Heading::new(1, "Main Title").with_id("main");
1203        assert_eq!(h1.level, 1);
1204        assert_eq!(h1.id, Some("main".to_string()));
1205    }
1206
1207    #[test]
1208    fn test_heading_level_clamping() {
1209        let h = Heading::new(10, "Test");
1210        assert_eq!(h.level, 6); // Clamped to max
1211    }
1212
1213    #[test]
1214    fn test_link_creation() {
1215        let link = Link::new("https://example.com", "Example");
1216        assert!(!link.is_nofollow);
1217        assert!(link.should_follow());
1218    }
1219
1220    #[test]
1221    fn test_link_nofollow() {
1222        let mut link = Link::new("/page", "Page");
1223        link.is_nofollow = true;
1224        assert!(!link.should_follow());
1225    }
1226
1227    #[test]
1228    fn test_image_creation() {
1229        let img = Image::new("/img/photo.jpg", "A photo");
1230        assert!(!img.is_decorative);
1231        
1232        let decorative = Image::new("/img/spacer.gif", "");
1233        assert!(decorative.is_decorative);
1234    }
1235
1236    #[test]
1237    fn test_list_content() {
1238        let mut list = ListContent::new(ListType::Unordered);
1239        list.add_item(ListItem::new("Item 1"));
1240        list.add_item(ListItem::new("Item 2"));
1241        assert_eq!(list.total_items, 2);
1242        assert!(!list.is_empty());
1243    }
1244
1245    #[test]
1246    fn test_table_content() {
1247        let table = TableContent::new();
1248        assert!(table.is_empty());
1249        assert_eq!(table.row_count(), 0);
1250    }
1251
1252    #[test]
1253    fn test_code_block() {
1254        let code = CodeBlock::new("fn main() {\n    println!(\"Hello\");\n}").with_language("rust");
1255        assert_eq!(code.language, Some("rust".to_string()));
1256        assert_eq!(code.line_count, 3);
1257        assert!(!code.is_inline);
1258    }
1259
1260    #[test]
1261    fn test_opengraph() {
1262        let og = OpenGraph::default();
1263        assert!(!og.is_present());
1264        
1265        let og2 = OpenGraph {
1266            title: Some("Test".to_string()),
1267            ..Default::default()
1268        };
1269        assert!(og2.is_present());
1270    }
1271
1272    #[test]
1273    fn test_robots_meta() {
1274        let allowed = RobotsMeta::allowed();
1275        assert!(allowed.index);
1276        assert!(allowed.follow);
1277        
1278        let noindex = RobotsMeta::noindex_nofollow();
1279        assert!(!noindex.index);
1280        assert!(!noindex.follow);
1281    }
1282
1283    #[test]
1284    fn test_page_metadata_effective() {
1285        let mut meta = PageMetadata::default();
1286        meta.title = Some("Page Title".to_string());
1287        meta.opengraph.title = Some("OG Title".to_string());
1288        
1289        // OG takes precedence
1290        assert_eq!(meta.effective_title(), Some("OG Title"));
1291    }
1292
1293    #[test]
1294    fn test_parser_config() {
1295        let config = ParserConfig::default();
1296        assert!(config.extract_images);
1297        assert!(config.extract_links);
1298        
1299        let minimal = ParserConfig::minimal();
1300        assert!(!minimal.extract_images);
1301    }
1302
1303    #[test]
1304    fn test_normalize_whitespace() {
1305        assert_eq!(normalize_whitespace("  hello   world  "), "hello world");
1306        assert_eq!(normalize_whitespace("a\n\n\nb"), "a b");
1307        assert_eq!(normalize_whitespace("  "), "");
1308    }
1309
1310    #[test]
1311    fn test_clean_text() {
1312        let text = "Hello\x00World\x01Test";
1313        let cleaned = clean_text(text);
1314        assert_eq!(cleaned, "HelloWorldTest");
1315    }
1316
1317    #[test]
1318    fn test_truncate_text() {
1319        assert_eq!(truncate_text("Hello", 10), "Hello");
1320        assert_eq!(truncate_text("Hello World", 8), "Hello...");
1321    }
1322
1323    #[test]
1324    fn test_parsed_content_links() {
1325        let mut content = ParsedContent::default();
1326        content.links.push(Link {
1327            link_type: LinkType::Internal,
1328            ..Link::new("/page", "Page")
1329        });
1330        content.links.push(Link {
1331            link_type: LinkType::External,
1332            ..Link::new("https://ext.com", "Ext")
1333        });
1334        
1335        assert_eq!(content.internal_links().len(), 1);
1336        assert_eq!(content.external_links().len(), 1);
1337    }
1338
1339    #[test]
1340    fn test_reading_time() {
1341        // 225 WPM average
1342        let text = TextContent::from_raw(&"word ".repeat(450));
1343        assert!(text.reading_time_minutes.is_some());
1344        let time = text.reading_time_minutes.unwrap();
1345        assert!((time - 2.0).abs() < 0.1); // ~2 minutes
1346    }
1347}
halldyll_parser/types.rs

halldyll_parser/
types.rs