Skip to main content

punch_types/
link.rs

1//! # Link Understanding — scouting enemy territory by extracting intel from URLs.
2//!
3//! This module provides types and traits for fetching and extracting structured
4//! content from URLs, turning raw links into actionable battlefield intelligence.
5
6use async_trait::async_trait;
7use chrono::{DateTime, Utc};
8use serde::{Deserialize, Serialize};
9
10use crate::error::PunchResult;
11
12/// Classification of the content behind a link — what kind of territory we're scouting.
13#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
14#[serde(rename_all = "snake_case")]
15pub enum LinkContentType {
16    /// A written article or blog post.
17    Article,
18    /// Technical documentation.
19    Documentation,
20    /// Code repository (GitHub, GitLab, etc.).
21    Repository,
22    /// Social media post or thread.
23    SocialMedia,
24    /// Video content page.
25    Video,
26    /// Image content page.
27    Image,
28    /// Unclassified content.
29    Other,
30}
31
32/// Metadata extracted from a link — the dossier on the target.
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct LinkMetadata {
35    /// Author of the content.
36    pub author: Option<String>,
37    /// When the content was published.
38    pub published_at: Option<DateTime<Utc>>,
39    /// Approximate word count of the main content.
40    pub word_count: usize,
41    /// Detected language of the content.
42    pub language: Option<String>,
43    /// Short description or summary.
44    pub description: Option<String>,
45}
46
47impl LinkMetadata {
48    /// Create empty metadata with zero word count.
49    pub fn empty() -> Self {
50        Self {
51            author: None,
52            published_at: None,
53            word_count: 0,
54            language: None,
55            description: None,
56        }
57    }
58}
59
60/// Extracted content from a URL — the full intelligence report.
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct LinkContent {
63    /// The original URL that was extracted.
64    pub url: String,
65    /// Title of the page/content.
66    pub title: Option<String>,
67    /// The extracted main text content.
68    pub content: String,
69    /// Classification of the content type.
70    pub content_type: LinkContentType,
71    /// Metadata about the content.
72    pub metadata: LinkMetadata,
73}
74
75impl LinkContent {
76    /// Create a new link content result.
77    pub fn new(
78        url: impl Into<String>,
79        content: impl Into<String>,
80        content_type: LinkContentType,
81    ) -> Self {
82        let content = content.into();
83        let word_count = content.split_whitespace().count();
84        Self {
85            url: url.into(),
86            title: None,
87            content,
88            content_type,
89            metadata: LinkMetadata {
90                author: None,
91                published_at: None,
92                word_count,
93                language: None,
94                description: None,
95            },
96        }
97    }
98
99    /// Set the title.
100    pub fn with_title(mut self, title: impl Into<String>) -> Self {
101        self.title = Some(title.into());
102        self
103    }
104
105    /// Set metadata.
106    pub fn with_metadata(mut self, metadata: LinkMetadata) -> Self {
107        self.metadata = metadata;
108        self
109    }
110}
111
112/// Trait for link extraction backends — the scout unit that infiltrates URLs.
113#[async_trait]
114pub trait LinkExtractor: Send + Sync {
115    /// Extract content from the given URL.
116    async fn extract(&self, url: &str) -> PunchResult<LinkContent>;
117
118    /// Check if this extractor supports the given URL.
119    fn supports_url(&self, url: &str) -> bool;
120}
121
122#[cfg(test)]
123mod tests {
124    use super::*;
125
126    #[test]
127    fn test_content_construction() {
128        let content = LinkContent::new(
129            "https://example.com/article",
130            "This is a test article about fighting techniques.",
131            LinkContentType::Article,
132        )
133        .with_title("Fighting Techniques");
134
135        assert_eq!(content.url, "https://example.com/article");
136        assert_eq!(content.title, Some("Fighting Techniques".to_string()));
137        assert_eq!(content.content_type, LinkContentType::Article);
138        assert!(!content.content.is_empty());
139    }
140
141    #[test]
142    fn test_content_type_classification() {
143        let types = vec![
144            LinkContentType::Article,
145            LinkContentType::Documentation,
146            LinkContentType::Repository,
147            LinkContentType::SocialMedia,
148            LinkContentType::Video,
149            LinkContentType::Image,
150            LinkContentType::Other,
151        ];
152
153        for ct in &types {
154            let json = serde_json::to_string(ct).expect("serialize content type");
155            let deser: LinkContentType =
156                serde_json::from_str(&json).expect("deserialize content type");
157            assert_eq!(&deser, ct);
158        }
159
160        assert_eq!(
161            serde_json::to_string(&LinkContentType::SocialMedia).expect("social media"),
162            "\"social_media\""
163        );
164    }
165
166    #[test]
167    fn test_metadata() {
168        let metadata = LinkMetadata {
169            author: Some("The Champion".to_string()),
170            published_at: Some(Utc::now()),
171            word_count: 1500,
172            language: Some("en".to_string()),
173            description: Some("A guide to winning".to_string()),
174        };
175
176        let json = serde_json::to_string(&metadata).expect("serialize metadata");
177        let deser: LinkMetadata = serde_json::from_str(&json).expect("deserialize metadata");
178
179        assert_eq!(deser.author, Some("The Champion".to_string()));
180        assert_eq!(deser.word_count, 1500);
181        assert_eq!(deser.language, Some("en".to_string()));
182    }
183
184    #[test]
185    fn test_url_support_check() {
186        // Test that supports_url can be implemented with simple pattern matching.
187        let github_url = "https://github.com/humancto/punch";
188        let docs_url = "https://docs.rs/serde/latest";
189        let random_url = "https://example.com/page";
190
191        // Simple URL classification logic for testing.
192        fn classify_url(url: &str) -> LinkContentType {
193            if url.contains("github.com") {
194                LinkContentType::Repository
195            } else if url.contains("docs.rs") || url.contains("docs.") {
196                LinkContentType::Documentation
197            } else {
198                LinkContentType::Other
199            }
200        }
201
202        assert_eq!(classify_url(github_url), LinkContentType::Repository);
203        assert_eq!(classify_url(docs_url), LinkContentType::Documentation);
204        assert_eq!(classify_url(random_url), LinkContentType::Other);
205    }
206
207    #[test]
208    fn test_word_count() {
209        let content = LinkContent::new(
210            "https://example.com",
211            "one two three four five six seven eight nine ten",
212            LinkContentType::Article,
213        );
214
215        assert_eq!(content.metadata.word_count, 10);
216
217        let empty_content = LinkContent::new("https://example.com", "", LinkContentType::Other);
218        assert_eq!(empty_content.metadata.word_count, 0);
219    }
220}