twars_url2md/
cli.rs

1use anyhow::Result;
2use clap::Parser;
3use std::fs;
4use std::io::{self, Read};
5use std::path::PathBuf;
6use tokio;
7
8use crate::url::extract_urls_from_text;
9
10/// Command-line interface for URL processing
11#[derive(Parser)]
12#[command(
13    name = "twars-url2md",
14    author = "Adam Twardoch",
15    version = env!("CARGO_PKG_VERSION"),
16    about = "Convert web pages to clean Markdown format while preserving content structure",
17    long_about = "\
18A powerful CLI tool that fetches web pages and converts them to clean Markdown format \
19using Monolith for content extraction and htmd for conversion"
20)]
21pub struct Cli {
22    /// Input file to process
23    #[arg(short, long)]
24    input: Option<PathBuf>,
25
26    /// Output directory for markdown files
27    #[arg(short, long)]
28    output: Option<PathBuf>,
29
30    /// Read from stdin
31    #[arg(long)]
32    stdin: bool,
33
34    /// Base URL for resolving relative URLs
35    #[arg(long)]
36    base_url: Option<String>,
37
38    /// Output file to pack all markdown files together
39    #[arg(short = 'p', long)]
40    pack: Option<PathBuf>,
41
42    /// Enable verbose output
43    #[arg(short, long)]
44    verbose: bool,
45}
46
47impl Cli {
48    /// Parse command-line arguments with custom error handling
49    pub fn parse_args() -> Result<Self> {
50        let args: Vec<_> = std::env::args().collect();
51        let cli = if args.iter().any(|arg| arg == "-v" || arg == "--verbose") {
52            Self::parse()
53        } else {
54            match Self::try_parse() {
55                Ok(cli) => {
56                    // Add validation for input arguments
57                    if !cli.stdin && cli.input.is_none() {
58                        eprintln!("Error: Either --stdin or --input must be specified");
59                        eprintln!("Run with --help for usage information");
60                        std::process::exit(1);
61                    }
62                    cli
63                }
64                Err(err) => {
65                    if err.kind() == clap::error::ErrorKind::DisplayHelp
66                        || err.kind() == clap::error::ErrorKind::DisplayVersion
67                    {
68                        println!("{}", err);
69                        std::process::exit(0);
70                    }
71                    eprintln!(
72                        "Error: {}",
73                        err.render()
74                            .to_string()
75                            .lines()
76                            .next()
77                            .unwrap_or("Invalid usage")
78                    );
79                    std::process::exit(1);
80                }
81            }
82        };
83
84        Ok(cli)
85    }
86
87    /// Collect URLs from all input sources
88    pub fn collect_urls(&self) -> io::Result<Vec<String>> {
89        // Get content from stdin or file
90        let content = if self.stdin {
91            let mut buffer = String::new();
92            io::stdin().read_to_string(&mut buffer)?;
93            buffer
94        } else if let Some(input_path) = &self.input {
95            fs::read_to_string(input_path)?
96        } else {
97            // Replace unreachable!() with a proper error
98            return Err(io::Error::new(
99                io::ErrorKind::InvalidInput,
100                "Neither stdin nor input file specified",
101            ));
102        };
103
104        // Extract URLs from content
105        Ok(extract_urls_from_text(&content, self.base_url.as_deref()))
106    }
107
108    /// Create configuration from CLI arguments
109    pub fn create_config(&self) -> crate::Config {
110        crate::Config {
111            verbose: self.verbose,
112            max_retries: 2,
113            output_base: self.output.clone().unwrap_or_else(|| PathBuf::from(".")),
114            single_file: self.input.is_none(),
115            has_output: self.output.is_some(),
116            pack_file: self.pack.clone(),
117        }
118    }
119}
120
121pub async fn run() -> io::Result<()> {
122    // Use unwrap() instead of ? because parse_args returns anyhow::Result
123    // which is not compatible with io::Result
124    let cli = match Cli::parse_args() {
125        Ok(cli) => cli,
126        Err(e) => {
127            eprintln!("Error parsing arguments: {}", e);
128            std::process::exit(1);
129        }
130    };
131
132    // Validate input options
133    if cli.stdin && cli.input.is_some() {
134        eprintln!("Error: Cannot use both --stdin and --input");
135        std::process::exit(1);
136    }
137
138    // Extract URLs from content
139    let urls = cli.collect_urls()?;
140
141    // Process output
142    if let Some(output_dir) = cli.output.clone() {
143        fs::create_dir_all(&output_dir)?;
144        for url in urls {
145            // Create markdown file for each URL
146            let mut file_path = output_dir.clone();
147            file_path.push(format!("{}.md", url_to_filename(&url)));
148            tokio::fs::write(file_path, format!("# {}\n\n{}\n", url, url)).await?;
149        }
150    } else {
151        // Print URLs to stdout if no output directory specified
152        for url in urls {
153            println!("{}", url);
154        }
155    }
156
157    Ok(())
158}
159
160fn url_to_filename(url: &str) -> String {
161    // Convert URL to a valid filename
162    let mut filename = url
163        .replace(
164            [
165                ':', '/', '?', '#', '[', ']', '@', '!', '$', '&', '\'', '(', ')', '*', '+', ',',
166                ';', '=',
167            ],
168            "_",
169        )
170        .replace([' ', '\t', '\n', '\r'], "");
171
172    // Ensure the filename is not too long
173    if filename.len() > 200 {
174        filename.truncate(200);
175    }
176
177    filename
178}
179
180#[cfg(test)]
181mod tests {
182    use super::*;
183    use tempfile::tempdir;
184
185    #[test]
186    fn test_collect_urls_from_text_file() -> Result<()> {
187        let temp_dir = tempdir()?;
188        let test_file = temp_dir.path().join("sample_urls.txt");
189        let test_content = "\
190            https://example.com/\n\
191            http://test.org/\n\
192            https://rust-lang.org/\n\
193            https://github.com/example/repo\n\
194            http://blog.example.com/post/123\n\
195            https://docs.example.com/guide#section\n\
196            ftp://invalid.com\n\
197            not-a-url.com\n\
198            www.example.com";
199
200        // Create test file with sample URLs
201        fs::write(&test_file, test_content)?;
202
203        // Test file input
204        let cli = Cli {
205            input: Some(test_file),
206            output: None,
207            stdin: false,
208            base_url: None,
209            pack: None,
210            verbose: false,
211        };
212
213        let urls = cli.collect_urls()?;
214        println!("Found URLs: {:?}", urls);
215        verify_urls(&urls);
216
217        Ok(())
218    }
219
220    fn verify_urls(urls: &[String]) {
221        println!("Found URLs: {:?}", urls);
222
223        // Test for basic URLs (with trailing slashes)
224        assert!(urls.iter().any(|u| u == "https://example.com/"));
225        assert!(urls.iter().any(|u| u == "http://test.org/"));
226        assert!(urls.iter().any(|u| u == "https://rust-lang.org/"));
227
228        // Test for URLs with paths and fragments
229        assert!(urls.iter().any(|u| u == "https://github.com/example/repo"));
230        assert!(urls.iter().any(|u| u == "http://blog.example.com/post/123"));
231        assert!(urls
232            .iter()
233            .any(|u| u == "https://docs.example.com/guide#section"));
234
235        // Make sure invalid URLs are not included
236        assert!(!urls.iter().any(|u| u.starts_with("ftp://")));
237        assert!(!urls.iter().any(|u| u == "not-a-url.com"));
238        assert!(!urls.iter().any(|u| u == "www.example.com"));
239
240        assert_eq!(urls.len(), 6, "Expected exactly 6 valid URLs");
241    }
242}