1use anyhow::Result;
2use clap::Parser;
3use std::fs;
4use std::io::{self, Read};
5use std::path::PathBuf;
6use tokio;
7
8use crate::url::extract_urls_from_text;
9
10#[derive(Parser)]
12#[command(
13 name = "twars-url2md",
14 author = "Adam Twardoch",
15 version = env!("CARGO_PKG_VERSION"),
16 about = "Convert web pages to clean Markdown format while preserving content structure",
17 long_about = "\
18A powerful CLI tool that fetches web pages and converts them to clean Markdown format \
19using Monolith for content extraction and htmd for conversion"
20)]
21pub struct Cli {
22 #[arg(short, long)]
24 input: Option<PathBuf>,
25
26 #[arg(short, long)]
28 output: Option<PathBuf>,
29
30 #[arg(long)]
32 stdin: bool,
33
34 #[arg(long)]
36 base_url: Option<String>,
37
38 #[arg(short = 'p', long)]
40 pack: Option<PathBuf>,
41
42 #[arg(short, long)]
44 verbose: bool,
45}
46
47impl Cli {
48 pub fn parse_args() -> Result<Self> {
50 let args: Vec<_> = std::env::args().collect();
51 let cli = if args.iter().any(|arg| arg == "-v" || arg == "--verbose") {
52 Self::parse()
53 } else {
54 match Self::try_parse() {
55 Ok(cli) => {
56 if !cli.stdin && cli.input.is_none() {
58 eprintln!("Error: Either --stdin or --input must be specified");
59 eprintln!("Run with --help for usage information");
60 std::process::exit(1);
61 }
62 cli
63 }
64 Err(err) => {
65 if err.kind() == clap::error::ErrorKind::DisplayHelp
66 || err.kind() == clap::error::ErrorKind::DisplayVersion
67 {
68 println!("{}", err);
69 std::process::exit(0);
70 }
71 eprintln!(
72 "Error: {}",
73 err.render()
74 .to_string()
75 .lines()
76 .next()
77 .unwrap_or("Invalid usage")
78 );
79 std::process::exit(1);
80 }
81 }
82 };
83
84 Ok(cli)
85 }
86
87 pub fn collect_urls(&self) -> io::Result<Vec<String>> {
89 let content = if self.stdin {
91 let mut buffer = String::new();
92 io::stdin().read_to_string(&mut buffer)?;
93 buffer
94 } else if let Some(input_path) = &self.input {
95 fs::read_to_string(input_path)?
96 } else {
97 return Err(io::Error::new(
99 io::ErrorKind::InvalidInput,
100 "Neither stdin nor input file specified",
101 ));
102 };
103
104 Ok(extract_urls_from_text(&content, self.base_url.as_deref()))
106 }
107
108 pub fn create_config(&self) -> crate::Config {
110 crate::Config {
111 verbose: self.verbose,
112 max_retries: 2,
113 output_base: self.output.clone().unwrap_or_else(|| PathBuf::from(".")),
114 single_file: self.input.is_none(),
115 has_output: self.output.is_some(),
116 pack_file: self.pack.clone(),
117 }
118 }
119}
120
121pub async fn run() -> io::Result<()> {
122 let cli = match Cli::parse_args() {
125 Ok(cli) => cli,
126 Err(e) => {
127 eprintln!("Error parsing arguments: {}", e);
128 std::process::exit(1);
129 }
130 };
131
132 if cli.stdin && cli.input.is_some() {
134 eprintln!("Error: Cannot use both --stdin and --input");
135 std::process::exit(1);
136 }
137
138 let urls = cli.collect_urls()?;
140
141 if let Some(output_dir) = cli.output.clone() {
143 fs::create_dir_all(&output_dir)?;
144 for url in urls {
145 let mut file_path = output_dir.clone();
147 file_path.push(format!("{}.md", url_to_filename(&url)));
148 tokio::fs::write(file_path, format!("# {}\n\n{}\n", url, url)).await?;
149 }
150 } else {
151 for url in urls {
153 println!("{}", url);
154 }
155 }
156
157 Ok(())
158}
159
160fn url_to_filename(url: &str) -> String {
161 let mut filename = url
163 .replace(
164 [
165 ':', '/', '?', '#', '[', ']', '@', '!', '$', '&', '\'', '(', ')', '*', '+', ',',
166 ';', '=',
167 ],
168 "_",
169 )
170 .replace([' ', '\t', '\n', '\r'], "");
171
172 if filename.len() > 200 {
174 filename.truncate(200);
175 }
176
177 filename
178}
179
180#[cfg(test)]
181mod tests {
182 use super::*;
183 use tempfile::tempdir;
184
185 #[test]
186 fn test_collect_urls_from_text_file() -> Result<()> {
187 let temp_dir = tempdir()?;
188 let test_file = temp_dir.path().join("sample_urls.txt");
189 let test_content = "\
190 https://example.com/\n\
191 http://test.org/\n\
192 https://rust-lang.org/\n\
193 https://github.com/example/repo\n\
194 http://blog.example.com/post/123\n\
195 https://docs.example.com/guide#section\n\
196 ftp://invalid.com\n\
197 not-a-url.com\n\
198 www.example.com";
199
200 fs::write(&test_file, test_content)?;
202
203 let cli = Cli {
205 input: Some(test_file),
206 output: None,
207 stdin: false,
208 base_url: None,
209 pack: None,
210 verbose: false,
211 };
212
213 let urls = cli.collect_urls()?;
214 println!("Found URLs: {:?}", urls);
215 verify_urls(&urls);
216
217 Ok(())
218 }
219
220 fn verify_urls(urls: &[String]) {
221 println!("Found URLs: {:?}", urls);
222
223 assert!(urls.iter().any(|u| u == "https://example.com/"));
225 assert!(urls.iter().any(|u| u == "http://test.org/"));
226 assert!(urls.iter().any(|u| u == "https://rust-lang.org/"));
227
228 assert!(urls.iter().any(|u| u == "https://github.com/example/repo"));
230 assert!(urls.iter().any(|u| u == "http://blog.example.com/post/123"));
231 assert!(urls
232 .iter()
233 .any(|u| u == "https://docs.example.com/guide#section"));
234
235 assert!(!urls.iter().any(|u| u.starts_with("ftp://")));
237 assert!(!urls.iter().any(|u| u == "not-a-url.com"));
238 assert!(!urls.iter().any(|u| u == "www.example.com"));
239
240 assert_eq!(urls.len(), 6, "Expected exactly 6 valid URLs");
241 }
242}