twars_url2md/
lib.rs

1use crate::url::Url;
2use anyhow::Result;
3use futures::stream::{self, StreamExt};
4use std::path::PathBuf;
5use std::sync::Arc;
6use std::thread;
7
8pub mod cli;
9mod error;
10mod html;
11mod markdown;
12pub mod url;
13
14pub use cli::Cli;
15pub use error::Error;
16
17include!(concat!(env!("OUT_DIR"), "/built.rs"));
18
19/// Version information with build details
20pub fn version() -> String {
21    format!(
22        "{}\nBuild Time: {}\nTarget: {}\nProfile: {}",
23        env!("CARGO_PKG_VERSION"),
24        BUILT_TIME_UTC,
25        TARGET,
26        PROFILE
27    )
28}
29
30/// Default user agent string for HTTP requests
31pub(crate) const USER_AGENT_STRING: &str =
32    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0";
33
34/// Configuration for URL processing
35#[derive(Debug, Clone)]
36pub struct Config {
37    pub verbose: bool,
38    pub max_retries: u32,
39    pub output_base: PathBuf,
40    pub single_file: bool,
41    pub has_output: bool,
42    pub pack_file: Option<PathBuf>,
43}
44
45/// Process a list of URLs with the given configuration
46pub async fn process_urls(
47    urls: Vec<String>,
48    config: Config,
49) -> Result<Vec<(String, anyhow::Error)>> {
50    use indicatif::{ProgressBar, ProgressStyle};
51    use tokio::io::AsyncWriteExt;
52
53    let pb = if urls.len() > 1 {
54        let pb = ProgressBar::new(urls.len() as u64);
55        pb.set_style(
56            ProgressStyle::default_bar()
57                .template(
58                    "{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} ({eta})",
59                )
60                .unwrap()
61                .progress_chars("#>-"),
62        );
63        Some(pb)
64    } else {
65        None
66    };
67
68    let pb = Arc::new(pb);
69    // Adaptive concurrency based on CPU cores
70    let concurrency_limit = thread::available_parallelism()
71        .map(|n| n.get() * 2) // 2 tasks per CPU core
72        .unwrap_or(10);
73
74    // If pack_file is specified, collect the markdown content
75    let should_pack = config.pack_file.is_some();
76    let pack_path = config.pack_file.clone();
77    let packed_content = if should_pack {
78        Arc::new(tokio::sync::Mutex::new(Vec::with_capacity(urls.len())))
79    } else {
80        Arc::new(tokio::sync::Mutex::new(Vec::new()))
81    };
82
83    // Clone the URLs vector before moving it into the stream
84    let urls_for_ordering = urls.clone();
85
86    let results = stream::iter(urls.into_iter().map(|url| {
87        let pb = Arc::clone(&pb);
88        let config = config.clone();
89        let packed_content = Arc::clone(&packed_content);
90        async move {
91            if config.verbose {
92                eprintln!("Processing: {}", url);
93            }
94            match Url::parse(&url) {
95                Ok(url_parsed) => {
96                    let out_path = if config.single_file
97                        && config.has_output
98                        && !config.output_base.is_dir()
99                    {
100                        Some(config.output_base)
101                    } else {
102                        url::create_output_path(&url_parsed, &config.output_base).ok()
103                    };
104
105                    let result = if should_pack {
106                        // Process URL and collect content for packing
107                        match url::process_url_with_content(
108                            &url,
109                            out_path,
110                            config.verbose,
111                            config.max_retries,
112                        )
113                        .await
114                        {
115                            Ok(content) => {
116                                if let Some(md_content) = content {
117                                    let mut content_vec = packed_content.lock().await;
118                                    content_vec.push((url.clone(), md_content));
119                                }
120                                Ok(())
121                            }
122                            Err(e) => Err(e),
123                        }
124                    } else {
125                        // Process URL normally
126                        url::process_url_with_retry(
127                            &url,
128                            out_path,
129                            config.verbose,
130                            config.max_retries,
131                        )
132                        .await
133                    };
134
135                    if let Some(pb) = &*pb {
136                        pb.inc(1);
137                    }
138                    result
139                }
140                Err(e) => {
141                    if let Some(pb) = &*pb {
142                        pb.inc(1);
143                    }
144                    Err((url, e.into()))
145                }
146            }
147        }
148    }))
149    .buffer_unordered(concurrency_limit)
150    .collect::<Vec<_>>()
151    .await;
152
153    if let Some(pb) = &*pb {
154        pb.finish_with_message("Done!");
155    }
156
157    // Write the packed content to the specified file
158    if let Some(pack_path) = pack_path {
159        if config.verbose {
160            eprintln!("Writing packed content to {}", pack_path.display());
161        }
162
163        if let Some(parent) = pack_path.parent() {
164            if let Err(e) = tokio::fs::create_dir_all(parent).await {
165                eprintln!(
166                    "Warning: Failed to create directory {}: {}",
167                    parent.display(),
168                    e
169                );
170            }
171        }
172
173        let mut packed_file = match tokio::fs::File::create(&pack_path).await {
174            Ok(file) => file,
175            Err(e) => {
176                eprintln!("Error creating packed file: {}", e);
177                return Ok(results.into_iter().filter_map(|r| r.err()).collect());
178            }
179        };
180
181        // Get the locked packed_content
182        let mut content_to_write = packed_content.lock().await;
183
184        // Reorder packed_content to match the original URL order
185        let mut url_to_index = std::collections::HashMap::new();
186        for (i, url) in urls_for_ordering.iter().enumerate() {
187            url_to_index.insert(url.clone(), i);
188        }
189
190        content_to_write.sort_by(|a, b| {
191            let a_idx = url_to_index.get(&a.0).unwrap_or(&usize::MAX);
192            let b_idx = url_to_index.get(&b.0).unwrap_or(&usize::MAX);
193            a_idx.cmp(b_idx)
194        });
195
196        for (url, content) in content_to_write.iter() {
197            if let Err(e) = packed_file
198                .write_all(format!("# {}\n\n{}\n\n---\n\n", url, content).as_bytes())
199                .await
200            {
201                eprintln!("Error writing to packed file: {}", e);
202            }
203        }
204    }
205
206    // Process results as before
207    let mut errors = Vec::new();
208    for r in results {
209        match r {
210            Ok(()) => {}
211            Err(e) => {
212                eprintln!("Warning: Failed to process {}: {}", e.0, e.1);
213                errors.push(e);
214            }
215        }
216    }
217
218    Ok(errors)
219}