duckduckgo-search-cli 0.6.3

CLI in Rust to search DuckDuckGo via pure HTTP, with structured output for LLM consumption.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
//! Shared data types used across the application.
//!
//! All output structs (`SaidaBusca`, `SaidaBuscaMultipla`, `ResultadoBusca`,
//! `MetadadosBusca`) serialize with field names in Brazilian Portuguese
//! (snake_case), as per the INVIOLABLE invariant of blueprint v2: "Logs and field
//! names in Brazilian Portuguese". Rust field names and external JSON names
//! coincide — no active `serde(rename)`.

use crate::http::PerfilBrowser;
use serde::{Deserialize, Serialize};

/// Represents a single DuckDuckGo search result.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ResultadoBusca {
    /// Result position on the page (1-indexed, already after ad filtering).
    pub posicao: u32,

    /// Result title, extracted from the `.result__a` element.
    pub titulo: String,

    /// Result URL, extracted from the `href` attribute of `.result__a`.
    pub url: String,

    /// Display URL (more user-friendly), extracted from `.result__url`.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub url_exibicao: Option<String>,

    /// Descriptive snippet for the result, extracted from `.result__snippet`.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub snippet: Option<String>,

    /// Literal title text as rendered by DuckDuckGo, preserved for auditing
    /// when substitution heuristics are applied (e.g., DDG returns "Official site"
    /// for verified domains — we replace it with `url_exibicao` and keep the
    /// original here). Absent when the title was not modified.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub titulo_original: Option<String>,

    /// Full text content of the page (only with `--fetch-content`; not implemented in the MVP).
    #[serde(skip_serializing_if = "Option::is_none")]
    pub conteudo: Option<String>,

    /// Size in characters of the extracted content (only with `--fetch-content`).
    #[serde(skip_serializing_if = "Option::is_none")]
    pub tamanho_conteudo: Option<u32>,

    /// Method used to extract content: `"http"` or `"chrome"` (only with `--fetch-content`).
    #[serde(skip_serializing_if = "Option::is_none")]
    pub metodo_extracao_conteudo: Option<String>,
}

/// Search execution metadata, useful for diagnostics and LLM integration.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MetadadosBusca {
    /// Total execution time in milliseconds.
    pub tempo_execucao_ms: u64,

    /// Blake3 hash (hex, first 16 characters) of the selector configuration used.
    pub hash_seletores: String,

    /// Number of retries performed (0 in MVP — retry not yet implemented).
    pub retentativas: u32,

    /// Indicates whether the Lite endpoint was used as fallback (always `false` in MVP).
    pub usou_endpoint_fallback: bool,

    /// Number of parallel content fetches started (0 in MVP).
    pub fetches_simultaneos: u32,

    /// Successful content fetches (0 in MVP).
    pub sucessos_fetch: u32,

    /// Failed content fetches (0 in MVP).
    pub falhas_fetch: u32,

    /// Indicates whether Chrome was used (always `false` in MVP).
    pub usou_chrome: bool,

    /// User-Agent used during execution.
    pub user_agent: String,

    /// Indicates whether a proxy was configured (always `false` in MVP).
    pub usou_proxy: bool,
}

/// Complete output for a single-query search (serialized as JSON in the MVP).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SaidaBusca {
    /// Original search query submitted by the user.
    pub query: String,

    /// Search engine used — always `"duckduckgo"`.
    pub motor: String,

    /// Endpoint used — `"html"` or `"lite"` (always `"html"` in MVP).
    pub endpoint: String,

    /// ISO-8601 (RFC 3339) timestamp of when the search was executed.
    pub timestamp: String,

    /// `kl` region code used (e.g., `"br-pt"`).
    pub regiao: String,

    /// Count of results returned after ad filtering.
    pub quantidade_resultados: u32,

    /// List of organic results.
    pub resultados: Vec<ResultadoBusca>,

    /// Number of pages fetched (always 1 in MVP).
    pub paginas_buscadas: u32,

    /// Structured error code if the search partially failed (None on full success).
    #[serde(skip_serializing_if = "Option::is_none")]
    pub erro: Option<String>,

    /// Additional human-readable message (used for non-fatal warnings).
    #[serde(skip_serializing_if = "Option::is_none")]
    pub mensagem: Option<String>,

    /// Execution metadata.
    pub metadados: MetadadosBusca,
}

/// Complete output for a multi-query execution (serialized as JSON).
///
/// Per section 14.1 of the specification. Each inner `SaidaBusca` retains the
/// single-query format (including per-query `error`), and the root-level fields
/// aggregate metadata from the parallel execution.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SaidaBuscaMultipla {
    /// Total number of queries executed (success + failure).
    pub quantidade_queries: u32,

    /// ISO-8601 (RFC 3339) timestamp of the start of the parallel execution.
    pub timestamp: String,

    /// Effective `--parallel` value used during execution (after validation/clamp).
    pub paralelismo: u32,

    /// Result of each individual query, in the same order as the input queries.
    pub buscas: Vec<SaidaBusca>,
}

/// CSS selector configuration (loaded from selectors.toml or hardcoded defaults).
///
/// Retains the existing fields (`html_endpoint`) for backward compatibility with
/// tests and selector hashing. Starting from iteration 6, adds flat additional
/// fields for the Lite endpoint, pagination, and related searches, enabling
/// full externalization via an external TOML file.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(default)]
pub struct ConfiguracaoSeletores {
    /// Legacy group — retained for compatibility with existing serialization and tests.
    pub html_endpoint: SeletoresHtml,

    /// Selector group for the Lite endpoint.
    #[serde(default)]
    pub lite_endpoint: SeletoresLite,

    /// Selectors used to extract pagination data (form `s`).
    #[serde(default)]
    pub pagination: SeletoresPaginacao,

    /// Selectors used to extract "related searches".
    #[serde(default)]
    pub related_searches: SeletoresRelacionadas,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(default)]
pub struct SeletoresHtml {
    pub results_container: String,
    pub result_item: String,
    pub title_and_url: String,
    pub snippet: String,
    pub display_url: String,
    pub ads_filter: FiltroAnuncios,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(default)]
pub struct FiltroAnuncios {
    pub ad_classes: Vec<String>,
    pub ad_attributes: Vec<String>,
    pub ad_url_patterns: Vec<String>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(default)]
pub struct SeletoresLite {
    pub results_table: String,
    pub result_link: String,
    pub result_snippet: String,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(default)]
pub struct SeletoresPaginacao {
    pub vqd_input: String,
    pub s_input: String,
    pub dc_input: String,
    pub next_form: String,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(default)]
pub struct SeletoresRelacionadas {
    pub container: String,
    pub links: String,
}

impl Default for SeletoresHtml {
    fn default() -> Self {
        Self {
            results_container: "#links".to_string(),
            result_item:
                "#links .result:not(.result--ad), #links .results_links, div.result:not(.result--ad)"
                    .to_string(),
            title_and_url: ".result__a, a.result__a, .result__title a".to_string(),
            // v0.3.0: removido `.result__body` — casava o container pai e trazia
            // titulo+url+snippet concatenados no campo snippet.
            snippet: ".result__snippet, a.result__snippet".to_string(),
            display_url: ".result__url, span.result__url".to_string(),
            ads_filter: FiltroAnuncios::default(),
        }
    }
}

impl Default for FiltroAnuncios {
    fn default() -> Self {
        Self {
            ad_classes: vec![".result--ad".to_string(), ".badge--ad".to_string()],
            ad_attributes: vec!["data-nrn=ad".to_string()],
            ad_url_patterns: vec!["duckduckgo.com/y.js".to_string()],
        }
    }
}

impl Default for SeletoresLite {
    fn default() -> Self {
        Self {
            results_table: "table, body table".to_string(),
            result_link: "a.result-link, td a[href]".to_string(),
            result_snippet: "td.result-snippet, tr.result-snippet td".to_string(),
        }
    }
}

impl Default for SeletoresPaginacao {
    fn default() -> Self {
        Self {
            vqd_input: "input[name='vqd'], input[type='hidden'][name='vqd']".to_string(),
            s_input: "input[name='s']".to_string(),
            dc_input: "input[name='dc']".to_string(),
            next_form: "form.result--more__btn, form[action='/html/']".to_string(),
        }
    }
}

impl Default for SeletoresRelacionadas {
    fn default() -> Self {
        Self {
            container: ".result--more__btn, .result--sep".to_string(),
            links: "a".to_string(),
        }
    }
}

/// DuckDuckGo endpoint chosen via `--endpoint`.
///
/// - `Html` (default): `https://html.duckduckgo.com/html/` with `.result` in the DOM.
/// - `Lite`: `https://lite.duckduckgo.com/lite/` with tabular layout (no JavaScript).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Endpoint {
    Html,
    Lite,
}

impl Endpoint {
    pub fn como_str(&self) -> &'static str {
        match self {
            Endpoint::Html => "html",
            Endpoint::Lite => "lite",
        }
    }
}

/// DuckDuckGo `df` time filter.
///
/// Values accepted by the API: `d` (day), `w` (week), `m` (month), `y` (year).
/// Absence of the parameter means "no time filter".
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FiltroTemporal {
    Dia,
    Semana,
    Mes,
    Ano,
}

impl FiltroTemporal {
    /// Returns the code accepted by the URL's `df` parameter.
    pub fn como_parametro(&self) -> &'static str {
        match self {
            FiltroTemporal::Dia => "d",
            FiltroTemporal::Semana => "w",
            FiltroTemporal::Mes => "m",
            FiltroTemporal::Ano => "y",
        }
    }
}

/// DuckDuckGo safe-search (`kp` parameter).
///
/// Accepted values: `-2` moderate (DDG default, sent as absence of the parameter),
/// `-1` off (disables filters), `1` strict (filters adult content).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SafeSearch {
    Off,
    Moderate,
    Strict,
}

impl SafeSearch {
    /// Value for the `kp` parameter. `None` means "do not add the parameter"
    /// (equivalent to DDG's moderate default).
    pub fn como_parametro(&self) -> Option<&'static str> {
        match self {
            SafeSearch::Off => Some("-1"),
            SafeSearch::Moderate => None,
            SafeSearch::Strict => Some("1"),
        }
    }
}

/// Global settings derived from the CLI, passed through the pipeline.
///
/// The `query` field remains as the "active query" in single-query executions
/// (useful for the legacy flow in `pipeline::executar`). In multi-query mode, the
/// pipeline iterates over `queries` and clones this struct for each task,
/// overwriting `query` with the current iteration item.
#[derive(Debug, Clone)]
pub struct Configuracoes {
    /// "Active" query — populated before calling the single-query flow.
    /// In multi-query mode starts equal to the first query and is overwritten per task.
    pub query: String,
    /// Full list of queries to execute. Always contains at least 1 item.
    pub queries: Vec<String>,
    pub num_resultados: Option<u32>,
    pub formato: FormatoSaida,
    pub timeout_segundos: u64,
    pub idioma: String,
    pub pais: String,
    pub modo_verboso: bool,
    pub modo_silencioso: bool,
    pub user_agent: String,
    /// Full browser profile — family, version and platform derived from `user_agent`.
    /// Kept alongside the `user_agent` field (used in MetadadosBusca and JSON output).
    pub perfil_browser: PerfilBrowser,
    /// Effective parallelism degree (1..=20). Informational only in single-query mode.
    pub paralelismo: u32,
    /// Number of pages to fetch per query (1..=5).
    pub paginas: u32,
    /// Number of retry attempts (0..=10). 0 = no retry; 2 is the default.
    pub retries: u32,
    /// Preferred endpoint (html by default; lite forces the no-JavaScript endpoint).
    pub endpoint: Endpoint,
    /// Optional time filter (`df`).
    pub filtro_temporal: Option<FiltroTemporal>,
    /// Safe-search (`kp`).
    pub safe_search: SafeSearch,
    /// `--stream` flag (placeholder — not implemented in this iteration).
    pub modo_stream: bool,
    /// Optional path for writing output (instead of stdout).
    pub arquivo_saida: Option<std::path::PathBuf>,
    /// `--fetch-content` flag — enables text content extraction from result pages.
    pub buscar_conteudo: bool,
    /// Value of `--max-content-length` — maximum content size in characters (1..=100000).
    pub max_tamanho_conteudo: usize,
    /// HTTP/HTTPS/SOCKS5 proxy URL via `--proxy`. When `Some`, takes precedence over env vars.
    pub proxy: Option<String>,
    /// `--no-proxy` flag — disables any proxy (including env vars). Mutually exclusive with `proxy`.
    pub sem_proxy: bool,
    /// Value of `--global-timeout` in seconds (global timeout for the entire execution).
    pub timeout_global_segundos: u64,
    /// `--match-platform-ua` flag — restricts UAs from the external config to the current platform.
    pub corresponde_plataforma_ua: bool,
    /// Per-host concurrent fetch limit in `--fetch-content` mode (1..=10, default 2).
    pub limite_por_host: usize,
    /// Optional manual path to Chrome/Chromium (`--chrome-path` flag, `chrome` feature).
    /// Without the `chrome` feature or `--fetch-content`, this value is ignored with a warning.
    pub caminho_chrome: Option<std::path::PathBuf>,
    /// CSS selector configuration (loaded from selectors.toml or built-in defaults).
    /// Wrapped in `Arc` for cheap cloning across concurrent tasks.
    pub seletores: std::sync::Arc<ConfiguracaoSeletores>,
}

/// Output formats supported by the CLI (only `Json` is supported in the MVP).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FormatoSaida {
    Json,
    Text,
    Markdown,
    Auto,
}

impl FormatoSaida {
    /// Converts a `"json"|"text"|"markdown"|"auto"` string into the corresponding enum variant.
    pub fn a_partir_de_str(valor: &str) -> Option<Self> {
        match valor.to_ascii_lowercase().as_str() {
            "json" => Some(Self::Json),
            "text" => Some(Self::Text),
            "markdown" | "md" => Some(Self::Markdown),
            "auto" => Some(Self::Auto),
            _ => None,
        }
    }
}

#[cfg(test)]
mod testes {
    use super::*;

    #[test]
    fn configuracao_seletores_default_contem_result_container() {
        let cfg = ConfiguracaoSeletores::default();
        assert_eq!(cfg.html_endpoint.results_container, "#links");
        assert!(cfg
            .html_endpoint
            .ads_filter
            .ad_url_patterns
            .contains(&"duckduckgo.com/y.js".to_string()));
    }

    #[test]
    fn formato_saida_parseia_variantes_validas() {
        assert_eq!(
            FormatoSaida::a_partir_de_str("json"),
            Some(FormatoSaida::Json)
        );
        assert_eq!(
            FormatoSaida::a_partir_de_str("TEXT"),
            Some(FormatoSaida::Text)
        );
        assert_eq!(
            FormatoSaida::a_partir_de_str("markdown"),
            Some(FormatoSaida::Markdown)
        );
        assert_eq!(
            FormatoSaida::a_partir_de_str("md"),
            Some(FormatoSaida::Markdown)
        );
        assert_eq!(
            FormatoSaida::a_partir_de_str("Auto"),
            Some(FormatoSaida::Auto)
        );
        assert_eq!(FormatoSaida::a_partir_de_str("xml"), None);
    }

    #[test]
    fn saida_busca_serializa_campos_em_portugues_no_json() {
        let saida = SaidaBusca {
            query: "teste".to_string(),
            motor: "duckduckgo".to_string(),
            endpoint: "html".to_string(),
            timestamp: "2026-04-14T00:00:00Z".to_string(),
            regiao: "br-pt".to_string(),
            quantidade_resultados: 0,
            resultados: vec![],
            paginas_buscadas: 1,
            erro: None,
            mensagem: None,
            metadados: MetadadosBusca {
                tempo_execucao_ms: 0,
                hash_seletores: "abc123".to_string(),
                retentativas: 0,
                usou_endpoint_fallback: false,
                fetches_simultaneos: 0,
                sucessos_fetch: 0,
                falhas_fetch: 0,
                usou_chrome: false,
                user_agent: "Mozilla/5.0".to_string(),
                usou_proxy: false,
            },
        };
        let json = serde_json::to_string(&saida).expect("serialização deve funcionar");
        // Nomes de campo em PT devem aparecer no JSON (invariante INVIOLÁVEL do blueprint v2).
        assert!(json.contains("\"query\""));
        assert!(json.contains("\"quantidade_resultados\""));
        assert!(json.contains("\"tempo_execucao_ms\""));
        assert!(json.contains("\"resultados\""));
        assert!(json.contains("\"metadados\""));
        // v0.3.0 BREAKING: campo `buscas_relacionadas` removido do schema.
        assert!(!json.contains("\"buscas_relacionadas\""));
        // Nomes em inglês NÃO devem aparecer.
        assert!(!json.contains("\"results_count\""));
        assert!(!json.contains("\"results\":"));
        assert!(!json.contains("\"metadata\""));
        assert!(!json.contains("\"related_searches\""));
    }

    #[test]
    fn saida_busca_multipla_serializa_campos_em_portugues() {
        let saida = SaidaBuscaMultipla {
            quantidade_queries: 2,
            timestamp: "2026-04-14T00:00:00Z".to_string(),
            paralelismo: 5,
            buscas: vec![],
        };
        let json = serde_json::to_string(&saida).expect("serialização deve funcionar");
        // Nomes de campo em PT devem aparecer no JSON.
        assert!(json.contains("\"quantidade_queries\":2"));
        assert!(json.contains("\"paralelismo\":5"));
        assert!(json.contains("\"buscas\":[]"));
        // Nomes em inglês NÃO devem aparecer.
        assert!(!json.contains("\"queries_count\""));
        assert!(!json.contains("\"parallel\""));
        assert!(!json.contains("\"searches\""));
    }
}