edgarkit/
search.rs

1//! Search SEC EDGAR filings using flexible criteria and filters.
2//!
3//! This module provides a powerful search interface to the SEC's EDGAR full-text search system.
4//! You can search by form type, date ranges, company names, CIKs, keywords, and more. The search
5//! API supports both single-page queries and automatic pagination through all matching results.
6//!
7//! Search results include comprehensive metadata about each filing such as company names, CIKs,
8//! filing dates, form types, and accession numbers. Results are returned in reverse chronological
9//! order by default (newest first).
10//!
11//! # Search Capabilities
12//!
13//! - Full-text search with keyword queries
14//! - Filter by form types (10-K, 8-K, S-1, etc.)
15//! - Date range filtering
16//! - Company name or CIK filtering
17//! - SIC code and location-based filtering
18//! - Pagination with configurable page sizes
19//!
20//! # Performance
21//!
22//! The `search_all()` method fetches all results across multiple pages using parallel requests
23//! (up to 7 concurrent) while respecting rate limits. This provides significantly better
24//! performance than sequential pagination for large result sets.
25//!
26//! # Example
27//!
28//! ```ignore
29//! use edgarkit::{Edgar, SearchOperations, SearchOptions};
30//!
31//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
32//! let edgar = Edgar::new("your_app_name contact@example.com")?;
33//!
34//! let options = SearchOptions::new()
35//!     .with_forms(vec!["10-K".to_string()])
36//!     .with_date_range("2024-01-01".to_string(), "2024-12-31".to_string())
37//!     .with_count(100);
38//!
39//! let results = edgar.search_all(options).await?;
40//! # Ok(())
41//! # }
42//! ```
43
44use super::Edgar;
45use super::error::{EdgarError, Result};
46use super::traits::SearchOperations;
47use async_trait::async_trait;
48use serde::{Deserialize, Deserializer, de};
49
50/// Response container from the EDGAR search API containing search metadata and results.
51///
52/// This structure wraps the complete search response including timing information,
53/// shard statistics from Elasticsearch, and the actual search hits. The search
54/// system uses Elasticsearch under the hood, which is why you'll see fields like
55/// `_shards` and `_score` that are specific to that search engine.
56#[derive(Debug, Clone, Deserialize)]
57pub struct SearchResponse {
58    /// Time taken to execute search (ms)
59    pub took: u32,
60
61    /// Whether the search timed out
62    pub timed_out: bool,
63
64    /// Shard information
65    pub _shards: Shards,
66
67    /// Search results
68    pub hits: Hits,
69}
70
71/// Information about Elasticsearch shards that processed the search query.
72///
73/// The EDGAR search system uses Elasticsearch which distributes data across multiple
74/// shards for performance. This struct provides diagnostic information about how many
75/// shards were involved and whether all completed successfully.
76#[derive(Debug, Clone, Deserialize)]
77pub struct Shards {
78    pub total: u32,
79    pub successful: u32,
80    pub skipped: u32,
81    pub failed: u32,
82}
83
84/// Container for search results including total count and individual hit documents.
85///
86/// This structure holds the array of matching filings along with metadata about the
87/// total number of matches and relevance scoring. The `total` field indicates how
88/// many documents matched your search criteria, while `hits` contains the actual
89/// results for the current page.
90#[derive(Debug, Clone, Deserialize)]
91pub struct Hits {
92    /// Total hits information
93    pub total: TotalHits,
94
95    /// Maximum relevance score
96    #[serde(default)]
97    pub max_score: Option<f64>,
98
99    /// Array of hit documents
100    pub hits: Vec<Hit>,
101}
102
103/// Total count of matching documents and the relationship type.
104///
105/// The `relation` field indicates whether the count is exact ("eq") or a lower bound
106/// ("gte"). For very large result sets, Elasticsearch may provide an approximate count.
107#[derive(Debug, Clone, Deserialize)]
108pub struct TotalHits {
109    pub value: u32,
110    pub relation: String,
111}
112
113/// A single search result representing a matching SEC filing.
114///
115/// Each hit contains metadata about the search match (index name, document ID, relevance
116/// score) and the actual filing data in the `_source` field. The underscore-prefixed
117/// fields are Elasticsearch conventions for system metadata.
118#[derive(Debug, Clone, Deserialize)]
119pub struct Hit {
120    /// Index name
121    pub _index: String,
122
123    /// Document ID
124    pub _id: String,
125
126    /// Relevance score
127    #[serde(default)]
128    pub _score: Option<f64>,
129
130    /// Filing information
131    pub _source: Source,
132}
133
134/// Filing information and metadata extracted from the EDGAR search index.
135///
136/// This structure contains all the details about a specific SEC filing including company
137/// identifiers, filing metadata, form type, and business information. This is the primary
138/// data payload for each search result and includes everything you need to identify and
139/// retrieve the actual filing documents.
140///
141/// Many fields are arrays because a single filing can be associated with multiple entities,
142/// locations, or classification codes. For example, merger filings may list multiple CIKs.
143#[derive(Debug, Clone, Deserialize)]
144pub struct Source {
145    /// Company CIK numbers
146    pub ciks: Vec<String>,
147
148    /// Period ending date (if applicable)
149    #[serde(default)]
150    pub period_ending: Option<String>,
151
152    /// File numbers
153    pub file_num: Option<Vec<String>>,
154
155    /// Company display names
156    pub display_names: Vec<String>,
157
158    /// XSL stylesheet reference
159    #[serde(default)]
160    pub xsl: Option<String>,
161
162    /// Sequence number
163    #[serde(deserialize_with = "deserialize_sequence")]
164    pub sequence: u32,
165
166    /// Root form types
167    pub root_forms: Vec<String>,
168
169    /// Filing date (YYYY-MM-DD)
170    pub file_date: String,
171
172    /// Business states
173    pub biz_states: Vec<String>,
174
175    /// SIC codes
176    pub sics: Vec<String>,
177
178    /// Form type (e.g., "10-K", "8-K")
179    pub form: String,
180
181    /// Accession number
182    pub adsh: String,
183
184    /// Film numbers
185    pub film_num: Vec<String>,
186
187    /// Business locations
188    pub biz_locations: Vec<String>,
189
190    /// File type
191    pub file_type: String,
192
193    /// File description
194    #[serde(default)]
195    pub file_description: Option<String>,
196
197    /// Incorporation states
198    pub inc_states: Vec<String>,
199
200    /// Item numbers (for 8-K)
201    pub items: Option<Vec<String>>,
202}
203
204/// Configurable options for searching SEC EDGAR filings.
205///
206/// This builder-style struct allows you to construct complex search queries using a fluent
207/// interface. Combine multiple filters to narrow down results: form types, date ranges,
208/// company identifiers, keywords, and more. All options are optional - you can construct
209/// as simple or complex a query as needed.
210///
211/// The search system supports advanced query syntax including Boolean operators, phrase
212/// searches with quotes, and wildcards. See the SEC's EDGAR full-text search FAQ for
213/// details on query syntax and special operators.
214///
215/// # Builder Pattern
216///
217/// Options are set using builder methods that return `self`, allowing you to chain
218/// multiple calls together. For example:
219///
220/// ```rust
221/// # use edgarkit::SearchOptions;
222/// let options = SearchOptions::new()
223///     .with_query("acquisition merger")
224///     .with_forms(vec!["8-K".to_string()])
225///     .with_date_range("2024-01-01".to_string(), "2024-12-31".to_string())
226///     .with_count(100);
227/// ```
228///
229/// # Pagination
230///
231/// Control pagination using `with_page()`, `with_from()`, and `with_count()`. The maximum
232/// results per page is 100. For retrieving all results across multiple pages, use the
233/// `search_all()` method instead of manually paginating.
234///
235/// # Common Patterns
236///
237/// - **Recent filings**: Use `with_forms()` and `with_count()` without date filters
238/// - **Company-specific**: Use `with_ciks()` to filter by one or more company CIKs
239/// - **Date-bounded**: Use `with_date_range()` to limit results to a specific time period
240/// - **Form type filtering**: Use `with_forms()` to search specific filing types
241#[derive(Debug, Clone, Default)]
242pub struct SearchOptions {
243    /// Typeahead keys
244    pub keys_typed: Option<String>,
245
246    /// Search query (supports special operators, see SEC FAQ)
247    pub query: Option<String>,
248
249    /// Filing category
250    pub category: Option<String>,
251
252    /// Filter by company location
253    pub location_code: Option<String>,
254
255    /// Company or individual name (cannot combine with cik or sic)
256    pub entity_name: Option<String>,
257
258    /// Form types to search (e.g., ["10-K", "10-Q"])
259    pub forms: Option<Vec<String>>,
260
261    /// Filter by multiple location codes
262    pub location_codes: Option<Vec<String>>,
263
264    /// Page number for pagination
265    pub page: Option<u32>,
266
267    /// Number of results to skip
268    pub from: Option<u32>,
269
270    /// Number of results to return (max 100)
271    pub count: Option<u32>,
272
273    /// Order by oldest first instead of newest
274    pub reverse_order: Option<bool>,
275
276    /// Start date (YYYY-MM-DD, requires end_date)
277    pub start_date: Option<String>,
278
279    /// End date (YYYY-MM-DD, requires start_date)
280    pub end_date: Option<String>,
281
282    /// Search by base words (default) or exactly as entered
283    pub stemming: Option<String>,
284
285    /// CIK codes to search (cannot combine with name or sic)
286    pub ciks: Option<Vec<String>>,
287
288    /// Standard Industrial Classification code
289    pub sic: Option<String>,
290
291    /// Use incorporation location instead of HQ location
292    pub incorporated_location: Option<bool>,
293}
294
295/// Custom deserializer for sequence field that can be either u32 or string
296fn deserialize_sequence<'de, D>(deserializer: D) -> std::result::Result<u32, D::Error>
297where
298    D: Deserializer<'de>,
299{
300    struct SequenceVisitor;
301
302    impl<'de> de::Visitor<'de> for SequenceVisitor {
303        type Value = u32;
304
305        fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
306            formatter.write_str("an integer or a string containing an integer")
307        }
308
309        fn visit_u64<E>(self, value: u64) -> std::result::Result<Self::Value, E>
310        where
311            E: de::Error,
312        {
313            Ok(value as u32)
314        }
315
316        fn visit_str<E>(self, value: &str) -> std::result::Result<Self::Value, E>
317        where
318            E: de::Error,
319        {
320            value.parse().map_err(de::Error::custom)
321        }
322    }
323
324    deserializer.deserialize_any(SequenceVisitor)
325}
326
327impl SearchOptions {
328    /// Creates a new instance of SearchOptions with default values
329    pub fn new() -> Self {
330        Self::default()
331    }
332
333    /// Sets the search query text.
334    ///
335    /// # Example
336    ///
337    /// ```rust
338    /// # use edgarkit::SearchOptions;
339    /// let options = SearchOptions::new()
340    ///     .with_query("quarterly report");
341    /// ```
342    pub fn with_query(mut self, query: impl Into<String>) -> Self {
343        self.query = Some(query.into());
344        self
345    }
346
347    /// Sets the keys typed for typeahead search functionality
348    pub fn with_keys_typed(mut self, keys: impl Into<String>) -> Self {
349        self.keys_typed = Some(keys.into());
350        self
351    }
352
353    /// Sets the category filter for the search
354    pub fn with_category(mut self, category: impl Into<String>) -> Self {
355        self.category = Some(category.into());
356        self
357    }
358
359    /// Sets the location code filter
360    pub fn with_location_code(mut self, code: impl Into<String>) -> Self {
361        self.location_code = Some(code.into());
362        self
363    }
364
365    /// Sets the entity name filter
366    pub fn with_entity_name(mut self, name: impl Into<String>) -> Self {
367        self.entity_name = Some(name.into());
368        self
369    }
370
371    /// Sets the form types to filter by (e.g., ["10-K", "10-Q"])
372    pub fn with_forms(mut self, forms: Vec<String>) -> Self {
373        self.forms = Some(forms);
374        self
375    }
376
377    /// Sets the location codes to filter by
378    pub fn with_location_codes(mut self, codes: Vec<String>) -> Self {
379        self.location_codes = Some(codes);
380        self
381    }
382
383    /// Sets the page number for pagination (starting from 1)
384    pub fn with_page(mut self, page: u32) -> Self {
385        self.page = Some(page);
386        self
387    }
388
389    /// Sets the starting index for results
390    pub fn with_from(mut self, from: u32) -> Self {
391        self.from = Some(from);
392        self
393    }
394
395    /// Sets the maximum number of results to return
396    pub fn with_count(mut self, count: u32) -> Self {
397        self.count = Some(count);
398        self
399    }
400
401    /// Sets whether to return results in reverse order
402    pub fn with_reverse_order(mut self, reverse: bool) -> Self {
403        self.reverse_order = Some(reverse);
404        self
405    }
406
407    /// Sets the date range for the search
408    ///
409    /// # Arguments
410    /// * `start_date` - Start date in YYYY-MM-DD format
411    /// * `end_date` - End date in YYYY-MM-DD format
412    pub fn with_date_range(mut self, start_date: String, end_date: String) -> Self {
413        self.start_date = Some(start_date);
414        self.end_date = Some(end_date);
415        self
416    }
417
418    /// Sets stemming option for search
419    pub fn with_stemming(mut self, stemming: impl Into<String>) -> Self {
420        self.stemming = Some(stemming.into());
421        self
422    }
423
424    /// Sets company CIK filter(s).
425    ///
426    /// # Examples
427    ///
428    /// ```rust
429    /// # use edgarkit::SearchOptions;
430    /// // Single CIK
431    /// let options = SearchOptions::new().with_ciks(vec!["0001234567".to_string()]);
432    ///
433    /// // Multiple CIKs
434    /// let options = SearchOptions::new().with_ciks(vec!["0001234567".to_string(), "0007654321".to_string()]);
435    /// ```
436    pub fn with_ciks<T>(mut self, ciks: T) -> Self
437    where
438        T: Into<Vec<String>>,
439    {
440        self.ciks = Some(ciks.into());
441        self
442    }
443
444    /// Sets a single company CIK filter
445    ///
446    /// This is a convenience method for backwards compatibility
447    ///
448    /// # Arguments
449    /// * `cik` - A single CIK
450    pub fn with_cik(self, cik: impl Into<String>) -> Self {
451        self.with_ciks(vec![cik.into()])
452    }
453
454    /// Sets SIC code filter
455    pub fn with_sic(mut self, sic: impl Into<String>) -> Self {
456        self.sic = Some(sic.into());
457        self
458    }
459
460    /// Sets whether to use incorporation location instead of HQ
461    pub fn with_incorporated_location(mut self, incorporated: bool) -> Self {
462        self.incorporated_location = Some(incorporated);
463        self
464    }
465
466    pub fn to_query_params(&self) -> Vec<(String, String)> {
467        let mut params = Vec::new();
468
469        if let Some(ref query) = self.query {
470            params.push(("q".to_string(), query.clone()));
471        }
472
473        if let Some(ref keys) = self.keys_typed {
474            params.push(("keysTyped".to_string(), keys.clone()));
475        }
476
477        if let Some(ref category) = self.category {
478            params.push(("category".to_string(), category.clone()));
479        }
480
481        if let Some(ref code) = self.location_code {
482            params.push(("locationCode".to_string(), code.clone()));
483        }
484
485        if let Some(ref name) = self.entity_name {
486            params.push(("entityName".to_string(), name.clone()));
487        }
488
489        if let Some(ref forms) = self.forms {
490            params.push(("forms".to_string(), forms.join(",")));
491        }
492
493        if let Some(ref codes) = self.location_codes {
494            params.push(("locationCodes".to_string(), codes.join(",")));
495        }
496
497        if let Some(page) = self.page {
498            params.push(("page".to_string(), page.to_string()));
499        }
500
501        if let Some(from) = self.from {
502            params.push(("from".to_string(), from.to_string()));
503        }
504
505        if let Some(count) = self.count {
506            params.push(("count".to_string(), count.to_string()));
507        }
508
509        if let Some(reverse) = self.reverse_order {
510            params.push((
511                "reverse_order".to_string(),
512                if reverse { "TRUE" } else { "FALSE" }.to_string(),
513            ));
514        }
515
516        if let Some(ref start) = self.start_date {
517            params.push(("startdt".to_string(), start.clone()));
518        }
519
520        if let Some(ref end) = self.end_date {
521            params.push(("enddt".to_string(), end.clone()));
522        }
523
524        if let Some(ref stemming) = self.stemming {
525            params.push(("stemming".to_string(), stemming.clone()));
526        }
527
528        if let Some(ref ciks) = self.ciks {
529            params.push(("ciks".to_string(), ciks.join(",")));
530        }
531
532        if let Some(ref sic) = self.sic {
533            params.push(("sic".to_string(), sic.clone()));
534        }
535
536        if let Some(incorporated) = self.incorporated_location {
537            params.push((
538                "incorporated_location".to_string(),
539                incorporated.to_string(),
540            ));
541        }
542
543        params
544    }
545}
546
547/// Search operations for querying SEC EDGAR filings with flexible filters and criteria.
548///
549/// This trait provides two main search methods: `search()` for single-page queries and
550/// `search_all()` for comprehensive multi-page retrieval. Both methods use the same
551/// `SearchOptions` for filtering, but `search_all()` automatically handles pagination
552/// and fetches all matching results in parallel batches.
553///
554/// The search system is powered by SEC's EDGAR full-text search, which indexes filing
555/// content, company names, form types, and metadata. Results are ranked by relevance
556/// when using keyword queries, or sorted by filing date when searching by form type
557/// or date range.
558///
559/// # Performance Considerations
560///
561/// For large result sets (>100 documents), `search_all()` is significantly faster than
562/// manually paginating because it fetches multiple pages concurrently. However, it will
563/// retrieve ALL matching results, which could be thousands of documents. Consider using
564/// date ranges or other filters to limit scope when appropriate.
565///
566/// # Example
567///
568/// ```ignore
569/// use edgarkit::{Edgar, SearchOperations, SearchOptions};
570///
571/// async fn example() -> Result<(), Box<dyn std::error::Error>> {
572///     let edgar = Edgar::new("your_app_name contact@example.com")?;
573///     
574///     let options = SearchOptions::new()
575///         .with_forms(vec!["10-K".to_string()])
576///         .with_count(10);
577///     
578///     // Single page
579///     let first_page = edgar.search(options.clone()).await?;
580///     println!("First page: {} results", first_page.hits.hits.len());
581///     
582///     // All results across pages
583///     let all_results = edgar.search_all(options).await?;
584///     println!("Total results: {}", all_results.len());
585///     Ok(())
586/// }
587/// ```
588#[async_trait]
589impl SearchOperations for Edgar {
590    /// Executes a search query and returns a single page of results.
591    ///
592    /// This method performs one search request and returns the results for the specified
593    /// page. Use this when you only need a small number of results or want to implement
594    /// custom pagination logic. For retrieving all matching results, use `search_all()`
595    /// which handles pagination automatically.
596    ///
597    /// The returned `SearchResponse` includes metadata about the search (execution time,
598    /// total hits) along with the actual results for the current page. By default, results
599    /// are sorted by filing date (newest first) unless a keyword query is provided, in
600    /// which case they're ranked by relevance.
601    ///
602    /// # Arguments
603    ///
604    /// * `options` - Search filters and pagination settings
605    ///
606    /// # Returns
607    ///
608    /// Returns a `SearchResponse` containing search metadata and results for one page.
609    ///
610    /// # Example
611    ///
612    /// ```ignore
613    /// let options = SearchOptions::new()
614    ///     .with_forms(vec!["10-Q".to_string()])
615    ///     .with_page(1)
616    ///     .with_count(50);
617    ///
618    /// let response = edgar.search(options).await?;
619    /// println!("Found {} total matches", response.hits.total.value);
620    /// println!("This page has {} results", response.hits.hits.len());
621    /// ```
622    async fn search(&self, options: SearchOptions) -> Result<SearchResponse> {
623        let params = options.to_query_params();
624        let query_string = serde_urlencoded::to_string(&params)
625            .map_err(|e| EdgarError::InvalidResponse(e.to_string()))?;
626
627        let url = format!("{}?{}", self.search_url(), query_string);
628        let response = self.get(&url).await?;
629
630        Ok(serde_json::from_str(&response)?)
631    }
632
633    /// Fetches all matching results across multiple pages with automatic pagination.
634    ///
635    /// This method is designed for comprehensive data retrieval where you need all filings
636    /// matching your search criteria. It automatically handles pagination by first querying
637    /// for total count, then fetching all pages in parallel batches of up to 7 concurrent
638    /// requests. This provides excellent performance while respecting SEC rate limits.
639    ///
640    /// The method aggregates all results into a single vector of `Hit` objects, making it
641    /// easy to process the complete result set. Progress and errors are logged using the
642    /// `tracing` crate, so you can monitor long-running searches.
643    ///
644    /// # Performance Notes
645    ///
646    /// - Uses parallel requests (batch size: 7) to fetch multiple pages simultaneously
647    /// - Respects rate limiting between batches
648    /// - For 1000+ results, this is significantly faster than sequential pagination
649    /// - Memory usage scales with result set size - consider filtering for very large queries
650    ///
651    /// # Arguments
652    ///
653    /// * `options` - Search filters and criteria (pagination options are overridden)
654    ///
655    /// # Returns
656    ///
657    /// Returns a vector containing all matching `Hit` objects across all pages.
658    ///
659    /// # Example
660    ///
661    /// ```ignore
662    /// let options = SearchOptions::new()
663    ///     .with_query("quarterly earnings")
664    ///     .with_forms(vec!["10-Q".to_string()])
665    ///     .with_date_range("2024-01-01".to_string(), "2024-03-31".to_string());
666    ///
667    /// let all_results = edgar.search_all(options).await?;
668    /// println!("Retrieved {} quarterly reports", all_results.len());
669    ///
670    /// for hit in all_results {
671    ///     println!("{}: {} filed on {}",
672    ///         hit._source.display_names[0],
673    ///         hit._source.form,
674    ///         hit._source.file_date);
675    /// }
676    /// ```
677    async fn search_all(&self, mut options: SearchOptions) -> Result<Vec<Hit>> {
678        const BATCH_SIZE: u32 = 7; // Maximum number of concurrent requests
679        const PAGE_SIZE: u32 = 100; // Results per page
680
681        // Set defaults
682        options.count = Some(PAGE_SIZE);
683        options.page = Some(1);
684        options.reverse_order = Some(false);
685
686        let initial_response = self.search(options.clone()).await?;
687        let total_hits = initial_response.hits.total.value;
688
689        tracing::info!("Found {} total hits", total_hits);
690
691        let mut all_hits = Vec::with_capacity(total_hits as usize);
692        all_hits.extend(initial_response.hits.hits);
693
694        let total_pages = (total_hits + PAGE_SIZE - 1) / PAGE_SIZE;
695        let mut current_page = 1;
696
697        while current_page < total_pages {
698            let end_page = (current_page + BATCH_SIZE).min(total_pages);
699            let mut batch_futures = Vec::with_capacity((end_page - current_page) as usize);
700
701            for page in (current_page + 1)..=end_page {
702                let skip = (page - 1) * PAGE_SIZE;
703
704                // Stop if we've gone past the total hits
705                if skip >= total_hits {
706                    break;
707                }
708
709                let mut page_options = options.clone();
710                page_options.page = Some(page);
711                page_options.from = Some(skip);
712                page_options.count = Some(PAGE_SIZE.min(total_hits - skip));
713                page_options.reverse_order = Some(false);
714
715                batch_futures.push(self.search(page_options));
716            }
717
718            if batch_futures.is_empty() {
719                break;
720            }
721
722            let results = futures_util::future::join_all(batch_futures).await;
723
724            for result in results {
725                match result {
726                    Ok(response) => {
727                        all_hits.extend(response.hits.hits);
728                    }
729                    Err(e) => {
730                        tracing::error!("Error fetching page: {}", e);
731                        return Err(e);
732                    }
733                }
734            }
735
736            current_page += BATCH_SIZE;
737        }
738
739        Ok(all_hits)
740    }
741}
742
743#[cfg(test)]
744mod tests {
745    use super::*;
746
747    #[test]
748    fn test_search_options_builder() {
749        let options = SearchOptions::new()
750            .with_query("test")
751            .with_forms(vec!["10-K".to_string(), "10-Q".to_string()])
752            .with_count(10)
753            .with_reverse_order(true);
754
755        let params = options.to_query_params();
756
757        assert!(params.contains(&("q".to_string(), "test".to_string())));
758        assert!(params.contains(&("forms".to_string(), "10-K,10-Q".to_string())));
759        assert!(params.contains(&("count".to_string(), "10".to_string())));
760        assert!(params.contains(&("reverse_order".to_string(), "TRUE".to_string())));
761    }
762}