Skip to main content

sec_fetcher_shared/
lib.rs

1//! Workspace-internal shared types and utilities.
2//!
3//! Contains fiscal-period parsing, sort-rank helpers, and the canonical set
4//! of CSV column names written by the data pipeline.  This crate has no
5//! external dependencies and is not published to crates.io.
6//!
7//! # Fiscal-period sorting
8//!
9//! The SEC's `fp` field on EDGAR observations uses a variety of tokens to
10//! label fiscal periods: `"Q1"`, `"FY"`, `"H1"`, `"SA2"`, `"6M"`, etc.
11//! [`parse_period_slot_token`] normalises all of them into a comparable
12//! integer rank so that rows can be sorted newest-first without a
13//! hardcoded lookup table.
14
15/// Canonical period-slot categories parsed from raw fiscal-period labels.
16///
17/// This enum is intentionally source-format agnostic: multiple token styles
18/// map into the small stable set used by filtering and trend ranking.
19///
20/// Recognised aliases:
21/// - Quarters:     `Q1`..`Q4`
22/// - Annual:       `FY`, `ANNUAL`
23/// - Semi-annual:  `H1`/`H2`, `HY1`/`HY2`, `SA1`/`SA2`, `S1`/`S2`
24/// - Month windows: `3M`, `6M`, `9M`, `12M`
25#[derive(Debug, Clone, Copy, PartialEq, Eq)]
26pub enum PeriodSlot {
27    Q1,
28    Q2,
29    Q3,
30    Q4,
31    FY,
32    H1,
33    H2,
34    M3,
35    M6,
36    M9,
37    M12,
38}
39
40impl PeriodSlot {
41    /// Maps the slot to a normalised quarter-space rank (`1..=4`).
42    ///
43    /// Higher rank = later in the fiscal year.  Sort descending on this
44    /// value to get newest-first row ordering.
45    ///
46    /// | Rank | Slots |
47    /// |------|-------|
48    /// | 4    | `Q4`, `FY`, `H2`, `12M` |
49    /// | 3    | `Q3`, `9M` |
50    /// | 2    | `Q2`, `H1`, `6M` |
51    /// | 1    | `Q1`, `3M` |
52    pub fn normalized_quarter(self) -> i64 {
53        match self {
54            PeriodSlot::Q1 | PeriodSlot::M3 => 1,
55            PeriodSlot::Q2 | PeriodSlot::H1 | PeriodSlot::M6 => 2,
56            PeriodSlot::Q3 | PeriodSlot::M9 => 3,
57            PeriodSlot::Q4 | PeriodSlot::FY | PeriodSlot::H2 | PeriodSlot::M12 => 4,
58        }
59    }
60}
61
62/// Extracts a quarter number (1–4) from tokens like `"Q1"`, `"Q3"`.
63/// Returns `None` for anything outside that range or without a `Q` prefix.
64pub fn parse_quarter_token(s: &str) -> Option<i64> {
65    let upper = s.to_ascii_uppercase();
66    let pos = upper.find('Q')?;
67    let n = upper[pos + 1..]
68        .chars()
69        .next()
70        .and_then(|c| c.to_digit(10))
71        .map(i64::from)?;
72    if (1..=4).contains(&n) { Some(n) } else { None }
73}
74
75/// Parses a raw fiscal-period token into a canonical [`PeriodSlot`].
76///
77/// ```
78/// use sec_fetcher_shared::{PeriodSlot, parse_period_slot};
79/// assert_eq!(parse_period_slot("SA1"), Some(PeriodSlot::H1));
80/// assert_eq!(parse_period_slot("FY"),  Some(PeriodSlot::FY));
81/// assert_eq!(parse_period_slot("Q4"),  Some(PeriodSlot::Q4));
82/// ```
83pub fn parse_period_slot(s: &str) -> Option<PeriodSlot> {
84    let upper = s.trim().to_ascii_uppercase();
85
86    if let Some(q) = parse_quarter_token(&upper) {
87        return Some(match q {
88            1 => PeriodSlot::Q1,
89            2 => PeriodSlot::Q2,
90            3 => PeriodSlot::Q3,
91            4 => PeriodSlot::Q4,
92            _ => return None,
93        });
94    }
95
96    if upper.contains("FY") || upper.contains("ANNUAL") {
97        return Some(PeriodSlot::FY);
98    }
99
100    // Semi-annual H1 aliases (must check before H2 to avoid prefix collision)
101    if upper.contains("HY1")
102        || upper.contains("H1")
103        || upper.contains("SA1")
104        || upper.contains("S1")
105    {
106        return Some(PeriodSlot::H1);
107    }
108    if upper.contains("HY2")
109        || upper.contains("H2")
110        || upper.contains("SA2")
111        || upper.contains("S2")
112    {
113        return Some(PeriodSlot::H2);
114    }
115
116    // Month-window aliases (12M first to avoid prefix match on 2M)
117    if upper.contains("12M") {
118        return Some(PeriodSlot::M12);
119    }
120    if upper.contains("9M") {
121        return Some(PeriodSlot::M9);
122    }
123    if upper.contains("6M") {
124        return Some(PeriodSlot::M6);
125    }
126    if upper.contains("3M") {
127        return Some(PeriodSlot::M3);
128    }
129
130    None
131}
132
133/// Returns the normalised quarter rank (1–4) for a raw fiscal-period token,
134/// or `None` if the token is not recognised.
135///
136/// Equivalent to `parse_period_slot(s).map(PeriodSlot::normalized_quarter)`.
137///
138/// This is the primary entry-point for sort-key computation.
139///
140/// ```
141/// use sec_fetcher_shared::parse_period_slot_token;
142/// assert_eq!(parse_period_slot_token("FY"),  Some(4));
143/// assert_eq!(parse_period_slot_token("Q3"),  Some(3));
144/// assert_eq!(parse_period_slot_token("SA1"), Some(2));
145/// assert_eq!(parse_period_slot_token("Q4"),  Some(4));
146/// assert_eq!(parse_period_slot_token(""),    None);
147/// ```
148pub fn parse_period_slot_token(s: &str) -> Option<i64> {
149    parse_period_slot(s).map(PeriodSlot::normalized_quarter)
150}
151
152/// Normalises a raw SEC `fp` token to its canonical CSV label.
153///
154/// The only transformation currently applied is **`Q4` → `FY`**: the SEC tags
155/// some year-end filings as `"Q4"` (typically via a 10-Q/A) rather than the
156/// standard `"FY"` used by annual 10-K filings.  Both map to the same
157/// period-slot rank (4) and refer to the same fiscal year-end point, so the
158/// canonical form is `"FY"` for consistency in downstream CSV consumers.
159///
160/// All other tokens are returned unchanged (preserving case as-is from the
161/// API, e.g. `"Q1"`, `"H1"`, `"SA2"`).
162///
163/// ```
164/// use sec_fetcher_shared::normalize_fp_label;
165/// assert_eq!(normalize_fp_label("Q4"), "FY");
166/// assert_eq!(normalize_fp_label("q4"), "FY");
167/// assert_eq!(normalize_fp_label("FY"),  "FY");
168/// assert_eq!(normalize_fp_label("Q3"),  "Q3");
169/// assert_eq!(normalize_fp_label("H1"),  "H1");
170/// ```
171pub fn normalize_fp_label(fp: &str) -> String {
172    if fp.trim().eq_ignore_ascii_case("Q4") {
173        "FY".to_string()
174    } else {
175        fp.to_string()
176    }
177}
178
179/// Returns the set of candidate ticker symbols that a raw input string might
180/// resolve to, normalised to uppercase and with `.`/`-` variants generated.
181///
182/// EDGAR and data providers use both `.` and `-` as separators in class-share
183/// tickers (e.g. `BRK.B` vs `BRK-B`).  This function always returns both
184/// forms so callers only need a single lookup rather than separate tries.
185///
186/// ```
187/// use sec_fetcher_shared::normalize_symbol;
188/// let c = normalize_symbol("brk.b");
189/// assert!(c.contains(&"BRK.B".to_string()));
190/// assert!(c.contains(&"BRK-B".to_string()));
191/// ```
192pub fn normalize_symbol(symbol: &str) -> Vec<String> {
193    let upper = symbol.to_ascii_uppercase();
194    let dot_to_dash = upper.replace('.', "-");
195    let dash_to_dot = upper.replace('-', ".");
196
197    let mut set = std::collections::HashSet::new();
198    set.insert(upper);
199    set.insert(dot_to_dash);
200    set.insert(dash_to_dot);
201
202    let mut out: Vec<String> = set.into_iter().collect();
203    out.sort();
204    out
205}
206
207#[cfg(test)]
208mod tests {
209    use super::*;
210
211    #[test]
212    fn quarters_rank_correctly() {
213        assert_eq!(parse_period_slot_token("Q1"), Some(1));
214        assert_eq!(parse_period_slot_token("Q2"), Some(2));
215        assert_eq!(parse_period_slot_token("Q3"), Some(3));
216        assert_eq!(parse_period_slot_token("Q4"), Some(4));
217    }
218
219    #[test]
220    fn fy_ranks_same_as_q4() {
221        assert_eq!(parse_period_slot_token("FY"), Some(4));
222    }
223
224    #[test]
225    fn semi_annual_aliases() {
226        assert_eq!(parse_period_slot_token("H1"), Some(2));
227        assert_eq!(parse_period_slot_token("H2"), Some(4));
228        assert_eq!(parse_period_slot_token("HY1"), Some(2));
229        assert_eq!(parse_period_slot_token("HY2"), Some(4));
230        assert_eq!(parse_period_slot_token("SA1"), Some(2));
231        assert_eq!(parse_period_slot_token("SA2"), Some(4));
232        assert_eq!(parse_period_slot_token("S1"), Some(2));
233        assert_eq!(parse_period_slot_token("S2"), Some(4));
234    }
235
236    #[test]
237    fn month_window_aliases() {
238        assert_eq!(parse_period_slot_token("3M"), Some(1));
239        assert_eq!(parse_period_slot_token("6M"), Some(2));
240        assert_eq!(parse_period_slot_token("9M"), Some(3));
241        assert_eq!(parse_period_slot_token("12M"), Some(4));
242    }
243
244    #[test]
245    fn unrecognised_returns_none() {
246        assert_eq!(parse_period_slot_token(""), None);
247        assert_eq!(parse_period_slot_token("SA"), None);
248        assert_eq!(parse_period_slot_token("Q5"), None);
249    }
250
251    #[test]
252    fn case_insensitive() {
253        assert_eq!(parse_period_slot_token("fy"), Some(4));
254        assert_eq!(parse_period_slot_token("q2"), Some(2));
255        assert_eq!(parse_period_slot_token("sa2"), Some(4));
256    }
257
258    #[test]
259    fn normalize_fp_label_maps_q4_to_fy() {
260        assert_eq!(normalize_fp_label("Q4"), "FY");
261        assert_eq!(normalize_fp_label("q4"), "FY");
262    }
263
264    #[test]
265    fn normalize_fp_label_leaves_other_tokens_unchanged() {
266        assert_eq!(normalize_fp_label("FY"), "FY");
267        assert_eq!(normalize_fp_label("Q3"), "Q3");
268        assert_eq!(normalize_fp_label("H1"), "H1");
269        assert_eq!(normalize_fp_label("SA2"), "SA2");
270        assert_eq!(normalize_fp_label(""), "");
271    }
272
273    #[test]
274    fn normalize_symbol_generates_dot_and_dash_variants() {
275        let c = normalize_symbol("brk.b");
276        assert!(c.contains(&"BRK.B".to_string()));
277        assert!(c.contains(&"BRK-B".to_string()));
278    }
279
280    #[test]
281    fn normalize_symbol_upcases_plain_ticker() {
282        let c = normalize_symbol("aapl");
283        assert_eq!(c, vec!["AAPL".to_string()]);
284    }
285
286    // -- extract_first_year --
287
288    #[test]
289    fn test_extract_first_year_from_combined() {
290        assert_eq!(extract_first_year("2024Q3"), Some(2024));
291    }
292
293    #[test]
294    fn test_extract_first_year_from_plain_year() {
295        assert_eq!(extract_first_year("2024"), Some(2024));
296    }
297
298    #[test]
299    fn test_extract_first_year_from_longer_string() {
300        assert_eq!(extract_first_year("FY ended 2024-12-31"), Some(2024));
301    }
302
303    #[test]
304    fn test_extract_first_year_out_of_range_low() {
305        assert_eq!(extract_first_year("1899"), None);
306    }
307
308    #[test]
309    fn test_extract_first_year_out_of_range_high() {
310        assert_eq!(extract_first_year("2101"), None);
311    }
312
313    #[test]
314    fn test_extract_first_year_no_digits() {
315        assert_eq!(extract_first_year("hello world"), None);
316    }
317
318    #[test]
319    fn test_extract_first_year_short_string() {
320        assert_eq!(extract_first_year("23"), None);
321    }
322
323    #[test]
324    fn test_extract_first_year_empty_string() {
325        assert_eq!(extract_first_year(""), None);
326    }
327
328    // -- parse_period --
329
330    #[test]
331    fn test_parse_period_year_only() {
332        assert_eq!(parse_period("2024").unwrap(), Period::Year { year: 2024 });
333    }
334
335    #[test]
336    fn test_parse_period_year_quarter() {
337        assert_eq!(
338            parse_period("2024Q3").unwrap(),
339            Period::YearQuarter {
340                year: 2024,
341                quarter: 3
342            }
343        );
344    }
345
346    #[test]
347    fn test_parse_period_year_fy() {
348        assert_eq!(
349            parse_period("2024FY").unwrap(),
350            Period::YearQuarter {
351                year: 2024,
352                quarter: 4
353            }
354        );
355    }
356
357    #[test]
358    fn test_parse_period_year_h1() {
359        assert_eq!(
360            parse_period("2024H1").unwrap(),
361            Period::YearQuarter {
362                year: 2024,
363                quarter: 2
364            }
365        );
366    }
367
368    #[test]
369    fn test_parse_period_year_9m() {
370        assert_eq!(
371            parse_period("2024 9M").unwrap(),
372            Period::YearQuarter {
373                year: 2024,
374                quarter: 3
375            }
376        );
377    }
378
379    #[test]
380    fn test_parse_period_whitespace_trimmed() {
381        assert_eq!(
382            parse_period("  2024Q2  ").unwrap(),
383            Period::YearQuarter {
384                year: 2024,
385                quarter: 2
386            }
387        );
388    }
389
390    #[test]
391    fn test_parse_period_missing_year() {
392        let err = parse_period("Q3").unwrap_err();
393        assert!(err.contains("missing year"));
394    }
395
396    #[test]
397    fn test_parse_period_empty_string() {
398        let err = parse_period("").unwrap_err();
399        assert!(err.contains("missing year"));
400    }
401
402    // -- normalize_symbol --
403
404    #[test]
405    fn test_normalize_symbol_dash_to_dot() {
406        let c = normalize_symbol("BRK-B");
407        assert!(c.contains(&"BRK.B".to_string()));
408        assert!(c.contains(&"BRK-B".to_string()));
409    }
410
411    #[test]
412    fn test_normalize_symbol_no_change_needed() {
413        let c = normalize_symbol("AAPL");
414        assert_eq!(c, vec!["AAPL".to_string()]);
415    }
416
417    #[test]
418    fn test_normalize_symbol_dot_and_dash_both_present() {
419        let c = normalize_symbol("brk.b");
420        assert!(c.contains(&"BRK.B".to_string()));
421        assert!(c.contains(&"BRK-B".to_string()));
422        // Exactly 2 variants (no duplicates)
423        assert_eq!(c.len(), 2);
424    }
425
426    // -- normalize_fp_label --
427
428    #[test]
429    fn test_normalize_fp_label_q4_case_insensitive() {
430        assert_eq!(normalize_fp_label("Q4"), "FY");
431        assert_eq!(normalize_fp_label("q4"), "FY");
432        assert_eq!(normalize_fp_label("  Q4  "), "FY");
433    }
434}
435
436/// A parsed fiscal period — either a specific year+quarter or a bare year.
437///
438/// Quarter values use the same 1–4 rank as [`PeriodSlot::normalized_quarter`]:
439/// Q1 = 1, Q2/H1/6M = 2, Q3/9M = 3, Q4/FY/H2/12M = 4.
440#[derive(Debug, Clone, Copy, PartialEq, Eq)]
441pub enum Period {
442    YearQuarter { year: i64, quarter: i64 },
443    Year { year: i64 },
444}
445
446/// Extracts the first 4-digit year (1900–2100) from a string.
447///
448/// Used by [`parse_period`] to pull the fiscal year out of tokens like
449/// `"2024Q3"`, `"2024FY"`, or `"2024"`.
450pub fn extract_first_year(s: &str) -> Option<i64> {
451    let chars: Vec<char> = s.chars().collect();
452    for i in 0..chars.len().saturating_sub(3) {
453        if chars[i].is_ascii_digit()
454            && chars[i + 1].is_ascii_digit()
455            && chars[i + 2].is_ascii_digit()
456            && chars[i + 3].is_ascii_digit()
457        {
458            let year_str: String = chars[i..=i + 3].iter().collect();
459            if let Ok(year) = year_str.parse::<i64>()
460                && (1900..=2100).contains(&year)
461            {
462                return Some(year);
463            }
464        }
465    }
466    None
467}
468
469/// Parses a raw period string into a [`Period`].
470///
471/// Recognises year-only strings (`"2024"`) and combined year+period tokens
472/// (`"2024Q3"`, `"2024FY"`, `"2024H1"`, `"2024 9M"`, etc.).
473///
474/// Returns `Err` if no 4-digit year can be found.
475///
476/// ```
477/// use sec_fetcher_shared::{Period, parse_period};
478/// assert_eq!(parse_period("2024Q3").unwrap(), Period::YearQuarter { year: 2024, quarter: 3 });
479/// assert_eq!(parse_period("2024FY").unwrap(), Period::YearQuarter { year: 2024, quarter: 4 });
480/// assert_eq!(parse_period("2024").unwrap(),   Period::Year { year: 2024 });
481/// ```
482pub fn parse_period(period: &str) -> Result<Period, String> {
483    let raw = period.trim();
484    let upper = raw.to_ascii_uppercase();
485    let year = extract_first_year(&upper).ok_or_else(|| {
486        format!(
487            "Period `{}` is missing year; expected values like 2024Q3, 2024H1, or 2024FY",
488            raw
489        )
490    })?;
491    if let Some(slot) = parse_period_slot(&upper) {
492        let q = slot.normalized_quarter();
493        return Ok(Period::YearQuarter { year, quarter: q });
494    }
495    Ok(Period::Year { year })
496}
497
498/// Ordered metadata columns present in every per-symbol US-GAAP CSV file.
499///
500/// These columns are written by the data pipeline and read by the normaliser
501/// and trend analysis layers.  They always appear before the XBRL fact columns
502/// (which vary per company) and must be treated as non-fact bookkeeping data.
503///
504/// The order here is canonical: both the writer and reader rely on it.
505///
506/// ## Row ordering convention
507///
508/// Rows are written **newest-first** (reverse chronological).  The primary
509/// sort key is `(fy DESC, fp_rank DESC)` where `fp_rank` is the integer
510/// returned by [`parse_period_slot_token`] for the `fp` field.
511///
512/// `canonical_order` is a **0-based physical row index**: the first row has
513/// `canonical_order = 0`, the second `1`, and so on.  It encodes the
514/// newest-first position assigned at write time and is the bridge between the
515/// on-disk row position and the runtime `local_idx` used for global-ID lookups.
516/// A mismatch between the stored value and the actual row position indicates
517/// that the file has been rewritten or reordered without updating this column.
518/// `is_amendment` (index 6): `true` when the winning row for this period came from an
519/// amendment filing (i.e. the original `form` value ended with `/A`, e.g. `"10-Q/A"`).
520/// The `form` column itself is normalised to the base type (`"10-Q"`).
521pub const US_GAAP_CSV_META_COLUMNS: &[&str] = &[
522    "canonical_order",
523    "fy",
524    "fp",
525    "period_end",
526    "filed",
527    "form",
528    "is_amendment",
529    "accn",
530    "filing_url",
531];