Skip to main content

chordsketch_core/
heuristic.rs

1//! Heuristic plain-text chord+lyrics importer.
2//!
3//! This module detects and converts plain-text chord sheets — where chord names
4//! appear on their own lines above the corresponding lyric lines — into the
5//! ChordPro [`Song`] AST.
6//!
7//! # Format
8//!
9//! Plain-text chord sheets look like:
10//!
11//! ```text
12//! [Verse]
13//! Am        F         C         G
14//! There's a lady who's sure all that glitters is gold
15//! ```
16//!
17//! Each "chord line" contains only chord names (whitespace-separated), and the
18//! following "lyric line" contains the sung text. The column position of each
19//! chord in the chord line is preserved as an inline annotation over the
20//! corresponding text in the lyric line.
21//!
22//! # Detection
23//!
24//! Use [`detect_format`] to auto-classify an input string, or
25//! [`PlainTextImporter::detect_format`] to use custom thresholds.
26//!
27//! # Conversion
28//!
29//! Use [`convert_plain_text`] to convert a plain-text chord sheet into a
30//! [`Song`], or [`PlainTextImporter::convert`] to use custom thresholds.
31
32use crate::ast::{Chord, Directive, Line, LyricsLine, LyricsSegment, Song};
33
34// ---------------------------------------------------------------------------
35// InputFormat
36// ---------------------------------------------------------------------------
37
38/// Classification of an input text format.
39#[derive(Debug, Clone, Copy, PartialEq, Eq)]
40pub enum InputFormat {
41    /// The input is ChordPro format (directives or inline chord notation).
42    ChordPro,
43    /// The input is a plain chord+lyrics sheet.
44    PlainChordLyrics,
45    /// The input is ABC notation.
46    Abc,
47    /// The format could not be determined.
48    Unknown,
49}
50
51// ---------------------------------------------------------------------------
52// PlainTextImporter
53// ---------------------------------------------------------------------------
54
55/// Configuration for the plain-text heuristic importer.
56///
57/// # Examples
58///
59/// ```
60/// use chordsketch_core::heuristic::{PlainTextImporter, InputFormat};
61///
62/// let importer = PlainTextImporter::new();
63/// let format = importer.detect_format("Am  G  C\nHello world here\n");
64/// assert_eq!(format, InputFormat::PlainChordLyrics);
65/// ```
66#[derive(Debug, Clone)]
67pub struct PlainTextImporter {
68    /// Minimum fraction of whitespace-separated tokens that must be valid chord
69    /// names for a line to be classified as a chord line. Default: `0.5`.
70    ///
71    /// **Valid range: `[0.0, 1.0]`.**
72    /// - `0.0` — every non-empty, non-punctuated line is classified as a chord
73    ///   line regardless of content.
74    /// - `1.0` — all tokens must be valid chord names for the line to qualify.
75    /// - Values above `1.0` disable chord-line detection entirely (the ratio of
76    ///   chord tokens can never exceed `1.0`).
77    /// - Negative values behave like `0.0`.
78    ///
79    /// Prefer [`PlainTextImporter::with_thresholds`] to construct an importer
80    /// with validated values.
81    pub chord_threshold: f64,
82    /// Minimum number of chord tokens required to classify a line as a chord
83    /// line. Default: `2`.
84    ///
85    /// **Valid range: `>= 1`.**
86    /// Setting this to `0` disables the minimum-count guard: any non-empty,
87    /// non-punctuated line that meets [`chord_threshold`][Self::chord_threshold]
88    /// will be classified as a chord line, even if it contains only a single
89    /// token.
90    ///
91    /// Prefer [`PlainTextImporter::with_thresholds`] to construct an importer
92    /// with validated values.
93    pub min_chord_tokens: usize,
94}
95
96impl Default for PlainTextImporter {
97    fn default() -> Self {
98        Self {
99            chord_threshold: 0.5,
100            min_chord_tokens: 2,
101        }
102    }
103}
104
105impl PlainTextImporter {
106    /// Creates a new importer with default threshold settings.
107    #[must_use]
108    pub fn new() -> Self {
109        Self::default()
110    }
111
112    /// Creates a new importer with explicit threshold values, returning an
113    /// error string if any value is out of its valid range.
114    ///
115    /// # Errors
116    ///
117    /// Returns `Err` if:
118    /// - `chord_threshold` is not in `[0.0, 1.0]`
119    /// - `min_chord_tokens` is `0`
120    ///
121    /// # Examples
122    ///
123    /// ```
124    /// use chordsketch_core::heuristic::PlainTextImporter;
125    ///
126    /// // Valid mid-range value.
127    /// let importer = PlainTextImporter::with_thresholds(0.75, 3).unwrap();
128    /// assert_eq!(importer.chord_threshold, 0.75);
129    /// assert_eq!(importer.min_chord_tokens, 3);
130    ///
131    /// // Boundary values are valid.
132    /// assert!(PlainTextImporter::with_thresholds(0.0, 1).is_ok());
133    /// assert!(PlainTextImporter::with_thresholds(1.0, 1).is_ok());
134    ///
135    /// // Out-of-range values are rejected.
136    /// assert!(PlainTextImporter::with_thresholds(1.5, 2).is_err());
137    /// assert!(PlainTextImporter::with_thresholds(-0.1, 2).is_err());
138    /// assert!(PlainTextImporter::with_thresholds(f64::NAN, 2).is_err());
139    /// assert!(PlainTextImporter::with_thresholds(0.5, 0).is_err());
140    /// ```
141    #[must_use = "this `Result` should be handled; use `.unwrap()` or `?` to obtain the configured importer"]
142    pub fn with_thresholds(chord_threshold: f64, min_chord_tokens: usize) -> Result<Self, String> {
143        if !(0.0..=1.0).contains(&chord_threshold) {
144            return Err(format!(
145                "chord_threshold must be in [0.0, 1.0], got {chord_threshold}"
146            ));
147        }
148        if min_chord_tokens == 0 {
149            return Err("min_chord_tokens must be >= 1".to_string());
150        }
151        Ok(Self {
152            chord_threshold,
153            min_chord_tokens,
154        })
155    }
156
157    /// Returns `true` if `line` appears to be a chord line.
158    ///
159    /// A chord line satisfies all of the following conditions:
160    /// - Contains at least [`min_chord_tokens`][Self::min_chord_tokens] tokens
161    ///   that parse as valid chord names.
162    /// - The fraction of valid chord tokens is ≥
163    ///   [`chord_threshold`][Self::chord_threshold].
164    /// - Does not contain sentence-ending punctuation (`.`, `?`, `!`), which
165    ///   is a strong indicator that the line is lyrics.
166    fn is_chord_line(&self, line: &str) -> bool {
167        // Sentence-ending punctuation strongly indicates lyrics.
168        if line.contains('.') || line.contains('?') || line.contains('!') {
169            return false;
170        }
171        let tokens: Vec<&str> = line.split_whitespace().collect();
172        if tokens.is_empty() {
173            return false;
174        }
175        let chord_count = tokens.iter().filter(|t| is_chord_token(t)).count();
176        chord_count >= self.min_chord_tokens
177            && chord_count as f64 / tokens.len() as f64 >= self.chord_threshold
178    }
179
180    /// Detects the input format using heuristics.
181    ///
182    /// Returns [`InputFormat::ChordPro`] if the input appears to be ChordPro
183    /// (directive braces or inline `[chord]` notation).
184    /// Returns [`InputFormat::PlainChordLyrics`] if the input contains at least
185    /// two chord lines.
186    /// Returns [`InputFormat::Unknown`] otherwise.
187    #[must_use]
188    pub fn detect_format(&self, input: &str) -> InputFormat {
189        let lines: Vec<&str> = input.lines().collect();
190
191        // ChordPro directive syntax: a line whose first non-space character is
192        // `{` and last non-space character is `}`.
193        let has_directives = lines.iter().any(|l| {
194            let t = l.trim();
195            t.starts_with('{') && t.ends_with('}')
196        });
197        if has_directives {
198            return InputFormat::ChordPro;
199        }
200
201        // ChordPro inline chord notation: `[Am]`, `[G7]`, etc.
202        // Distinguish from plain-text section labels like `[Verse]` or `[Chorus 2]`
203        // by checking whether the content inside `[...]` is a valid chord name.
204        //
205        // A whole-line bracket (the trimmed line is exactly `[content]`) is
206        // treated as a section header — not inline chord notation — because
207        // `parse_section_header` already classifies it that way during
208        // conversion. This prevents single-letter key indicators like `[C]`
209        // or `[Am]` from triggering a false ChordPro classification.
210        //
211        // Known limitation (issue #1304): a directive-free ChordPro file that
212        // uses *only* whole-line bracket chords (e.g. `[Am]` alone on its own
213        // line immediately before a lyric) will be classified as `Unknown`
214        // rather than `ChordPro`, because such lines are indistinguishable
215        // from plain-text key/section labels without lookahead context that
216        // would risk introducing new false positives. Files with at least one
217        // `{directive}` or one mid-line inline chord (e.g., `Hello [Am]world`)
218        // are not affected.
219        let has_inline_chords = lines.iter().any(|l| {
220            let trimmed = l.trim();
221            // Skip whole-line brackets: `[content]` where content has no nested `[`.
222            if trimmed.starts_with('[')
223                && trimmed.ends_with(']')
224                && trimmed.len() >= 3
225                && !trimmed[1..trimmed.len() - 1].contains('[')
226            {
227                return false;
228            }
229            let mut rest: &str = l;
230            while let Some(open) = rest.find('[') {
231                let after = &rest[open + 1..];
232                let Some(close) = after.find(']') else { break };
233                let content = &after[..close];
234                if is_chord_token(content) {
235                    return true;
236                }
237                rest = &after[close + 1..];
238            }
239            false
240        });
241        if has_inline_chords {
242            return InputFormat::ChordPro;
243        }
244
245        // ABC notation: at least one `X:` reference-number field followed by
246        // digits (the mandatory field that begins every ABC tune).
247        let has_abc_header = lines.iter().any(|l| {
248            let t = l.trim_start();
249            if let Some(rest) = t.strip_prefix("X:") {
250                rest.trim_start()
251                    .chars()
252                    .next()
253                    .is_some_and(|c| c.is_ascii_digit())
254            } else {
255                false
256            }
257        });
258        if has_abc_header {
259            return InputFormat::Abc;
260        }
261
262        // Plain chord+lyrics: at least two chord lines.
263        let chord_line_count = lines.iter().filter(|l| self.is_chord_line(l)).count();
264        if chord_line_count >= 2 {
265            InputFormat::PlainChordLyrics
266        } else if chord_line_count == 1 && lines.len() <= 5 {
267            // Very short input with one chord line is still treated as plain
268            // chord+lyrics (e.g., a two-line snippet passed for testing).
269            InputFormat::PlainChordLyrics
270        } else {
271            InputFormat::Unknown
272        }
273    }
274
275    /// Converts a plain chord+lyrics text into a [`Song`] AST.
276    ///
277    /// # Algorithm
278    ///
279    /// 1. Classify each line as a chord line, section header, lyric, or blank.
280    /// 2. Pair each chord line with the immediately following lyric line.
281    ///    For each such pair, compute chord column offsets and produce inline
282    ///    chord annotations.
283    /// 3. Section headers are converted to `{start_of_*}` / `{end_of_*}`
284    ///    directive pairs.
285    /// 4. Lines that are neither chord lines nor section headers are emitted as
286    ///    plain lyric lines.
287    #[must_use]
288    pub fn convert(&self, input: &str) -> Song {
289        let raw_lines: Vec<&str> = input.lines().collect();
290        let classes: Vec<LineKind<'_>> = raw_lines
291            .iter()
292            .map(|l| classify_line(l, |line| self.is_chord_line(line)))
293            .collect();
294
295        let mut song = Song::new();
296        let mut i = 0;
297        let mut current_section: Option<String> = None;
298
299        while i < classes.len() {
300            match &classes[i] {
301                LineKind::Blank => {
302                    song.lines.push(Line::Empty);
303                    i += 1;
304                }
305                LineKind::SectionHeader(label) => {
306                    let label = label.clone();
307                    // Close any open section.
308                    if let Some(ref sec) = current_section {
309                        song.lines
310                            .push(Line::Directive(end_directive_for_section(sec)));
311                    }
312                    // Open the new section.
313                    let (start_dir, canonical) = start_directive_for_section(&label);
314                    song.lines.push(Line::Directive(start_dir));
315                    current_section = Some(canonical);
316                    i += 1;
317                }
318                LineKind::ChordLine(positions) => {
319                    // Peek at the next non-blank line: if it is a lyric, pair them.
320                    let j = i + 1;
321                    if j < classes.len() {
322                        if let LineKind::Lyric(lyric) = &classes[j] {
323                            let paired = pair_chords_with_lyric(positions, lyric);
324                            song.lines.push(Line::Lyrics(paired));
325                            i += 2;
326                            continue;
327                        }
328                    }
329                    // No following lyric — emit the chords as a chord-only line.
330                    let paired = pair_chords_with_lyric(positions, "");
331                    song.lines.push(Line::Lyrics(paired));
332                    i += 1;
333                }
334                LineKind::Lyric(text) => {
335                    song.lines.push(Line::Lyrics(LyricsLine {
336                        segments: vec![LyricsSegment {
337                            chord: None,
338                            text: (*text).to_string(),
339                            spans: vec![],
340                        }],
341                    }));
342                    i += 1;
343                }
344            }
345        }
346
347        // Close the last open section.
348        if let Some(ref sec) = current_section {
349            song.lines
350                .push(Line::Directive(end_directive_for_section(sec)));
351        }
352
353        song
354    }
355}
356
357// ---------------------------------------------------------------------------
358// Module-level convenience functions
359// ---------------------------------------------------------------------------
360
361/// Detects the format of `input` using default [`PlainTextImporter`] settings.
362///
363/// # Examples
364///
365/// ```
366/// use chordsketch_core::heuristic::{detect_format, InputFormat};
367///
368/// assert_eq!(
369///     detect_format("{title: My Song}\n[Am]Hello"),
370///     InputFormat::ChordPro
371/// );
372/// assert_eq!(
373///     detect_format("Am  G  C\nHello world\n"),
374///     InputFormat::PlainChordLyrics
375/// );
376/// ```
377#[must_use]
378pub fn detect_format(input: &str) -> InputFormat {
379    PlainTextImporter::default().detect_format(input)
380}
381
382/// Converts a plain chord+lyrics text into a [`Song`] AST using default
383/// [`PlainTextImporter`] settings.
384///
385/// # Examples
386///
387/// ```
388/// use chordsketch_core::heuristic::convert_plain_text;
389///
390/// let song = convert_plain_text("Am  G\nHello world\n");
391/// assert!(!song.lines.is_empty());
392/// ```
393#[must_use]
394pub fn convert_plain_text(input: &str) -> Song {
395    PlainTextImporter::default().convert(input)
396}
397
398// ---------------------------------------------------------------------------
399// Internal helpers
400// ---------------------------------------------------------------------------
401
402/// Classification of a single input line.
403#[derive(Debug)]
404enum LineKind<'a> {
405    /// A blank (empty or all-whitespace) line.
406    Blank,
407    /// A section header like `[Verse]` or `CHORUS:`.
408    /// Contains the inner label text.
409    SectionHeader(String),
410    /// A chord line. Contains `(byte_offset, chord_name)` pairs, sorted by
411    /// byte offset.
412    ChordLine(Vec<(usize, String)>),
413    /// A lyric (or unrecognised) line.
414    Lyric(&'a str),
415}
416
417/// Classifies a single line using the supplied chord-line predicate.
418fn classify_line<'a, F>(line: &'a str, is_chord_line: F) -> LineKind<'a>
419where
420    F: Fn(&str) -> bool,
421{
422    if line.trim().is_empty() {
423        return LineKind::Blank;
424    }
425    if let Some(label) = parse_section_header(line) {
426        return LineKind::SectionHeader(label);
427    }
428    if is_chord_line(line) {
429        return LineKind::ChordLine(chord_positions(line));
430    }
431    LineKind::Lyric(line)
432}
433
434/// Returns `true` if `token` is a well-formed chord name.
435///
436/// This is a stricter check than [`parse_chord`]: it rejects tokens whose
437/// extension part contains unexpected alphabetic characters (e.g., `"Chorus"`
438/// would be parsed as `C + "horus"` by `parse_chord`, but is rejected here).
439///
440/// Accepted patterns:
441/// - Root `A–G` with optional `#` / `b`
442/// - Zero or more quality+extension atoms: a quality keyword (`m`, `maj`,
443///   `min`, `dim`, `aug`, `sus`, `add`, `+`, `°`) optionally followed by a
444///   numeric extension (e.g. `m7add11`, `maj7sus4`, `7b5`)
445/// - Optional bass note: `/A–G[#b]`
446fn is_chord_token(token: &str) -> bool {
447    // Reject obviously non-chord tokens.  16 characters covers multi-component
448    // jazz chords like Dbmaj7#11sus4b9 (15 chars) while still catching long words.
449    if token.is_empty() || token.len() > 16 {
450        return false;
451    }
452    let bytes = token.as_bytes();
453
454    // Root note: must be A–G (uppercase).
455    if !matches!(bytes[0], b'A'..=b'G') {
456        return false;
457    }
458
459    // Split off optional bass note (/X[#b]).
460    let (body, bass) = match token.find('/') {
461        Some(i) => (&token[..i], Some(&token[i + 1..])),
462        None => (token, None),
463    };
464
465    // Validate bass note.
466    if let Some(bass) = bass {
467        if bass.is_empty() {
468            return false;
469        }
470        let b = bass.as_bytes();
471        if !matches!(b[0], b'A'..=b'G') {
472            return false;
473        }
474        if b.len() > 1 && b[1] != b'#' && b[1] != b'b' {
475            return false;
476        }
477        if b.len() > 2 {
478            return false;
479        }
480    }
481
482    // Validate body: root [accidental] [quality] [extension]
483    let body_bytes = body.as_bytes();
484    let mut pos = 1usize; // skip root letter
485
486    // Optional accidental.
487    if pos < body_bytes.len() && (body_bytes[pos] == b'#' || body_bytes[pos] == b'b') {
488        pos += 1;
489    }
490
491    let quality_ext = &body[pos..];
492    is_valid_quality_ext(quality_ext)
493}
494
495/// Consumes an optional accidental (`#` or `b`, only when immediately
496/// followed by a digit) and then any run of ASCII digits.  Returns the
497/// unconsumed suffix.
498fn consume_numeric(s: &str) -> &str {
499    let bytes = s.as_bytes();
500    let mut i = 0;
501    // Accidental only counts when a digit follows it.
502    if bytes.len() >= 2 && (bytes[0] == b'#' || bytes[0] == b'b') && bytes[1].is_ascii_digit() {
503        i = 1;
504    }
505    while i < bytes.len() && bytes[i].is_ascii_digit() {
506        i += 1;
507    }
508    &s[i..]
509}
510
511/// Returns `true` if `s` is a valid chord quality+extension suffix.
512///
513/// The suffix is consumed iteratively: each step strips one quality keyword
514/// (`maj`, `min`, `dim`, `aug`, `sus`, `add`, `m`, `+`, `°`) optionally
515/// followed by a numeric extension (optional `#`/`b` accidental then digits).
516/// This allows compound forms such as `m7add11`, `maj7sus4`, or `7b5`.
517///
518/// Acceptable atoms (zero or more, in any order):
519/// - Quality keyword: `maj`, `min`, `dim`, `aug`, `sus`, `add`, `m`, `+`, `°`
520/// - Numeric extension: optional accidental (`#`/`b`, only if a digit follows)
521///   then one or more digits
522fn is_valid_quality_ext(s: &str) -> bool {
523    let mut rest = s;
524    loop {
525        if rest.is_empty() {
526            return true;
527        }
528
529        // Try to strip a quality keyword.
530        let after_kw: Option<&str> = None
531            .or_else(|| rest.strip_prefix("maj"))
532            .or_else(|| rest.strip_prefix("min"))
533            .or_else(|| rest.strip_prefix("dim"))
534            .or_else(|| rest.strip_prefix("aug"))
535            .or_else(|| rest.strip_prefix("sus"))
536            .or_else(|| rest.strip_prefix("add"))
537            .or_else(|| rest.strip_prefix('m'))
538            .or_else(|| rest.strip_prefix('+'))
539            .or_else(|| rest.strip_prefix('°'));
540
541        if let Some(after) = after_kw {
542            // Keyword consumed; optionally consume a following numeric part.
543            rest = consume_numeric(after);
544        } else {
545            // No keyword — try a bare numeric extension.
546            let next = consume_numeric(rest);
547            if next.len() == rest.len() {
548                // Nothing consumed: unrecognized character.
549                return false;
550            }
551            rest = next;
552        }
553    }
554}
555
556/// Extracts `(byte_offset, chord_name)` pairs from a chord line, preserving
557/// the column position of each chord token.
558fn chord_positions(line: &str) -> Vec<(usize, String)> {
559    let mut result = Vec::new();
560    let mut search_start = 0usize;
561
562    for token in line.split_whitespace() {
563        // Locate this token inside the remaining slice.
564        if let Some(rel) = line[search_start..].find(token) {
565            let abs = search_start + rel;
566            if is_chord_token(token) {
567                result.push((abs, token.to_string()));
568            }
569            // Advance past this token.
570            search_start = abs + token.len();
571        }
572    }
573    result
574}
575
576/// Attempts to parse `line` as a section header.
577///
578/// Recognised patterns:
579/// - `[Verse]`, `[Chorus 2]` — square brackets
580/// - `(Bridge)` — parentheses
581/// - `VERSE:`, `CHORUS:` — uppercase label followed by a colon
582/// - `-- Chorus --`, `== Verse ==` — dash or equals decoration
583///
584/// Returns the inner label on success, or `None` if the line is not a section
585/// header.
586fn parse_section_header(line: &str) -> Option<String> {
587    let trimmed = line.trim();
588
589    /// Returns `true` if `label` is safe to embed in a ChordPro directive
590    /// value.  Rejects labels containing `{` or `}` because ChordPro has no
591    /// escape mechanism inside directive values and those characters would
592    /// produce malformed output.
593    fn is_safe_label(label: &str) -> bool {
594        !label.is_empty() && !label.contains('{') && !label.contains('}')
595    }
596
597    // [Label] form
598    if trimmed.starts_with('[') && trimmed.ends_with(']') && trimmed.len() >= 3 {
599        let inner = trimmed[1..trimmed.len() - 1].trim();
600        if !inner.contains('[') && is_safe_label(inner) {
601            return Some(inner.to_string());
602        }
603    }
604
605    // (Label) form
606    if trimmed.starts_with('(') && trimmed.ends_with(')') && trimmed.len() >= 3 {
607        let inner = trimmed[1..trimmed.len() - 1].trim();
608        if !inner.contains('(') && is_safe_label(inner) {
609            return Some(inner.to_string());
610        }
611    }
612
613    // LABEL: form — alphabetic label followed by exactly one colon
614    if let Some(label) = trimmed.strip_suffix(':') {
615        if label
616            .chars()
617            .all(|c| c.is_alphabetic() || c == ' ' || c == '-')
618            && is_safe_label(label.trim())
619        {
620            return Some(label.trim().to_string());
621        }
622    }
623
624    // -- Label -- and == Label == forms
625    for delim in &["--", "==", "**", "##"] {
626        if trimmed.starts_with(delim) && trimmed.ends_with(delim) && trimmed.len() > 2 * delim.len()
627        {
628            let inner = trimmed[delim.len()..trimmed.len() - delim.len()].trim();
629            if is_safe_label(inner) {
630                return Some(inner.to_string());
631            }
632        }
633    }
634
635    None
636}
637
638/// Maps a section label to its canonical ChordPro directive base name.
639///
640/// For example, `"verse"` → `"verse"`, `"chorus 2"` → `"chorus"`.
641/// Unknown labels fall back to `"verse"` with a label attribute.
642fn canonical_section(label: &str) -> &'static str {
643    let lower = label.to_lowercase();
644    let lower = lower.trim();
645    if lower.starts_with("chorus") || lower.starts_with("refrain") {
646        "chorus"
647    } else if lower.starts_with("bridge") {
648        "bridge"
649    } else {
650        // verse, intro, outro, pre-chorus, and unknown labels all map to "verse".
651        "verse"
652    }
653}
654
655/// Returns the `{start_of_*}` directive for the given label, plus the
656/// canonical section name used to match the closing directive.
657fn start_directive_for_section(label: &str) -> (Directive, String) {
658    let canonical = canonical_section(label);
659    let dir_name = format!("start_of_{canonical}");
660    // Include the label as a `label` attribute when the label text differs from
661    // the canonical section name (e.g., "Verse 2", "Intro").
662    let lower_label = label.trim().to_lowercase();
663    let dir = if lower_label == canonical {
664        Directive::name_only(dir_name)
665    } else {
666        Directive::with_value(dir_name, label.trim().to_string())
667    };
668    (dir, canonical.to_string())
669}
670
671/// Returns the `{end_of_*}` directive for the given canonical section name.
672fn end_directive_for_section(canonical: &str) -> Directive {
673    let dir_name = format!("end_of_{canonical}");
674    Directive::name_only(dir_name)
675}
676
677/// Builds a [`LyricsLine`] by pairing chord column positions with the
678/// corresponding lyric text.
679///
680/// Each chord annotates the lyric text starting at its column position in the
681/// chord line. Text that precedes the first chord is emitted in a leading
682/// chord-free segment.
683///
684/// If `positions` is empty the function returns a single chord-free segment
685/// containing the full `lyric` string (rather than an empty `LyricsLine`).
686fn pair_chords_with_lyric(positions: &[(usize, String)], lyric: &str) -> LyricsLine {
687    // Fast path: no chord positions — return the lyric as a single plain segment.
688    if positions.is_empty() {
689        return LyricsLine {
690            segments: vec![LyricsSegment {
691                chord: None,
692                text: lyric.to_string(),
693                spans: vec![],
694            }],
695        };
696    }
697
698    // Work in terms of char indices for correctness with multi-byte text, but
699    // chord positions from `chord_positions` are byte offsets into the ASCII
700    // chord line. Because chord lines are expected to be ASCII (chord names),
701    // byte == char for the chord line. We must, however, map those byte offsets
702    // to char offsets in the lyric line for correct slicing.
703    let lyric_char_offsets: Vec<usize> = lyric.char_indices().map(|(b, _)| b).collect();
704    let lyric_len = lyric.len();
705
706    // Maps a byte offset from the (ASCII) chord line to a valid byte offset in
707    // the lyric string, clamped to lyric_len.  When col is beyond the end of
708    // the lyric we return lyric_len (no text to annotate).  For shorter cols
709    // we snap to the nearest char boundary so we never slice in the middle of
710    // a multi-byte codepoint.
711    let clamp_to_lyric = |col: usize| -> usize {
712        if col >= lyric_len {
713            return lyric_len;
714        }
715        // Snap down to the nearest char-boundary offset that does not exceed col.
716        lyric_char_offsets
717            .iter()
718            .copied()
719            .rfind(|&b| b <= col)
720            .unwrap_or(0)
721    };
722
723    let mut segments: Vec<LyricsSegment> = Vec::new();
724    let mut cursor = 0usize; // byte position in lyric
725
726    for (i, (col, chord_name)) in positions.iter().enumerate() {
727        let text_start = clamp_to_lyric(*col);
728        let text_end = if let Some((next_col, _)) = positions.get(i + 1) {
729            clamp_to_lyric(*next_col)
730        } else {
731            lyric_len // last chord gets all remaining lyric text
732        };
733
734        // Any lyric text before this chord position (after the cursor) without
735        // a chord annotation.
736        if text_start > cursor {
737            segments.push(LyricsSegment {
738                chord: None,
739                text: lyric[cursor..text_start].to_string(),
740                spans: vec![],
741            });
742        }
743
744        // text_start is always <= lyric_len by construction of clamp_to_lyric.
745        let text = lyric[text_start..text_end.min(lyric_len)].to_string();
746
747        segments.push(LyricsSegment {
748            chord: Some(Chord::new(chord_name.as_str())),
749            text,
750            spans: vec![],
751        });
752        cursor = text_end.min(lyric_len);
753    }
754
755    // positions was non-empty, so segments is non-empty and cursor == lyric_len.
756    LyricsLine { segments }
757}
758
759// ---------------------------------------------------------------------------
760// ChordPro serializer for plain-text-imported songs
761// ---------------------------------------------------------------------------
762
763/// Strips `{` and `}` from a string so it is safe to embed as a ChordPro
764/// directive name or value.  ChordPro has no escape mechanism inside directive
765/// names or values, so brace characters would produce malformed output.
766fn sanitize_directive_token(s: &str) -> std::borrow::Cow<'_, str> {
767    if s.contains('{') || s.contains('}') {
768        std::borrow::Cow::Owned(s.replace(['{', '}'], ""))
769    } else {
770        std::borrow::Cow::Borrowed(s)
771    }
772}
773
774/// Serializes a [`Song`] to ChordPro format.
775///
776/// This serializer is intended for songs produced by [`convert_plain_text`].
777/// It handles the subset of [`Line`] and [`Directive`] variants that the
778/// heuristic importer emits. Complex AST features (image directives, delegate
779/// environments, etc.) are rendered as a best-effort comment.
780///
781/// # Examples
782///
783/// ```
784/// use chordsketch_core::heuristic::{convert_plain_text, song_to_chordpro};
785///
786/// let song = convert_plain_text("[Verse]\nAm  G\nHello world\n");
787/// let chordpro = song_to_chordpro(&song);
788/// assert!(chordpro.contains("{start_of_verse}"));
789/// assert!(chordpro.contains("[Am]"));
790/// ```
791#[must_use]
792pub fn song_to_chordpro(song: &Song) -> String {
793    use crate::ast::{CommentStyle, Line};
794
795    let mut out = String::new();
796
797    // Emit metadata directives first if populated.
798    if let Some(ref title) = song.metadata.title {
799        out.push_str(&format!("{{title: {}}}\n", sanitize_directive_token(title)));
800    }
801    if let Some(artist) = song.metadata.artists.first() {
802        out.push_str(&format!(
803            "{{artist: {}}}\n",
804            sanitize_directive_token(artist)
805        ));
806    }
807
808    for line in &song.lines {
809        match line {
810            Line::Empty => out.push('\n'),
811            Line::Comment(style, text) => {
812                let t = sanitize_directive_token(text);
813                match style {
814                    CommentStyle::Normal => out.push_str(&format!("{{comment: {t}}}\n")),
815                    CommentStyle::Italic => out.push_str(&format!("{{comment_italic: {t}}}\n")),
816                    CommentStyle::Boxed => out.push_str(&format!("{{comment_box: {t}}}\n")),
817                }
818            }
819            Line::Directive(dir) => {
820                let name = sanitize_directive_token(&dir.name);
821                if let Some(ref value) = dir.value {
822                    out.push_str(&format!(
823                        "{{{}: {}}}\n",
824                        name,
825                        sanitize_directive_token(value)
826                    ));
827                } else {
828                    out.push_str(&format!("{{{}}}\n", name));
829                }
830            }
831            Line::Lyrics(lyrics) => {
832                for seg in &lyrics.segments {
833                    if let Some(ref chord) = seg.chord {
834                        out.push('[');
835                        out.push_str(&chord.name);
836                        out.push(']');
837                    }
838                    out.push_str(&seg.text);
839                }
840                out.push('\n');
841            }
842        }
843    }
844
845    out
846}
847
848// ---------------------------------------------------------------------------
849// Tests
850// ---------------------------------------------------------------------------
851
852#[cfg(test)]
853mod tests {
854    use super::*;
855
856    fn chord_names(line: &LyricsLine) -> Vec<Option<String>> {
857        line.segments
858            .iter()
859            .map(|s| s.chord.as_ref().map(|c| c.name.clone()))
860            .collect()
861    }
862
863    // --- detect_format ---
864
865    #[test]
866    fn detects_chordpro_from_directives() {
867        assert_eq!(
868            detect_format("{title: Hello}\n{soc}\n[Am]Hello\n{eoc}"),
869            InputFormat::ChordPro
870        );
871    }
872
873    #[test]
874    fn detects_chordpro_from_inline_chords() {
875        assert_eq!(detect_format("[Am]Hello [G]world"), InputFormat::ChordPro);
876    }
877
878    #[test]
879    fn detects_plain_chord_lyrics() {
880        let input = "Am  G  C  Em\nHello beautiful world tonight\nG  C\nOnce more";
881        assert_eq!(detect_format(input), InputFormat::PlainChordLyrics);
882    }
883
884    #[test]
885    fn detects_plain_chord_lyrics_with_section_labels() {
886        // Section labels like [Verse] and [Chorus] should NOT be misidentified
887        // as ChordPro inline chord notation.
888        let input = "[Verse]\nG  D\nHere I am\nEm  C\nWondering\n\n[Chorus]\nC  G\nLala\n";
889        assert_eq!(detect_format(input), InputFormat::PlainChordLyrics);
890    }
891
892    #[test]
893    fn detects_single_letter_section_label_not_chordpro() {
894        // A whole-line `[C]` is a key/section label in plain-text chord
895        // sheets, not ChordPro inline chord notation. Issue #1278.
896        let input = "[C]\nG  D\nHere I am\nEm  C\nWondering\n";
897        assert_eq!(detect_format(input), InputFormat::PlainChordLyrics);
898
899        // Same for other single-letter keys that are valid chord names.
900        let input_am = "[Am]\nG  D  Em\nHello world again now\n";
901        assert_eq!(detect_format(input_am), InputFormat::PlainChordLyrics);
902    }
903
904    #[test]
905    fn detects_inline_chord_in_line_still_chordpro() {
906        // `[C]` embedded mid-line (not the whole line) is inline chord notation.
907        assert_eq!(detect_format("[C]Hello world"), InputFormat::ChordPro);
908        assert_eq!(detect_format("Hello [Am]world"), InputFormat::ChordPro);
909    }
910
911    #[test]
912    fn detects_multi_bracket_line_as_chordpro() {
913        // `[Am][G]` on a single line has inner content `Am][G` which contains `[`,
914        // so the whole-line guard does NOT trigger. The scan finds `[Am]` → ChordPro.
915        assert_eq!(detect_format("[Am][G]"), InputFormat::ChordPro);
916        // Three brackets also triggers correctly.
917        assert_eq!(detect_format("[C][G][Am]"), InputFormat::ChordPro);
918    }
919
920    #[test]
921    fn detects_mixed_section_label_and_inline_chord_as_chordpro() {
922        // A whole-line section label `[Verse]` does NOT trigger ChordPro detection
923        // on its own, but a mid-line inline chord on another line does.
924        let input = "[Verse]\nHello [Am]world\n";
925        assert_eq!(detect_format(input), InputFormat::ChordPro);
926    }
927
928    #[test]
929    fn known_limitation_whole_line_chord_only_chordpro_returns_unknown() {
930        // A directive-free ChordPro file that uses ONLY whole-line bracket chords
931        // (each chord on its own line before a lyric) is indistinguishable from
932        // a plain-text file with key/section labels, so detect_format returns
933        // Unknown rather than ChordPro. This is a documented trade-off — see
934        // issue #1304 and the comment in detect_format above `has_inline_chords`.
935        //
936        // Files with at least one directive or one mid-line inline chord are
937        // correctly identified as ChordPro (see `detects_chordpro_from_directives`
938        // and `detects_inline_chord_in_line_still_chordpro`).
939        let input = "[Am]\nThis is a lyric line\n[G]\nAnother lyric line\n";
940        assert_eq!(detect_format(input), InputFormat::Unknown);
941    }
942
943    #[test]
944    fn detects_unknown_for_pure_lyrics() {
945        let input = "Hello beautiful world\nOnce upon a time\nSomething happened here";
946        assert_eq!(detect_format(input), InputFormat::Unknown);
947    }
948
949    #[test]
950    fn detects_unknown_for_empty() {
951        assert_eq!(detect_format(""), InputFormat::Unknown);
952    }
953
954    // --- is_chord_token ---
955
956    #[test]
957    fn chord_token_rejects_section_labels() {
958        // Section labels that start with A-G should NOT be recognized as chords.
959        assert!(!is_chord_token("Chorus"));
960        assert!(!is_chord_token("Bridge"));
961        assert!(!is_chord_token("Em7add9sus2extended"));
962    }
963
964    #[test]
965    fn chord_token_accepts_valid_chords() {
966        assert!(is_chord_token("Am"));
967        assert!(is_chord_token("C"));
968        assert!(is_chord_token("G7"));
969        assert!(is_chord_token("Cmaj7"));
970        assert!(is_chord_token("D/F#"));
971        assert!(is_chord_token("Bb"));
972        assert!(is_chord_token("F#m7"));
973        assert!(is_chord_token("Gsus4"));
974        assert!(is_chord_token("Em"));
975    }
976
977    #[test]
978    fn chord_token_accepts_multicomponent_extensions() {
979        // Compound quality+extension sequences (issue #1279).
980        assert!(is_chord_token("Am7add11"));
981        assert!(is_chord_token("Cmaj7sus4"));
982        assert!(is_chord_token("G7b5"));
983        assert!(is_chord_token("Fmaj7add9"));
984        assert!(is_chord_token("Dm7add11"));
985        assert!(is_chord_token("G7#9"));
986        assert!(is_chord_token("Cmaj9"));
987        // Words that happen to start with A-G must still be rejected.
988        assert!(!is_chord_token("Chorus"));
989        assert!(!is_chord_token("Bridge"));
990        // Cmaj7e: maj consumed → 7e, numeric 7 consumed → e; 'e' is not a
991        // keyword or numeric character so the algorithm returns false.
992        assert!(!is_chord_token("Cmaj7e"));
993        // Multi-component chords up to 16 characters must be accepted.
994        assert!(is_chord_token("Cmaj7sus4add9")); // 13 chars
995    }
996
997    // --- is_chord_line ---
998
999    #[test]
1000    fn chord_line_typical() {
1001        let imp = PlainTextImporter::new();
1002        assert!(imp.is_chord_line("Am  F  C  G"));
1003    }
1004
1005    #[test]
1006    fn chord_line_with_slash_chords() {
1007        let imp = PlainTextImporter::new();
1008        assert!(imp.is_chord_line("G  D/F#  Em  C"));
1009    }
1010
1011    #[test]
1012    fn not_chord_line_all_lyrics() {
1013        let imp = PlainTextImporter::new();
1014        assert!(!imp.is_chord_line("There's a lady who's sure."));
1015    }
1016
1017    #[test]
1018    fn not_chord_line_sentence_punctuation() {
1019        let imp = PlainTextImporter::new();
1020        // Even though "A" and "G" are valid chords, the period disqualifies it.
1021        assert!(!imp.is_chord_line("A song for G."));
1022    }
1023
1024    #[test]
1025    fn not_chord_line_too_few_chords() {
1026        let imp = PlainTextImporter::new();
1027        // Only one chord token — below min_chord_tokens=2.
1028        assert!(!imp.is_chord_line("Am something else here now"));
1029    }
1030
1031    // --- parse_section_header ---
1032
1033    #[test]
1034    fn section_square_brackets() {
1035        assert_eq!(parse_section_header("[Verse]"), Some("Verse".to_string()));
1036        assert_eq!(
1037            parse_section_header("[Chorus 2]"),
1038            Some("Chorus 2".to_string())
1039        );
1040    }
1041
1042    #[test]
1043    fn section_parens() {
1044        assert_eq!(parse_section_header("(Bridge)"), Some("Bridge".to_string()));
1045    }
1046
1047    #[test]
1048    fn section_colon() {
1049        assert_eq!(parse_section_header("VERSE:"), Some("VERSE".to_string()));
1050        assert_eq!(parse_section_header("Chorus:"), Some("Chorus".to_string()));
1051    }
1052
1053    #[test]
1054    fn section_dash_decorated() {
1055        assert_eq!(
1056            parse_section_header("-- Chorus --"),
1057            Some("Chorus".to_string())
1058        );
1059    }
1060
1061    #[test]
1062    fn section_not_matched() {
1063        assert_eq!(parse_section_header("Hello world"), None);
1064        assert_eq!(parse_section_header("Am G C Em"), None);
1065    }
1066
1067    #[test]
1068    fn section_rejects_brace_in_label() {
1069        // Labels containing { or } must be rejected to prevent emitting
1070        // malformed ChordPro directive values (M-1 from delta review).
1071        assert_eq!(parse_section_header("[Verse}]"), None);
1072        assert_eq!(parse_section_header("[{Chorus}]"), None);
1073        assert_eq!(parse_section_header("Verse}:"), None);
1074    }
1075
1076    // --- pair_chords_with_lyric ---
1077
1078    #[test]
1079    fn pair_basic() {
1080        // "Am  F" at cols 0 and 4 over "Hello world"
1081        let positions = vec![(0, "Am".to_string()), (4, "F".to_string())];
1082        let line = pair_chords_with_lyric(&positions, "Hello world");
1083        let chords = chord_names(&line);
1084        assert_eq!(chords, vec![Some("Am".to_string()), Some("F".to_string())]);
1085        assert_eq!(line.segments[0].text, "Hell");
1086        assert_eq!(line.segments[1].text, "o world");
1087    }
1088
1089    #[test]
1090    fn pair_lyric_shorter_than_chord_line() {
1091        // Chord at column 10 but lyric is only 5 chars.
1092        let positions = vec![(0, "Am".to_string()), (10, "G".to_string())];
1093        let line = pair_chords_with_lyric(&positions, "Hi");
1094        // Am gets "Hi", G gets ""
1095        assert_eq!(
1096            line.segments[0].chord.as_ref().map(|c| c.name.as_str()),
1097            Some("Am")
1098        );
1099        assert_eq!(line.segments[0].text, "Hi");
1100        assert_eq!(
1101            line.segments[1].chord.as_ref().map(|c| c.name.as_str()),
1102            Some("G")
1103        );
1104        assert_eq!(line.segments[1].text, "");
1105    }
1106
1107    #[test]
1108    fn pair_no_chords_returns_plain_lyric() {
1109        let positions: Vec<(usize, String)> = vec![];
1110        let line = pair_chords_with_lyric(&positions, "Hello world");
1111        assert_eq!(line.segments.len(), 1);
1112        assert!(line.segments[0].chord.is_none());
1113        assert_eq!(line.segments[0].text, "Hello world");
1114    }
1115
1116    // --- convert ---
1117
1118    #[test]
1119    fn convert_simple_verse() {
1120        let input = "[Verse]\nAm  G  C\nHello world today\n";
1121        let song = convert_plain_text(input);
1122        // Should have: start_of_verse, lyrics, end_of_verse
1123        assert!(song.lines.iter().any(|l| matches!(l, Line::Directive(_))));
1124        assert!(song.lines.iter().any(|l| matches!(l, Line::Lyrics(_))));
1125    }
1126
1127    #[test]
1128    fn convert_chordless_lyric_passthrough() {
1129        let input = "Am  G  C  Em\nThere is a song\nThis line has no preceding chord line\n";
1130        let song = convert_plain_text(input);
1131        // The first line pair produces a lyrics line with chords.
1132        // "This line has no preceding chord line" is a lyric after the pair.
1133        let has_plain_lyric = song.lines.iter().any(|l| {
1134            if let Line::Lyrics(ll) = l {
1135                ll.segments.len() == 1
1136                    && ll.segments[0].chord.is_none()
1137                    && ll.segments[0].text.contains("no preceding")
1138            } else {
1139                false
1140            }
1141        });
1142        assert!(has_plain_lyric);
1143    }
1144
1145    #[test]
1146    fn convert_section_labels_to_directives() {
1147        let input = "[Chorus]\nG  C  G  D\nLala lala lala\n[Verse]\nAm  F  C  G\nHello world\n";
1148        let song = convert_plain_text(input);
1149        // At least one StartOfChorus and one StartOfVerse directive.
1150        use crate::ast::DirectiveKind;
1151        let kinds: Vec<&DirectiveKind> = song
1152            .lines
1153            .iter()
1154            .filter_map(|l| {
1155                if let Line::Directive(d) = l {
1156                    Some(&d.kind)
1157                } else {
1158                    None
1159                }
1160            })
1161            .collect();
1162        assert!(kinds.iter().any(|k| **k == DirectiveKind::StartOfChorus));
1163        assert!(kinds.iter().any(|k| **k == DirectiveKind::StartOfVerse));
1164    }
1165
1166    #[test]
1167    fn convert_multiple_sections_close_properly() {
1168        let input = "[Verse]\nAm  G\nHello world\n[Chorus]\nC  G\nYeah yeah\n";
1169        let song = convert_plain_text(input);
1170        use crate::ast::DirectiveKind;
1171        // Should have both end_of_verse and end_of_chorus.
1172        let kinds: Vec<&DirectiveKind> = song
1173            .lines
1174            .iter()
1175            .filter_map(|l| {
1176                if let Line::Directive(d) = l {
1177                    Some(&d.kind)
1178                } else {
1179                    None
1180                }
1181            })
1182            .collect();
1183        assert!(kinds.iter().any(|k| **k == DirectiveKind::EndOfVerse));
1184        assert!(kinds.iter().any(|k| **k == DirectiveKind::EndOfChorus));
1185    }
1186
1187    // --- song_to_chordpro ---
1188
1189    #[test]
1190    fn song_to_chordpro_strips_braces_in_title() {
1191        // song_to_chordpro must not emit malformed ChordPro when metadata
1192        // contains brace characters (issue #1282).
1193        let mut song = Song::default();
1194        song.metadata.title = Some("Hello {World}".to_string());
1195        let out = song_to_chordpro(&song);
1196        // Braces inside the value must be stripped; the directive itself is still well-formed.
1197        assert_eq!(out, "{title: Hello World}\n");
1198    }
1199
1200    #[test]
1201    fn song_to_chordpro_strips_braces_in_artist() {
1202        let mut song = Song::default();
1203        song.metadata.artists.push("{Dodgy} Artist".to_string());
1204        let out = song_to_chordpro(&song);
1205        assert_eq!(out, "{artist: Dodgy Artist}\n");
1206    }
1207
1208    #[test]
1209    fn song_to_chordpro_strips_braces_in_comment() {
1210        use crate::ast::{CommentStyle, Line};
1211        let mut song = Song::default();
1212        song.lines.push(Line::Comment(
1213            CommentStyle::Normal,
1214            "See {note}".to_string(),
1215        ));
1216        let out = song_to_chordpro(&song);
1217        assert_eq!(out, "{comment: See note}\n");
1218    }
1219
1220    #[test]
1221    fn song_to_chordpro_strips_braces_in_directive_name_and_value() {
1222        // A manually-constructed Song with braces in directive name/value
1223        // must still produce well-formed ChordPro (issue #1291).
1224        use crate::ast::{Directive, Line};
1225        let mut dir = Directive::name_only("start_of_{section}".to_string());
1226        dir.value = Some("{custom}".to_string());
1227        let mut song = Song::default();
1228        song.lines.push(Line::Directive(dir));
1229        let out = song_to_chordpro(&song);
1230        assert_eq!(out, "{start_of_section: custom}\n");
1231    }
1232}