chordsketch_core/heuristic.rs
1//! Heuristic plain-text chord+lyrics importer.
2//!
3//! This module detects and converts plain-text chord sheets — where chord names
4//! appear on their own lines above the corresponding lyric lines — into the
5//! ChordPro [`Song`] AST.
6//!
7//! # Format
8//!
9//! Plain-text chord sheets look like:
10//!
11//! ```text
12//! [Verse]
13//! Am F C G
14//! There's a lady who's sure all that glitters is gold
15//! ```
16//!
17//! Each "chord line" contains only chord names (whitespace-separated), and the
18//! following "lyric line" contains the sung text. The column position of each
19//! chord in the chord line is preserved as an inline annotation over the
20//! corresponding text in the lyric line.
21//!
22//! # Detection
23//!
24//! Use [`detect_format`] to auto-classify an input string, or
25//! [`PlainTextImporter::detect_format`] to use custom thresholds.
26//!
27//! # Conversion
28//!
29//! Use [`convert_plain_text`] to convert a plain-text chord sheet into a
30//! [`Song`], or [`PlainTextImporter::convert`] to use custom thresholds.
31
32use crate::ast::{Chord, Directive, Line, LyricsLine, LyricsSegment, Song};
33
34// ---------------------------------------------------------------------------
35// InputFormat
36// ---------------------------------------------------------------------------
37
38/// Classification of an input text format.
39#[derive(Debug, Clone, Copy, PartialEq, Eq)]
40pub enum InputFormat {
41 /// The input is ChordPro format (directives or inline chord notation).
42 ChordPro,
43 /// The input is a plain chord+lyrics sheet.
44 PlainChordLyrics,
45 /// The input is ABC notation.
46 Abc,
47 /// The format could not be determined.
48 Unknown,
49}
50
51// ---------------------------------------------------------------------------
52// PlainTextImporter
53// ---------------------------------------------------------------------------
54
55/// Configuration for the plain-text heuristic importer.
56///
57/// # Examples
58///
59/// ```
60/// use chordsketch_core::heuristic::{PlainTextImporter, InputFormat};
61///
62/// let importer = PlainTextImporter::new();
63/// let format = importer.detect_format("Am G C\nHello world here\n");
64/// assert_eq!(format, InputFormat::PlainChordLyrics);
65/// ```
66#[derive(Debug, Clone)]
67pub struct PlainTextImporter {
68 /// Minimum fraction of whitespace-separated tokens that must be valid chord
69 /// names for a line to be classified as a chord line. Default: `0.5`.
70 ///
71 /// **Valid range: `[0.0, 1.0]`.**
72 /// - `0.0` — every non-empty, non-punctuated line is classified as a chord
73 /// line regardless of content.
74 /// - `1.0` — all tokens must be valid chord names for the line to qualify.
75 /// - Values above `1.0` disable chord-line detection entirely (the ratio of
76 /// chord tokens can never exceed `1.0`).
77 /// - Negative values behave like `0.0`.
78 ///
79 /// Prefer [`PlainTextImporter::with_thresholds`] to construct an importer
80 /// with validated values.
81 pub chord_threshold: f64,
82 /// Minimum number of chord tokens required to classify a line as a chord
83 /// line. Default: `2`.
84 ///
85 /// **Valid range: `>= 1`.**
86 /// Setting this to `0` disables the minimum-count guard: any non-empty,
87 /// non-punctuated line that meets [`chord_threshold`][Self::chord_threshold]
88 /// will be classified as a chord line, even if it contains only a single
89 /// token.
90 ///
91 /// Prefer [`PlainTextImporter::with_thresholds`] to construct an importer
92 /// with validated values.
93 pub min_chord_tokens: usize,
94}
95
96impl Default for PlainTextImporter {
97 fn default() -> Self {
98 Self {
99 chord_threshold: 0.5,
100 min_chord_tokens: 2,
101 }
102 }
103}
104
105impl PlainTextImporter {
106 /// Creates a new importer with default threshold settings.
107 #[must_use]
108 pub fn new() -> Self {
109 Self::default()
110 }
111
112 /// Creates a new importer with explicit threshold values, returning an
113 /// error string if any value is out of its valid range.
114 ///
115 /// # Errors
116 ///
117 /// Returns `Err` if:
118 /// - `chord_threshold` is not in `[0.0, 1.0]`
119 /// - `min_chord_tokens` is `0`
120 ///
121 /// # Examples
122 ///
123 /// ```
124 /// use chordsketch_core::heuristic::PlainTextImporter;
125 ///
126 /// // Valid mid-range value.
127 /// let importer = PlainTextImporter::with_thresholds(0.75, 3).unwrap();
128 /// assert_eq!(importer.chord_threshold, 0.75);
129 /// assert_eq!(importer.min_chord_tokens, 3);
130 ///
131 /// // Boundary values are valid.
132 /// assert!(PlainTextImporter::with_thresholds(0.0, 1).is_ok());
133 /// assert!(PlainTextImporter::with_thresholds(1.0, 1).is_ok());
134 ///
135 /// // Out-of-range values are rejected.
136 /// assert!(PlainTextImporter::with_thresholds(1.5, 2).is_err());
137 /// assert!(PlainTextImporter::with_thresholds(-0.1, 2).is_err());
138 /// assert!(PlainTextImporter::with_thresholds(f64::NAN, 2).is_err());
139 /// assert!(PlainTextImporter::with_thresholds(0.5, 0).is_err());
140 /// ```
141 #[must_use = "this `Result` should be handled; use `.unwrap()` or `?` to obtain the configured importer"]
142 pub fn with_thresholds(chord_threshold: f64, min_chord_tokens: usize) -> Result<Self, String> {
143 if !(0.0..=1.0).contains(&chord_threshold) {
144 return Err(format!(
145 "chord_threshold must be in [0.0, 1.0], got {chord_threshold}"
146 ));
147 }
148 if min_chord_tokens == 0 {
149 return Err("min_chord_tokens must be >= 1".to_string());
150 }
151 Ok(Self {
152 chord_threshold,
153 min_chord_tokens,
154 })
155 }
156
157 /// Returns `true` if `line` appears to be a chord line.
158 ///
159 /// A chord line satisfies all of the following conditions:
160 /// - Contains at least [`min_chord_tokens`][Self::min_chord_tokens] tokens
161 /// that parse as valid chord names.
162 /// - The fraction of valid chord tokens is ≥
163 /// [`chord_threshold`][Self::chord_threshold].
164 /// - Does not contain sentence-ending punctuation (`.`, `?`, `!`), which
165 /// is a strong indicator that the line is lyrics.
166 fn is_chord_line(&self, line: &str) -> bool {
167 // Sentence-ending punctuation strongly indicates lyrics.
168 if line.contains('.') || line.contains('?') || line.contains('!') {
169 return false;
170 }
171 let tokens: Vec<&str> = line.split_whitespace().collect();
172 if tokens.is_empty() {
173 return false;
174 }
175 let chord_count = tokens.iter().filter(|t| is_chord_token(t)).count();
176 chord_count >= self.min_chord_tokens
177 && chord_count as f64 / tokens.len() as f64 >= self.chord_threshold
178 }
179
180 /// Detects the input format using heuristics.
181 ///
182 /// Returns [`InputFormat::ChordPro`] if the input appears to be ChordPro
183 /// (directive braces or inline `[chord]` notation).
184 /// Returns [`InputFormat::PlainChordLyrics`] if the input contains at least
185 /// two chord lines.
186 /// Returns [`InputFormat::Unknown`] otherwise.
187 #[must_use]
188 pub fn detect_format(&self, input: &str) -> InputFormat {
189 let lines: Vec<&str> = input.lines().collect();
190
191 // ChordPro directive syntax: a line whose first non-space character is
192 // `{` and last non-space character is `}`.
193 let has_directives = lines.iter().any(|l| {
194 let t = l.trim();
195 t.starts_with('{') && t.ends_with('}')
196 });
197 if has_directives {
198 return InputFormat::ChordPro;
199 }
200
201 // ChordPro inline chord notation: `[Am]`, `[G7]`, etc.
202 // Distinguish from plain-text section labels like `[Verse]` or `[Chorus 2]`
203 // by checking whether the content inside `[...]` is a valid chord name.
204 //
205 // A whole-line bracket (the trimmed line is exactly `[content]`) is
206 // treated as a section header — not inline chord notation — because
207 // `parse_section_header` already classifies it that way during
208 // conversion. This prevents single-letter key indicators like `[C]`
209 // or `[Am]` from triggering a false ChordPro classification.
210 //
211 // Known limitation (issue #1304): a directive-free ChordPro file that
212 // uses *only* whole-line bracket chords (e.g. `[Am]` alone on its own
213 // line immediately before a lyric) will be classified as `Unknown`
214 // rather than `ChordPro`, because such lines are indistinguishable
215 // from plain-text key/section labels without lookahead context that
216 // would risk introducing new false positives. Files with at least one
217 // `{directive}` or one mid-line inline chord (e.g., `Hello [Am]world`)
218 // are not affected.
219 let has_inline_chords = lines.iter().any(|l| {
220 let trimmed = l.trim();
221 // Skip whole-line brackets: `[content]` where content has no nested `[`.
222 if trimmed.starts_with('[')
223 && trimmed.ends_with(']')
224 && trimmed.len() >= 3
225 && !trimmed[1..trimmed.len() - 1].contains('[')
226 {
227 return false;
228 }
229 let mut rest: &str = l;
230 while let Some(open) = rest.find('[') {
231 let after = &rest[open + 1..];
232 let Some(close) = after.find(']') else { break };
233 let content = &after[..close];
234 if is_chord_token(content) {
235 return true;
236 }
237 rest = &after[close + 1..];
238 }
239 false
240 });
241 if has_inline_chords {
242 return InputFormat::ChordPro;
243 }
244
245 // ABC notation: at least one `X:` reference-number field followed by
246 // digits (the mandatory field that begins every ABC tune).
247 let has_abc_header = lines.iter().any(|l| {
248 let t = l.trim_start();
249 if let Some(rest) = t.strip_prefix("X:") {
250 rest.trim_start()
251 .chars()
252 .next()
253 .is_some_and(|c| c.is_ascii_digit())
254 } else {
255 false
256 }
257 });
258 if has_abc_header {
259 return InputFormat::Abc;
260 }
261
262 // Plain chord+lyrics: at least two chord lines.
263 let chord_line_count = lines.iter().filter(|l| self.is_chord_line(l)).count();
264 if chord_line_count >= 2 {
265 InputFormat::PlainChordLyrics
266 } else if chord_line_count == 1 && lines.len() <= 5 {
267 // Very short input with one chord line is still treated as plain
268 // chord+lyrics (e.g., a two-line snippet passed for testing).
269 InputFormat::PlainChordLyrics
270 } else {
271 InputFormat::Unknown
272 }
273 }
274
275 /// Converts a plain chord+lyrics text into a [`Song`] AST.
276 ///
277 /// # Algorithm
278 ///
279 /// 1. Classify each line as a chord line, section header, lyric, or blank.
280 /// 2. Pair each chord line with the immediately following lyric line.
281 /// For each such pair, compute chord column offsets and produce inline
282 /// chord annotations.
283 /// 3. Section headers are converted to `{start_of_*}` / `{end_of_*}`
284 /// directive pairs.
285 /// 4. Lines that are neither chord lines nor section headers are emitted as
286 /// plain lyric lines.
287 #[must_use]
288 pub fn convert(&self, input: &str) -> Song {
289 let raw_lines: Vec<&str> = input.lines().collect();
290 let classes: Vec<LineKind<'_>> = raw_lines
291 .iter()
292 .map(|l| classify_line(l, |line| self.is_chord_line(line)))
293 .collect();
294
295 let mut song = Song::new();
296 let mut i = 0;
297 let mut current_section: Option<String> = None;
298
299 while i < classes.len() {
300 match &classes[i] {
301 LineKind::Blank => {
302 song.lines.push(Line::Empty);
303 i += 1;
304 }
305 LineKind::SectionHeader(label) => {
306 let label = label.clone();
307 // Close any open section.
308 if let Some(ref sec) = current_section {
309 song.lines
310 .push(Line::Directive(end_directive_for_section(sec)));
311 }
312 // Open the new section.
313 let (start_dir, canonical) = start_directive_for_section(&label);
314 song.lines.push(Line::Directive(start_dir));
315 current_section = Some(canonical);
316 i += 1;
317 }
318 LineKind::ChordLine(positions) => {
319 // Peek at the next non-blank line: if it is a lyric, pair them.
320 let j = i + 1;
321 if j < classes.len() {
322 if let LineKind::Lyric(lyric) = &classes[j] {
323 let paired = pair_chords_with_lyric(positions, lyric);
324 song.lines.push(Line::Lyrics(paired));
325 i += 2;
326 continue;
327 }
328 }
329 // No following lyric — emit the chords as a chord-only line.
330 let paired = pair_chords_with_lyric(positions, "");
331 song.lines.push(Line::Lyrics(paired));
332 i += 1;
333 }
334 LineKind::Lyric(text) => {
335 song.lines.push(Line::Lyrics(LyricsLine {
336 segments: vec![LyricsSegment {
337 chord: None,
338 text: (*text).to_string(),
339 spans: vec![],
340 }],
341 }));
342 i += 1;
343 }
344 }
345 }
346
347 // Close the last open section.
348 if let Some(ref sec) = current_section {
349 song.lines
350 .push(Line::Directive(end_directive_for_section(sec)));
351 }
352
353 song
354 }
355}
356
357// ---------------------------------------------------------------------------
358// Module-level convenience functions
359// ---------------------------------------------------------------------------
360
361/// Detects the format of `input` using default [`PlainTextImporter`] settings.
362///
363/// # Examples
364///
365/// ```
366/// use chordsketch_core::heuristic::{detect_format, InputFormat};
367///
368/// assert_eq!(
369/// detect_format("{title: My Song}\n[Am]Hello"),
370/// InputFormat::ChordPro
371/// );
372/// assert_eq!(
373/// detect_format("Am G C\nHello world\n"),
374/// InputFormat::PlainChordLyrics
375/// );
376/// ```
377#[must_use]
378pub fn detect_format(input: &str) -> InputFormat {
379 PlainTextImporter::default().detect_format(input)
380}
381
382/// Converts a plain chord+lyrics text into a [`Song`] AST using default
383/// [`PlainTextImporter`] settings.
384///
385/// # Examples
386///
387/// ```
388/// use chordsketch_core::heuristic::convert_plain_text;
389///
390/// let song = convert_plain_text("Am G\nHello world\n");
391/// assert!(!song.lines.is_empty());
392/// ```
393#[must_use]
394pub fn convert_plain_text(input: &str) -> Song {
395 PlainTextImporter::default().convert(input)
396}
397
398// ---------------------------------------------------------------------------
399// Internal helpers
400// ---------------------------------------------------------------------------
401
402/// Classification of a single input line.
403#[derive(Debug)]
404enum LineKind<'a> {
405 /// A blank (empty or all-whitespace) line.
406 Blank,
407 /// A section header like `[Verse]` or `CHORUS:`.
408 /// Contains the inner label text.
409 SectionHeader(String),
410 /// A chord line. Contains `(byte_offset, chord_name)` pairs, sorted by
411 /// byte offset.
412 ChordLine(Vec<(usize, String)>),
413 /// A lyric (or unrecognised) line.
414 Lyric(&'a str),
415}
416
417/// Classifies a single line using the supplied chord-line predicate.
418fn classify_line<'a, F>(line: &'a str, is_chord_line: F) -> LineKind<'a>
419where
420 F: Fn(&str) -> bool,
421{
422 if line.trim().is_empty() {
423 return LineKind::Blank;
424 }
425 if let Some(label) = parse_section_header(line) {
426 return LineKind::SectionHeader(label);
427 }
428 if is_chord_line(line) {
429 return LineKind::ChordLine(chord_positions(line));
430 }
431 LineKind::Lyric(line)
432}
433
434/// Returns `true` if `token` is a well-formed chord name.
435///
436/// This is a stricter check than [`parse_chord`]: it rejects tokens whose
437/// extension part contains unexpected alphabetic characters (e.g., `"Chorus"`
438/// would be parsed as `C + "horus"` by `parse_chord`, but is rejected here).
439///
440/// Accepted patterns:
441/// - Root `A–G` with optional `#` / `b`
442/// - Zero or more quality+extension atoms: a quality keyword (`m`, `maj`,
443/// `min`, `dim`, `aug`, `sus`, `add`, `+`, `°`) optionally followed by a
444/// numeric extension (e.g. `m7add11`, `maj7sus4`, `7b5`)
445/// - Optional bass note: `/A–G[#b]`
446fn is_chord_token(token: &str) -> bool {
447 // Reject obviously non-chord tokens. 16 characters covers multi-component
448 // jazz chords like Dbmaj7#11sus4b9 (15 chars) while still catching long words.
449 if token.is_empty() || token.len() > 16 {
450 return false;
451 }
452 let bytes = token.as_bytes();
453
454 // Root note: must be A–G (uppercase).
455 if !matches!(bytes[0], b'A'..=b'G') {
456 return false;
457 }
458
459 // Split off optional bass note (/X[#b]).
460 let (body, bass) = match token.find('/') {
461 Some(i) => (&token[..i], Some(&token[i + 1..])),
462 None => (token, None),
463 };
464
465 // Validate bass note.
466 if let Some(bass) = bass {
467 if bass.is_empty() {
468 return false;
469 }
470 let b = bass.as_bytes();
471 if !matches!(b[0], b'A'..=b'G') {
472 return false;
473 }
474 if b.len() > 1 && b[1] != b'#' && b[1] != b'b' {
475 return false;
476 }
477 if b.len() > 2 {
478 return false;
479 }
480 }
481
482 // Validate body: root [accidental] [quality] [extension]
483 let body_bytes = body.as_bytes();
484 let mut pos = 1usize; // skip root letter
485
486 // Optional accidental.
487 if pos < body_bytes.len() && (body_bytes[pos] == b'#' || body_bytes[pos] == b'b') {
488 pos += 1;
489 }
490
491 let quality_ext = &body[pos..];
492 is_valid_quality_ext(quality_ext)
493}
494
495/// Consumes an optional accidental (`#` or `b`, only when immediately
496/// followed by a digit) and then any run of ASCII digits. Returns the
497/// unconsumed suffix.
498fn consume_numeric(s: &str) -> &str {
499 let bytes = s.as_bytes();
500 let mut i = 0;
501 // Accidental only counts when a digit follows it.
502 if bytes.len() >= 2 && (bytes[0] == b'#' || bytes[0] == b'b') && bytes[1].is_ascii_digit() {
503 i = 1;
504 }
505 while i < bytes.len() && bytes[i].is_ascii_digit() {
506 i += 1;
507 }
508 &s[i..]
509}
510
511/// Returns `true` if `s` is a valid chord quality+extension suffix.
512///
513/// The suffix is consumed iteratively: each step strips one quality keyword
514/// (`maj`, `min`, `dim`, `aug`, `sus`, `add`, `m`, `+`, `°`) optionally
515/// followed by a numeric extension (optional `#`/`b` accidental then digits).
516/// This allows compound forms such as `m7add11`, `maj7sus4`, or `7b5`.
517///
518/// Acceptable atoms (zero or more, in any order):
519/// - Quality keyword: `maj`, `min`, `dim`, `aug`, `sus`, `add`, `m`, `+`, `°`
520/// - Numeric extension: optional accidental (`#`/`b`, only if a digit follows)
521/// then one or more digits
522fn is_valid_quality_ext(s: &str) -> bool {
523 let mut rest = s;
524 loop {
525 if rest.is_empty() {
526 return true;
527 }
528
529 // Try to strip a quality keyword.
530 let after_kw: Option<&str> = None
531 .or_else(|| rest.strip_prefix("maj"))
532 .or_else(|| rest.strip_prefix("min"))
533 .or_else(|| rest.strip_prefix("dim"))
534 .or_else(|| rest.strip_prefix("aug"))
535 .or_else(|| rest.strip_prefix("sus"))
536 .or_else(|| rest.strip_prefix("add"))
537 .or_else(|| rest.strip_prefix('m'))
538 .or_else(|| rest.strip_prefix('+'))
539 .or_else(|| rest.strip_prefix('°'));
540
541 if let Some(after) = after_kw {
542 // Keyword consumed; optionally consume a following numeric part.
543 rest = consume_numeric(after);
544 } else {
545 // No keyword — try a bare numeric extension.
546 let next = consume_numeric(rest);
547 if next.len() == rest.len() {
548 // Nothing consumed: unrecognized character.
549 return false;
550 }
551 rest = next;
552 }
553 }
554}
555
556/// Extracts `(byte_offset, chord_name)` pairs from a chord line, preserving
557/// the column position of each chord token.
558fn chord_positions(line: &str) -> Vec<(usize, String)> {
559 let mut result = Vec::new();
560 let mut search_start = 0usize;
561
562 for token in line.split_whitespace() {
563 // Locate this token inside the remaining slice.
564 if let Some(rel) = line[search_start..].find(token) {
565 let abs = search_start + rel;
566 if is_chord_token(token) {
567 result.push((abs, token.to_string()));
568 }
569 // Advance past this token.
570 search_start = abs + token.len();
571 }
572 }
573 result
574}
575
576/// Attempts to parse `line` as a section header.
577///
578/// Recognised patterns:
579/// - `[Verse]`, `[Chorus 2]` — square brackets
580/// - `(Bridge)` — parentheses
581/// - `VERSE:`, `CHORUS:` — uppercase label followed by a colon
582/// - `-- Chorus --`, `== Verse ==` — dash or equals decoration
583///
584/// Returns the inner label on success, or `None` if the line is not a section
585/// header.
586fn parse_section_header(line: &str) -> Option<String> {
587 let trimmed = line.trim();
588
589 /// Returns `true` if `label` is safe to embed in a ChordPro directive
590 /// value. Rejects labels containing `{` or `}` because ChordPro has no
591 /// escape mechanism inside directive values and those characters would
592 /// produce malformed output.
593 fn is_safe_label(label: &str) -> bool {
594 !label.is_empty() && !label.contains('{') && !label.contains('}')
595 }
596
597 // [Label] form
598 if trimmed.starts_with('[') && trimmed.ends_with(']') && trimmed.len() >= 3 {
599 let inner = trimmed[1..trimmed.len() - 1].trim();
600 if !inner.contains('[') && is_safe_label(inner) {
601 return Some(inner.to_string());
602 }
603 }
604
605 // (Label) form
606 if trimmed.starts_with('(') && trimmed.ends_with(')') && trimmed.len() >= 3 {
607 let inner = trimmed[1..trimmed.len() - 1].trim();
608 if !inner.contains('(') && is_safe_label(inner) {
609 return Some(inner.to_string());
610 }
611 }
612
613 // LABEL: form — alphabetic label followed by exactly one colon
614 if let Some(label) = trimmed.strip_suffix(':') {
615 if label
616 .chars()
617 .all(|c| c.is_alphabetic() || c == ' ' || c == '-')
618 && is_safe_label(label.trim())
619 {
620 return Some(label.trim().to_string());
621 }
622 }
623
624 // -- Label -- and == Label == forms
625 for delim in &["--", "==", "**", "##"] {
626 if trimmed.starts_with(delim) && trimmed.ends_with(delim) && trimmed.len() > 2 * delim.len()
627 {
628 let inner = trimmed[delim.len()..trimmed.len() - delim.len()].trim();
629 if is_safe_label(inner) {
630 return Some(inner.to_string());
631 }
632 }
633 }
634
635 None
636}
637
638/// Maps a section label to its canonical ChordPro directive base name.
639///
640/// For example, `"verse"` → `"verse"`, `"chorus 2"` → `"chorus"`.
641/// Unknown labels fall back to `"verse"` with a label attribute.
642fn canonical_section(label: &str) -> &'static str {
643 let lower = label.to_lowercase();
644 let lower = lower.trim();
645 if lower.starts_with("chorus") || lower.starts_with("refrain") {
646 "chorus"
647 } else if lower.starts_with("bridge") {
648 "bridge"
649 } else {
650 // verse, intro, outro, pre-chorus, and unknown labels all map to "verse".
651 "verse"
652 }
653}
654
655/// Returns the `{start_of_*}` directive for the given label, plus the
656/// canonical section name used to match the closing directive.
657fn start_directive_for_section(label: &str) -> (Directive, String) {
658 let canonical = canonical_section(label);
659 let dir_name = format!("start_of_{canonical}");
660 // Include the label as a `label` attribute when the label text differs from
661 // the canonical section name (e.g., "Verse 2", "Intro").
662 let lower_label = label.trim().to_lowercase();
663 let dir = if lower_label == canonical {
664 Directive::name_only(dir_name)
665 } else {
666 Directive::with_value(dir_name, label.trim().to_string())
667 };
668 (dir, canonical.to_string())
669}
670
671/// Returns the `{end_of_*}` directive for the given canonical section name.
672fn end_directive_for_section(canonical: &str) -> Directive {
673 let dir_name = format!("end_of_{canonical}");
674 Directive::name_only(dir_name)
675}
676
677/// Builds a [`LyricsLine`] by pairing chord column positions with the
678/// corresponding lyric text.
679///
680/// Each chord annotates the lyric text starting at its column position in the
681/// chord line. Text that precedes the first chord is emitted in a leading
682/// chord-free segment.
683///
684/// If `positions` is empty the function returns a single chord-free segment
685/// containing the full `lyric` string (rather than an empty `LyricsLine`).
686fn pair_chords_with_lyric(positions: &[(usize, String)], lyric: &str) -> LyricsLine {
687 // Fast path: no chord positions — return the lyric as a single plain segment.
688 if positions.is_empty() {
689 return LyricsLine {
690 segments: vec![LyricsSegment {
691 chord: None,
692 text: lyric.to_string(),
693 spans: vec![],
694 }],
695 };
696 }
697
698 // Work in terms of char indices for correctness with multi-byte text, but
699 // chord positions from `chord_positions` are byte offsets into the ASCII
700 // chord line. Because chord lines are expected to be ASCII (chord names),
701 // byte == char for the chord line. We must, however, map those byte offsets
702 // to char offsets in the lyric line for correct slicing.
703 let lyric_char_offsets: Vec<usize> = lyric.char_indices().map(|(b, _)| b).collect();
704 let lyric_len = lyric.len();
705
706 // Maps a byte offset from the (ASCII) chord line to a valid byte offset in
707 // the lyric string, clamped to lyric_len. When col is beyond the end of
708 // the lyric we return lyric_len (no text to annotate). For shorter cols
709 // we snap to the nearest char boundary so we never slice in the middle of
710 // a multi-byte codepoint.
711 let clamp_to_lyric = |col: usize| -> usize {
712 if col >= lyric_len {
713 return lyric_len;
714 }
715 // Snap down to the nearest char-boundary offset that does not exceed col.
716 lyric_char_offsets
717 .iter()
718 .copied()
719 .rfind(|&b| b <= col)
720 .unwrap_or(0)
721 };
722
723 let mut segments: Vec<LyricsSegment> = Vec::new();
724 let mut cursor = 0usize; // byte position in lyric
725
726 for (i, (col, chord_name)) in positions.iter().enumerate() {
727 let text_start = clamp_to_lyric(*col);
728 let text_end = if let Some((next_col, _)) = positions.get(i + 1) {
729 clamp_to_lyric(*next_col)
730 } else {
731 lyric_len // last chord gets all remaining lyric text
732 };
733
734 // Any lyric text before this chord position (after the cursor) without
735 // a chord annotation.
736 if text_start > cursor {
737 segments.push(LyricsSegment {
738 chord: None,
739 text: lyric[cursor..text_start].to_string(),
740 spans: vec![],
741 });
742 }
743
744 // text_start is always <= lyric_len by construction of clamp_to_lyric.
745 let text = lyric[text_start..text_end.min(lyric_len)].to_string();
746
747 segments.push(LyricsSegment {
748 chord: Some(Chord::new(chord_name.as_str())),
749 text,
750 spans: vec![],
751 });
752 cursor = text_end.min(lyric_len);
753 }
754
755 // positions was non-empty, so segments is non-empty and cursor == lyric_len.
756 LyricsLine { segments }
757}
758
759// ---------------------------------------------------------------------------
760// ChordPro serializer for plain-text-imported songs
761// ---------------------------------------------------------------------------
762
763/// Strips `{` and `}` from a string so it is safe to embed as a ChordPro
764/// directive name or value. ChordPro has no escape mechanism inside directive
765/// names or values, so brace characters would produce malformed output.
766fn sanitize_directive_token(s: &str) -> std::borrow::Cow<'_, str> {
767 if s.contains('{') || s.contains('}') {
768 std::borrow::Cow::Owned(s.replace(['{', '}'], ""))
769 } else {
770 std::borrow::Cow::Borrowed(s)
771 }
772}
773
774/// Serializes a [`Song`] to ChordPro format.
775///
776/// This serializer is intended for songs produced by [`convert_plain_text`].
777/// It handles the subset of [`Line`] and [`Directive`] variants that the
778/// heuristic importer emits. Complex AST features (image directives, delegate
779/// environments, etc.) are rendered as a best-effort comment.
780///
781/// # Examples
782///
783/// ```
784/// use chordsketch_core::heuristic::{convert_plain_text, song_to_chordpro};
785///
786/// let song = convert_plain_text("[Verse]\nAm G\nHello world\n");
787/// let chordpro = song_to_chordpro(&song);
788/// assert!(chordpro.contains("{start_of_verse}"));
789/// assert!(chordpro.contains("[Am]"));
790/// ```
791#[must_use]
792pub fn song_to_chordpro(song: &Song) -> String {
793 use crate::ast::{CommentStyle, Line};
794
795 let mut out = String::new();
796
797 // Emit metadata directives first if populated.
798 if let Some(ref title) = song.metadata.title {
799 out.push_str(&format!("{{title: {}}}\n", sanitize_directive_token(title)));
800 }
801 if let Some(artist) = song.metadata.artists.first() {
802 out.push_str(&format!(
803 "{{artist: {}}}\n",
804 sanitize_directive_token(artist)
805 ));
806 }
807
808 for line in &song.lines {
809 match line {
810 Line::Empty => out.push('\n'),
811 Line::Comment(style, text) => {
812 let t = sanitize_directive_token(text);
813 match style {
814 CommentStyle::Normal => out.push_str(&format!("{{comment: {t}}}\n")),
815 CommentStyle::Italic => out.push_str(&format!("{{comment_italic: {t}}}\n")),
816 CommentStyle::Boxed => out.push_str(&format!("{{comment_box: {t}}}\n")),
817 }
818 }
819 Line::Directive(dir) => {
820 let name = sanitize_directive_token(&dir.name);
821 if let Some(ref value) = dir.value {
822 out.push_str(&format!(
823 "{{{}: {}}}\n",
824 name,
825 sanitize_directive_token(value)
826 ));
827 } else {
828 out.push_str(&format!("{{{}}}\n", name));
829 }
830 }
831 Line::Lyrics(lyrics) => {
832 for seg in &lyrics.segments {
833 if let Some(ref chord) = seg.chord {
834 out.push('[');
835 out.push_str(&chord.name);
836 out.push(']');
837 }
838 out.push_str(&seg.text);
839 }
840 out.push('\n');
841 }
842 }
843 }
844
845 out
846}
847
848// ---------------------------------------------------------------------------
849// Tests
850// ---------------------------------------------------------------------------
851
852#[cfg(test)]
853mod tests {
854 use super::*;
855
856 fn chord_names(line: &LyricsLine) -> Vec<Option<String>> {
857 line.segments
858 .iter()
859 .map(|s| s.chord.as_ref().map(|c| c.name.clone()))
860 .collect()
861 }
862
863 // --- detect_format ---
864
865 #[test]
866 fn detects_chordpro_from_directives() {
867 assert_eq!(
868 detect_format("{title: Hello}\n{soc}\n[Am]Hello\n{eoc}"),
869 InputFormat::ChordPro
870 );
871 }
872
873 #[test]
874 fn detects_chordpro_from_inline_chords() {
875 assert_eq!(detect_format("[Am]Hello [G]world"), InputFormat::ChordPro);
876 }
877
878 #[test]
879 fn detects_plain_chord_lyrics() {
880 let input = "Am G C Em\nHello beautiful world tonight\nG C\nOnce more";
881 assert_eq!(detect_format(input), InputFormat::PlainChordLyrics);
882 }
883
884 #[test]
885 fn detects_plain_chord_lyrics_with_section_labels() {
886 // Section labels like [Verse] and [Chorus] should NOT be misidentified
887 // as ChordPro inline chord notation.
888 let input = "[Verse]\nG D\nHere I am\nEm C\nWondering\n\n[Chorus]\nC G\nLala\n";
889 assert_eq!(detect_format(input), InputFormat::PlainChordLyrics);
890 }
891
892 #[test]
893 fn detects_single_letter_section_label_not_chordpro() {
894 // A whole-line `[C]` is a key/section label in plain-text chord
895 // sheets, not ChordPro inline chord notation. Issue #1278.
896 let input = "[C]\nG D\nHere I am\nEm C\nWondering\n";
897 assert_eq!(detect_format(input), InputFormat::PlainChordLyrics);
898
899 // Same for other single-letter keys that are valid chord names.
900 let input_am = "[Am]\nG D Em\nHello world again now\n";
901 assert_eq!(detect_format(input_am), InputFormat::PlainChordLyrics);
902 }
903
904 #[test]
905 fn detects_inline_chord_in_line_still_chordpro() {
906 // `[C]` embedded mid-line (not the whole line) is inline chord notation.
907 assert_eq!(detect_format("[C]Hello world"), InputFormat::ChordPro);
908 assert_eq!(detect_format("Hello [Am]world"), InputFormat::ChordPro);
909 }
910
911 #[test]
912 fn detects_multi_bracket_line_as_chordpro() {
913 // `[Am][G]` on a single line has inner content `Am][G` which contains `[`,
914 // so the whole-line guard does NOT trigger. The scan finds `[Am]` → ChordPro.
915 assert_eq!(detect_format("[Am][G]"), InputFormat::ChordPro);
916 // Three brackets also triggers correctly.
917 assert_eq!(detect_format("[C][G][Am]"), InputFormat::ChordPro);
918 }
919
920 #[test]
921 fn detects_mixed_section_label_and_inline_chord_as_chordpro() {
922 // A whole-line section label `[Verse]` does NOT trigger ChordPro detection
923 // on its own, but a mid-line inline chord on another line does.
924 let input = "[Verse]\nHello [Am]world\n";
925 assert_eq!(detect_format(input), InputFormat::ChordPro);
926 }
927
928 #[test]
929 fn known_limitation_whole_line_chord_only_chordpro_returns_unknown() {
930 // A directive-free ChordPro file that uses ONLY whole-line bracket chords
931 // (each chord on its own line before a lyric) is indistinguishable from
932 // a plain-text file with key/section labels, so detect_format returns
933 // Unknown rather than ChordPro. This is a documented trade-off — see
934 // issue #1304 and the comment in detect_format above `has_inline_chords`.
935 //
936 // Files with at least one directive or one mid-line inline chord are
937 // correctly identified as ChordPro (see `detects_chordpro_from_directives`
938 // and `detects_inline_chord_in_line_still_chordpro`).
939 let input = "[Am]\nThis is a lyric line\n[G]\nAnother lyric line\n";
940 assert_eq!(detect_format(input), InputFormat::Unknown);
941 }
942
943 #[test]
944 fn detects_unknown_for_pure_lyrics() {
945 let input = "Hello beautiful world\nOnce upon a time\nSomething happened here";
946 assert_eq!(detect_format(input), InputFormat::Unknown);
947 }
948
949 #[test]
950 fn detects_unknown_for_empty() {
951 assert_eq!(detect_format(""), InputFormat::Unknown);
952 }
953
954 // --- is_chord_token ---
955
956 #[test]
957 fn chord_token_rejects_section_labels() {
958 // Section labels that start with A-G should NOT be recognized as chords.
959 assert!(!is_chord_token("Chorus"));
960 assert!(!is_chord_token("Bridge"));
961 assert!(!is_chord_token("Em7add9sus2extended"));
962 }
963
964 #[test]
965 fn chord_token_accepts_valid_chords() {
966 assert!(is_chord_token("Am"));
967 assert!(is_chord_token("C"));
968 assert!(is_chord_token("G7"));
969 assert!(is_chord_token("Cmaj7"));
970 assert!(is_chord_token("D/F#"));
971 assert!(is_chord_token("Bb"));
972 assert!(is_chord_token("F#m7"));
973 assert!(is_chord_token("Gsus4"));
974 assert!(is_chord_token("Em"));
975 }
976
977 #[test]
978 fn chord_token_accepts_multicomponent_extensions() {
979 // Compound quality+extension sequences (issue #1279).
980 assert!(is_chord_token("Am7add11"));
981 assert!(is_chord_token("Cmaj7sus4"));
982 assert!(is_chord_token("G7b5"));
983 assert!(is_chord_token("Fmaj7add9"));
984 assert!(is_chord_token("Dm7add11"));
985 assert!(is_chord_token("G7#9"));
986 assert!(is_chord_token("Cmaj9"));
987 // Words that happen to start with A-G must still be rejected.
988 assert!(!is_chord_token("Chorus"));
989 assert!(!is_chord_token("Bridge"));
990 // Cmaj7e: maj consumed → 7e, numeric 7 consumed → e; 'e' is not a
991 // keyword or numeric character so the algorithm returns false.
992 assert!(!is_chord_token("Cmaj7e"));
993 // Multi-component chords up to 16 characters must be accepted.
994 assert!(is_chord_token("Cmaj7sus4add9")); // 13 chars
995 }
996
997 // --- is_chord_line ---
998
999 #[test]
1000 fn chord_line_typical() {
1001 let imp = PlainTextImporter::new();
1002 assert!(imp.is_chord_line("Am F C G"));
1003 }
1004
1005 #[test]
1006 fn chord_line_with_slash_chords() {
1007 let imp = PlainTextImporter::new();
1008 assert!(imp.is_chord_line("G D/F# Em C"));
1009 }
1010
1011 #[test]
1012 fn not_chord_line_all_lyrics() {
1013 let imp = PlainTextImporter::new();
1014 assert!(!imp.is_chord_line("There's a lady who's sure."));
1015 }
1016
1017 #[test]
1018 fn not_chord_line_sentence_punctuation() {
1019 let imp = PlainTextImporter::new();
1020 // Even though "A" and "G" are valid chords, the period disqualifies it.
1021 assert!(!imp.is_chord_line("A song for G."));
1022 }
1023
1024 #[test]
1025 fn not_chord_line_too_few_chords() {
1026 let imp = PlainTextImporter::new();
1027 // Only one chord token — below min_chord_tokens=2.
1028 assert!(!imp.is_chord_line("Am something else here now"));
1029 }
1030
1031 // --- parse_section_header ---
1032
1033 #[test]
1034 fn section_square_brackets() {
1035 assert_eq!(parse_section_header("[Verse]"), Some("Verse".to_string()));
1036 assert_eq!(
1037 parse_section_header("[Chorus 2]"),
1038 Some("Chorus 2".to_string())
1039 );
1040 }
1041
1042 #[test]
1043 fn section_parens() {
1044 assert_eq!(parse_section_header("(Bridge)"), Some("Bridge".to_string()));
1045 }
1046
1047 #[test]
1048 fn section_colon() {
1049 assert_eq!(parse_section_header("VERSE:"), Some("VERSE".to_string()));
1050 assert_eq!(parse_section_header("Chorus:"), Some("Chorus".to_string()));
1051 }
1052
1053 #[test]
1054 fn section_dash_decorated() {
1055 assert_eq!(
1056 parse_section_header("-- Chorus --"),
1057 Some("Chorus".to_string())
1058 );
1059 }
1060
1061 #[test]
1062 fn section_not_matched() {
1063 assert_eq!(parse_section_header("Hello world"), None);
1064 assert_eq!(parse_section_header("Am G C Em"), None);
1065 }
1066
1067 #[test]
1068 fn section_rejects_brace_in_label() {
1069 // Labels containing { or } must be rejected to prevent emitting
1070 // malformed ChordPro directive values (M-1 from delta review).
1071 assert_eq!(parse_section_header("[Verse}]"), None);
1072 assert_eq!(parse_section_header("[{Chorus}]"), None);
1073 assert_eq!(parse_section_header("Verse}:"), None);
1074 }
1075
1076 // --- pair_chords_with_lyric ---
1077
1078 #[test]
1079 fn pair_basic() {
1080 // "Am F" at cols 0 and 4 over "Hello world"
1081 let positions = vec![(0, "Am".to_string()), (4, "F".to_string())];
1082 let line = pair_chords_with_lyric(&positions, "Hello world");
1083 let chords = chord_names(&line);
1084 assert_eq!(chords, vec![Some("Am".to_string()), Some("F".to_string())]);
1085 assert_eq!(line.segments[0].text, "Hell");
1086 assert_eq!(line.segments[1].text, "o world");
1087 }
1088
1089 #[test]
1090 fn pair_lyric_shorter_than_chord_line() {
1091 // Chord at column 10 but lyric is only 5 chars.
1092 let positions = vec![(0, "Am".to_string()), (10, "G".to_string())];
1093 let line = pair_chords_with_lyric(&positions, "Hi");
1094 // Am gets "Hi", G gets ""
1095 assert_eq!(
1096 line.segments[0].chord.as_ref().map(|c| c.name.as_str()),
1097 Some("Am")
1098 );
1099 assert_eq!(line.segments[0].text, "Hi");
1100 assert_eq!(
1101 line.segments[1].chord.as_ref().map(|c| c.name.as_str()),
1102 Some("G")
1103 );
1104 assert_eq!(line.segments[1].text, "");
1105 }
1106
1107 #[test]
1108 fn pair_no_chords_returns_plain_lyric() {
1109 let positions: Vec<(usize, String)> = vec![];
1110 let line = pair_chords_with_lyric(&positions, "Hello world");
1111 assert_eq!(line.segments.len(), 1);
1112 assert!(line.segments[0].chord.is_none());
1113 assert_eq!(line.segments[0].text, "Hello world");
1114 }
1115
1116 // --- convert ---
1117
1118 #[test]
1119 fn convert_simple_verse() {
1120 let input = "[Verse]\nAm G C\nHello world today\n";
1121 let song = convert_plain_text(input);
1122 // Should have: start_of_verse, lyrics, end_of_verse
1123 assert!(song.lines.iter().any(|l| matches!(l, Line::Directive(_))));
1124 assert!(song.lines.iter().any(|l| matches!(l, Line::Lyrics(_))));
1125 }
1126
1127 #[test]
1128 fn convert_chordless_lyric_passthrough() {
1129 let input = "Am G C Em\nThere is a song\nThis line has no preceding chord line\n";
1130 let song = convert_plain_text(input);
1131 // The first line pair produces a lyrics line with chords.
1132 // "This line has no preceding chord line" is a lyric after the pair.
1133 let has_plain_lyric = song.lines.iter().any(|l| {
1134 if let Line::Lyrics(ll) = l {
1135 ll.segments.len() == 1
1136 && ll.segments[0].chord.is_none()
1137 && ll.segments[0].text.contains("no preceding")
1138 } else {
1139 false
1140 }
1141 });
1142 assert!(has_plain_lyric);
1143 }
1144
1145 #[test]
1146 fn convert_section_labels_to_directives() {
1147 let input = "[Chorus]\nG C G D\nLala lala lala\n[Verse]\nAm F C G\nHello world\n";
1148 let song = convert_plain_text(input);
1149 // At least one StartOfChorus and one StartOfVerse directive.
1150 use crate::ast::DirectiveKind;
1151 let kinds: Vec<&DirectiveKind> = song
1152 .lines
1153 .iter()
1154 .filter_map(|l| {
1155 if let Line::Directive(d) = l {
1156 Some(&d.kind)
1157 } else {
1158 None
1159 }
1160 })
1161 .collect();
1162 assert!(kinds.iter().any(|k| **k == DirectiveKind::StartOfChorus));
1163 assert!(kinds.iter().any(|k| **k == DirectiveKind::StartOfVerse));
1164 }
1165
1166 #[test]
1167 fn convert_multiple_sections_close_properly() {
1168 let input = "[Verse]\nAm G\nHello world\n[Chorus]\nC G\nYeah yeah\n";
1169 let song = convert_plain_text(input);
1170 use crate::ast::DirectiveKind;
1171 // Should have both end_of_verse and end_of_chorus.
1172 let kinds: Vec<&DirectiveKind> = song
1173 .lines
1174 .iter()
1175 .filter_map(|l| {
1176 if let Line::Directive(d) = l {
1177 Some(&d.kind)
1178 } else {
1179 None
1180 }
1181 })
1182 .collect();
1183 assert!(kinds.iter().any(|k| **k == DirectiveKind::EndOfVerse));
1184 assert!(kinds.iter().any(|k| **k == DirectiveKind::EndOfChorus));
1185 }
1186
1187 // --- song_to_chordpro ---
1188
1189 #[test]
1190 fn song_to_chordpro_strips_braces_in_title() {
1191 // song_to_chordpro must not emit malformed ChordPro when metadata
1192 // contains brace characters (issue #1282).
1193 let mut song = Song::default();
1194 song.metadata.title = Some("Hello {World}".to_string());
1195 let out = song_to_chordpro(&song);
1196 // Braces inside the value must be stripped; the directive itself is still well-formed.
1197 assert_eq!(out, "{title: Hello World}\n");
1198 }
1199
1200 #[test]
1201 fn song_to_chordpro_strips_braces_in_artist() {
1202 let mut song = Song::default();
1203 song.metadata.artists.push("{Dodgy} Artist".to_string());
1204 let out = song_to_chordpro(&song);
1205 assert_eq!(out, "{artist: Dodgy Artist}\n");
1206 }
1207
1208 #[test]
1209 fn song_to_chordpro_strips_braces_in_comment() {
1210 use crate::ast::{CommentStyle, Line};
1211 let mut song = Song::default();
1212 song.lines.push(Line::Comment(
1213 CommentStyle::Normal,
1214 "See {note}".to_string(),
1215 ));
1216 let out = song_to_chordpro(&song);
1217 assert_eq!(out, "{comment: See note}\n");
1218 }
1219
1220 #[test]
1221 fn song_to_chordpro_strips_braces_in_directive_name_and_value() {
1222 // A manually-constructed Song with braces in directive name/value
1223 // must still produce well-formed ChordPro (issue #1291).
1224 use crate::ast::{Directive, Line};
1225 let mut dir = Directive::name_only("start_of_{section}".to_string());
1226 dir.value = Some("{custom}".to_string());
1227 let mut song = Song::default();
1228 song.lines.push(Line::Directive(dir));
1229 let out = song_to_chordpro(&song);
1230 assert_eq!(out, "{start_of_section: custom}\n");
1231 }
1232}