Skip to main content

lex_core/lex/
migrate.rs

1//! Source-level migration for legacy bare labels.
2//!
3//! Phase 5 of the label-semantics refactor tracked in
4//! [#570](https://github.com/lex-fmt/lex/issues/570). Parses a `.lex`
5//! source string, identifies every legacy bare label that
6//! [`NormalizeLabels`](crate::lex::assembling::stages::NormalizeLabels)
7//! would rewrite at parse time, and produces a rewritten source string
8//! with the labels migrated to their canonical `lex.*` form.
9//!
10//! This is what powers `lexd migrate-labels`: an explicit, source-level
11//! pass users can run to migrate their `.lex` files once and stop
12//! relying on the silent parse-time rewrite.
13//!
14//! # Why source-level, not AST-level
15//!
16//! [`NormalizeLabels`] already migrates the AST in memory — it's
17//! invoked unconditionally by `STRING_TO_AST` since #570 Phase 3b.
18//! Source-level migration is different: it produces a rewritten `.lex`
19//! file that no longer carries the legacy form, so future parses don't
20//! need the in-flight rewrite at all. This is the user-facing
21//! deliverable for the "two minor versions to migrate" deprecation
22//! window the issue called out.
23//!
24//! The key trick: after `STRING_TO_AST` runs, every `Label.value` in
25//! the AST is *canonical*, but `Label.location.span` still points at
26//! the **original** source bytes — which still carry the legacy form.
27//! So we walk the parsed AST collecting `(span, legacy_text)` pairs
28//! and rewrite the source in reverse byte order. No re-parsing, no
29//! regex heuristics, no ambiguity.
30
31use crate::lex::assembling::stages::{
32    ApplyTableConfig, AttachAnnotations, AttachRoot, NormalizeLabels,
33};
34use crate::lex::ast::elements::annotation::Annotation;
35use crate::lex::ast::elements::content_item::ContentItem;
36use crate::lex::ast::elements::label::Label;
37use crate::lex::ast::elements::verbatim::Verbatim;
38use crate::lex::ast::Document;
39use crate::lex::transforms::stages::ParseInlines;
40use crate::lex::transforms::standard::LEXING;
41use crate::lex::transforms::Runnable;
42
43/// Mapping of legacy label inputs to the canonical they migrate to.
44/// Local to the migration tool — the parse-time `NormalizeLabels` no
45/// longer carries any "legacy" concept since PR 2 of #584: it only
46/// resolves accepted forms. Anything in this table is what the
47/// migration tool recognizes as needing a source-level rewrite.
48///
49/// `doc.*` entries map to the prefix-stripped form of the corresponding
50/// canonical (so `:: doc.table ::` rewrites to `:: table ::`, the
51/// blessed shortcut). The non-shortcut metadata labels (`category`,
52/// `template`, etc.) rewrite to their prefix-stripped form (e.g.
53/// `:: category ::` → `:: metadata.category ::`).
54pub const LEGACY_TO_BLESSED: &[(&str, &str)] = &[
55    ("category", "metadata.category"),
56    ("template", "metadata.template"),
57    ("publishing-date", "metadata.publishing-date"),
58    ("front-matter", "metadata.front-matter"),
59    ("doc.table", "table"),
60    ("doc.image", "image"),
61    ("doc.video", "video"),
62    ("doc.audio", "audio"),
63];
64
65/// Lookup helper for the legacy→blessed map. Used by the LSP's
66/// `forbidden-label-prefix` quickfix in `lex-lsp-core::available_actions`
67/// — PR 4 of #584 wired up the code action surface.
68pub fn blessed_for_legacy(legacy: &str) -> Option<&'static str> {
69    LEGACY_TO_BLESSED
70        .iter()
71        .find(|(l, _)| *l == legacy)
72        .map(|(_, b)| *b)
73}
74
75/// One legacy-label rewrite site.
76#[derive(Debug, Clone, PartialEq, Eq)]
77pub struct LabelMigration {
78    /// Byte range in the original source that holds the legacy label.
79    pub byte_range: std::ops::Range<usize>,
80    /// Legacy label as it appears in the source.
81    pub from: &'static str,
82    /// Canonical replacement.
83    pub to: &'static str,
84}
85
86/// The result of a migration pass.
87#[derive(Debug, Clone, PartialEq, Eq)]
88pub struct MigrationOutcome {
89    /// The rewritten source, with every legacy label replaced by its
90    /// canonical form. Equals the input verbatim when
91    /// [`migrations`](Self::migrations) is empty.
92    pub rewritten: String,
93    /// One entry per legacy label site found in the input. Empty when
94    /// the source has no legacy labels.
95    pub migrations: Vec<LabelMigration>,
96}
97
98impl MigrationOutcome {
99    /// True when the migration pass found any legacy labels to
100    /// rewrite. `lexd migrate-labels --check` exits non-zero when this
101    /// is true.
102    pub fn is_modified(&self) -> bool {
103        !self.migrations.is_empty()
104    }
105}
106
107/// Walk `src`'s parsed AST and migrate every legacy bare label found
108/// to its canonical `lex.*` form. Returns the rewritten source plus a
109/// per-site list of what changed.
110///
111/// Returns `Err` only when the source fails to parse — the migration
112/// pass needs a clean parse to locate label spans. Soft diagnostics
113/// from the parser are ignored; only hard parse errors abort.
114pub fn migrate_labels_in_source(src: &str) -> Result<MigrationOutcome, MigrationError> {
115    // Strict-mode parse rejects legacy `doc.*` and bare non-shortcuts —
116    // exactly the inputs the migration tool needs to rewrite. Run a
117    // permissive pipeline so legacy spellings survive into the AST,
118    // then walk it to map source spans onto the rewrite table.
119    let doc = parse_permissive(src).map_err(|e| MigrationError::ParseFailed {
120        message: e.to_string(),
121    })?;
122
123    let mut sites = Vec::new();
124    collect_sites(&doc, src, &mut sites);
125
126    let rewritten = apply_migrations(src, &sites);
127    Ok(MigrationOutcome {
128        rewritten,
129        migrations: sites,
130    })
131}
132
133/// Run the parse + assembly stages with NormalizeLabels in permissive
134/// mode so legacy label spellings (`doc.*`, bare non-shortcut metadata)
135/// flow through unchanged. Mirrors `STRING_TO_AST` exactly except for
136/// the NormalizeLabels constructor.
137fn parse_permissive(src: &str) -> Result<Document, crate::lex::transforms::TransformError> {
138    let source = if !src.is_empty() && !src.ends_with('\n') {
139        format!("{src}\n")
140    } else {
141        src.to_string()
142    };
143    let tokens = LEXING.run(source.clone())?;
144    let mut output =
145        crate::lex::parsing::engine::parse_from_flat_tokens(tokens, &source).map_err(|e| {
146            crate::lex::transforms::TransformError::StageFailed {
147                stage: "Parser".to_string(),
148                message: e.to_string(),
149            }
150        })?;
151    output.root = ParseInlines::new().run(output.root)?;
152    if let Some(ref mut title) = output.title {
153        title.content.ensure_inline_parsed();
154    }
155    let mut doc = AttachRoot::new().run(output)?;
156    doc = AttachAnnotations::new().run(doc)?;
157    doc = NormalizeLabels::permissive().run(doc)?;
158    doc = ApplyTableConfig::new().run(doc)?;
159    Ok(doc)
160}
161
162/// Errors surfaced by [`migrate_labels_in_source`].
163#[derive(Debug, Clone, PartialEq, Eq)]
164pub enum MigrationError {
165    /// The parser rejected the source. The migration pass needs a
166    /// clean parse to locate label spans, so it cannot proceed.
167    ParseFailed { message: String },
168}
169
170impl std::fmt::Display for MigrationError {
171    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
172        match self {
173            Self::ParseFailed { message } => write!(f, "parse failed: {message}"),
174        }
175    }
176}
177
178impl std::error::Error for MigrationError {}
179
180fn collect_sites(doc: &Document, src: &str, sites: &mut Vec<LabelMigration>) {
181    for ann in &doc.annotations {
182        check_label(&ann.data.label, src, sites);
183        for child in ann.children.iter() {
184            collect_in_item(child, src, sites);
185        }
186    }
187    for ann in &doc.root.annotations {
188        check_label(&ann.data.label, src, sites);
189        for child in ann.children.iter() {
190            collect_in_item(child, src, sites);
191        }
192    }
193    for item in doc.root.children.iter() {
194        collect_in_item(item, src, sites);
195    }
196}
197
198fn collect_in_item(item: &ContentItem, src: &str, sites: &mut Vec<LabelMigration>) {
199    match item {
200        ContentItem::Annotation(a) => check_annotation(a, src, sites),
201        ContentItem::VerbatimBlock(v) => check_verbatim(v, src, sites),
202        ContentItem::Table(t) => collect_in_table(t, src, sites),
203        _ => {}
204    }
205    if let Some(attached) = attached_annotations(item) {
206        for ann in attached.iter() {
207            check_annotation(ann, src, sites);
208        }
209    }
210    if let Some(children) = item.children() {
211        for child in children.iter() {
212            collect_in_item(child, src, sites);
213        }
214    }
215}
216
217fn collect_in_table(table: &crate::lex::ast::Table, src: &str, sites: &mut Vec<LabelMigration>) {
218    // `ContentItem::children()` returns `None` for tables (their
219    // structure lives in rows/cells), so the generic walker doesn't
220    // reach legacy labels nested inside cell block content or
221    // footnotes. Mirror the explicit table walk that
222    // `assembling::stages::normalize_labels` uses so the source-level
223    // migration discovers everything the AST-level normalize pass
224    // would have rewritten.
225    for row in table.header_rows.iter().chain(table.body_rows.iter()) {
226        for cell in row.cells.iter() {
227            for child in cell.children.iter() {
228                collect_in_item(child, src, sites);
229            }
230        }
231    }
232    if let Some(footnotes) = table.footnotes.as_ref() {
233        for ann in footnotes.annotations.iter() {
234            check_annotation(ann, src, sites);
235        }
236        for item in footnotes.items.iter() {
237            collect_in_item(item, src, sites);
238        }
239    }
240}
241
242fn check_annotation(annotation: &Annotation, src: &str, sites: &mut Vec<LabelMigration>) {
243    check_label(&annotation.data.label, src, sites);
244    for child in annotation.children.iter() {
245        collect_in_item(child, src, sites);
246    }
247}
248
249fn check_verbatim(verbatim: &Verbatim, src: &str, sites: &mut Vec<LabelMigration>) {
250    check_label(&verbatim.closing_data.label, src, sites);
251}
252
253fn attached_annotations(item: &ContentItem) -> Option<&Vec<Annotation>> {
254    match item {
255        ContentItem::Session(s) => Some(&s.annotations),
256        ContentItem::Paragraph(p) => Some(&p.annotations),
257        ContentItem::Definition(d) => Some(&d.annotations),
258        ContentItem::List(l) => Some(&l.annotations),
259        ContentItem::ListItem(li) => Some(&li.annotations),
260        ContentItem::VerbatimBlock(v) => Some(&v.annotations),
261        ContentItem::Table(t) => Some(&t.annotations),
262        _ => None,
263    }
264}
265
266fn check_label(label: &Label, src: &str, sites: &mut Vec<LabelMigration>) {
267    // After NormalizeLabels runs (which happens in STRING_TO_AST since
268    // Phase 3b), `label.value` is canonical. But the label's span
269    // still points at the original source bytes — so the source slice
270    // is the *legacy* form when one was used.
271    //
272    // The parser's label span typically captures one trailing
273    // whitespace byte (separator between the label and either the
274    // next param or the closing `::`). Trim the slice to the
275    // actual label characters and adjust the byte range we report so
276    // the rewrite drops in cleanly without disturbing the surrounding
277    // whitespace.
278    let span = &label.location.span;
279    let start = span.start;
280    let end = span.end;
281    if start > end || end > src.len() {
282        // Defensive: parser should always emit valid spans, but if a
283        // synthetic label slipped through we don't want to panic.
284        return;
285    }
286    let raw = &src[start..end];
287    let leading_ws = raw.bytes().take_while(|b| b.is_ascii_whitespace()).count();
288    let trailing_ws = raw
289        .bytes()
290        .rev()
291        .take_while(|b| b.is_ascii_whitespace())
292        .count();
293    let trim_start = start + leading_ws;
294    let trim_end = end.saturating_sub(trailing_ws);
295    if trim_start >= trim_end {
296        return;
297    }
298    let slice = &src[trim_start..trim_end];
299    if let Some((from, to)) = LEGACY_TO_BLESSED
300        .iter()
301        .find(|(legacy, _)| *legacy == slice)
302    {
303        // Permissive parse keeps the legacy spelling on the AST too;
304        // the source slice and label.value should agree.
305        debug_assert_eq!(
306            label.value, *from,
307            "permissive parse must preserve legacy spelling; got {} for source {slice}",
308            label.value
309        );
310        sites.push(LabelMigration {
311            byte_range: trim_start..trim_end,
312            from,
313            to,
314        });
315    }
316}
317
318fn apply_migrations(src: &str, sites: &[LabelMigration]) -> String {
319    if sites.is_empty() {
320        return src.to_string();
321    }
322    // Apply in reverse byte order so earlier replacements don't shift
323    // later offsets. The walker visits in document order; reverse the
324    // collected list to apply from end to start.
325    let mut result = src.to_string();
326    let mut sorted: Vec<&LabelMigration> = sites.iter().collect();
327    sorted.sort_by(|a, b| b.byte_range.start.cmp(&a.byte_range.start));
328    for site in sorted {
329        result.replace_range(site.byte_range.clone(), site.to);
330    }
331    result
332}
333
334#[cfg(test)]
335mod tests {
336    use super::*;
337
338    #[test]
339    fn no_legacy_labels_returns_input_unchanged() {
340        let src = "Hello world.\n\n:: lex.metadata.title :: My Doc\n";
341        let out = migrate_labels_in_source(src).expect("migrate ok");
342        assert_eq!(out.rewritten, src);
343        assert!(out.migrations.is_empty());
344        assert!(!out.is_modified());
345    }
346
347    #[test]
348    fn blessed_shortcuts_are_not_migrated() {
349        // Under #584, `:: title ::` and `:: author ::` are the blessed
350        // forms — no migration needed.
351        for shortcut in ["title", "author", "date", "tags"] {
352            let src = format!(":: {shortcut} :: value\n\nBody.\n");
353            let out = migrate_labels_in_source(&src).expect("migrate ok");
354            assert!(
355                !out.is_modified(),
356                "shortcut :: {shortcut} :: is the blessed form; must not migrate"
357            );
358            assert_eq!(out.rewritten, src);
359        }
360    }
361
362    #[test]
363    fn non_shortcut_bare_metadata_migrates_to_stripped_form() {
364        // The four metadata labels with no shortcut alias migrate to
365        // their prefix-stripped form (`metadata.<name>`), which is the
366        // shortest accepted form for them.
367        for (legacy, blessed) in [
368            ("category", "metadata.category"),
369            ("template", "metadata.template"),
370            ("publishing-date", "metadata.publishing-date"),
371            ("front-matter", "metadata.front-matter"),
372        ] {
373            let src = format!(":: {legacy} :: value\n\nBody.\n");
374            let out = migrate_labels_in_source(&src).unwrap_or_else(|e| {
375                panic!("migrate failed for {legacy}: {e}");
376            });
377            assert!(out.is_modified(), "{legacy} must trigger migration");
378            assert_eq!(out.migrations[0].from, legacy);
379            assert_eq!(out.migrations[0].to, blessed);
380            assert!(
381                out.rewritten.contains(&format!(":: {blessed} ::")),
382                "rewritten must contain :: {blessed} ::, got: {}",
383                out.rewritten
384            );
385        }
386    }
387
388    #[test]
389    fn doc_table_migrates_to_blessed_table_shortcut() {
390        let src = "Table:\n\n    | a | b |\n    |---|---|\n    | 1 | 2 |\n:: doc.table ::\n";
391        let out = migrate_labels_in_source(src).expect("migrate ok");
392        assert!(out.is_modified());
393        assert_eq!(out.migrations.len(), 1);
394        assert_eq!(out.migrations[0].from, "doc.table");
395        assert_eq!(out.migrations[0].to, "table");
396        assert!(out.rewritten.contains(":: table ::"));
397        assert!(!out.rewritten.contains(":: doc.table ::"));
398    }
399
400    #[test]
401    fn doc_image_video_audio_migrate_to_blessed_shortcuts() {
402        for (legacy, blessed) in [
403            ("doc.image", "image"),
404            ("doc.video", "video"),
405            ("doc.audio", "audio"),
406        ] {
407            let src = format!("Media:\n    caption\n:: {legacy} src=file ::\n");
408            let out = migrate_labels_in_source(&src).expect("migrate ok");
409            assert!(out.is_modified(), ":: {legacy} :: must trigger migration");
410            assert_eq!(out.migrations[0].from, legacy);
411            assert_eq!(out.migrations[0].to, blessed);
412            assert!(
413                out.rewritten.contains(&format!(":: {blessed} ")),
414                "expected blessed :: {blessed} :: in {}",
415                out.rewritten
416            );
417        }
418    }
419
420    #[test]
421    fn multiple_legacy_labels_all_rewrite_with_correct_offsets() {
422        let src = ":: category :: tech\n:: template :: x\n\nBody.\n";
423        let out = migrate_labels_in_source(src).expect("migrate ok");
424        assert_eq!(
425            out.migrations.len(),
426            2,
427            "two legacy labels must produce two migrations: {:?}",
428            out.migrations
429        );
430        assert!(out.rewritten.contains(":: metadata.category ::"));
431        assert!(out.rewritten.contains(":: metadata.template ::"));
432        assert!(!out.rewritten.contains(":: category ::"));
433        assert!(!out.rewritten.contains(":: template ::"));
434    }
435
436    #[test]
437    fn non_legacy_labels_are_left_alone() {
438        let src = ":: acme.custom param=value :: body\n\nBody.\n";
439        let out = migrate_labels_in_source(src).expect("migrate ok");
440        assert!(!out.is_modified());
441        assert_eq!(out.rewritten, src);
442    }
443
444    #[test]
445    fn already_canonical_labels_are_left_alone() {
446        let src = ":: lex.metadata.title :: My Doc\n:: lex.media.image src=x ::\n";
447        let out = migrate_labels_in_source(src).expect("migrate ok");
448        assert!(!out.is_modified(), "canonical labels must not be migrated");
449        assert_eq!(out.rewritten, src);
450    }
451
452    #[test]
453    fn body_text_containing_legacy_words_is_not_rewritten() {
454        // Important: "category" inside paragraph body text isn't a
455        // label and must not be touched.
456        let src = "This paragraph mentions the category and template words.\n";
457        let out = migrate_labels_in_source(src).expect("migrate ok");
458        assert!(!out.is_modified(), "body words must not be rewritten");
459        assert_eq!(out.rewritten, src);
460    }
461
462    #[test]
463    fn collect_in_table_recurses_into_cell_block_children() {
464        // Regression for Copilot's PR 581 callout: `ContentItem::Table`
465        // returns `None` from `children()`, so the generic walker
466        // doesn't reach a legacy annotation that lives in a cell's
467        // block-content `children` slot. Today's parser doesn't emit
468        // block children in cells, but the AST surface allows it via
469        // `TableCell::with_children`, and a future parser change must
470        // not silently lose migrations.
471        //
472        // Permissive mode preserves the original spelling, so the AST
473        // label value matches the source slice (no canonical rewrite).
474        use crate::lex::ast::elements::annotation::Annotation;
475        use crate::lex::ast::elements::data::Data;
476        use crate::lex::ast::elements::label::Label;
477        use crate::lex::ast::elements::table::{Table, TableCell, TableRow};
478        use crate::lex::ast::elements::typed_content::ContentElement;
479        use crate::lex::ast::elements::verbatim::VerbatimBlockMode;
480        use crate::lex::ast::range::{Position, Range as AstRange};
481        use crate::lex::ast::text_content::TextContent;
482        use crate::lex::ast::Document as LexDocument;
483
484        // The crafted src places `category` at bytes 3..11 (after `:: `).
485        let src = ":: category ::\n";
486        let label_span = std::ops::Range { start: 3, end: 11 };
487        let label = Label {
488            value: "category".to_string(),
489            location: AstRange::new(label_span, Position::new(0, 3), Position::new(0, 11)),
490            form: crate::lex::ast::elements::label::LabelForm::Canonical,
491        };
492        let inner_annotation = Annotation::from_data(Data::new(label, Vec::new()), Vec::new());
493
494        let cell = TableCell::new(TextContent::from_string("cell".into(), None))
495            .with_children(vec![ContentElement::Annotation(inner_annotation)]);
496        let row = TableRow::new(vec![cell]);
497        let table = Table::new(
498            TextContent::from_string("Data".into(), None),
499            Vec::new(),
500            vec![row],
501            VerbatimBlockMode::Inflow,
502        );
503
504        let mut doc = LexDocument::new();
505        doc.root
506            .children
507            .as_mut_vec()
508            .push(ContentItem::Table(Box::new(table)));
509
510        let mut sites = Vec::new();
511        collect_sites(&doc, src, &mut sites);
512
513        assert_eq!(
514            sites.len(),
515            1,
516            "legacy annotation inside a table cell's block children must be discovered"
517        );
518        assert_eq!(sites[0].from, "category");
519        assert_eq!(sites[0].to, "metadata.category");
520        assert_eq!(sites[0].byte_range, 3..11);
521    }
522
523    #[test]
524    fn migrations_have_correct_byte_ranges() {
525        // Span sanity: `from` slice from the input at the migration's
526        // byte range must equal the legacy label string.
527        let src = ":: category :: foo\n\nBody.\n";
528        let out = migrate_labels_in_source(src).expect("migrate ok");
529        let m = &out.migrations[0];
530        let slice = &src[m.byte_range.clone()];
531        assert_eq!(slice, m.from, "byte range must point at the legacy text");
532    }
533}