dprint_plugin_jupyter/
format_text.rs

1use std::borrow::Cow;
2use std::path::Path;
3use std::path::PathBuf;
4
5use crate::text_changes::TextChange;
6use crate::text_changes::apply_text_changes;
7use anyhow::Result;
8use jsonc_parser::CollectOptions;
9use jsonc_parser::CommentCollectionStrategy;
10use jsonc_parser::ParseOptions;
11
12pub fn format_text(
13  input_text: &str,
14  format_with_host: impl FnMut(&Path, String) -> Result<Option<String>>,
15) -> Result<Option<String>> {
16  let had_bom = input_text.starts_with("\u{FEFF}");
17  let input_text = if had_bom { &input_text[3..] } else { input_text };
18  let result = format_inner(input_text, format_with_host)?;
19  if result.is_none() && had_bom {
20    Ok(Some(input_text.to_string()))
21  } else {
22    Ok(result)
23  }
24}
25
26fn format_inner(
27  input_text: &str,
28  format_with_host: impl FnMut(&Path, String) -> Result<Option<String>>,
29) -> Result<Option<String>> {
30  let parse_result = jsonc_parser::parse_to_ast(
31    input_text,
32    &CollectOptions {
33      comments: CommentCollectionStrategy::Off,
34      tokens: false,
35    },
36    &ParseOptions {
37      allow_comments: true,
38      allow_loose_object_property_names: true,
39      allow_trailing_commas: true,
40    },
41  )?;
42  let Some(root_value) = parse_result.value else {
43    return Ok(None);
44  };
45
46  Ok(match format_root(input_text, &root_value, format_with_host) {
47    Some(text) => {
48      #[cfg(debug_assertions)]
49      validate_output_json(&text)?;
50      Some(text)
51    }
52    None => None,
53  })
54}
55
56fn format_root(
57  input_text: &str,
58  root_value: &jsonc_parser::ast::Value,
59  mut format_with_host: impl FnMut(&Path, String) -> Result<Option<String>>,
60) -> Option<String> {
61  let root_obj = root_value.as_object()?;
62  let maybe_default_language = get_metadata_language(root_obj);
63  let cells = root_value.as_object()?.get_array("cells")?;
64
65  let text_changes: Vec<TextChange> = cells
66    .elements
67    .iter()
68    .filter_map(|element| get_cell_text_change(input_text, element, maybe_default_language, &mut format_with_host))
69    .collect();
70
71  if text_changes.is_empty() {
72    None
73  } else {
74    Some(apply_text_changes(input_text, text_changes))
75  }
76}
77
78#[cfg(debug_assertions)]
79fn validate_output_json(text: &str) -> Result<()> {
80  // ensures the output is correct in debug mode
81
82  let result = jsonc_parser::parse_to_ast(
83    text,
84    &CollectOptions {
85      comments: CommentCollectionStrategy::Off,
86      tokens: false,
87    },
88    &ParseOptions {
89      allow_comments: true,
90      allow_loose_object_property_names: false,
91      allow_trailing_commas: true,
92    },
93  );
94  match result {
95    Ok(_) => Ok(()),
96    Err(err) => {
97      anyhow::bail!(
98        "dprint-plugin-jupyter produced invalid json. Please open an issue with reproduction steps at https://github.com/dprint/dprint-plugin-jupyter/issues\n{:#}\n\n== TEXT ==\n{}",
99        err,
100        text
101      );
102    }
103  }
104}
105
106fn get_cell_text_change(
107  file_text: &str,
108  cell: &jsonc_parser::ast::Value,
109  maybe_default_language: Option<&str>,
110  format_with_host: &mut impl FnMut(&Path, String) -> Result<Option<String>>,
111) -> Option<TextChange> {
112  let cell = cell.as_object()?;
113  let cell_language = get_cell_vscode_language_id(cell).or_else(|| {
114    let cell_type = cell.get_string("cell_type")?;
115    match cell_type.value.as_ref() {
116      "markdown" => Some("markdown"),
117      "code" => maybe_default_language,
118      _ => None,
119    }
120  })?;
121  let code_block = analyze_code_block(cell, file_text)?;
122  let file_path = language_to_path(cell_language)?;
123  let formatted_text = format_with_host(&file_path, code_block.source).ok()??;
124  // many plugins will add a final newline, but that doesn't look nice in notebooks, so trim it off
125  let formatted_text = formatted_text.trim_end();
126
127  let new_text = if code_block.is_array {
128    build_array_json_text(formatted_text, code_block.indent_text)
129  } else {
130    serde_json::to_string(&formatted_text).unwrap()
131  };
132
133  Some(TextChange {
134    range: code_block.replace_range,
135    new_text,
136  })
137}
138
139struct CodeBlockText<'a> {
140  // Can be either a string or an array of strings.
141  // (https://github.com/jupyter/nbformat/blob/0708dd627d9ef81b12f231defb0d94dd7e80e3f4/nbformat/v4/nbformat.v4.5.schema.json#L460C7-L468C8)
142  is_array: bool,
143  indent_text: &'a str,
144  replace_range: std::ops::Range<usize>,
145  source: String,
146}
147
148fn analyze_code_block<'a>(cell: &jsonc_parser::ast::Object<'a>, file_text: &'a str) -> Option<CodeBlockText<'a>> {
149  let mut indent_text = "";
150  let mut replace_range = std::ops::Range::default();
151  let mut is_array = false;
152  let cell_source = match &cell.get("source")?.value {
153    jsonc_parser::ast::Value::Array(items) => {
154      is_array = true;
155      let mut strings = Vec::with_capacity(items.elements.len());
156      for (i, element) in items.elements.iter().enumerate() {
157        let string_lit = element.as_string_lit()?;
158        if i == 0 {
159          indent_text = get_indent_text(file_text, string_lit.range.start);
160          replace_range.start = string_lit.range.start;
161        }
162        if i == items.elements.len() - 1 {
163          replace_range.end = string_lit.range.end;
164        }
165        strings.push(&string_lit.value);
166      }
167
168      let mut text = String::with_capacity(strings.iter().map(|s| s.len()).sum::<usize>());
169      for string in strings {
170        text.push_str(string);
171      }
172      text
173    }
174    jsonc_parser::ast::Value::StringLit(string) => {
175      replace_range = string.range.start..string.range.end;
176      string.value.to_string()
177    }
178    _ => return None,
179  };
180  Some(CodeBlockText {
181    is_array,
182    indent_text,
183    replace_range,
184    source: cell_source,
185  })
186}
187
188/// Turn the formatted text into a json array, split up by line breaks.
189fn build_array_json_text(formatted_text: &str, indent_text: &str) -> String {
190  let mut new_text = String::new();
191  let mut current_end_index = 0;
192  for (i, line) in formatted_text.split('\n').enumerate() {
193    current_end_index += line.len();
194    if i > 0 {
195      new_text.push_str(",\n");
196      new_text.push_str(indent_text);
197    }
198    let is_last_line = current_end_index == formatted_text.len();
199    new_text.push_str(
200      &serde_json::to_string(
201        if is_last_line {
202          Cow::Borrowed(line)
203        } else {
204          Cow::Owned(format!("{}\n", line))
205        }
206        .as_ref(),
207      )
208      .unwrap(),
209    );
210    current_end_index += 1;
211  }
212  new_text
213}
214
215fn get_metadata_language<'a>(root_obj: &'a jsonc_parser::ast::Object<'a>) -> Option<&'a str> {
216  let language_info = root_obj.get_object("metadata")?.get_object("language_info")?;
217  Some(&language_info.get_string("name")?.value)
218}
219
220fn get_cell_vscode_language_id<'a>(cell: &'a jsonc_parser::ast::Object<'a>) -> Option<&'a str> {
221  let cell_metadata = cell.get_object("metadata")?;
222  let cell_language_info = cell_metadata.get_object("vscode")?;
223  Some(&cell_language_info.get_string("languageId")?.value)
224}
225
226fn language_to_path(language: &str) -> Option<PathBuf> {
227  let ext = match language.to_lowercase().as_str() {
228    "bash" => Some("sh"),
229    "c++" => Some("cpp"),
230    "css" => Some("css"),
231    "csharp" => Some("cs"),
232    "html" => Some("html"),
233    "go" => Some("go"),
234    "kotlin" => Some("kt"),
235    "json" => Some("json"),
236    "julia" => Some("jl"),
237    "markdown" => Some("md"),
238    "typescript" => Some("ts"),
239    "javascript" => Some("js"),
240    "perl" => Some("perl"),
241    "php" => Some("php"),
242    "python" | "python3" => Some("py"),
243    "r" => Some("r"),
244    "ruby" => Some("rb"),
245    "scala" => Some("scala"),
246    "sql" => Some("sql"),
247    "yaml" => Some("yml"),
248    _ => None,
249  };
250  ext.map(|ext| PathBuf::from(format!("code_block.{}", ext)))
251}
252
253fn get_indent_text(file_text: &str, start_pos: usize) -> &str {
254  let preceeding_text = &file_text[..start_pos];
255  let whitespace_start = preceeding_text.trim_end().len();
256  let whitespace_text = &preceeding_text[whitespace_start..];
257  let whitespace_newline_pos = whitespace_text.rfind('\n');
258  &preceeding_text[whitespace_newline_pos
259    .map(|pos| whitespace_start + pos + 1)
260    .unwrap_or(whitespace_start)..]
261}
262
263#[cfg(test)]
264mod test {
265  use super::*;
266
267  #[test]
268  fn test_get_indent_text() {
269    assert_eq!(get_indent_text("  hello", 2), "  ");
270    assert_eq!(get_indent_text("\n  hello", 3), "  ");
271    assert_eq!(get_indent_text("t\n  hello", 4), "  ");
272    assert_eq!(get_indent_text("t\n\t\thello", 4), "\t\t");
273    assert_eq!(get_indent_text("hello", 0), "");
274    assert_eq!(get_indent_text("\nhello", 1), "");
275    assert_eq!(get_indent_text("\nhello", 2), "");
276  }
277
278  #[test]
279  fn formats_with_bom() {
280    // no changes to code other than bom
281    {
282      let input_text = "\u{FEFF}{\"cells\":[{\"cell_type\":\"code\",\"source\":\"let x = 5;\"}]}";
283      let formatted_text = format_text(input_text, |_, text| Ok(Some(text))).unwrap().unwrap();
284      assert_eq!(
285        formatted_text,
286        "{\"cells\":[{\"cell_type\":\"code\",\"source\":\"let x = 5;\"}]}"
287      );
288    }
289    // other changes as well
290    let input_text = "\u{FEFF}{
291  \"cells\":[{
292    \"cell_type\":\"code\",
293    \"metadata\": {
294      \"vscode\": {
295       \"languageId\": \"typescript\"
296      }
297    },
298    \"source\": \"let x = 5;\"
299  }]
300}
301";
302    let formatted_text = format_text(input_text, |_, text| Ok(Some(format!("{}_formatted", text))))
303      .unwrap()
304      .unwrap();
305    assert_eq!(
306      formatted_text,
307      "{
308  \"cells\":[{
309    \"cell_type\":\"code\",
310    \"metadata\": {
311      \"vscode\": {
312       \"languageId\": \"typescript\"
313      }
314    },
315    \"source\": \"let x = 5;_formatted\"
316  }]
317}
318"
319    );
320  }
321}