1use crate::writer::JsonlWriter;
14use crate::{
15 Declaration, Diagnostic, DiagnosticSeverity, GoalState, LeanFile, Provenance, Result,
16 TraceConfig, capture_provenance, collect_imports, detect_declaration, discover_lean_files,
17 run_lean_file,
18};
19use camino::{Utf8Component, Utf8Path, Utf8PathBuf};
20use serde::{Deserialize, Serialize};
21use std::collections::HashSet;
22use std::time::Duration;
23use tracing::{info, warn};
24
25const INSTRUCTIONS: &str = "Replace only the target span with a Lean proof that compiles.";
27
28const DEFAULT_EXCLUDE: &str = ".lake/";
30
31#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
33#[serde(rename_all = "snake_case")]
34pub enum MineKind {
35 Sorry,
37 Admit,
39 Error,
41}
42
43impl MineKind {
44 #[must_use]
46 pub const fn placeholder_keyword(self) -> Option<&'static str> {
47 match self {
48 Self::Sorry => Some("sorry"),
49 Self::Admit => Some("admit"),
50 Self::Error => None,
51 }
52 }
53}
54
55#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
62pub struct TargetSpan {
63 pub start_line: u32,
65 pub start_column: u32,
67 pub end_line: u32,
69 pub end_column: u32,
71 pub text: String,
73}
74
75#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
77pub struct AllowedEdit {
78 pub file: Utf8PathBuf,
80 pub start_line: u32,
82 pub end_line: u32,
84}
85
86#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
88pub struct MineTask {
89 pub task_id: String,
92 pub project: String,
94 pub file: LeanFile,
96 #[serde(skip_serializing_if = "Option::is_none")]
98 pub declaration: Option<Declaration>,
99 pub kind: MineKind,
101 pub line: u32,
103 pub column: u32,
105 pub imports: Vec<String>,
107 pub source_before: String,
109 pub target_span: TargetSpan,
111 pub source_after: String,
113 #[serde(skip_serializing_if = "Option::is_none")]
115 pub diagnostic: Option<Diagnostic>,
116 #[serde(skip_serializing_if = "Option::is_none")]
118 pub goal_state: Option<GoalState>,
119 pub allowed_edit: AllowedEdit,
121 pub instructions: String,
123}
124
125#[derive(Clone, Debug, Eq, PartialEq)]
127pub struct MineOptions {
128 pub kind: MineKind,
130 pub project: String,
132 pub lake_root: Utf8PathBuf,
134 pub recursive: bool,
136 pub timeout: Duration,
138 pub exclude: Vec<String>,
140}
141
142impl MineOptions {
143 #[must_use]
146 pub fn new(kind: MineKind, project: String, lake_root: Utf8PathBuf) -> Self {
147 Self {
148 kind,
149 project,
150 lake_root,
151 recursive: false,
152 timeout: Duration::from_secs(60),
153 exclude: vec![DEFAULT_EXCLUDE.to_owned()],
154 }
155 }
156}
157
158#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
160pub struct MineSummary {
161 pub files_scanned: usize,
163 pub tasks_written: usize,
165}
166
167pub async fn run_mine(
170 options: &MineOptions,
171 roots: &[Utf8PathBuf],
172 writer: &mut JsonlWriter,
173) -> Result<MineSummary> {
174 let files = collect_files(options, roots)?;
175 info!(count = files.len(), kind = ?options.kind, "discovered Lean files");
176
177 let provenance = match options.kind {
178 MineKind::Error => capture_provenance(options.lake_root.as_path()).await,
179 MineKind::Sorry | MineKind::Admit => Provenance::default(),
180 };
181
182 let mut summary = MineSummary::default();
183 for file in files {
184 let source = match std::fs::read_to_string(file.as_path()) {
185 Ok(source) => source,
186 Err(err) => {
187 warn!(%file, error = %err, "skipping unreadable file");
188 continue;
189 }
190 };
191 summary.files_scanned += 1;
192
193 let tasks = match options.kind {
194 MineKind::Sorry | MineKind::Admit => mine_placeholders(
195 &file,
196 &source,
197 options.kind,
198 &options.project,
199 options.lake_root.as_path(),
200 ),
201 MineKind::Error => {
202 let mut config =
203 TraceConfig::new(options.lake_root.clone()).timeout(options.timeout);
204 config.include_warnings = false;
205 let trace = run_lean_file(&config, &provenance, file.clone()).await;
206 mine_errors(
207 &file,
208 &source,
209 &trace.diagnostics,
210 &options.project,
211 options.lake_root.as_path(),
212 )
213 }
214 };
215
216 for task in &tasks {
217 writer.write_record(task)?;
218 summary.tasks_written += 1;
219 }
220 }
221
222 writer.flush()?;
223 Ok(summary)
224}
225
226#[must_use]
231pub fn mine_placeholders(
232 file: &LeanFile,
233 source: &str,
234 kind: MineKind,
235 project: &str,
236 root: &Utf8Path,
237) -> Vec<MineTask> {
238 let Some(keyword) = kind.placeholder_keyword() else {
239 return Vec::new();
240 };
241 let lines: Vec<&str> = source.lines().collect();
242 let imports = collect_imports(&lines);
243 let module = module_name(file.as_path(), root);
244 let mut seen_ids: HashSet<String> = HashSet::new();
245 let mut tasks = Vec::new();
246
247 for hit in scan_placeholders(source, keyword) {
248 let target_idx = (hit.line.saturating_sub(1)) as usize;
249 let declaration = detect_declaration(&lines, target_idx);
250 let span = SpanBytes {
251 start_byte: hit.byte_start,
252 end_byte: hit.byte_end,
253 start_line: hit.line,
254 start_column: hit.column,
255 end_line: hit.line,
256 end_column: hit.end_column,
257 };
258 tasks.push(build_task(
259 file,
260 source,
261 project,
262 &module,
263 &imports,
264 declaration,
265 kind,
266 hit.line,
267 hit.column,
268 &span,
269 None,
270 None,
271 &mut seen_ids,
272 ));
273 }
274 tasks
275}
276
277#[must_use]
283pub fn mine_errors(
284 file: &LeanFile,
285 source: &str,
286 diagnostics: &[Diagnostic],
287 project: &str,
288 root: &Utf8Path,
289) -> Vec<MineTask> {
290 let lines: Vec<&str> = source.lines().collect();
291 let spans = line_content_spans(source);
292 let imports = collect_imports(&lines);
293 let module = module_name(file.as_path(), root);
294 let mut seen_ids: HashSet<String> = HashSet::new();
295 let mut seen_spans: HashSet<(usize, usize)> = HashSet::new();
296 let mut tasks = Vec::new();
297
298 for diag in diagnostics
299 .iter()
300 .filter(|d| d.severity == DiagnosticSeverity::Error)
301 {
302 if let Some(diag_file) = &diag.file {
303 if diag_file.file_name() != file.as_path().file_name() {
304 continue;
305 }
306 }
307 let Some(line) = diag.line else { continue };
308 let target_idx = (line.saturating_sub(1)) as usize;
309 if target_idx >= spans.len() {
310 continue;
311 }
312
313 let declaration = detect_declaration(&lines, target_idx);
314 let span = match declaration_span(declaration.as_ref(), line, source, &spans) {
315 Some(span) => span,
316 None => continue,
317 };
318 if !seen_spans.insert((span.start_byte, span.end_byte)) {
319 continue;
320 }
321
322 let column = diag.column.unwrap_or(0);
323 tasks.push(build_task(
324 file,
325 source,
326 project,
327 &module,
328 &imports,
329 declaration,
330 MineKind::Error,
331 line,
332 column,
333 &span,
334 Some(diag.clone()),
335 diag.goal_state.clone(),
336 &mut seen_ids,
337 ));
338 }
339 tasks
340}
341
342struct SpanBytes {
344 start_byte: usize,
345 end_byte: usize,
346 start_line: u32,
347 start_column: u32,
348 end_line: u32,
349 end_column: u32,
350}
351
352fn declaration_span(
355 declaration: Option<&Declaration>,
356 error_line: u32,
357 source: &str,
358 spans: &[(usize, usize)],
359) -> Option<SpanBytes> {
360 let (start_line, end_line) = match declaration {
361 Some(decl) => (decl.start_line, decl.end_line),
362 None => (error_line, error_line),
363 };
364 let start = spans.get((start_line.saturating_sub(1)) as usize)?;
365 let end = spans.get((end_line.saturating_sub(1)) as usize)?;
366 let end_column = codepoint_len(source.get(end.0..end.1).unwrap_or(""));
367 Some(SpanBytes {
368 start_byte: start.0,
369 end_byte: end.1,
370 start_line,
371 start_column: 0,
372 end_line,
373 end_column,
374 })
375}
376
377#[allow(clippy::too_many_arguments)]
379fn build_task(
380 file: &LeanFile,
381 source: &str,
382 project: &str,
383 module: &str,
384 imports: &[String],
385 declaration: Option<Declaration>,
386 kind: MineKind,
387 line: u32,
388 column: u32,
389 span: &SpanBytes,
390 diagnostic: Option<Diagnostic>,
391 goal_state: Option<GoalState>,
392 seen_ids: &mut HashSet<String>,
393) -> MineTask {
394 let text = source
395 .get(span.start_byte..span.end_byte)
396 .unwrap_or("")
397 .to_owned();
398 let source_before = source.get(..span.start_byte).unwrap_or("").to_owned();
399 let source_after = source.get(span.end_byte..).unwrap_or("").to_owned();
400
401 let decl_name = declaration.as_ref().and_then(|decl| decl.name.clone());
402 let base_id = match &decl_name {
403 Some(name) => format!("{module}.{name}:{line}"),
404 None => format!("{module}:{line}"),
405 };
406 let task_id = if seen_ids.contains(&base_id) {
407 format!("{base_id}:{column}")
408 } else {
409 base_id
410 };
411 seen_ids.insert(task_id.clone());
412
413 MineTask {
414 task_id,
415 project: project.to_owned(),
416 file: file.clone(),
417 declaration,
418 kind,
419 line,
420 column,
421 imports: imports.to_vec(),
422 source_before,
423 target_span: TargetSpan {
424 start_line: span.start_line,
425 start_column: span.start_column,
426 end_line: span.end_line,
427 end_column: span.end_column,
428 text,
429 },
430 source_after,
431 diagnostic,
432 goal_state,
433 allowed_edit: AllowedEdit {
434 file: file.as_path().clone(),
435 start_line: span.start_line,
436 end_line: span.end_line,
437 },
438 instructions: INSTRUCTIONS.to_owned(),
439 }
440}
441
442fn collect_files(options: &MineOptions, roots: &[Utf8PathBuf]) -> Result<Vec<LeanFile>> {
444 let mut files = Vec::new();
445 for root in roots {
446 files.extend(discover_lean_files(root, options.recursive)?);
447 }
448 files.sort();
449 files.dedup();
450 files.retain(|file| {
451 let path = file.as_path().as_str();
452 !options
453 .exclude
454 .iter()
455 .any(|pattern| path.contains(pattern.as_str()))
456 });
457 Ok(files)
458}
459
460pub(crate) fn module_name(file: &Utf8Path, root: &Utf8Path) -> String {
462 let relative = file.strip_prefix(root).unwrap_or(file);
463 let no_extension = relative.with_extension("");
464 let parts: Vec<&str> = no_extension
465 .components()
466 .filter_map(|component| match component {
467 Utf8Component::Normal(part) => Some(part),
468 _ => None,
469 })
470 .collect();
471 if parts.is_empty() {
472 no_extension.as_str().to_owned()
473 } else {
474 parts.join(".")
475 }
476}
477
478fn codepoint_len(text: &str) -> u32 {
480 text.chars().count() as u32
481}
482
483fn line_content_spans(source: &str) -> Vec<(usize, usize)> {
487 let bytes = source.as_bytes();
488 let mut spans = Vec::new();
489 let mut start = 0usize;
490 let mut i = 0usize;
491 while i < bytes.len() {
492 if bytes[i] == b'\n' {
493 let mut end = i;
494 if end > start && bytes[end - 1] == b'\r' {
495 end -= 1;
496 }
497 spans.push((start, end));
498 start = i + 1;
499 }
500 i += 1;
501 }
502 if start < bytes.len() {
503 let mut end = bytes.len();
504 if end > start && bytes[end - 1] == b'\r' {
505 end -= 1;
506 }
507 spans.push((start, end));
508 }
509 spans
510}
511
512struct Hit {
514 byte_start: usize,
515 byte_end: usize,
516 line: u32,
517 column: u32,
518 end_column: u32,
519}
520
521enum ScanState {
523 Normal,
524 LineComment,
525 BlockComment(u32),
526 Str,
527}
528
529fn is_word_char(c: char) -> bool {
531 c.is_alphanumeric() || c == '_' || c == '\''
532}
533
534fn peek(chars: &[(usize, char)], idx: usize) -> Option<char> {
536 chars.get(idx).map(|&(_, c)| c)
537}
538
539fn word_matches(chars: &[(usize, char)], i: usize, j: usize, keyword: &str) -> bool {
541 let expected: Vec<char> = keyword.chars().collect();
542 if j - i != expected.len() {
543 return false;
544 }
545 chars[i..j]
546 .iter()
547 .map(|&(_, c)| c)
548 .eq(expected.iter().copied())
549}
550
551fn scan_placeholders(source: &str, keyword: &str) -> Vec<Hit> {
554 let chars: Vec<(usize, char)> = source.char_indices().collect();
555 let n = chars.len();
556 let mut hits = Vec::new();
557 let mut i = 0usize;
558 let mut line = 1u32;
559 let mut col = 0u32;
560 let mut state = ScanState::Normal;
561
562 while i < n {
563 let (byte, c) = chars[i];
564 match state {
565 ScanState::LineComment => {
566 if c == '\n' {
567 state = ScanState::Normal;
568 i += 1;
569 line += 1;
570 col = 0;
571 } else {
572 i += 1;
573 col += 1;
574 }
575 }
576 ScanState::BlockComment(depth) => {
577 if c == '/' && peek(&chars, i + 1) == Some('-') {
578 state = ScanState::BlockComment(depth + 1);
579 i += 2;
580 col += 2;
581 } else if c == '-' && peek(&chars, i + 1) == Some('/') {
582 let next = depth - 1;
583 state = if next == 0 {
584 ScanState::Normal
585 } else {
586 ScanState::BlockComment(next)
587 };
588 i += 2;
589 col += 2;
590 } else if c == '\n' {
591 i += 1;
592 line += 1;
593 col = 0;
594 } else {
595 i += 1;
596 col += 1;
597 }
598 }
599 ScanState::Str => {
600 if c == '\\' {
601 i += 1;
602 col += 1;
603 if let Some(escaped) = peek(&chars, i) {
604 if escaped == '\n' {
605 line += 1;
606 col = 0;
607 } else {
608 col += 1;
609 }
610 i += 1;
611 }
612 } else if c == '"' {
613 state = ScanState::Normal;
614 i += 1;
615 col += 1;
616 } else if c == '\n' {
617 i += 1;
618 line += 1;
619 col = 0;
620 } else {
621 i += 1;
622 col += 1;
623 }
624 }
625 ScanState::Normal => {
626 if c == '-' && peek(&chars, i + 1) == Some('-') {
627 state = ScanState::LineComment;
628 i += 2;
629 col += 2;
630 } else if c == '/' && peek(&chars, i + 1) == Some('-') {
631 state = ScanState::BlockComment(1);
632 i += 2;
633 col += 2;
634 } else if c == '"' {
635 state = ScanState::Str;
636 i += 1;
637 col += 1;
638 } else if is_word_char(c) {
639 let start_line = line;
640 let start_col = col;
641 let mut j = i;
642 while j < n && is_word_char(chars[j].1) {
643 j += 1;
644 }
645 let word_len = (j - i) as u32;
646 let end_byte = if j < n { chars[j].0 } else { source.len() };
647 let preceded_by_dot = i > 0 && chars[i - 1].1 == '.';
648 if !preceded_by_dot && word_matches(&chars, i, j, keyword) {
649 hits.push(Hit {
650 byte_start: byte,
651 byte_end: end_byte,
652 line: start_line,
653 column: start_col,
654 end_column: start_col + word_len,
655 });
656 }
657 i = j;
658 col += word_len;
659 } else if c == '\n' {
660 i += 1;
661 line += 1;
662 col = 0;
663 } else {
664 i += 1;
665 col += 1;
666 }
667 }
668 }
669 }
670 hits
671}
672
673#[cfg(test)]
674mod tests {
675 use super::*;
676
677 const SAMPLE: &str = "import Init\n\nnamespace Demo\n\ntheorem foo (n : Nat) : n = n := by\n sorry\n\ndef bar : Nat := by admit\n\nend Demo\n";
678
679 fn lean(path: &str) -> LeanFile {
680 LeanFile(Utf8PathBuf::from(path))
681 }
682
683 fn root() -> Utf8PathBuf {
684 Utf8PathBuf::from(".")
685 }
686
687 #[test]
688 fn reconstructs_file_from_before_span_after() {
689 let file = lean("Demo.lean");
690 let tasks = mine_placeholders(&file, SAMPLE, MineKind::Sorry, "demo", root().as_path());
691 assert_eq!(tasks.len(), 1);
692 let task = &tasks[0];
693 let rebuilt = format!(
694 "{}{}{}",
695 task.source_before, task.target_span.text, task.source_after
696 );
697 assert_eq!(rebuilt, SAMPLE);
698 }
699
700 #[test]
701 fn sorry_task_has_precise_single_span() {
702 let file = lean("Demo.lean");
703 let tasks = mine_placeholders(&file, SAMPLE, MineKind::Sorry, "demo", root().as_path());
704 let task = &tasks[0];
705 assert_eq!(task.kind, MineKind::Sorry);
706 assert_eq!(task.target_span.text, "sorry");
707 assert_eq!(task.line, 6);
708 assert_eq!(task.column, 2);
709 assert_eq!(task.target_span.start_line, task.target_span.end_line);
710 assert_eq!(task.allowed_edit.start_line, 6);
711 assert_eq!(task.allowed_edit.end_line, 6);
712 assert_eq!(task.target_span.end_column, 7);
713 assert_eq!(task.instructions, INSTRUCTIONS);
714 }
715
716 #[test]
717 fn task_id_uses_module_and_declaration() {
718 let file = lean("Demo/Basic.lean");
719 let tasks = mine_placeholders(&file, SAMPLE, MineKind::Sorry, "demo", Utf8Path::new("."));
720 assert_eq!(tasks[0].task_id, "Demo.Basic.foo:6");
721 assert_eq!(tasks[0].imports, vec!["import Init"]);
722 let decl = tasks[0].declaration.as_ref().and_then(|d| d.name.clone());
723 assert_eq!(decl.as_deref(), Some("foo"));
724 }
725
726 #[test]
727 fn admit_is_mined_only_for_admit_kind() {
728 let file = lean("Demo.lean");
729 let sorry_tasks =
730 mine_placeholders(&file, SAMPLE, MineKind::Sorry, "demo", root().as_path());
731 assert!(sorry_tasks.iter().all(|t| t.target_span.text == "sorry"));
732 let admit_tasks =
733 mine_placeholders(&file, SAMPLE, MineKind::Admit, "demo", root().as_path());
734 assert_eq!(admit_tasks.len(), 1);
735 assert_eq!(admit_tasks[0].target_span.text, "admit");
736 assert_eq!(admit_tasks[0].line, 8);
737 }
738
739 #[test]
740 fn skips_placeholders_in_comments_and_strings() {
741 let source = "-- a stray sorry here\n/- block sorry -/\ndef s : String := \"sorry\"\ntheorem real : True := by\n sorry\n";
742 let file = lean("S.lean");
743 let tasks = mine_placeholders(&file, source, MineKind::Sorry, "demo", root().as_path());
744 assert_eq!(tasks.len(), 1);
745 assert_eq!(tasks[0].line, 5);
746 }
747
748 #[test]
749 fn skips_qualified_and_partial_words() {
750 let source = "def a := Foo.sorry\ndef b := sorryAx\ndef c := mysorry\n";
751 let file = lean("Q.lean");
752 let tasks = mine_placeholders(&file, source, MineKind::Sorry, "demo", root().as_path());
753 assert!(tasks.is_empty());
754 }
755
756 #[test]
757 fn error_task_targets_enclosing_declaration() {
758 let source = "import Init\n\ntheorem broken : 1 = 2 := by\n rfl\n";
759 let file = lean("Broken.lean");
760 let diag = Diagnostic {
761 file: Some(Utf8PathBuf::from("Broken.lean")),
762 line: Some(4),
763 column: Some(2),
764 severity: DiagnosticSeverity::Error,
765 message: "error: unsolved goals\n⊢ 1 = 2".to_owned(),
766 goal_state: Some(GoalState("⊢ 1 = 2".to_owned())),
767 };
768 let tasks = mine_errors(&file, source, &[diag], "demo", root().as_path());
769 assert_eq!(tasks.len(), 1);
770 let task = &tasks[0];
771 assert_eq!(task.kind, MineKind::Error);
772 assert_eq!(task.target_span.start_line, 3);
773 assert_eq!(task.target_span.end_line, 4);
774 assert!(task.target_span.text.contains("theorem broken"));
775 assert!(task.target_span.text.contains("rfl"));
776 assert_eq!(task.allowed_edit.start_line, 3);
777 assert_eq!(task.allowed_edit.end_line, 4);
778 assert!(task.goal_state.is_some());
779 assert!(task.diagnostic.is_some());
780 let rebuilt = format!(
781 "{}{}{}",
782 task.source_before, task.target_span.text, task.source_after
783 );
784 assert_eq!(rebuilt, source);
785 }
786
787 #[test]
788 fn error_tasks_dedup_by_span() {
789 let source = "theorem broken : 1 = 2 := by\n rfl\n";
790 let file = lean("Broken.lean");
791 let make = |line: u32| Diagnostic {
792 file: Some(Utf8PathBuf::from("Broken.lean")),
793 line: Some(line),
794 column: Some(0),
795 severity: DiagnosticSeverity::Error,
796 message: "error: something".to_owned(),
797 goal_state: None,
798 };
799 let tasks = mine_errors(&file, source, &[make(1), make(2)], "demo", root().as_path());
800 assert_eq!(tasks.len(), 1);
801 }
802
803 #[test]
804 fn warnings_are_ignored_for_error_mining() {
805 let source = "theorem t : True := trivial\n";
806 let file = lean("T.lean");
807 let warning = Diagnostic {
808 file: Some(Utf8PathBuf::from("T.lean")),
809 line: Some(1),
810 column: Some(0),
811 severity: DiagnosticSeverity::Warning,
812 message: "warning: declaration uses 'sorry'".to_owned(),
813 goal_state: None,
814 };
815 let tasks = mine_errors(&file, source, &[warning], "demo", root().as_path());
816 assert!(tasks.is_empty());
817 }
818
819 #[test]
820 fn module_name_drops_root_and_extension() {
821 assert_eq!(
822 module_name(Utf8Path::new("./Demo/Basic.lean"), Utf8Path::new(".")),
823 "Demo.Basic"
824 );
825 assert_eq!(
826 module_name(
827 Utf8Path::new("/tmp/proj/Demo.lean"),
828 Utf8Path::new("/tmp/proj")
829 ),
830 "Demo"
831 );
832 }
833
834 #[test]
835 fn line_content_spans_match_str_lines() {
836 let source = "abc\n\ndef\n";
837 let spans = line_content_spans(source);
838 let rebuilt: Vec<&str> = spans.iter().map(|&(a, b)| &source[a..b]).collect();
839 let expected: Vec<&str> = source.lines().collect();
840 assert_eq!(rebuilt, expected);
841 }
842}