1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
//! Full pipeline integration for data ingestion, inference, and export
//!
//! This module provides a complete pipeline that orchestrates:
//! - Data ingestion into staging database
//! - Schema inference from staged data
//! - LLM-enhanced schema refinement (optional)
//! - Target schema mapping (optional)
//! - Data export to Parquet/target format
//! - ODCS contract generation
//!
//! # Example
//!
//! ```rust,ignore
//! use data_modelling_core::pipeline::{PipelineConfig, PipelineExecutor, PipelineStage};
//!
//! let config = PipelineConfig::new()
//! .with_source("/data/input")
//! .with_database("staging.duckdb")
//! .with_output_dir("/data/output")
//! .with_stages(vec![
//! PipelineStage::Ingest,
//! PipelineStage::Infer,
//! PipelineStage::Export,
//! ]);
//!
//! let mut executor = PipelineExecutor::new(config)?;
//! let report = executor.run()?;
//!
//! println!("Pipeline completed in {}", report.duration_formatted());
//! ```
//!
//! # Pipeline Stages
//!
//! 1. **Ingest**: Load JSON/JSONL files into staging database
//! 2. **Infer**: Infer schema from staged data using statistical analysis
//! 3. **Refine** (optional): Enhance schema using LLM with documentation context
//! 4. **Map** (optional): Map inferred schema to target schema
//! 5. **Export**: Export data to Parquet or other target format
//! 6. **Generate**: Generate ODCS data contracts
//!
//! # Checkpointing
//!
//! The pipeline supports checkpointing for resume functionality:
//!
//! ```rust,ignore
//! // Resume from previous run
//! let config = PipelineConfig::new()
//! .with_source("/data/input")
//! .with_resume(true);
//!
//! let mut executor = PipelineExecutor::new(config)?;
//! let report = executor.run()?; // Continues from last checkpoint
//! ```
//!
//! # Dry Run
//!
//! Validate inputs without executing:
//!
//! ```rust,ignore
//! let config = PipelineConfig::new()
//! .with_source("/data/input")
//! .with_dry_run(true);
//!
//! let mut executor = PipelineExecutor::new(config)?;
//! let report = executor.run()?; // Validates but doesn't execute
//! ```
pub use ;
pub use ;
pub use ;
pub use ;
/// Run a pipeline with the given configuration
///
/// This is a convenience function for simple pipeline execution.