bq_schema_gen/
lib.rs

1//! BigQuery Schema Generator
2//!
3//! A Rust library and CLI tool for generating BigQuery schemas from JSON or CSV data.
4//!
5//! Unlike BigQuery's built-in auto-detect which only examines the first 500 records,
6//! this tool processes all records in the input to generate a complete schema.
7//!
8//! # Example
9//!
10//! ```rust
11//! use bq_schema_gen::{SchemaGenerator, GeneratorConfig, SchemaMap};
12//! use serde_json::json;
13//!
14//! let config = GeneratorConfig::default();
15//! let mut generator = SchemaGenerator::new(config);
16//! let mut schema_map = SchemaMap::new();
17//!
18//! // Process records
19//! let record = json!({"name": "test", "count": 42});
20//! generator.process_record(&record, &mut schema_map).unwrap();
21//!
22//! // Get the BigQuery schema
23//! let schema = generator.flatten_schema(&schema_map);
24//! ```
25
26pub mod diff;
27pub mod error;
28pub mod inference;
29pub mod input;
30pub mod output;
31pub mod schema;
32pub mod validate;
33pub mod watch;
34
35// Re-export commonly used types
36pub use error::{Error, ErrorLog, Result};
37pub use input::{CsvRecordIterator, JsonRecordIterator};
38pub use output::{
39    schema_to_json_string, write_schema_ddl, write_schema_debug_map, write_schema_json,
40    write_schema_json_schema, OutputFormat,
41};
42pub use schema::{
43    bq_schema_to_map, read_existing_schema_from_file, BqMode, BqSchemaField, BqType, EntryStatus,
44    GeneratorConfig, InputFormat, SchemaEntry, SchemaGenerator, SchemaMap,
45};
46pub use validate::{
47    validate_json_data, SchemaValidator, ValidationError, ValidationErrorType, ValidationOptions,
48    ValidationResult,
49};
50pub use watch::{run_watch, WatchConfig, WatchState};
51
52use std::io::{BufRead, Read, Write};
53
54/// High-level function to generate schema from a JSON reader.
55///
56/// If `existing_schema` is provided, the generated schema will be merged with it.
57pub fn generate_schema_from_json<R: BufRead, W: Write>(
58    input: R,
59    output: &mut W,
60    config: GeneratorConfig,
61    ignore_invalid_lines: bool,
62    debugging_interval: Option<usize>,
63    existing_schema: Option<SchemaMap>,
64) -> Result<Vec<ErrorLog>> {
65    let mut generator = SchemaGenerator::new(config);
66    let mut schema_map = existing_schema.unwrap_or_default();
67
68    let iter = JsonRecordIterator::new(input, ignore_invalid_lines);
69
70    for result in iter {
71        let (line_num, record) = result?;
72
73        if let Some(interval) = debugging_interval {
74            if line_num % interval == 0 {
75                eprintln!("Processing line {}", line_num);
76            }
77        }
78
79        if let Err(e) = generator.process_record(&record, &mut schema_map) {
80            if !ignore_invalid_lines {
81                return Err(e);
82            }
83        }
84    }
85
86    eprintln!("Processed {} lines", generator.line_number());
87
88    let schema = generator.flatten_schema(&schema_map);
89    write_schema_json(&schema, output)?;
90
91    Ok(generator.error_logs().to_vec())
92}
93
94/// High-level function to generate schema from a CSV reader.
95///
96/// If `existing_schema` is provided, the generated schema will be merged with it.
97pub fn generate_schema_from_csv<R: Read, W: Write>(
98    input: R,
99    output: &mut W,
100    config: GeneratorConfig,
101    debugging_interval: Option<usize>,
102    existing_schema: Option<SchemaMap>,
103) -> Result<Vec<ErrorLog>> {
104    let mut generator = SchemaGenerator::new(config);
105    let mut schema_map = existing_schema.unwrap_or_default();
106
107    let iter = CsvRecordIterator::new(input)?;
108
109    for result in iter {
110        let (line_num, record) = result?;
111
112        if let Some(interval) = debugging_interval {
113            if line_num % interval == 0 {
114                eprintln!("Processing line {}", line_num);
115            }
116        }
117
118        generator.process_record(&record, &mut schema_map)?;
119    }
120
121    eprintln!("Processed {} lines", generator.line_number());
122
123    let schema = generator.flatten_schema(&schema_map);
124    write_schema_json(&schema, output)?;
125
126    Ok(generator.error_logs().to_vec())
127}
128
129#[cfg(test)]
130mod tests {
131    use super::*;
132    use std::io::Cursor;
133
134    #[test]
135    fn test_generate_schema_from_json() {
136        let input = r#"{"name": "test", "value": 42}
137{"name": "foo", "value": 123, "active": true}"#;
138        let cursor = Cursor::new(input);
139        let mut output = Vec::new();
140
141        let config = GeneratorConfig::default();
142        let errors =
143            generate_schema_from_json(cursor, &mut output, config, false, None, None).unwrap();
144
145        assert!(errors.is_empty());
146
147        let output_str = String::from_utf8(output).unwrap();
148        assert!(output_str.contains("\"name\""));
149        assert!(output_str.contains("\"value\""));
150        assert!(output_str.contains("\"active\""));
151    }
152
153    #[test]
154    fn test_generate_schema_from_csv() {
155        let input = "name,value,active\ntest,42,true\nfoo,123,false";
156        let cursor = Cursor::new(input);
157        let mut output = Vec::new();
158
159        let config = GeneratorConfig {
160            input_format: InputFormat::Csv,
161            ..Default::default()
162        };
163
164        let errors = generate_schema_from_csv(cursor, &mut output, config, None, None).unwrap();
165
166        assert!(errors.is_empty());
167
168        let output_str = String::from_utf8(output).unwrap();
169        assert!(output_str.contains("\"name\""));
170        assert!(output_str.contains("\"value\""));
171        assert!(output_str.contains("\"active\""));
172    }
173}