1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
//! Extract bibliographic data from MARC records and export to CSV.
//!
//! This example demonstrates reading MARC records (from gzip-compressed files or raw binary)
//! and extracting publication dates, authors, and titles into a CSV format.
//!
//! # Usage
//!
//! ```sh
//! cargo run --example marc_to_csv -- <input_file.mrc[.gz]> [output_file.csv]
//! ```
//!
//! If no output file is specified, writes to stdout.
//!
//! # Examples
//!
//! ```sh
//! cargo run --example marc_to_csv -- records.mrc
//! cargo run --example marc_to_csv -- records.mrc.gz output.csv
//! cargo run --example marc_to_csv -- BooksAll.2016.part01.utf8.gz books.csv
//! ```
use std::env;
use std::fs::File;
use std::io::{BufReader, Write};
use flate2::read::GzDecoder;
use mrrc::MarcReader;
fn main() -> anyhow::Result<()> {
let args: Vec<String> = env::args().collect();
if args.len() < 2 {
eprintln!("Usage: {} <input_file.mrc[.gz]> [output_file.csv]", args[0]);
eprintln!();
eprintln!("Arguments:");
eprintln!(" input_file Path to MARC file (supports .gz compression)");
eprintln!(" output_file Optional CSV output file (default: stdout)");
std::process::exit(1);
}
let input_path = &args[1];
let output_path = args.get(2).map(std::string::String::as_str);
// Open input file
let file = File::open(input_path)
.map_err(|e| anyhow::anyhow!("Failed to open input file '{input_path}': {e}"))?;
// Determine if file is gzipped
let is_gzip = std::path::Path::new(input_path)
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("gz"));
// Create appropriate reader based on compression
let reader: Box<dyn std::io::Read> = if is_gzip {
Box::new(GzDecoder::new(BufReader::new(file)))
} else {
Box::new(BufReader::new(file))
};
let mut marc_reader = MarcReader::new(reader);
// Prepare output
let mut output: Box<dyn Write> = if let Some(path) = output_path {
Box::new(
File::create(path)
.map_err(|e| anyhow::anyhow!("Failed to create output file '{path}': {e}"))?,
)
} else {
Box::new(std::io::stdout())
};
// Write CSV header
writeln!(output, "title,author,publication_date")?;
let mut record_count = 0;
let mut error_count = 0;
// Process records
loop {
match marc_reader.read_record() {
Ok(Some(record)) => {
record_count += 1;
// Extract title (field 245, subfield 'a')
let title = record
.get_field("245")
.and_then(|f| f.get_subfield('a'))
.unwrap_or("N/A");
// Extract author (field 100, subfield 'a' - primary author)
let author = record
.get_field("100")
.and_then(|f| f.get_subfield('a'))
.or_else(|| {
// Fallback to field 110 (corporate author)
record.get_field("110").and_then(|f| f.get_subfield('a'))
})
.unwrap_or("N/A");
// Extract publication date
// Try field 260 (Publication, Distribution, Etc.) subfield 'c' first
let pub_date = record
.get_field("260")
.and_then(|f| f.get_subfield('c'))
.or_else(|| {
// Fallback to field 008 (Fixed-length data elements)
// Position 7-10 contains the publication year for most records
record.get_control_field("008").and_then(|field_008| {
if field_008.len() >= 11 {
let year = &field_008[7..11];
// Only use if it looks like a year (4 digits, not all spaces/zeros)
if year != " "
&& year != "0000"
&& year.chars().all(|c| c.is_ascii_digit())
{
Some(year)
} else {
None
}
} else {
None
}
})
})
.unwrap_or("N/A");
// Write CSV row with proper escaping
write_csv_row(&mut output, &[title, author, pub_date])?;
},
Ok(None) => {
// End of file
break;
},
Err(e) => {
error_count += 1;
eprintln!("Error reading record {}: {}", record_count + 1, e);
// Continue processing remaining records
},
}
}
eprintln!("Processed {record_count} records with {error_count} errors");
if let Some(path) = output_path {
eprintln!("CSV written to: {path}");
}
Ok(())
}
/// Write a CSV row with proper field escaping
fn write_csv_row<W: Write>(writer: &mut W, fields: &[&str]) -> anyhow::Result<()> {
for (i, field) in fields.iter().enumerate() {
if i > 0 {
write!(writer, ",")?;
}
// Escape quotes and wrap in quotes if needed
if field.contains(',') || field.contains('"') || field.contains('\n') {
write!(writer, "\"")?;
for c in field.chars() {
if c == '"' {
write!(writer, "\"\"")?;
} else {
write!(writer, "{c}")?;
}
}
write!(writer, "\"")?;
} else {
write!(writer, "{field}")?;
}
}
writeln!(writer)?;
Ok(())
}