wilayah 0.6.0

Location lookup for Indonesian villages by GPS coordinates or name
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
//! Build pipeline for constructing the `wilayah` location database.
//!
//! This module provides an end-to-end pipeline that:
//! 1. Downloads the official Kemendagri PDF listing all Indonesian villages
//! 2. Extracts and parses village records from the PDF text
//! 3. Fetches village polygon boundaries from the BIG ArcGIS API and computes centroids
//! 4. Merges the data, using kecamatan centroids as fallback for new villages
//! 5. Builds a SQLite database with RTree spatial index and FTS5 full-text search
//!
//! # Example
//!
//! ```no_run
//! use wilayah::builder::Pipeline;
//!
//! let output = Pipeline::new()
//!     .output(std::path::Path::new("data/locations.db"))
//!     .run()
//!     .expect("pipeline failed");
//!
//! println!("Built database with {} villages", output.village_count);
//! ```
//!
//! The pipeline is designed to be reproducible and transparent, sourcing data from
//! official government publications and APIs. The resulting database is embedded
//! into the `wilayah` crate at compile time via the build script.

mod big_api;
mod db_create;
mod parse;
mod pdf;
mod spatial;
mod util;

pub use parse::ParseOutputDetail;

use std::path::{Path, PathBuf};

/// The government decree number and year that the Kemendagri PDF data is based on.
pub const DATA_DECREE: &str = "Kepmendagri No 300.2.2-2138 Tahun 2025";

const PDF_URL: &str =
    "https://drive.google.com/uc?export=download&id=1o_m621D00TtwCwQMLn8XUnV3nolamPDm";
const BIG_API_URL: &str =
    "https://geoservices.big.go.id/gis/rest/services/BAPANAS/Batas_Administrasi/MapServer/2/query";

/// How to classify multi-ring polygon features.
///
/// BIG ArcGIS data can return features with multiple rings. Some rings are
/// separate outer boundaries (e.g., an island village spanning multiple
/// islands), while rare rings are holes (enclaves).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RingClassification {
    /// Treat all rings as separate outer polygons (MultiPolygon).
    ///
    /// A point in an enclave would match both the surrounding village and
    /// the enclave village. This is correct for 99%+ of Indonesian village
    /// boundaries where holes are essentially nonexistent.
    SeparateRings,
    /// Use spatial containment to detect holes.
    ///
    /// Rings contained within an outer ring become interior rings (holes).
    /// A point inside a hole will NOT match the outer village. This is fully
    /// correct but adds ~50 lines of classification logic at build time.
    ClassifyHoles,
}

/// Error type returned when a pipeline step fails.
///
/// Contains a descriptive error message indicating what went wrong during
/// the pipeline execution (e.g., download failure, parsing error, database
/// creation failure).
pub struct PipelineError {
    message: String,
    source: Option<Box<dyn std::error::Error + Send + Sync>>,
}

impl PipelineError {
    fn from_error<E: std::error::Error + Send + Sync + 'static>(e: E) -> Self {
        PipelineError {
            message: e.to_string(),
            source: Some(Box::new(e)),
        }
    }

    /// Creates a new `PipelineError` with the given message and no source.
    pub fn new(msg: impl Into<String>) -> Self {
        PipelineError {
            message: msg.into(),
            source: None,
        }
    }

    /// Wraps this error with additional context message, preserving the original as `source`.
    pub fn context(self, msg: impl Into<String>) -> Self {
        PipelineError {
            message: msg.into(),
            source: Some(Box::new(self)),
        }
    }
}

impl std::fmt::Display for PipelineError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.message)
    }
}

impl std::fmt::Debug for PipelineError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        if let Some(src) = &self.source {
            write!(f, "PipelineError({}, source: {})", self.message, src)
        } else {
            write!(f, "PipelineError({})", self.message)
        }
    }
}

impl std::error::Error for PipelineError {
    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
        self.source
            .as_ref()
            .map(|e| e.as_ref() as &(dyn std::error::Error + 'static))
    }
}

trait PipelineResultExt<T> {
    fn ctx(self, msg: impl Into<String>) -> Result<T, PipelineError>;
}

impl<T, E: std::error::Error + Send + Sync + 'static> PipelineResultExt<T> for Result<T, E> {
    fn ctx(self, msg: impl Into<String>) -> Result<T, PipelineError> {
        self.map_err(|e| PipelineError {
            message: msg.into(),
            source: Some(Box::new(e)),
        })
    }
}

impl From<std::io::Error> for PipelineError {
    fn from(e: std::io::Error) -> Self {
        Self::from_error(e)
    }
}

impl From<rusqlite::Error> for PipelineError {
    fn from(e: rusqlite::Error) -> Self {
        Self::from_error(e)
    }
}

impl From<serde_json::Error> for PipelineError {
    fn from(e: serde_json::Error) -> Self {
        Self::from_error(e)
    }
}

/// Output of a successful pipeline run.
pub struct PipelineOutput {
    /// Path to the built SQLite database file.
    pub db_path: PathBuf,
    /// Path to the built polygon database file, if `include_polygons(true)` was set.
    pub poly_db_path: Option<PathBuf>,
    /// Path to the saved parsed villages JSON, if `save_parsed_villages` was set.
    pub parsed_villages_path: Option<PathBuf>,
    /// Path to the saved parsed districts JSON, if `save_parsed_villages` was set.
    pub parsed_districts_path: Option<PathBuf>,
    /// Path to the saved parsed provinces JSON, if `save_parsed_villages` was set.
    pub parsed_provinces_path: Option<PathBuf>,
    /// Path to the saved parsed cities JSON, if `save_parsed_villages` was set.
    pub parsed_cities_path: Option<PathBuf>,
    /// Path to the saved parsed island summaries JSON, if `save_parsed_villages` was set.
    pub parsed_island_summaries_path: Option<PathBuf>,
    /// Path to the saved parsed islands JSON, if `save_parsed_villages` was set.
    pub parsed_islands_path: Option<PathBuf>,
    /// Number of villages in the database.
    pub village_count: usize,
    /// SHA-256 hash of the database file, in hexadecimal.
    pub sha256: String,
}

/// Builder for configuring and running the database build pipeline.
///
/// The pipeline fetches data from official sources (Kemendagri PDF and BIG ArcGIS API),
/// merges and validates it, then constructs a SQLite database with RTree and FTS5
/// indexes. The resulting database is used by the `wilayah` crate at compile time.
pub struct Pipeline {
    pdf_url: String,
    big_api_url: String,
    cache_dir: PathBuf,
    output: PathBuf,
    decree: String,
    force_refresh_big: bool,
    ring_classification: RingClassification,
    include_polygons: bool,
    save_parsed_villages: Option<parse::ParseOutputDetail>,
}

impl Pipeline {
    /// Creates a new `Pipeline` with default configuration.
    ///
    /// Defaults:
    /// - PDF URL: Kemendagri official PDF from Google Drive
    /// - BIG API URL: `https://geoservices.big.go.id/...`
    /// - Cache directory: `data/cache` (relative to current working directory)
    /// - Output database: `data/locations.db`
    /// - Decree: `DATA_DECREE` constant
    /// - `force_refresh_big`: `false`
    ///
    /// To change any of these, use the builder methods (`pdf_url()`, `cache_dir()`,
    /// etc.) before calling `run()`.
    pub fn new() -> Self {
        Self {
            pdf_url: PDF_URL.to_string(),
            big_api_url: BIG_API_URL.to_string(),
            cache_dir: PathBuf::from("data/cache"),
            output: PathBuf::from("data/locations.db"),
            decree: DATA_DECREE.to_string(),
            force_refresh_big: false,
            ring_classification: RingClassification::SeparateRings,
            include_polygons: false,
            save_parsed_villages: None,
        }
    }

    /// Overrides the default Kemendagri PDF download URL.
    ///
    /// The URL should point to a PDF file containing the official village listing.
    /// The default is the Google Drive link used by the Ministry of Home Affairs.
    pub fn pdf_url(mut self, url: &str) -> Self {
        self.pdf_url = url.to_string();
        self
    }

    /// Overrides the default BIG (Badan Informasi Geospasial) ArcGIS API endpoint.
    ///
    /// The pipeline queries this service for village polygon boundaries and computes
    /// centroids. The default is the public BAPANAS service.
    pub fn big_api_url(mut self, url: &str) -> Self {
        self.big_api_url = url.to_string();
        self
    }

    /// Sets the directory where intermediate files are cached.
    ///
    /// This includes the downloaded PDF (`kemendagri.pdf`) and cached BIG data
    /// (`big_villages.json`). The directory is created if it does not exist.
    pub fn cache_dir(mut self, dir: &Path) -> Self {
        self.cache_dir = dir.to_path_buf();
        self
    }

    /// Sets the output path for the final SQLite database.
    ///
    /// This file will be overwritten if it already exists. The parent directory
    /// must be writable.
    pub fn output(mut self, path: &Path) -> Self {
        self.output = path.to_path_buf();
        self
    }

    /// Overrides the government decree string stored in the database metadata.
    ///
    /// This value is for informational purposes only and appears in `DataInfo`.
    /// The default is `DATA_DECREE`.
    pub fn decree(mut self, decree: &str) -> Self {
        self.decree = decree.to_string();
        self
    }

    /// Forces re-downloading BIG API data even if a cached copy exists.
    ///
    /// By default, the pipeline uses the cached `big_villages.json` if present.
    /// Set this to `true` to fetch fresh data from the API.
    pub fn force_refresh_big(mut self, yes: bool) -> Self {
        self.force_refresh_big = yes;
        self
    }

    /// Sets how multi-ring polygon features are classified.
    ///
    /// Defaults to [`RingClassification::SeparateRings`] — all rings are treated
    /// as separate outer polygons. Use [`RingClassification::ClassifyHoles`] to
    /// detect holes (enclaves) via spatial containment tests.
    pub fn ring_classification(mut self, mode: RingClassification) -> Self {
        self.ring_classification = mode;
        self
    }

    /// Enables building a separate polygon database alongside the main database.
    ///
    /// When `true`, the pipeline preserves raw polygon geometry from the BIG API
    /// and writes it to a `locations-poly.db` file (same directory as the output).
    /// This enables [`LocateMethod::Contained`](crate::types::LocateMethod::Contained)
    /// lookups via `Database::open_with_polygons()`.
    pub fn include_polygons(mut self, yes: bool) -> Self {
        self.include_polygons = yes;
        self
    }

    /// Enable saving parsed village records to a JSON file in the cache directory.
    ///
    /// The output detail level controls how much information is included:
    /// - `Minimal`: code + cleaned name + district + city + province
    /// - `WithRawName`: adds `raw_name` (original text before note stripping)
    /// - `Full`: adds `note_keyword` and `note_boundary` for parser auditing
    ///
    /// When set, the pipeline writes `parsed_villages.json` to the cache directory
    /// and includes its path in [`PipelineOutput::parsed_villages_path`].
    pub fn save_parsed_villages(mut self, detail: parse::ParseOutputDetail) -> Self {
        self.save_parsed_villages = Some(detail);
        self
    }

    /// Executes the full pipeline.
    ///
    /// Steps:
    /// 1. Ensure Kemendagri PDF is downloaded (cached if already present)
    /// 2. Extract text from PDF using `pdftotext`
    /// 3. Parse village records from the extracted text
    /// 4. Fetch BIG polygon data (cached or fresh with retries)
    /// 5. Merge villages with coordinates, using kecamatan centroids as fallback
    /// 6. Build the SQLite database with indexes and optimize
    /// 7. Compute SHA-256 of the final database
    ///
    /// Returns `PipelineOutput` on success, or `PipelineError` if any step fails.
    ///
    /// NOTE: The pipeline does not currently support resuming from a failed step.
    /// If step 6 (build_db) fails after spending significant time in steps 1–5,
    /// re-running will redo all work. Cached files (PDF, BIG JSON) help avoid
    /// redundant downloads, but parsing and merging are repeated. A future
    /// improvement could checkpoint intermediate results (e.g., parsed villages)
    /// to allow resuming from the point of failure.
    pub fn run(self) -> Result<PipelineOutput, PipelineError> {
        eprintln!("Starting pipeline...");

        let pdf_path = pdf::ensure_pdf(&self.pdf_url, &self.cache_dir)?;
        let text = pdf::extract_text(&pdf_path)?;
        let villages = parse::parse_villages(&text);

        let parsed_villages_path = if let Some(detail) = self.save_parsed_villages {
            let path = self.cache_dir.join("parsed_villages.json");
            parse::save_parsed_villages(&villages, detail, &path)?;

            let districts = parse::extract_districts(&villages);
            let dist_path = self.cache_dir.join("parsed_districts.json");
            parse::save_parsed_districts(&districts, &dist_path)?;

            Some(path)
        } else {
            None
        };

        let parsed_districts_path = if self.save_parsed_villages.is_some() {
            Some(self.cache_dir.join("parsed_districts.json"))
        } else {
            None
        };

        let (parsed_provinces_path, parsed_cities_path) = if self.save_parsed_villages.is_some() {
            let provinces = parse::extract_provinces(&text);
            let prov_path = self.cache_dir.join("parsed_provinces.json");
            parse::save_parsed_provinces(&provinces, &prov_path)?;

            let cities = parse::extract_cities(&text);
            let city_path = self.cache_dir.join("parsed_cities.json");
            parse::save_parsed_cities(&cities, &city_path)?;

            (Some(prov_path), Some(city_path))
        } else {
            (None, None)
        };

        let (parsed_island_summaries_path, parsed_islands_path) =
            if self.save_parsed_villages.is_some() {
                let (summaries, islands) = parse::extract_islands(&text);
                let sum_path = self.cache_dir.join("parsed_island_summaries.json");
                parse::save_parsed_island_summaries(&summaries, &sum_path)?;

                let isl_path = self.cache_dir.join("parsed_islands.json");
                parse::save_parsed_islands(&islands, &isl_path)?;

                (Some(sum_path), Some(isl_path))
            } else {
                (None, None)
            };

        let big_data = big_api::fetch_big_data(
            &self.big_api_url,
            &self.cache_dir,
            self.force_refresh_big,
            self.include_polygons,
        )?;
        let merged = db_create::merge_villages(&villages, &big_data);

        let build_date = std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap()
            .as_secs();

        db_create::build_db(&merged, &self.output, &self.decree, "official", build_date)?;

        let poly_db_path = if self.include_polygons {
            let poly_path = self.output.with_extension("poly.db");
            db_create::build_poly_db(&big_data, &poly_path, self.ring_classification)?;
            Some(poly_path)
        } else {
            None
        };

        let sha256 = util::hash_file(&self.output)?;

        let village_count = merged.len();

        eprintln!("Pipeline completed successfully.");
        Ok(PipelineOutput {
            db_path: self.output,
            poly_db_path,
            parsed_villages_path,
            parsed_districts_path,
            parsed_provinces_path,
            parsed_cities_path,
            parsed_island_summaries_path,
            parsed_islands_path,
            village_count,
            sha256,
        })
    }
}

impl Default for Pipeline {
    fn default() -> Self {
        Self::new()
    }
}