ref_solver/web/
server.rs

1use axum::http::header;
2use axum::{
3    extract::{DefaultBodyLimit, Multipart, Query, State},
4    http::{HeaderName, HeaderValue, StatusCode},
5    response::{Html, IntoResponse, Json, Response},
6    routing::{get, post},
7    Router,
8};
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11use std::net::SocketAddr;
12use std::sync::Arc;
13use std::time::Duration;
14use tokio::net::TcpListener;
15use tower::limit::ConcurrencyLimitLayer;
16use tower::ServiceBuilder;
17use tower_governor::{governor::GovernorConfigBuilder, GovernorLayer};
18use tower_http::set_header::SetResponseHeaderLayer;
19use tower_http::timeout::TimeoutLayer;
20
21use crate::catalog::store::ReferenceCatalog;
22use crate::cli::ServeArgs;
23use crate::matching::engine::{MatchingConfig, MatchingEngine, ScoringWeights};
24use crate::matching::Suggestion;
25use crate::utils::validation::{validate_upload, ValidationError};
26use crate::web::format_detection::{
27    detect_format, parse_binary_file, parse_binary_file_from_path, parse_with_format, FileFormat,
28};
29
30/// Security configuration constants to prevent `DoS` attacks
31pub const MAX_MULTIPART_FIELDS: usize = 10;
32pub const MAX_FILE_FIELD_SIZE: usize = 16 * 1024 * 1024; // 16MB for text files
33pub const MAX_TEXT_FIELD_SIZE: usize = 1024 * 1024; // 1MB
34
35/// Maximum bytes to read from a binary upload before attempting header parse.
36/// BAM/CRAM headers are typically < 1 MB; 64 MB provides generous headroom.
37pub const BINARY_HEADER_READ_LIMIT: usize = 64 * 1024 * 1024; // 64MB
38
39/// Axum body limit — raised to allow large binary uploads to stream through.
40/// This does not cause memory bloat because `field.chunk()` reads lazily from
41/// the underlying HTTP body stream.
42const MAX_BODY_SIZE: usize = 256 * 1024 * 1024; // 256MB
43
44/// Helper function to convert usize count to f64 with explicit precision loss allowance
45#[inline]
46fn count_to_f64(count: usize) -> f64 {
47    #[allow(clippy::cast_precision_loss)]
48    {
49        count as f64
50    }
51}
52
53/// Shared application state
54pub struct AppState {
55    pub catalog: ReferenceCatalog,
56    pub refget_config: Option<crate::refget::RefgetConfig>,
57}
58
59/// Binary content from an upload, either fully buffered or streamed to a temp file.
60#[derive(Debug)]
61enum BinaryContent {
62    /// Small binary file fully buffered in memory (e.g. from a non-binary-format upload)
63    InMemory(Vec<u8>),
64    /// Large binary file streamed to a temp file (BAM/CRAM uploads)
65    TempFile(tempfile::NamedTempFile),
66}
67
68/// Input data extracted from multipart form
69#[derive(Debug)]
70struct InputData {
71    /// Text content (if provided via textarea or text file)
72    text_content: Option<String>,
73    /// Binary file content (if provided)
74    binary_content: Option<BinaryContent>,
75    /// Original filename
76    filename: Option<String>,
77    /// Detected or specified format
78    format: Option<FileFormat>,
79}
80
81/// Typed error categories for API error responses.
82///
83/// Serialized as `snake_case` strings in JSON to maintain backwards compatibility.
84#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
85#[serde(rename_all = "snake_case")]
86pub enum ErrorType {
87    FieldLimitExceeded,
88    FileTooLarge,
89    TextTooLarge,
90    InternalError,
91    InvalidMatchId,
92    FilenameTooLong,
93    InvalidFilename,
94    FormatMismatch,
95    InvalidContent,
96    ValidationFailed,
97    MissingInput,
98    FormatDetectionFailed,
99    ParseFailed,
100    BinaryParseFailed,
101}
102
103/// Enhanced error response
104#[derive(Serialize)]
105pub struct ErrorResponse {
106    pub error: String,
107    pub error_type: ErrorType,
108    pub details: Option<String>,
109}
110
111#[derive(Serialize)]
112struct ConfigurationInfo {
113    score_threshold: f64,
114    result_limit: usize,
115    scoring_weights: ScoringWeights,
116}
117
118/// Query parameters for detailed mode
119#[derive(Deserialize)]
120struct DetailedQueryParams {
121    /// Mode: "detailed" for detailed contig breakdown, omit for summary
122    mode: Option<String>,
123    /// Match index to get details for (0-based)
124    match_id: Option<usize>,
125    /// Page number for query contigs (0-based)
126    query_page: Option<usize>,
127    /// Page size for query contigs (default: 100, max: 500)
128    query_page_size: Option<usize>,
129    /// Page number for reference contigs (0-based)
130    ref_page: Option<usize>,
131    /// Page size for reference contigs (default: 100, max: 500)
132    ref_page_size: Option<usize>,
133}
134
135/// Create a safe error response that prevents information disclosure
136/// while logging detailed errors server-side for debugging
137pub fn create_safe_error_response(
138    error_type: ErrorType,
139    user_message: &str,
140    internal_error: Option<&str>,
141) -> ErrorResponse {
142    // Log detailed error server-side for debugging (not exposed to client)
143    if let Some(internal_msg) = internal_error {
144        tracing::error!("Internal error ({:?}): {}", error_type, internal_msg);
145    }
146
147    ErrorResponse {
148        error: user_message.to_string(),
149        error_type,
150        details: None, // Never expose internal details to prevent information disclosure
151    }
152}
153
154/// Run the web server
155///
156/// # Errors
157///
158/// Returns an error if the tokio runtime cannot be created or the server fails to start.
159pub fn run(args: ServeArgs) -> anyhow::Result<()> {
160    // Build tokio runtime
161    let rt = tokio::runtime::Runtime::new()?;
162    rt.block_on(async move { run_server(args).await })
163}
164
165/// Create the application router with all routes and middleware configured.
166///
167/// Pass `None` for `refget_config` to disable refget enrichment.
168///
169/// # Errors
170///
171/// Returns an error if the catalog cannot be loaded.
172#[allow(clippy::missing_panics_doc)] // Panics only on invalid governor config (constants are valid)
173pub fn create_router(refget_config: Option<crate::refget::RefgetConfig>) -> anyhow::Result<Router> {
174    // Load catalog
175    let catalog = ReferenceCatalog::load_embedded()?;
176    let state = Arc::new(AppState {
177        catalog,
178        refget_config,
179    });
180
181    // Configure IP-based rate limiting
182    let governor_conf = GovernorConfigBuilder::default()
183        .per_second(10) // 10 requests per second per IP
184        .burst_size(50) // Allow bursts of 50 requests
185        .finish()
186        .unwrap();
187
188    // Build router with comprehensive security layers
189    let app = Router::new()
190        .route("/", get(index_handler))
191        .route("/api/identify", post(identify_handler))
192        .route("/api/catalog", get(catalog_handler))
193        // Static file routes
194        .route("/static/css/styles.css", get(styles_css_handler))
195        .route("/static/js/main.js", get(main_js_handler))
196        .route("/static/js/utils/helpers.js", get(helpers_js_handler))
197        .route(
198            "/static/js/managers/ConfigurationManager.js",
199            get(config_manager_js_handler),
200        )
201        .route(
202            "/static/js/managers/TabManager.js",
203            get(tab_manager_js_handler),
204        )
205        .route(
206            "/static/js/managers/ResultsManager.js",
207            get(results_manager_js_handler),
208        )
209        .route(
210            "/static/js/managers/SplitViewManager.js",
211            get(split_view_manager_js_handler),
212        )
213        .route(
214            "/static/js/utils/headerExtractor.js",
215            get(header_extractor_js_handler),
216        )
217        .with_state(state)
218        .layer(
219            ServiceBuilder::new()
220                // Security headers for browser protection
221                .layer(SetResponseHeaderLayer::if_not_present(
222                    HeaderName::from_static("x-content-type-options"),
223                    HeaderValue::from_static("nosniff"),
224                ))
225                .layer(SetResponseHeaderLayer::if_not_present(
226                    HeaderName::from_static("x-frame-options"),
227                    HeaderValue::from_static("DENY"),
228                ))
229                .layer(SetResponseHeaderLayer::if_not_present(
230                    HeaderName::from_static("x-xss-protection"),
231                    HeaderValue::from_static("1; mode=block"),
232                ))
233                .layer(SetResponseHeaderLayer::if_not_present(
234                    HeaderName::from_static("strict-transport-security"),
235                    HeaderValue::from_static("max-age=31536000; includeSubDomains"),
236                ))
237                .layer(SetResponseHeaderLayer::if_not_present(
238                    HeaderName::from_static("referrer-policy"),
239                    HeaderValue::from_static("strict-origin-when-cross-origin"),
240                ))
241                // IP-based rate limiting to prevent abuse
242                .layer(GovernorLayer {
243                    config: Arc::new(governor_conf),
244                })
245                // Request timeout to prevent slow client attacks
246                .layer(TimeoutLayer::with_status_code(
247                    StatusCode::REQUEST_TIMEOUT,
248                    Duration::from_secs(30),
249                ))
250                // Limit concurrent requests to prevent DOS
251                .layer(ConcurrencyLimitLayer::new(100))
252                // Limit request body size — raised for binary streaming; actual read
253                // limits are enforced per-field in extract_request_data()
254                .layer(DefaultBodyLimit::max(MAX_BODY_SIZE)),
255        );
256
257    Ok(app)
258}
259
260async fn run_server(args: ServeArgs) -> anyhow::Result<()> {
261    let refget_config = if args.no_refget {
262        None
263    } else {
264        Some(crate::refget::RefgetConfig::new(&args.refget_server))
265    };
266    let app = create_router(refget_config)?;
267
268    let addr = format!("{}:{}", args.address, args.port);
269    println!("Starting ref-solver web server at http://{addr}");
270
271    if args.open {
272        let _ = open::that(format!("http://{addr}"));
273    }
274
275    let listener = TcpListener::bind(&addr).await?;
276    axum::serve(
277        listener,
278        app.into_make_service_with_connect_info::<SocketAddr>(),
279    )
280    .await?;
281
282    Ok(())
283}
284
285/// Main page handler
286async fn index_handler() -> Html<&'static str> {
287    Html(include_str!("templates/index.html"))
288}
289
290/// Static CSS handler
291async fn styles_css_handler() -> impl IntoResponse {
292    (
293        [(header::CONTENT_TYPE, "text/css; charset=utf-8")],
294        include_str!("static/css/styles.css"),
295    )
296}
297
298/// Static JS handlers for ES6 modules
299async fn main_js_handler() -> impl IntoResponse {
300    (
301        [(
302            header::CONTENT_TYPE,
303            "application/javascript; charset=utf-8",
304        )],
305        include_str!("static/js/main.js"),
306    )
307}
308
309async fn helpers_js_handler() -> impl IntoResponse {
310    (
311        [(
312            header::CONTENT_TYPE,
313            "application/javascript; charset=utf-8",
314        )],
315        include_str!("static/js/utils/helpers.js"),
316    )
317}
318
319async fn config_manager_js_handler() -> impl IntoResponse {
320    (
321        [(
322            header::CONTENT_TYPE,
323            "application/javascript; charset=utf-8",
324        )],
325        include_str!("static/js/managers/ConfigurationManager.js"),
326    )
327}
328
329async fn tab_manager_js_handler() -> impl IntoResponse {
330    (
331        [(
332            header::CONTENT_TYPE,
333            "application/javascript; charset=utf-8",
334        )],
335        include_str!("static/js/managers/TabManager.js"),
336    )
337}
338
339async fn results_manager_js_handler() -> impl IntoResponse {
340    (
341        [(
342            header::CONTENT_TYPE,
343            "application/javascript; charset=utf-8",
344        )],
345        include_str!("static/js/managers/ResultsManager.js"),
346    )
347}
348
349async fn split_view_manager_js_handler() -> impl IntoResponse {
350    (
351        [(
352            header::CONTENT_TYPE,
353            "application/javascript; charset=utf-8",
354        )],
355        include_str!("static/js/managers/SplitViewManager.js"),
356    )
357}
358
359async fn header_extractor_js_handler() -> impl IntoResponse {
360    (
361        [(
362            header::CONTENT_TYPE,
363            "application/javascript; charset=utf-8",
364        )],
365        include_str!("static/js/utils/headerExtractor.js"),
366    )
367}
368
369/// API endpoint for identifying references
370#[allow(clippy::too_many_lines)] // TODO: Refactor into smaller functions
371async fn identify_handler(
372    State(state): State<Arc<AppState>>,
373    Query(params): Query<DetailedQueryParams>,
374    mut multipart: Multipart,
375) -> impl IntoResponse {
376    let start_time = std::time::Instant::now();
377
378    // Extract input data and configuration from multipart form
379    let (input_data, config) = match extract_request_data(&mut multipart).await {
380        Ok(data) => data,
381        Err(error_response) => return error_response,
382    };
383
384    // Parse input using intelligent format detection
385    let (query, parse_warnings) = match parse_input_data(&input_data) {
386        Ok(result) => result,
387        Err(error_response) => return *error_response,
388    };
389
390    // Create matching engine with configuration
391    let matching_config = MatchingConfig {
392        min_score: config.score_threshold,
393        scoring_weights: config.scoring_weights.clone(),
394    };
395
396    let engine = MatchingEngine::new(&state.catalog, matching_config);
397    let matches = engine.find_matches(&query, config.result_limit);
398
399    // Check if detailed mode is requested
400    if params.mode.as_deref() == Some("detailed") {
401        return handle_detailed_response(
402            &params,
403            &matches,
404            &query,
405            start_time,
406            &config,
407            state.refget_config.as_ref(),
408        )
409        .await;
410    }
411
412    // Build enhanced response
413    let results: Vec<serde_json::Value> = matches
414        .iter()
415        .map(|m| {
416            serde_json::json!({
417                "reference": {
418                    "id": m.reference.id.0,
419                    "display_name": m.reference.display_name,
420                    "assembly": format!("{}", m.reference.assembly),
421                    "source": format!("{}", m.reference.source),
422                    "download_url": m.reference.download_url,
423                },
424                "score": {
425                    "composite": m.score.composite,
426                    "confidence": format!("{:?}", m.score.confidence),
427                    "detailed_scores": {
428                        "md5_jaccard": m.score.md5_jaccard,
429                        "name_length_jaccard": m.score.name_length_jaccard,
430                        "md5_query_coverage": m.score.md5_query_coverage,
431                        "order_score": m.score.order_score,
432                    },
433                },
434                "match_type": format!("{:?}", m.diagnosis.match_type),
435                "reordered": m.diagnosis.reordered,
436                "exact_matches": m.diagnosis.exact_matches.len(),
437                "renamed_matches": m.diagnosis.renamed_matches.len(),
438                "conflicts": m.diagnosis.conflicts.len(),
439                "query_only": m.diagnosis.query_only.len(),
440                "diagnosis": {
441                    "exact_matches": m.diagnosis.exact_matches.iter().map(|_| {
442                        serde_json::json!({"type": "exact"})
443                    }).collect::<Vec<_>>(),
444                    "renamed_matches": m.diagnosis.renamed_matches.iter().map(|r| {
445                        serde_json::json!({
446                            "query_name": r.query_name,
447                            "reference_name": r.reference_name
448                        })
449                    }).collect::<Vec<_>>(),
450                    "conflicts": m.diagnosis.conflicts.iter().map(|c| {
451                        serde_json::json!({
452                            "query_contig": {
453                                "name": c.query_contig.name,
454                                "length": c.query_contig.length,
455                                "md5": c.query_contig.md5
456                            },
457                            "conflict_type": format!("{:?}", c.conflict_type),
458                            "description": c.description
459                        })
460                    }).collect::<Vec<_>>(),
461                },
462                "suggestions": m.diagnosis.suggestions.iter().map(|s| {
463                    match s {
464                        Suggestion::RenameContigs { command_hint, .. } => {
465                            serde_json::json!({"type": "rename", "command": command_hint})
466                        }
467                        Suggestion::ReorderContigs { command_hint } => {
468                            serde_json::json!({"type": "reorder", "command": command_hint})
469                        }
470                        Suggestion::ReplaceContig { contig_name, reason, source } => {
471                            serde_json::json!({"type": "replace", "contig": contig_name, "reason": reason, "source": source})
472                        }
473                        Suggestion::UseAsIs { warnings } => {
474                            serde_json::json!({"type": "use_as_is", "warnings": warnings})
475                        }
476                        Suggestion::Realign { reason, suggested_reference } => {
477                            serde_json::json!({"type": "realign", "reason": reason, "reference": suggested_reference})
478                        }
479                    }
480                }).collect::<Vec<_>>(),
481            })
482        })
483        .collect();
484
485    #[allow(clippy::cast_possible_truncation)] // Processing time won't exceed u64
486    let processing_time = start_time.elapsed().as_millis() as u64;
487
488    Json(serde_json::json!({
489        "query": {
490            "contig_count": query.contigs.len(),
491            "has_md5": query.has_md5s(),
492            "md5_coverage": query.md5_coverage(),
493            "naming_convention": format!("{:?}", query.naming_convention),
494        },
495        "warnings": parse_warnings,
496        "matches": results,
497        "processing_info": {
498            "detected_format": input_data.format.as_ref().map_or("unknown", super::format_detection::FileFormat::display_name),
499            "processing_time_ms": processing_time,
500            "configuration": {
501                "score_threshold": config.score_threshold,
502                "result_limit": config.result_limit,
503                "scoring_weights": config.scoring_weights,
504            }
505        }
506    }))
507    .into_response()
508}
509
510/// Handle detailed response mode for contig breakdown
511#[allow(clippy::cast_possible_truncation, clippy::too_many_lines)] // JSON indices; TODO: refactor
512async fn handle_detailed_response(
513    params: &DetailedQueryParams,
514    matches: &[crate::matching::engine::MatchResult],
515    query: &crate::core::header::QueryHeader,
516    start_time: std::time::Instant,
517    config: &ConfigurationInfo,
518    refget_config: Option<&crate::refget::RefgetConfig>,
519) -> Response {
520    use crate::core::contig::Contig;
521
522    // Get the specific match or default to first match
523    let match_index = params.match_id.unwrap_or(0);
524    let Some(selected_match) = matches.get(match_index) else {
525        return (
526            StatusCode::BAD_REQUEST,
527            Json(create_safe_error_response(
528                ErrorType::InvalidMatchId,
529                "Invalid match ID specified",
530                Some("Match index out of bounds"),
531            )),
532        )
533            .into_response();
534    };
535
536    // Set up pagination parameters
537    let query_page = params.query_page.unwrap_or(0);
538    let query_page_size = params.query_page_size.unwrap_or(100).min(500);
539    let ref_page = params.ref_page.unwrap_or(0);
540    let ref_page_size = params.ref_page_size.unwrap_or(100).min(500);
541
542    // Extract query contigs with pagination
543    let total_query_contigs = query.contigs.len();
544    let query_start = query_page * query_page_size;
545    let query_end = (query_start + query_page_size).min(total_query_contigs);
546    let query_contigs_page: Vec<&Contig> = if query_start < total_query_contigs {
547        query.contigs[query_start..query_end].iter().collect()
548    } else {
549        Vec::new()
550    };
551
552    // Extract reference contigs with pagination
553    let total_ref_contigs = selected_match.reference.contigs.len();
554    let ref_start = ref_page * ref_page_size;
555    let ref_end = (ref_start + ref_page_size).min(total_ref_contigs);
556    let ref_contigs_page: Vec<&Contig> = if ref_start < total_ref_contigs {
557        selected_match.reference.contigs[ref_start..ref_end]
558            .iter()
559            .collect()
560    } else {
561        Vec::new()
562    };
563
564    // Build detailed mapping information
565    let mut exact_match_mappings = Vec::new();
566    let mut renamed_match_mappings = Vec::new();
567    let mut conflict_mappings = Vec::new();
568    let mut query_only_indices = Vec::new();
569    let mut reference_only_indices = Vec::new();
570
571    // Create lookup maps for efficient indexing
572    let query_name_to_index: std::collections::HashMap<&str, usize> = query
573        .contigs
574        .iter()
575        .enumerate()
576        .map(|(i, c)| (c.name.as_str(), i))
577        .collect();
578
579    let ref_name_to_index: std::collections::HashMap<&str, usize> = selected_match
580        .reference
581        .contigs
582        .iter()
583        .enumerate()
584        .map(|(i, c)| (c.name.as_str(), i))
585        .collect();
586
587    // Process exact matches (need to map back to contigs since ContigMatch is empty)
588    for (i, _) in selected_match.diagnosis.exact_matches.iter().enumerate() {
589        // Since ContigMatch doesn't contain contig data, we need to reconstruct
590        // the mapping by analyzing the query and reference contigs
591        // This is a limitation of the current data structure
592        exact_match_mappings.push(serde_json::json!({
593            "type": "exact",
594            "query_index": i, // This is approximate - we'd need better data structure
595            "reference_index": i // This is approximate - we'd need better data structure
596        }));
597    }
598
599    // Process renamed matches
600    for rename in &selected_match.diagnosis.renamed_matches {
601        if let (Some(&query_idx), Some(&ref_idx)) = (
602            query_name_to_index.get(rename.query_name.as_str()),
603            ref_name_to_index.get(rename.reference_name.as_str()),
604        ) {
605            renamed_match_mappings.push(serde_json::json!({
606                "type": "renamed",
607                "query_index": query_idx,
608                "reference_index": ref_idx,
609                "query_name": rename.query_name,
610                "reference_name": rename.reference_name
611            }));
612        }
613    }
614
615    // Process conflicts
616    for conflict in &selected_match.diagnosis.conflicts {
617        if let Some(&query_idx) = query_name_to_index.get(conflict.query_contig.name.as_str()) {
618            let ref_idx = conflict
619                .expected
620                .as_ref()
621                .and_then(|expected| ref_name_to_index.get(expected.name.as_str()));
622
623            conflict_mappings.push(serde_json::json!({
624                "type": "conflict",
625                "query_index": query_idx,
626                "reference_index": ref_idx,
627                "conflict_type": format!("{:?}", conflict.conflict_type),
628                "description": conflict.description
629            }));
630        }
631    }
632
633    // Process query-only contigs
634    for contig in &selected_match.diagnosis.query_only {
635        if let Some(&index) = query_name_to_index.get(contig.name.as_str()) {
636            query_only_indices.push(index);
637        }
638    }
639
640    // Identify reference-only contigs (those not matched by any query contig)
641    let mut matched_ref_indices = std::collections::HashSet::new();
642    #[allow(clippy::cast_possible_truncation)] // Contig indices bounded by MAX_CONTIGS
643    for mapping in &exact_match_mappings {
644        if let Some(ref_idx) = mapping
645            .get("reference_index")
646            .and_then(serde_json::Value::as_u64)
647        {
648            matched_ref_indices.insert(ref_idx as usize);
649        }
650    }
651    #[allow(clippy::cast_possible_truncation)]
652    for mapping in &renamed_match_mappings {
653        if let Some(ref_idx) = mapping
654            .get("reference_index")
655            .and_then(serde_json::Value::as_u64)
656        {
657            matched_ref_indices.insert(ref_idx as usize);
658        }
659    }
660    #[allow(clippy::cast_possible_truncation)]
661    for mapping in &conflict_mappings {
662        if let Some(ref_idx) = mapping
663            .get("reference_index")
664            .and_then(serde_json::Value::as_u64)
665        {
666            matched_ref_indices.insert(ref_idx as usize);
667        }
668    }
669
670    for (i, _) in selected_match.reference.contigs.iter().enumerate() {
671        if !matched_ref_indices.contains(&i) {
672            reference_only_indices.push(i);
673        }
674    }
675
676    // Enrich unmatched (missing) contigs on the current page via refget
677    let enriched_map: std::collections::HashMap<String, crate::refget::EnrichedContig> =
678        if let Some(refget_cfg) = refget_config {
679            // Only enrich contigs that are both unmatched and on the current page
680            let page_unmatched: Vec<&Contig> = selected_match
681                .diagnosis
682                .query_only
683                .iter()
684                .filter(|c| {
685                    query_name_to_index
686                        .get(c.name.as_str())
687                        .is_some_and(|&idx| idx >= query_start && idx < query_end)
688                })
689                .collect();
690            if page_unmatched.is_empty() {
691                std::collections::HashMap::new()
692            } else {
693                // Collect owned contigs for the enrichment call
694                let to_enrich: Vec<Contig> = page_unmatched.into_iter().cloned().collect();
695                let enriched =
696                    crate::refget::enrichment::enrich_contigs(&to_enrich, refget_cfg).await;
697                enriched.into_iter().map(|e| (e.name.clone(), e)).collect()
698            }
699        } else {
700            std::collections::HashMap::new()
701        };
702
703    // Build response
704    #[allow(clippy::cast_possible_truncation)] // Processing time won't exceed u64
705    let processing_time = start_time.elapsed().as_millis() as u64;
706
707    Json(serde_json::json!({
708        "mode": "detailed",
709        "match_id": match_index,
710        "query": {
711            "contigs": query_contigs_page.iter().enumerate().map(|(page_idx, contig)| {
712                let global_idx = query_start + page_idx;
713                // Determine match status for this contig
714                let match_status = if query_only_indices.contains(&global_idx) {
715                    "missing"
716                } else if conflict_mappings.iter().any(|c| c.get("query_index").and_then(serde_json::Value::as_u64).map(|i| i as usize) == Some(global_idx)) {
717                    "conflict"
718                } else if renamed_match_mappings.iter().any(|r| r.get("query_index").and_then(serde_json::Value::as_u64).map(|i| i as usize) == Some(global_idx)) {
719                    "renamed"
720                } else if exact_match_mappings.iter().any(|e| e.get("query_index").and_then(serde_json::Value::as_u64).map(|i| i as usize) == Some(global_idx)) {
721                    "exact"
722                } else {
723                    "unknown"
724                };
725
726                let mut entry = serde_json::json!({
727                    "index": global_idx,
728                    "name": contig.name,
729                    "length": contig.length,
730                    "md5": contig.md5,
731                    "sha512t24u": contig.sha512t24u,
732                    "sequence_role": format!("{:?}", contig.sequence_role),
733                    "aliases": contig.aliases,
734                    "match_status": match_status
735                });
736
737                // Attach refget metadata for missing contigs that were enriched
738                if match_status == "missing" {
739                    if let Some(enriched) = enriched_map.get(&contig.name) {
740                        entry["refget_metadata"] = serde_json::json!(&enriched.refget_metadata);
741                    }
742                }
743
744                entry
745            }).collect::<Vec<_>>(),
746            "pagination": {
747                "page": query_page,
748                "page_size": query_page_size,
749                "total_count": total_query_contigs,
750                "total_pages": total_query_contigs.div_ceil(query_page_size)
751            }
752        },
753        "reference": {
754            "id": selected_match.reference.id.0,
755            "display_name": selected_match.reference.display_name,
756            "assembly": format!("{}", selected_match.reference.assembly),
757            "contigs": ref_contigs_page.iter().enumerate().map(|(page_idx, contig)| {
758                let global_idx = ref_start + page_idx;
759                // Determine match status for this reference contig
760                let match_status = if reference_only_indices.contains(&global_idx) {
761                    "missing"
762                } else if conflict_mappings.iter().any(|c| c.get("reference_index").and_then(serde_json::Value::as_u64).map(|i| i as usize) == Some(global_idx)) {
763                    "conflict"
764                } else if renamed_match_mappings.iter().any(|r| r.get("reference_index").and_then(serde_json::Value::as_u64).map(|i| i as usize) == Some(global_idx)) {
765                    "renamed"
766                } else if exact_match_mappings.iter().any(|e| e.get("reference_index").and_then(serde_json::Value::as_u64).map(|i| i as usize) == Some(global_idx)) {
767                    "exact"
768                } else {
769                    "unknown"
770                };
771
772                serde_json::json!({
773                    "index": global_idx,
774                    "name": contig.name,
775                    "length": contig.length,
776                    "md5": contig.md5,
777                    "sha512t24u": contig.sha512t24u,
778                    "sequence_role": format!("{:?}", contig.sequence_role),
779                    "aliases": contig.aliases,
780                    "match_status": match_status
781                })
782            }).collect::<Vec<_>>(),
783            "pagination": {
784                "page": ref_page,
785                "page_size": ref_page_size,
786                "total_count": total_ref_contigs,
787                "total_pages": total_ref_contigs.div_ceil(ref_page_size)
788            }
789        },
790        "mappings": {
791            "exact_matches": exact_match_mappings,
792            "renamed_matches": renamed_match_mappings,
793            "conflicts": conflict_mappings,
794            "query_only": query_only_indices,
795            "reference_only": reference_only_indices
796        },
797        "match_summary": {
798            "match_type": format!("{:?}", selected_match.diagnosis.match_type),
799            "reordered": selected_match.diagnosis.reordered,
800            "score": {
801                "composite": selected_match.score.composite,
802                "confidence": format!("{:?}", selected_match.score.confidence)
803            }
804        },
805        "processing_info": {
806            "processing_time_ms": processing_time,
807            "configuration": {
808                "score_threshold": config.score_threshold,
809                "result_limit": config.result_limit,
810                "scoring_weights": config.scoring_weights,
811            }
812        }
813    }))
814    .into_response()
815}
816
817/// Extract input data and configuration from multipart form
818#[allow(clippy::too_many_lines)] // TODO: Refactor into smaller functions
819async fn extract_request_data(
820    multipart: &mut Multipart,
821) -> Result<(InputData, ConfigurationInfo), Response> {
822    let mut input_data = InputData {
823        text_content: None,
824        binary_content: None,
825        filename: None,
826        format: None,
827    };
828
829    let mut config = ConfigurationInfo {
830        score_threshold: 0.1, // Default 10%
831        result_limit: 10,
832        scoring_weights: ScoringWeights::default(),
833    };
834
835    let mut fields_received = 0usize;
836    let mut had_parse_error = false;
837
838    // Process multipart fields
839    loop {
840        // Check field count limit before processing
841        if fields_received >= MAX_MULTIPART_FIELDS {
842            return Err((
843                StatusCode::BAD_REQUEST,
844                Json(ErrorResponse {
845                    error: "Too many form fields".to_string(),
846                    error_type: ErrorType::FieldLimitExceeded,
847                    details: None, // No internal details for security
848                }),
849            )
850                .into_response());
851        }
852
853        match multipart.next_field().await {
854            Ok(Some(field)) => {
855                fields_received += 1;
856                let name = field.name().unwrap_or_default().to_string();
857
858                match name.as_str() {
859                    "file" => {
860                        let filename = field.file_name().map(std::string::ToString::to_string);
861
862                        // Detect format from filename before reading the body
863                        let detected_format = if let Some(ref name) = filename {
864                            detect_binary_format(name).unwrap_or(FileFormat::Auto)
865                        } else {
866                            FileFormat::Auto
867                        };
868
869                        // For BAM/CRAM: stream chunks to a temp file (header-only read)
870                        if matches!(detected_format, FileFormat::Bam | FileFormat::Cram) {
871                            match read_binary_chunks(field, detected_format).await {
872                                Ok((temp_file, _bytes_read)) => {
873                                    input_data.filename = filename;
874                                    input_data.binary_content =
875                                        Some(BinaryContent::TempFile(temp_file));
876                                    input_data.format = Some(detected_format);
877                                }
878                                Err(err_response) => return Err(err_response),
879                            }
880                        } else {
881                            // Text and other formats: buffer fully in memory
882                            match field.bytes().await {
883                                Ok(bytes) => {
884                                    // Validate field size before processing
885                                    if bytes.len() > MAX_FILE_FIELD_SIZE {
886                                        return Err((
887                                            StatusCode::PAYLOAD_TOO_LARGE,
888                                            Json(ErrorResponse {
889                                                error: "File size exceeds limit".to_string(),
890                                                error_type: ErrorType::FileTooLarge,
891                                                details: None,
892                                            }),
893                                        )
894                                            .into_response());
895                                    }
896
897                                    // Use comprehensive validation function for security
898                                    match validate_upload(
899                                        filename.as_deref(),
900                                        &bytes,
901                                        detected_format,
902                                    ) {
903                                        Ok(validated_filename) => {
904                                            input_data.filename = validated_filename;
905
906                                            // Detect if content is binary or text
907                                            if is_binary_content(&bytes) {
908                                                input_data.binary_content =
909                                                    Some(BinaryContent::InMemory(bytes.to_vec()));
910                                                input_data.format = Some(detected_format);
911                                            } else {
912                                                input_data.text_content = Some(
913                                                    String::from_utf8_lossy(&bytes).to_string(),
914                                                );
915                                            }
916                                        }
917                                        Err(ValidationError::FilenameTooLong) => {
918                                            return Err((
919                                                StatusCode::BAD_REQUEST,
920                                                Json(create_safe_error_response(
921                                                    ErrorType::FilenameTooLong,
922                                                    "Filename exceeds maximum length limit",
923                                                    Some("Filename validation failed due to length constraints")
924                                                )),
925                                            ).into_response());
926                                        }
927                                        Err(ValidationError::InvalidFilename) => {
928                                            return Err((
929                                                StatusCode::BAD_REQUEST,
930                                                Json(create_safe_error_response(
931                                                    ErrorType::InvalidFilename,
932                                                    "Filename contains invalid or dangerous characters",
933                                                    Some("Filename validation failed due to invalid characters")
934                                                )),
935                                            ).into_response());
936                                        }
937                                        Err(ValidationError::FormatValidationFailed) => {
938                                            return Err((
939                                                StatusCode::BAD_REQUEST,
940                                                Json(create_safe_error_response(
941                                                    ErrorType::FormatMismatch,
942                                                    "File content does not match the expected format based on filename",
943                                                    Some("Format validation failed")
944                                                )),
945                                            ).into_response());
946                                        }
947                                        Err(ValidationError::InvalidFileContent) => {
948                                            return Err((
949                                                StatusCode::BAD_REQUEST,
950                                                Json(create_safe_error_response(
951                                                    ErrorType::InvalidContent,
952                                                    "File content appears malformed or corrupted",
953                                                    None,
954                                                )),
955                                            )
956                                                .into_response());
957                                        }
958                                        Err(_) => {
959                                            return Err((
960                                                StatusCode::BAD_REQUEST,
961                                                Json(create_safe_error_response(
962                                                    ErrorType::ValidationFailed,
963                                                    "File validation failed",
964                                                    None,
965                                                )),
966                                            )
967                                                .into_response());
968                                        }
969                                    }
970                                }
971                                Err(_) => had_parse_error = true,
972                            }
973                        }
974                    }
975                    "header_text" => match field.text().await {
976                        Ok(text) => {
977                            // Validate text field size
978                            if text.len() > MAX_TEXT_FIELD_SIZE {
979                                return Err((
980                                    StatusCode::PAYLOAD_TOO_LARGE,
981                                    Json(ErrorResponse {
982                                        error: "Text field size exceeds limit".to_string(),
983                                        error_type: ErrorType::TextTooLarge,
984                                        details: None,
985                                    }),
986                                )
987                                    .into_response());
988                            }
989
990                            if !text.trim().is_empty() {
991                                input_data.text_content = Some(text);
992                            }
993                        }
994                        Err(_) => had_parse_error = true,
995                    },
996                    "score_threshold" => {
997                        if let Ok(text) = field.text().await {
998                            if let Ok(threshold) = text.parse::<f64>() {
999                                config.score_threshold = threshold.clamp(0.0, 1.0);
1000                            }
1001                        }
1002                    }
1003                    "result_limit" => {
1004                        if let Ok(text) = field.text().await {
1005                            if let Ok(limit) = text.parse::<usize>() {
1006                                config.result_limit = limit.clamp(1, 50); // Reasonable limits
1007                            }
1008                        }
1009                    }
1010                    "scoring_weights" => {
1011                        if let Ok(text) = field.text().await {
1012                            if let Ok(weights) = serde_json::from_str::<HashMap<String, f64>>(&text)
1013                            {
1014                                config.scoring_weights = parse_scoring_weights(&weights);
1015                            }
1016                        }
1017                    }
1018                    _ => {} // Ignore unknown fields
1019                }
1020            }
1021            Ok(None) => break, // No more fields
1022            Err(_) => {
1023                had_parse_error = true;
1024                break;
1025            }
1026        }
1027    }
1028
1029    // Validate that we have some input
1030    if input_data.text_content.is_none() && input_data.binary_content.is_none() {
1031        let error_msg = if had_parse_error {
1032            "Failed to parse upload. Please check the file format."
1033        } else if fields_received == 0 {
1034            "No data received. Please upload a file or paste header text."
1035        } else {
1036            "No valid header data found in upload."
1037        };
1038
1039        return Err((
1040            StatusCode::BAD_REQUEST,
1041            Json(create_safe_error_response(
1042                ErrorType::MissingInput,
1043                error_msg,
1044                None, // Never include details for consistency
1045            )),
1046        )
1047            .into_response());
1048    }
1049
1050    Ok((input_data, config))
1051}
1052
1053/// Parse input data using intelligent format detection.
1054///
1055/// Returns the parsed query header and a list of warnings (e.g. whitespace normalization).
1056fn parse_input_data(
1057    input_data: &InputData,
1058) -> Result<(crate::core::header::QueryHeader, Vec<String>), Box<Response>> {
1059    let mut warnings: Vec<String> = Vec::new();
1060
1061    if let Some(text_content) = &input_data.text_content {
1062        // Normalize space-separated SAM headers before detection and parsing
1063        let (normalized_content, was_normalized) =
1064            crate::parsing::sam::normalize_sam_whitespace(text_content);
1065        if was_normalized {
1066            warnings.push(
1067                "Input contained spaces instead of tabs between SAM header fields. \
1068                 Fields were automatically converted to tab-separated format."
1069                    .to_string(),
1070            );
1071        }
1072        let text_content = &normalized_content;
1073
1074        // Text-based parsing with format detection
1075        let Ok(detected_format) = detect_format(text_content, input_data.filename.as_deref())
1076        else {
1077            return Err(Box::new(
1078                (
1079                    StatusCode::BAD_REQUEST,
1080                    Json(create_safe_error_response(
1081                        ErrorType::FormatDetectionFailed,
1082                        "Unable to detect file format. Please check the file type and try again.",
1083                        Some("Format detection failed during parsing"),
1084                    )),
1085                )
1086                    .into_response(),
1087            ));
1088        };
1089
1090        match parse_with_format(text_content, detected_format) {
1091            Ok(query) => Ok((query, warnings)),
1092            Err(_) => Err(Box::new((
1093                StatusCode::BAD_REQUEST,
1094                Json(create_safe_error_response(
1095                    ErrorType::ParseFailed,
1096                    "Unable to process file content. Please check the file format and try again.",
1097                    Some("File parsing failed during content processing"),
1098                )),
1099            )
1100                .into_response())),
1101        }
1102    } else if let Some(binary_content) = &input_data.binary_content {
1103        // Binary file parsing
1104        let format = input_data.format.unwrap_or(FileFormat::Bam);
1105
1106        let result = match binary_content {
1107            BinaryContent::InMemory(bytes) => parse_binary_file(bytes, format),
1108            BinaryContent::TempFile(temp) => parse_binary_file_from_path(temp.path(), format),
1109        };
1110
1111        match result {
1112            Ok(query) => Ok((query, Vec::new())),
1113            Err(_) => Err(Box::new((
1114                StatusCode::BAD_REQUEST,
1115                Json(create_safe_error_response(
1116                    ErrorType::BinaryParseFailed,
1117                    "Unable to process binary file. Please verify the file format and try again.",
1118                    Some("Binary file parsing failed during processing"),
1119                )),
1120            )
1121                .into_response())),
1122        }
1123    } else {
1124        Err(Box::new(
1125            (
1126                StatusCode::INTERNAL_SERVER_ERROR,
1127                Json(ErrorResponse {
1128                    error: "Internal error: no input data".to_string(),
1129                    error_type: ErrorType::InternalError,
1130                    details: None,
1131                }),
1132            )
1133                .into_response(),
1134        ))
1135    }
1136}
1137
1138/// Read a binary file upload in chunks, writing to a temp file.
1139///
1140/// Stops after [`BINARY_HEADER_READ_LIMIT`] bytes — enough for any BAM/CRAM header.
1141/// Returns the temp file (kept alive for parsing) and total bytes written.
1142async fn read_binary_chunks(
1143    mut field: axum::extract::multipart::Field<'_>,
1144    format: FileFormat,
1145) -> Result<(tempfile::NamedTempFile, usize), Response> {
1146    use std::io::Write;
1147
1148    let extension = match format {
1149        FileFormat::Bam => ".bam",
1150        FileFormat::Cram => ".cram",
1151        _ => ".bin",
1152    };
1153
1154    let mut temp_file = tempfile::NamedTempFile::with_suffix(extension).map_err(|e| {
1155        tracing::error!("Failed to create temp file for binary upload: {e}");
1156        (
1157            StatusCode::INTERNAL_SERVER_ERROR,
1158            Json(ErrorResponse {
1159                error: "Internal error processing upload".to_string(),
1160                error_type: ErrorType::InternalError,
1161                details: None,
1162            }),
1163        )
1164            .into_response()
1165    })?;
1166
1167    let mut bytes_written: usize = 0;
1168
1169    loop {
1170        match field.chunk().await {
1171            Ok(Some(chunk)) => {
1172                let remaining = BINARY_HEADER_READ_LIMIT.saturating_sub(bytes_written);
1173                if remaining == 0 {
1174                    break;
1175                }
1176
1177                let to_write = chunk.len().min(remaining);
1178                temp_file.write_all(&chunk[..to_write]).map_err(|e| {
1179                    tracing::error!("Failed to write binary upload to temp file: {e}");
1180                    (
1181                        StatusCode::INTERNAL_SERVER_ERROR,
1182                        Json(ErrorResponse {
1183                            error: "Internal error processing upload".to_string(),
1184                            error_type: ErrorType::InternalError,
1185                            details: None,
1186                        }),
1187                    )
1188                        .into_response()
1189                })?;
1190                bytes_written += to_write;
1191
1192                if to_write < chunk.len() {
1193                    break; // Hit the limit mid-chunk
1194                }
1195            }
1196            Ok(None) => break, // End of field
1197            Err(_) => {
1198                return Err((
1199                    StatusCode::BAD_REQUEST,
1200                    Json(create_safe_error_response(
1201                        ErrorType::InvalidContent,
1202                        "Failed to read uploaded file",
1203                        Some("Error reading multipart chunk during binary upload"),
1204                    )),
1205                )
1206                    .into_response());
1207            }
1208        }
1209    }
1210
1211    if bytes_written == 0 {
1212        return Err((
1213            StatusCode::BAD_REQUEST,
1214            Json(create_safe_error_response(
1215                ErrorType::MissingInput,
1216                "Uploaded file is empty",
1217                None,
1218            )),
1219        )
1220            .into_response());
1221    }
1222
1223    Ok((temp_file, bytes_written))
1224}
1225
1226/// Check if content appears to be binary
1227fn is_binary_content(bytes: &[u8]) -> bool {
1228    // Simple heuristic: if more than 1% of first 1024 bytes are non-printable, consider binary
1229    let sample_size = std::cmp::min(bytes.len(), 1024);
1230
1231    // For very small samples, use a minimum threshold to avoid false positives
1232    if sample_size < 10 {
1233        return false; // Assume text for very small samples
1234    }
1235
1236    let non_printable_count = bytes[..sample_size]
1237        .iter()
1238        .filter(|&&b| b < 9 || (b > 13 && b < 32) || b > 126)
1239        .count();
1240
1241    // Use floating-point math to maintain consistent 1% threshold
1242    count_to_f64(non_printable_count) > (count_to_f64(sample_size) * 0.01)
1243}
1244
1245/// Detect binary format from filename
1246fn detect_binary_format(filename: &str) -> Option<FileFormat> {
1247    let lower = filename.to_lowercase();
1248    if std::path::Path::new(&lower)
1249        .extension()
1250        .is_some_and(|ext| ext.eq_ignore_ascii_case("bam"))
1251    {
1252        Some(FileFormat::Bam)
1253    } else if std::path::Path::new(&lower)
1254        .extension()
1255        .is_some_and(|ext| ext.eq_ignore_ascii_case("cram"))
1256    {
1257        Some(FileFormat::Cram)
1258    } else {
1259        None
1260    }
1261}
1262
1263/// Parse scoring weights from frontend format
1264fn parse_scoring_weights(weights: &HashMap<String, f64>) -> ScoringWeights {
1265    // Note: The frontend sends percentages (0-100), but the backend expects ratios (0-1)
1266    // New scoring model: contig_match, coverage, order, and conflict_penalty
1267    let contig_match = weights.get("contigMatch").unwrap_or(&70.0) / 100.0;
1268    let coverage = weights.get("coverage").unwrap_or(&20.0) / 100.0;
1269    let order = weights.get("orderScore").unwrap_or(&10.0) / 100.0;
1270    // Conflict penalty is a multiplier (0-1), not a weight percentage
1271    let conflict_penalty = weights.get("conflictPenalty").unwrap_or(&10.0) / 100.0;
1272
1273    ScoringWeights {
1274        contig_match,
1275        coverage,
1276        order,
1277        conflict_penalty,
1278    }
1279}
1280
1281/// Return list of references in catalog
1282async fn catalog_handler(State(state): State<Arc<AppState>>) -> Json<serde_json::Value> {
1283    let refs: Vec<serde_json::Value> = state
1284        .catalog
1285        .references
1286        .iter()
1287        .map(|r| {
1288            serde_json::json!({
1289                "id": r.id.0,
1290                "display_name": r.display_name,
1291                "assembly": format!("{}", r.assembly),
1292                "source": format!("{}", r.source),
1293                "contig_count": r.contigs.len(),
1294                "has_decoy": r.has_decoy(),
1295                "has_alt": r.has_alt(),
1296                "tags": r.tags,
1297            })
1298        })
1299        .collect();
1300
1301    Json(serde_json::json!({
1302        "count": refs.len(),
1303        "references": refs,
1304    }))
1305}
ref_solver/web/server.rs

ref_solver/web/
server.rs