Skip to main content

ref_solver/web/
server.rs

1use axum::http::header;
2use axum::{
3    extract::{DefaultBodyLimit, Multipart, Query, State},
4    http::{HeaderName, HeaderValue, StatusCode},
5    response::{Html, IntoResponse, Json, Response},
6    routing::{get, post},
7    Router,
8};
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11use std::net::SocketAddr;
12use std::sync::Arc;
13use std::time::Duration;
14use tokio::net::TcpListener;
15use tower::limit::ConcurrencyLimitLayer;
16use tower::ServiceBuilder;
17use tower_governor::{governor::GovernorConfigBuilder, GovernorLayer};
18use tower_http::set_header::SetResponseHeaderLayer;
19use tower_http::timeout::TimeoutLayer;
20
21use crate::catalog::store::ReferenceCatalog;
22use crate::cli::ServeArgs;
23use crate::matching::engine::{MatchingConfig, MatchingEngine, ScoringWeights};
24use crate::matching::Suggestion;
25use crate::utils::validation::{validate_upload, ValidationError};
26use crate::web::format_detection::{
27    detect_format, parse_binary_file, parse_with_format, FileFormat,
28};
29
30/// Security configuration constants to prevent `DoS` attacks
31pub const MAX_MULTIPART_FIELDS: usize = 10;
32pub const MAX_FILE_FIELD_SIZE: usize = 16 * 1024 * 1024; // 16MB
33pub const MAX_TEXT_FIELD_SIZE: usize = 1024 * 1024; // 1MB
34
35/// Helper function to convert usize count to f64 with explicit precision loss allowance
36#[inline]
37fn count_to_f64(count: usize) -> f64 {
38    #[allow(clippy::cast_precision_loss)]
39    {
40        count as f64
41    }
42}
43
44/// Shared application state
45pub struct AppState {
46    pub catalog: ReferenceCatalog,
47}
48
49/// Input data extracted from multipart form
50#[derive(Debug)]
51struct InputData {
52    /// Text content (if provided via textarea or text file)
53    text_content: Option<String>,
54    /// Binary file content (if provided)
55    binary_content: Option<Vec<u8>>,
56    /// Original filename
57    filename: Option<String>,
58    /// Detected or specified format
59    format: Option<FileFormat>,
60}
61
62/// Enhanced error response
63#[derive(Serialize)]
64pub struct ErrorResponse {
65    pub error: String,
66    pub error_type: String,
67    pub details: Option<String>,
68}
69
70#[derive(Serialize)]
71struct ConfigurationInfo {
72    score_threshold: f64,
73    result_limit: usize,
74    scoring_weights: ScoringWeights,
75}
76
77/// Query parameters for detailed mode
78#[derive(Deserialize)]
79struct DetailedQueryParams {
80    /// Mode: "detailed" for detailed contig breakdown, omit for summary
81    mode: Option<String>,
82    /// Match index to get details for (0-based)
83    match_id: Option<usize>,
84    /// Page number for query contigs (0-based)
85    query_page: Option<usize>,
86    /// Page size for query contigs (default: 100, max: 500)
87    query_page_size: Option<usize>,
88    /// Page number for reference contigs (0-based)
89    ref_page: Option<usize>,
90    /// Page size for reference contigs (default: 100, max: 500)
91    ref_page_size: Option<usize>,
92}
93
94/// Create a safe error response that prevents information disclosure
95/// while logging detailed errors server-side for debugging
96pub fn create_safe_error_response(
97    error_type: &str,
98    user_message: &str,
99    internal_error: Option<&str>,
100) -> ErrorResponse {
101    // Log detailed error server-side for debugging (not exposed to client)
102    if let Some(internal_msg) = internal_error {
103        tracing::error!("Internal error ({}): {}", error_type, internal_msg);
104    }
105
106    ErrorResponse {
107        error: user_message.to_string(),
108        error_type: error_type.to_string(),
109        details: None, // Never expose internal details to prevent information disclosure
110    }
111}
112
113/// Run the web server
114///
115/// # Errors
116///
117/// Returns an error if the tokio runtime cannot be created or the server fails to start.
118pub fn run(args: ServeArgs) -> anyhow::Result<()> {
119    // Build tokio runtime
120    let rt = tokio::runtime::Runtime::new()?;
121    rt.block_on(async move { run_server(args).await })
122}
123
124/// Create the application router with all routes and middleware configured.
125///
126/// # Errors
127///
128/// Returns an error if the catalog cannot be loaded.
129#[allow(clippy::missing_panics_doc)] // Panics only on invalid governor config (constants are valid)
130pub fn create_router() -> anyhow::Result<Router> {
131    // Load catalog
132    let catalog = ReferenceCatalog::load_embedded()?;
133    let state = Arc::new(AppState { catalog });
134
135    // Configure IP-based rate limiting
136    let governor_conf = GovernorConfigBuilder::default()
137        .per_second(10) // 10 requests per second per IP
138        .burst_size(50) // Allow bursts of 50 requests
139        .finish()
140        .unwrap();
141
142    // Build router with comprehensive security layers
143    let app = Router::new()
144        .route("/", get(index_handler))
145        .route("/api/identify", post(identify_handler))
146        .route("/api/catalog", get(catalog_handler))
147        // Static file routes
148        .route("/static/css/styles.css", get(styles_css_handler))
149        .route("/static/js/main.js", get(main_js_handler))
150        .route("/static/js/utils/helpers.js", get(helpers_js_handler))
151        .route(
152            "/static/js/managers/ConfigurationManager.js",
153            get(config_manager_js_handler),
154        )
155        .route(
156            "/static/js/managers/TabManager.js",
157            get(tab_manager_js_handler),
158        )
159        .route(
160            "/static/js/managers/ResultsManager.js",
161            get(results_manager_js_handler),
162        )
163        .route(
164            "/static/js/managers/SplitViewManager.js",
165            get(split_view_manager_js_handler),
166        )
167        .with_state(state)
168        .layer(
169            ServiceBuilder::new()
170                // Security headers for browser protection
171                .layer(SetResponseHeaderLayer::if_not_present(
172                    HeaderName::from_static("x-content-type-options"),
173                    HeaderValue::from_static("nosniff"),
174                ))
175                .layer(SetResponseHeaderLayer::if_not_present(
176                    HeaderName::from_static("x-frame-options"),
177                    HeaderValue::from_static("DENY"),
178                ))
179                .layer(SetResponseHeaderLayer::if_not_present(
180                    HeaderName::from_static("x-xss-protection"),
181                    HeaderValue::from_static("1; mode=block"),
182                ))
183                .layer(SetResponseHeaderLayer::if_not_present(
184                    HeaderName::from_static("strict-transport-security"),
185                    HeaderValue::from_static("max-age=31536000; includeSubDomains"),
186                ))
187                .layer(SetResponseHeaderLayer::if_not_present(
188                    HeaderName::from_static("referrer-policy"),
189                    HeaderValue::from_static("strict-origin-when-cross-origin"),
190                ))
191                // IP-based rate limiting to prevent abuse
192                .layer(GovernorLayer {
193                    config: Arc::new(governor_conf),
194                })
195                // Request timeout to prevent slow client attacks
196                .layer(TimeoutLayer::with_status_code(
197                    StatusCode::REQUEST_TIMEOUT,
198                    Duration::from_secs(30),
199                ))
200                // Limit concurrent requests to prevent DOS
201                .layer(ConcurrencyLimitLayer::new(100))
202                // Limit request body size (accommodate largest file + multipart overhead)
203                .layer(DefaultBodyLimit::max(20 * 1024 * 1024)), // 20MB limit
204        );
205
206    Ok(app)
207}
208
209async fn run_server(args: ServeArgs) -> anyhow::Result<()> {
210    let app = create_router()?;
211
212    let addr = format!("{}:{}", args.address, args.port);
213    println!("Starting ref-solver web server at http://{addr}");
214
215    if args.open {
216        let _ = open::that(format!("http://{addr}"));
217    }
218
219    let listener = TcpListener::bind(&addr).await?;
220    axum::serve(
221        listener,
222        app.into_make_service_with_connect_info::<SocketAddr>(),
223    )
224    .await?;
225
226    Ok(())
227}
228
229/// Main page handler
230async fn index_handler() -> Html<&'static str> {
231    Html(include_str!("templates/index.html"))
232}
233
234/// Static CSS handler
235async fn styles_css_handler() -> impl IntoResponse {
236    (
237        [(header::CONTENT_TYPE, "text/css; charset=utf-8")],
238        include_str!("static/css/styles.css"),
239    )
240}
241
242/// Static JS handlers for ES6 modules
243async fn main_js_handler() -> impl IntoResponse {
244    (
245        [(
246            header::CONTENT_TYPE,
247            "application/javascript; charset=utf-8",
248        )],
249        include_str!("static/js/main.js"),
250    )
251}
252
253async fn helpers_js_handler() -> impl IntoResponse {
254    (
255        [(
256            header::CONTENT_TYPE,
257            "application/javascript; charset=utf-8",
258        )],
259        include_str!("static/js/utils/helpers.js"),
260    )
261}
262
263async fn config_manager_js_handler() -> impl IntoResponse {
264    (
265        [(
266            header::CONTENT_TYPE,
267            "application/javascript; charset=utf-8",
268        )],
269        include_str!("static/js/managers/ConfigurationManager.js"),
270    )
271}
272
273async fn tab_manager_js_handler() -> impl IntoResponse {
274    (
275        [(
276            header::CONTENT_TYPE,
277            "application/javascript; charset=utf-8",
278        )],
279        include_str!("static/js/managers/TabManager.js"),
280    )
281}
282
283async fn results_manager_js_handler() -> impl IntoResponse {
284    (
285        [(
286            header::CONTENT_TYPE,
287            "application/javascript; charset=utf-8",
288        )],
289        include_str!("static/js/managers/ResultsManager.js"),
290    )
291}
292
293async fn split_view_manager_js_handler() -> impl IntoResponse {
294    (
295        [(
296            header::CONTENT_TYPE,
297            "application/javascript; charset=utf-8",
298        )],
299        include_str!("static/js/managers/SplitViewManager.js"),
300    )
301}
302
303/// API endpoint for identifying references
304#[allow(clippy::too_many_lines)] // TODO: Refactor into smaller functions
305async fn identify_handler(
306    State(state): State<Arc<AppState>>,
307    Query(params): Query<DetailedQueryParams>,
308    mut multipart: Multipart,
309) -> impl IntoResponse {
310    let start_time = std::time::Instant::now();
311
312    // Extract input data and configuration from multipart form
313    let (input_data, config) = match extract_request_data(&mut multipart).await {
314        Ok(data) => data,
315        Err(error_response) => return error_response,
316    };
317
318    // Parse input using intelligent format detection
319    let query = match parse_input_data(&input_data) {
320        Ok(query) => query,
321        Err(error_response) => return *error_response,
322    };
323
324    // Create matching engine with configuration
325    let matching_config = MatchingConfig {
326        min_score: config.score_threshold,
327        scoring_weights: config.scoring_weights.clone(),
328    };
329
330    let engine = MatchingEngine::new(&state.catalog, matching_config);
331    let matches = engine.find_matches(&query, config.result_limit);
332
333    // Check if detailed mode is requested
334    if params.mode.as_deref() == Some("detailed") {
335        return handle_detailed_response(&params, &matches, &query, start_time, &config).await;
336    }
337
338    // Build enhanced response
339    let results: Vec<serde_json::Value> = matches
340        .iter()
341        .map(|m| {
342            serde_json::json!({
343                "reference": {
344                    "id": m.reference.id.0,
345                    "display_name": m.reference.display_name,
346                    "assembly": format!("{}", m.reference.assembly),
347                    "source": format!("{}", m.reference.source),
348                    "download_url": m.reference.download_url,
349                },
350                "score": {
351                    "composite": m.score.composite,
352                    "confidence": format!("{:?}", m.score.confidence),
353                    "detailed_scores": {
354                        "md5_jaccard": m.score.md5_jaccard,
355                        "name_length_jaccard": m.score.name_length_jaccard,
356                        "md5_query_coverage": m.score.md5_query_coverage,
357                        "order_score": m.score.order_score,
358                    },
359                },
360                "match_type": format!("{:?}", m.diagnosis.match_type),
361                "reordered": m.diagnosis.reordered,
362                "exact_matches": m.diagnosis.exact_matches.len(),
363                "renamed_matches": m.diagnosis.renamed_matches.len(),
364                "conflicts": m.diagnosis.conflicts.len(),
365                "query_only": m.diagnosis.query_only.len(),
366                "diagnosis": {
367                    "exact_matches": m.diagnosis.exact_matches.iter().map(|_| {
368                        serde_json::json!({"type": "exact"})
369                    }).collect::<Vec<_>>(),
370                    "renamed_matches": m.diagnosis.renamed_matches.iter().map(|r| {
371                        serde_json::json!({
372                            "query_name": r.query_name,
373                            "reference_name": r.reference_name
374                        })
375                    }).collect::<Vec<_>>(),
376                    "conflicts": m.diagnosis.conflicts.iter().map(|c| {
377                        serde_json::json!({
378                            "query_contig": {
379                                "name": c.query_contig.name,
380                                "length": c.query_contig.length,
381                                "md5": c.query_contig.md5
382                            },
383                            "conflict_type": format!("{:?}", c.conflict_type),
384                            "description": c.description
385                        })
386                    }).collect::<Vec<_>>(),
387                },
388                "suggestions": m.diagnosis.suggestions.iter().map(|s| {
389                    match s {
390                        Suggestion::RenameContigs { command_hint, .. } => {
391                            serde_json::json!({"type": "rename", "command": command_hint})
392                        }
393                        Suggestion::ReorderContigs { command_hint } => {
394                            serde_json::json!({"type": "reorder", "command": command_hint})
395                        }
396                        Suggestion::ReplaceContig { contig_name, reason, source } => {
397                            serde_json::json!({"type": "replace", "contig": contig_name, "reason": reason, "source": source})
398                        }
399                        Suggestion::UseAsIs { warnings } => {
400                            serde_json::json!({"type": "use_as_is", "warnings": warnings})
401                        }
402                        Suggestion::Realign { reason, suggested_reference } => {
403                            serde_json::json!({"type": "realign", "reason": reason, "reference": suggested_reference})
404                        }
405                    }
406                }).collect::<Vec<_>>(),
407            })
408        })
409        .collect();
410
411    #[allow(clippy::cast_possible_truncation)] // Processing time won't exceed u64
412    let processing_time = start_time.elapsed().as_millis() as u64;
413
414    Json(serde_json::json!({
415        "query": {
416            "contig_count": query.contigs.len(),
417            "has_md5": query.has_md5s(),
418            "md5_coverage": query.md5_coverage(),
419            "naming_convention": format!("{:?}", query.naming_convention),
420        },
421        "matches": results,
422        "processing_info": {
423            "detected_format": input_data.format.as_ref().map_or("unknown", super::format_detection::FileFormat::display_name),
424            "processing_time_ms": processing_time,
425            "configuration": {
426                "score_threshold": config.score_threshold,
427                "result_limit": config.result_limit,
428                "scoring_weights": config.scoring_weights,
429            }
430        }
431    }))
432    .into_response()
433}
434
435/// Handle detailed response mode for contig breakdown
436#[allow(
437    clippy::cast_possible_truncation,
438    clippy::unused_async,
439    clippy::too_many_lines
440)] // JSON indices; TODO: refactor
441async fn handle_detailed_response(
442    params: &DetailedQueryParams,
443    matches: &[crate::matching::engine::MatchResult],
444    query: &crate::core::header::QueryHeader,
445    start_time: std::time::Instant,
446    config: &ConfigurationInfo,
447) -> Response {
448    use crate::core::contig::Contig;
449
450    // Get the specific match or default to first match
451    let match_index = params.match_id.unwrap_or(0);
452    let Some(selected_match) = matches.get(match_index) else {
453        return (
454            StatusCode::BAD_REQUEST,
455            Json(create_safe_error_response(
456                "invalid_match_id",
457                "Invalid match ID specified",
458                Some("Match index out of bounds"),
459            )),
460        )
461            .into_response();
462    };
463
464    // Set up pagination parameters
465    let query_page = params.query_page.unwrap_or(0);
466    let query_page_size = params.query_page_size.unwrap_or(100).min(500);
467    let ref_page = params.ref_page.unwrap_or(0);
468    let ref_page_size = params.ref_page_size.unwrap_or(100).min(500);
469
470    // Extract query contigs with pagination
471    let total_query_contigs = query.contigs.len();
472    let query_start = query_page * query_page_size;
473    let query_end = (query_start + query_page_size).min(total_query_contigs);
474    let query_contigs_page: Vec<&Contig> = if query_start < total_query_contigs {
475        query.contigs[query_start..query_end].iter().collect()
476    } else {
477        Vec::new()
478    };
479
480    // Extract reference contigs with pagination
481    let total_ref_contigs = selected_match.reference.contigs.len();
482    let ref_start = ref_page * ref_page_size;
483    let ref_end = (ref_start + ref_page_size).min(total_ref_contigs);
484    let ref_contigs_page: Vec<&Contig> = if ref_start < total_ref_contigs {
485        selected_match.reference.contigs[ref_start..ref_end]
486            .iter()
487            .collect()
488    } else {
489        Vec::new()
490    };
491
492    // Build detailed mapping information
493    let mut exact_match_mappings = Vec::new();
494    let mut renamed_match_mappings = Vec::new();
495    let mut conflict_mappings = Vec::new();
496    let mut query_only_indices = Vec::new();
497    let mut reference_only_indices = Vec::new();
498
499    // Create lookup maps for efficient indexing
500    let query_name_to_index: std::collections::HashMap<&str, usize> = query
501        .contigs
502        .iter()
503        .enumerate()
504        .map(|(i, c)| (c.name.as_str(), i))
505        .collect();
506
507    let ref_name_to_index: std::collections::HashMap<&str, usize> = selected_match
508        .reference
509        .contigs
510        .iter()
511        .enumerate()
512        .map(|(i, c)| (c.name.as_str(), i))
513        .collect();
514
515    // Process exact matches (need to map back to contigs since ContigMatch is empty)
516    for (i, _) in selected_match.diagnosis.exact_matches.iter().enumerate() {
517        // Since ContigMatch doesn't contain contig data, we need to reconstruct
518        // the mapping by analyzing the query and reference contigs
519        // This is a limitation of the current data structure
520        exact_match_mappings.push(serde_json::json!({
521            "type": "exact",
522            "query_index": i, // This is approximate - we'd need better data structure
523            "reference_index": i // This is approximate - we'd need better data structure
524        }));
525    }
526
527    // Process renamed matches
528    for rename in &selected_match.diagnosis.renamed_matches {
529        if let (Some(&query_idx), Some(&ref_idx)) = (
530            query_name_to_index.get(rename.query_name.as_str()),
531            ref_name_to_index.get(rename.reference_name.as_str()),
532        ) {
533            renamed_match_mappings.push(serde_json::json!({
534                "type": "renamed",
535                "query_index": query_idx,
536                "reference_index": ref_idx,
537                "query_name": rename.query_name,
538                "reference_name": rename.reference_name
539            }));
540        }
541    }
542
543    // Process conflicts
544    for conflict in &selected_match.diagnosis.conflicts {
545        if let Some(&query_idx) = query_name_to_index.get(conflict.query_contig.name.as_str()) {
546            let ref_idx = conflict
547                .expected
548                .as_ref()
549                .and_then(|expected| ref_name_to_index.get(expected.name.as_str()));
550
551            conflict_mappings.push(serde_json::json!({
552                "type": "conflict",
553                "query_index": query_idx,
554                "reference_index": ref_idx,
555                "conflict_type": format!("{:?}", conflict.conflict_type),
556                "description": conflict.description
557            }));
558        }
559    }
560
561    // Process query-only contigs
562    for contig in &selected_match.diagnosis.query_only {
563        if let Some(&index) = query_name_to_index.get(contig.name.as_str()) {
564            query_only_indices.push(index);
565        }
566    }
567
568    // Identify reference-only contigs (those not matched by any query contig)
569    let mut matched_ref_indices = std::collections::HashSet::new();
570    #[allow(clippy::cast_possible_truncation)] // Contig indices bounded by MAX_CONTIGS
571    for mapping in &exact_match_mappings {
572        if let Some(ref_idx) = mapping
573            .get("reference_index")
574            .and_then(serde_json::Value::as_u64)
575        {
576            matched_ref_indices.insert(ref_idx as usize);
577        }
578    }
579    #[allow(clippy::cast_possible_truncation)]
580    for mapping in &renamed_match_mappings {
581        if let Some(ref_idx) = mapping
582            .get("reference_index")
583            .and_then(serde_json::Value::as_u64)
584        {
585            matched_ref_indices.insert(ref_idx as usize);
586        }
587    }
588    #[allow(clippy::cast_possible_truncation)]
589    for mapping in &conflict_mappings {
590        if let Some(ref_idx) = mapping
591            .get("reference_index")
592            .and_then(serde_json::Value::as_u64)
593        {
594            matched_ref_indices.insert(ref_idx as usize);
595        }
596    }
597
598    for (i, _) in selected_match.reference.contigs.iter().enumerate() {
599        if !matched_ref_indices.contains(&i) {
600            reference_only_indices.push(i);
601        }
602    }
603
604    // Build response
605    #[allow(clippy::cast_possible_truncation)] // Processing time won't exceed u64
606    let processing_time = start_time.elapsed().as_millis() as u64;
607
608    Json(serde_json::json!({
609        "mode": "detailed",
610        "match_id": match_index,
611        "query": {
612            "contigs": query_contigs_page.iter().enumerate().map(|(page_idx, contig)| {
613                let global_idx = query_start + page_idx;
614                // Determine match status for this contig
615                let match_status = if query_only_indices.contains(&global_idx) {
616                    "missing"
617                } else if conflict_mappings.iter().any(|c| c.get("query_index").and_then(serde_json::Value::as_u64).map(|i| i as usize) == Some(global_idx)) {
618                    "conflict"
619                } else if renamed_match_mappings.iter().any(|r| r.get("query_index").and_then(serde_json::Value::as_u64).map(|i| i as usize) == Some(global_idx)) {
620                    "renamed"
621                } else if exact_match_mappings.iter().any(|e| e.get("query_index").and_then(serde_json::Value::as_u64).map(|i| i as usize) == Some(global_idx)) {
622                    "exact"
623                } else {
624                    "unknown"
625                };
626
627                serde_json::json!({
628                    "index": global_idx,
629                    "name": contig.name,
630                    "length": contig.length,
631                    "md5": contig.md5,
632                    "sha512t24u": contig.sha512t24u,
633                    "sequence_role": format!("{:?}", contig.sequence_role),
634                    "aliases": contig.aliases,
635                    "match_status": match_status
636                })
637            }).collect::<Vec<_>>(),
638            "pagination": {
639                "page": query_page,
640                "page_size": query_page_size,
641                "total_count": total_query_contigs,
642                "total_pages": total_query_contigs.div_ceil(query_page_size)
643            }
644        },
645        "reference": {
646            "id": selected_match.reference.id.0,
647            "display_name": selected_match.reference.display_name,
648            "assembly": format!("{}", selected_match.reference.assembly),
649            "contigs": ref_contigs_page.iter().enumerate().map(|(page_idx, contig)| {
650                let global_idx = ref_start + page_idx;
651                // Determine match status for this reference contig
652                let match_status = if reference_only_indices.contains(&global_idx) {
653                    "missing"
654                } else if conflict_mappings.iter().any(|c| c.get("reference_index").and_then(serde_json::Value::as_u64).map(|i| i as usize) == Some(global_idx)) {
655                    "conflict"
656                } else if renamed_match_mappings.iter().any(|r| r.get("reference_index").and_then(serde_json::Value::as_u64).map(|i| i as usize) == Some(global_idx)) {
657                    "renamed"
658                } else if exact_match_mappings.iter().any(|e| e.get("reference_index").and_then(serde_json::Value::as_u64).map(|i| i as usize) == Some(global_idx)) {
659                    "exact"
660                } else {
661                    "unknown"
662                };
663
664                serde_json::json!({
665                    "index": global_idx,
666                    "name": contig.name,
667                    "length": contig.length,
668                    "md5": contig.md5,
669                    "sha512t24u": contig.sha512t24u,
670                    "sequence_role": format!("{:?}", contig.sequence_role),
671                    "aliases": contig.aliases,
672                    "match_status": match_status
673                })
674            }).collect::<Vec<_>>(),
675            "pagination": {
676                "page": ref_page,
677                "page_size": ref_page_size,
678                "total_count": total_ref_contigs,
679                "total_pages": total_ref_contigs.div_ceil(ref_page_size)
680            }
681        },
682        "mappings": {
683            "exact_matches": exact_match_mappings,
684            "renamed_matches": renamed_match_mappings,
685            "conflicts": conflict_mappings,
686            "query_only": query_only_indices,
687            "reference_only": reference_only_indices
688        },
689        "match_summary": {
690            "match_type": format!("{:?}", selected_match.diagnosis.match_type),
691            "reordered": selected_match.diagnosis.reordered,
692            "score": {
693                "composite": selected_match.score.composite,
694                "confidence": format!("{:?}", selected_match.score.confidence)
695            }
696        },
697        "processing_info": {
698            "processing_time_ms": processing_time,
699            "configuration": {
700                "score_threshold": config.score_threshold,
701                "result_limit": config.result_limit,
702                "scoring_weights": config.scoring_weights,
703            }
704        }
705    }))
706    .into_response()
707}
708
709/// Extract input data and configuration from multipart form
710#[allow(clippy::too_many_lines)] // TODO: Refactor into smaller functions
711async fn extract_request_data(
712    multipart: &mut Multipart,
713) -> Result<(InputData, ConfigurationInfo), Response> {
714    let mut input_data = InputData {
715        text_content: None,
716        binary_content: None,
717        filename: None,
718        format: None,
719    };
720
721    let mut config = ConfigurationInfo {
722        score_threshold: 0.1, // Default 10%
723        result_limit: 10,
724        scoring_weights: ScoringWeights::default(),
725    };
726
727    let mut fields_received = 0usize;
728    let mut had_parse_error = false;
729
730    // Process multipart fields
731    loop {
732        // Check field count limit before processing
733        if fields_received >= MAX_MULTIPART_FIELDS {
734            return Err((
735                StatusCode::BAD_REQUEST,
736                Json(ErrorResponse {
737                    error: "Too many form fields".to_string(),
738                    error_type: "field_limit_exceeded".to_string(),
739                    details: None, // No internal details for security
740                }),
741            )
742                .into_response());
743        }
744
745        match multipart.next_field().await {
746            Ok(Some(field)) => {
747                fields_received += 1;
748                let name = field.name().unwrap_or_default().to_string();
749
750                match name.as_str() {
751                    "file" => {
752                        let filename = field.file_name().map(std::string::ToString::to_string);
753
754                        match field.bytes().await {
755                            Ok(bytes) => {
756                                // Validate field size before processing
757                                if bytes.len() > MAX_FILE_FIELD_SIZE {
758                                    return Err((
759                                        StatusCode::PAYLOAD_TOO_LARGE,
760                                        Json(ErrorResponse {
761                                            error: "File size exceeds limit".to_string(),
762                                            error_type: "file_too_large".to_string(),
763                                            details: None,
764                                        }),
765                                    )
766                                        .into_response());
767                                }
768
769                                // Detect format from filename for validation
770                                let detected_format = if let Some(ref name) = filename {
771                                    detect_binary_format(name).unwrap_or(FileFormat::Auto)
772                                } else {
773                                    FileFormat::Auto
774                                };
775
776                                // Use comprehensive validation function for security
777                                match validate_upload(filename.as_deref(), &bytes, detected_format)
778                                {
779                                    Ok(validated_filename) => {
780                                        input_data.filename = validated_filename;
781
782                                        // Detect if content is binary or text
783                                        if is_binary_content(&bytes) {
784                                            input_data.binary_content = Some(bytes.to_vec());
785                                            input_data.format = Some(detected_format);
786                                        } else {
787                                            input_data.text_content =
788                                                Some(String::from_utf8_lossy(&bytes).to_string());
789                                        }
790                                    }
791                                    Err(ValidationError::FilenameTooLong) => {
792                                        return Err((
793                                            StatusCode::BAD_REQUEST,
794                                            Json(create_safe_error_response(
795                                                "filename_too_long",
796                                                "Filename exceeds maximum length limit",
797                                                Some("Filename validation failed due to length constraints")
798                                            )),
799                                        ).into_response());
800                                    }
801                                    Err(ValidationError::InvalidFilename) => {
802                                        return Err((
803                                            StatusCode::BAD_REQUEST,
804                                            Json(create_safe_error_response(
805                                                "invalid_filename",
806                                                "Filename contains invalid or dangerous characters",
807                                                Some("Filename validation failed due to invalid characters")
808                                            )),
809                                        ).into_response());
810                                    }
811                                    Err(ValidationError::FormatValidationFailed) => {
812                                        return Err((
813                                            StatusCode::BAD_REQUEST,
814                                            Json(create_safe_error_response(
815                                                "format_mismatch",
816                                                "File content does not match the expected format based on filename",
817                                                Some("Format validation failed")
818                                            )),
819                                        ).into_response());
820                                    }
821                                    Err(ValidationError::InvalidFileContent) => {
822                                        return Err((
823                                            StatusCode::BAD_REQUEST,
824                                            Json(create_safe_error_response(
825                                                "invalid_content",
826                                                "File content appears malformed or corrupted",
827                                                None,
828                                            )),
829                                        )
830                                            .into_response());
831                                    }
832                                    Err(_) => {
833                                        return Err((
834                                            StatusCode::BAD_REQUEST,
835                                            Json(create_safe_error_response(
836                                                "validation_failed",
837                                                "File validation failed",
838                                                None,
839                                            )),
840                                        )
841                                            .into_response());
842                                    }
843                                }
844                            }
845                            Err(_) => had_parse_error = true,
846                        }
847                    }
848                    "header_text" => match field.text().await {
849                        Ok(text) => {
850                            // Validate text field size
851                            if text.len() > MAX_TEXT_FIELD_SIZE {
852                                return Err((
853                                    StatusCode::PAYLOAD_TOO_LARGE,
854                                    Json(ErrorResponse {
855                                        error: "Text field size exceeds limit".to_string(),
856                                        error_type: "text_too_large".to_string(),
857                                        details: None,
858                                    }),
859                                )
860                                    .into_response());
861                            }
862
863                            if !text.trim().is_empty() {
864                                input_data.text_content = Some(text);
865                            }
866                        }
867                        Err(_) => had_parse_error = true,
868                    },
869                    "score_threshold" => {
870                        if let Ok(text) = field.text().await {
871                            if let Ok(threshold) = text.parse::<f64>() {
872                                config.score_threshold = threshold.clamp(0.0, 1.0);
873                            }
874                        }
875                    }
876                    "result_limit" => {
877                        if let Ok(text) = field.text().await {
878                            if let Ok(limit) = text.parse::<usize>() {
879                                config.result_limit = limit.clamp(1, 50); // Reasonable limits
880                            }
881                        }
882                    }
883                    "scoring_weights" => {
884                        if let Ok(text) = field.text().await {
885                            if let Ok(weights) = serde_json::from_str::<HashMap<String, f64>>(&text)
886                            {
887                                config.scoring_weights = parse_scoring_weights(&weights);
888                            }
889                        }
890                    }
891                    _ => {} // Ignore unknown fields
892                }
893            }
894            Ok(None) => break, // No more fields
895            Err(_) => {
896                had_parse_error = true;
897                break;
898            }
899        }
900    }
901
902    // Validate that we have some input
903    if input_data.text_content.is_none() && input_data.binary_content.is_none() {
904        let error_msg = if had_parse_error {
905            "Failed to parse upload. Please check the file format."
906        } else if fields_received == 0 {
907            "No data received. Please upload a file or paste header text."
908        } else {
909            "No valid header data found in upload."
910        };
911
912        return Err((
913            StatusCode::BAD_REQUEST,
914            Json(create_safe_error_response(
915                "missing_input",
916                error_msg,
917                None, // Never include details for consistency
918            )),
919        )
920            .into_response());
921    }
922
923    Ok((input_data, config))
924}
925
926/// Parse input data using intelligent format detection
927fn parse_input_data(
928    input_data: &InputData,
929) -> Result<crate::core::header::QueryHeader, Box<Response>> {
930    if let Some(text_content) = &input_data.text_content {
931        // Text-based parsing with format detection
932        let Ok(detected_format) = detect_format(text_content, input_data.filename.as_deref())
933        else {
934            return Err(Box::new(
935                (
936                    StatusCode::BAD_REQUEST,
937                    Json(create_safe_error_response(
938                        "format_detection_failed",
939                        "Unable to detect file format. Please check the file type and try again.",
940                        Some("Format detection failed during parsing"),
941                    )),
942                )
943                    .into_response(),
944            ));
945        };
946
947        match parse_with_format(text_content, detected_format) {
948            Ok(query) => Ok(query),
949            Err(_) => Err(Box::new((
950                StatusCode::BAD_REQUEST,
951                Json(create_safe_error_response(
952                    "parse_failed",
953                    "Unable to process file content. Please check the file format and try again.",
954                    Some("File parsing failed during content processing"),
955                )),
956            )
957                .into_response())),
958        }
959    } else if let Some(binary_content) = &input_data.binary_content {
960        // Binary file parsing
961        let format = input_data.format.unwrap_or(FileFormat::Bam);
962
963        match parse_binary_file(binary_content, format) {
964            Ok(query) => Ok(query),
965            Err(_) => Err(Box::new((
966                StatusCode::BAD_REQUEST,
967                Json(create_safe_error_response(
968                    "binary_parse_failed",
969                    "Unable to process binary file. Please verify the file format and try again.",
970                    Some("Binary file parsing failed during processing"),
971                )),
972            )
973                .into_response())),
974        }
975    } else {
976        Err(Box::new(
977            (
978                StatusCode::INTERNAL_SERVER_ERROR,
979                Json(ErrorResponse {
980                    error: "Internal error: no input data".to_string(),
981                    error_type: "internal_error".to_string(),
982                    details: None,
983                }),
984            )
985                .into_response(),
986        ))
987    }
988}
989
990/// Check if content appears to be binary
991fn is_binary_content(bytes: &[u8]) -> bool {
992    // Simple heuristic: if more than 1% of first 1024 bytes are non-printable, consider binary
993    let sample_size = std::cmp::min(bytes.len(), 1024);
994
995    // For very small samples, use a minimum threshold to avoid false positives
996    if sample_size < 10 {
997        return false; // Assume text for very small samples
998    }
999
1000    let non_printable_count = bytes[..sample_size]
1001        .iter()
1002        .filter(|&&b| b < 9 || (b > 13 && b < 32) || b > 126)
1003        .count();
1004
1005    // Use floating-point math to maintain consistent 1% threshold
1006    count_to_f64(non_printable_count) > (count_to_f64(sample_size) * 0.01)
1007}
1008
1009/// Detect binary format from filename
1010fn detect_binary_format(filename: &str) -> Option<FileFormat> {
1011    let lower = filename.to_lowercase();
1012    if std::path::Path::new(&lower)
1013        .extension()
1014        .is_some_and(|ext| ext.eq_ignore_ascii_case("bam"))
1015    {
1016        Some(FileFormat::Bam)
1017    } else if std::path::Path::new(&lower)
1018        .extension()
1019        .is_some_and(|ext| ext.eq_ignore_ascii_case("cram"))
1020    {
1021        Some(FileFormat::Cram)
1022    } else {
1023        None
1024    }
1025}
1026
1027/// Parse scoring weights from frontend format
1028fn parse_scoring_weights(weights: &HashMap<String, f64>) -> ScoringWeights {
1029    // Note: The frontend sends percentages (0-100), but the backend expects ratios (0-1)
1030    // New scoring model: contig_match, coverage, order, and conflict_penalty
1031    let contig_match = weights.get("contigMatch").unwrap_or(&70.0) / 100.0;
1032    let coverage = weights.get("coverage").unwrap_or(&20.0) / 100.0;
1033    let order = weights.get("orderScore").unwrap_or(&10.0) / 100.0;
1034    // Conflict penalty is a multiplier (0-1), not a weight percentage
1035    let conflict_penalty = weights.get("conflictPenalty").unwrap_or(&10.0) / 100.0;
1036
1037    ScoringWeights {
1038        contig_match,
1039        coverage,
1040        order,
1041        conflict_penalty,
1042    }
1043}
1044
1045/// Return list of references in catalog
1046async fn catalog_handler(State(state): State<Arc<AppState>>) -> Json<serde_json::Value> {
1047    let refs: Vec<serde_json::Value> = state
1048        .catalog
1049        .references
1050        .iter()
1051        .map(|r| {
1052            serde_json::json!({
1053                "id": r.id.0,
1054                "display_name": r.display_name,
1055                "assembly": format!("{}", r.assembly),
1056                "source": format!("{}", r.source),
1057                "contig_count": r.contigs.len(),
1058                "has_decoy": r.has_decoy(),
1059                "has_alt": r.has_alt(),
1060                "tags": r.tags,
1061            })
1062        })
1063        .collect();
1064
1065    Json(serde_json::json!({
1066        "count": refs.len(),
1067        "references": refs,
1068    }))
1069}