diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 13e1af9..c48d227 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,3 +30,24 @@ jobs: - name: Lint (cargo fmt + clippy) run: ./lint.sh + + test: + name: test + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Rust + uses: dtolnay/rust-toolchain@stable + + - name: Cache Rust build + uses: Swatinem/rust-cache@v2 + with: + workspaces: rust + key: test + + - name: Test + run: ./test.sh diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index c000012..7959f57 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -1,9 +1,7 @@ name: Coverage on: - push: - branches: [main] - pull_request: + workflow_dispatch: permissions: contents: read diff --git a/AGENTS.md b/AGENTS.md index 25de3cc..d3fc05f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -16,6 +16,6 @@ production limit measures production code, not test scaffolding. Test files should still be split when they mix unrelated behavior or become hard to scan. When a production file grows past 500 lines, split it before adding more -behavior. Temporary exceptions must be listed in -`rust/bioscript-core/tests/source_size.rs` with their current line count, and -that count should not increase. +behavior. Temporary exceptions must be listed in this file under +`Current Refactor Backlog`; the source-size guard reads that list and fails when +it drifts from the code. diff --git a/lint.sh b/lint.sh index 95fe95e..9e37793 100755 --- a/lint.sh +++ b/lint.sh @@ -25,3 +25,5 @@ filter_vendored() { } cargo clippy "${PKG_ARGS[@]}" --all-targets --color=never -- -D warnings 2> >(filter_vendored >&2) + +cargo test -p bioscript-core --test source_size -- --nocapture diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 2e747e4..bf9a5dc 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -124,6 +124,7 @@ dependencies = [ "bioscript-core", "bioscript-formats", "bioscript-runtime", + "bioscript-schema", "jni", "monty", "serde", diff --git a/rust/bioscript-cli/src/commands.rs b/rust/bioscript-cli/src/commands.rs index 933abb9..ddf30b1 100644 --- a/rust/bioscript-cli/src/commands.rs +++ b/rust/bioscript-cli/src/commands.rs @@ -180,3 +180,167 @@ where Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + use bioscript_schema::ValidationReport; + use std::time::{SystemTime, UNIX_EPOCH}; + + fn temp_dir(label: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("clock drift") + .as_nanos(); + let dir = std::env::temp_dir().join(format!( + "bioscript-cli-commands-{label}-{}-{nanos}", + std::process::id() + )); + fs::create_dir_all(&dir).unwrap(); + dir + } + + fn empty_report() -> ValidationReport { + ValidationReport { + files_scanned: 1, + reports: Vec::new(), + } + } + + #[test] + fn command_parsers_report_prepare_and_inspect_argument_errors() { + for (args, expected) in [ + (vec!["--root"], "--root requires a directory"), + (vec!["--input-file"], "--input-file requires a path"), + (vec!["--reference-file"], "--reference-file requires a path"), + (vec!["--input-format"], "--input-format requires a value"), + (vec!["--input-format", "bad"], "invalid --input-format"), + (vec!["--cache-dir"], "--cache-dir requires a path"), + (vec!["--unexpected"], "unexpected argument"), + ] { + let err = run_prepare(args.into_iter().map(str::to_owned).collect()).unwrap_err(); + assert!(err.contains(expected), "{err}"); + } + + for (args, expected) in [ + (Vec::<&str>::new(), "usage: bioscript inspect"), + (vec!["--input-index"], "--input-index requires a path"), + (vec!["--reference-file"], "--reference-file requires a path"), + ( + vec!["--reference-index"], + "--reference-index requires a path", + ), + (vec!["input.txt", "extra"], "unexpected argument"), + ] { + let err = run_inspect(args.into_iter().map(str::to_owned).collect()).unwrap_err(); + assert!(err.contains(expected), "{err}"); + } + } + + #[test] + fn validation_command_covers_report_success_and_error_paths() { + let dir = temp_dir("validation"); + let input = dir.join("input.yaml"); + fs::write(&input, "schema: bioscript:variant:1.0\n").unwrap(); + let report = dir.join("reports/report.txt"); + + run_validation_command( + vec![ + input.display().to_string(), + "--report".to_owned(), + report.display().to_string(), + ], + "usage", + |_| Ok(empty_report()), + ) + .unwrap(); + assert!( + fs::read_to_string(&report) + .unwrap() + .contains("files_scanned") + ); + + let err = + run_validation_command(Vec::new(), "usage text", |_| Ok(empty_report())).unwrap_err(); + assert_eq!(err, "usage text"); + + let err = + run_validation_command(vec!["--report".to_owned()], "usage", |_| Ok(empty_report())) + .unwrap_err(); + assert!(err.contains("--report requires a path")); + + let err = run_validation_command(vec!["one".to_owned(), "two".to_owned()], "usage", |_| { + Ok(empty_report()) + }) + .unwrap_err(); + assert!(err.contains("unexpected argument")); + + let err = run_validation_command(vec!["input".to_owned()], "usage", |_| { + Err("validator failed".to_owned()) + }) + .unwrap_err(); + assert_eq!(err, "validator failed"); + } + + #[test] + fn public_validation_and_inspect_commands_cover_successful_argument_branches() { + let dir = temp_dir("public-commands"); + let vcf = dir.join("sample.vcf"); + fs::write( + &vcf, + "##fileformat=VCFv4.3\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 1\t10\trs10\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + let index = dir.join("sample.vcf.tbi"); + let reference = dir.join("ref.fa"); + let reference_index = dir.join("ref.fa.fai"); + fs::write(&index, b"index").unwrap(); + fs::write(&reference, b">chr1\nA\n").unwrap(); + fs::write(&reference_index, b"chr1\t1\t6\t1\t2\n").unwrap(); + + run_inspect(vec![ + vcf.display().to_string(), + "--input-index".to_owned(), + index.display().to_string(), + "--reference-file".to_owned(), + reference.display().to_string(), + "--reference-index".to_owned(), + reference_index.display().to_string(), + ]) + .unwrap(); + + let invalid_variant = dir.join("invalid-variant.yaml"); + fs::write(&invalid_variant, "schema: bioscript:variant:1.0\n").unwrap(); + let variant_report = dir.join("variant-report.txt"); + let err = run_validate_variants(vec![ + invalid_variant.display().to_string(), + "--report".to_owned(), + variant_report.display().to_string(), + ]) + .unwrap_err(); + assert!(err.contains("validation found")); + assert!( + fs::read_to_string(&variant_report) + .unwrap() + .contains("errors:") + ); + + let invalid_panel = dir.join("invalid-panel.yaml"); + fs::write(&invalid_panel, "schema: bioscript:panel:1.0\n").unwrap(); + let panel_report = dir.join("panel-report.txt"); + let err = run_validate_panels(vec![ + invalid_panel.display().to_string(), + "--report".to_owned(), + panel_report.display().to_string(), + ]) + .unwrap_err(); + assert!(err.contains("validation found")); + assert!( + fs::read_to_string(&panel_report) + .unwrap() + .contains("errors:") + ); + } +} diff --git a/rust/bioscript-ffi/Cargo.toml b/rust/bioscript-ffi/Cargo.toml index d82ca2b..95f4b93 100644 --- a/rust/bioscript-ffi/Cargo.toml +++ b/rust/bioscript-ffi/Cargo.toml @@ -10,6 +10,7 @@ crate-type = ["rlib", "staticlib", "cdylib"] bioscript-core = { path = "../bioscript-core" } bioscript-formats = { path = "../bioscript-formats" } bioscript-runtime = { path = "../bioscript-runtime" } +bioscript-schema = { path = "../bioscript-schema" } jni = "0.21" monty = { path = "../../monty/crates/monty" } serde = { version = "1.0", features = ["derive"] } diff --git a/rust/bioscript-ffi/src/c_api.rs b/rust/bioscript-ffi/src/c_api.rs new file mode 100644 index 0000000..01c3b75 --- /dev/null +++ b/rust/bioscript-ffi/src/c_api.rs @@ -0,0 +1,83 @@ +use std::{ + ffi::{CStr, CString}, + os::raw::c_char, +}; + +use crate::{RunFileRequest, RunFileResult, run_file_request, types::FfiResult}; + +#[unsafe(no_mangle)] +/// Executes a bioscript run request encoded as a UTF-8 JSON C string. +/// +/// # Safety +/// +/// `request_json` must either be null or point to a valid, NUL-terminated C +/// string that remains alive for the duration of this call. +pub unsafe extern "C" fn bioscript_run_file_json(request_json: *const c_char) -> *mut c_char { + let response = unsafe { + if request_json.is_null() { + FfiResult:: { + ok: false, + value: None, + error: Some("request_json was null".to_owned()), + } + } else { + parse_and_run_request(request_json) + } + }; + + encode_response(&response) +} + +unsafe fn parse_and_run_request(request_json: *const c_char) -> FfiResult { + match unsafe { CStr::from_ptr(request_json) }.to_str() { + Ok(value) => match serde_json::from_str::(value) { + Ok(request) => match run_file_request(request) { + Ok(result) => FfiResult { + ok: true, + value: Some(result), + error: None, + }, + Err(error) => FfiResult:: { + ok: false, + value: None, + error: Some(error), + }, + }, + Err(error) => FfiResult:: { + ok: false, + value: None, + error: Some(format!("invalid request JSON: {error}")), + }, + }, + Err(error) => FfiResult:: { + ok: false, + value: None, + error: Some(format!("request_json was not valid UTF-8: {error}")), + }, + } +} + +fn encode_response(response: &FfiResult) -> *mut c_char { + match serde_json::to_string(response) { + Ok(json) => match CString::new(json) { + Ok(value) => value.into_raw(), + Err(_) => std::ptr::null_mut(), + }, + Err(_) => std::ptr::null_mut(), + } +} + +#[unsafe(no_mangle)] +/// Frees a string previously returned by [`bioscript_run_file_json`]. +/// +/// # Safety +/// +/// `ptr` must be null or a pointer returned by [`CString::into_raw`] from this +/// library, and it must not be freed more than once. +pub unsafe extern "C" fn bioscript_free_string(ptr: *mut c_char) { + if !ptr.is_null() { + unsafe { + let _ = CString::from_raw(ptr); + } + } +} diff --git a/rust/bioscript-ffi/src/lib.rs b/rust/bioscript-ffi/src/lib.rs index 75e0c0b..5263bf1 100644 --- a/rust/bioscript-ffi/src/lib.rs +++ b/rust/bioscript-ffi/src/lib.rs @@ -1,302 +1,16 @@ -use std::{ - env, - ffi::{CStr, CString}, - fmt::Write as _, - fs, - os::raw::c_char, - path::PathBuf, - time::{Duration, Instant}, +mod c_api; +mod limits; +mod run_file; +mod types; +mod variant_yaml; + +pub use c_api::{bioscript_free_string, bioscript_run_file_json}; +pub use run_file::run_file_request; +pub use types::{ + RunFileRequest, RunFileResult, RunVariantYamlRequest, RunVariantYamlResult, + VariantObservationResult, }; - -use bioscript_formats::{ - GenotypeLoadOptions, GenotypeSourceFormat, PrepareRequest, prepare_indexes, -}; -use bioscript_runtime::{BioscriptRuntime, RuntimeConfig, StageTiming}; -use monty::{MontyObject, ResourceLimits}; -use serde::{Deserialize, Serialize}; - -const DEFAULT_MAX_DURATION_MS: u64 = 100; -const DEFAULT_MAX_MEMORY_BYTES: usize = 8 * 1024 * 1024; -const DEFAULT_MAX_ALLOCATIONS: usize = 200_000; -const DEFAULT_MAX_RECURSION_DEPTH: usize = 200; -const HARD_MAX_DURATION_MS: u64 = 60_000; -const HARD_MAX_MEMORY_BYTES: usize = 256 * 1024 * 1024; -const HARD_MAX_ALLOCATIONS: usize = 10_000_000; -const HARD_MAX_RECURSION_DEPTH: usize = 10_000; - -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct RunFileRequest { - pub script_path: String, - pub root: Option, - pub input_file: Option, - pub output_file: Option, - pub participant_id: Option, - pub trace_report_path: Option, - pub timing_report_path: Option, - pub input_format: Option, - pub input_index: Option, - pub reference_file: Option, - pub reference_index: Option, - pub allow_md5_mismatch: Option, - pub auto_index: Option, - pub cache_dir: Option, - pub max_duration_ms: Option, - pub max_memory_bytes: Option, - pub max_allocations: Option, - pub max_recursion_depth: Option, -} - -#[derive(Debug, Serialize)] -#[serde(rename_all = "camelCase")] -pub struct RunFileResult { - pub ok: bool, -} - -#[derive(Debug, Serialize)] -#[serde(rename_all = "camelCase")] -struct FfiResult { - ok: bool, - #[serde(skip_serializing_if = "Option::is_none")] - value: Option, - #[serde(skip_serializing_if = "Option::is_none")] - error: Option, -} - -/// Runs a bioscript file request described by a JSON-compatible Rust struct. -/// -/// # Errors -/// -/// Returns an error string when request parsing, optional index preparation, -/// runtime construction, script execution, or report writing fails. -pub fn run_file_request(request: RunFileRequest) -> Result { - let script_path = PathBuf::from(&request.script_path); - let runtime_root = runtime_root(&request)?; - let mut loader = build_loader(&request)?; - let limits = build_limits(&request); - - let mut ffi_timings: Vec = Vec::new(); - if request.auto_index.unwrap_or(false) { - let auto_index_started = Instant::now(); - let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; - let effective_cache = request.cache_dir.as_ref().map_or_else( - || runtime_root.join(".bioscript-cache"), - |dir| { - let path = PathBuf::from(dir); - if path.is_absolute() { - path - } else { - runtime_root.join(path) - } - }, - ); - let prepare_request = PrepareRequest { - root: runtime_root.clone(), - cwd: cwd.clone(), - cache_dir: effective_cache, - input_file: request.input_file.clone(), - input_format: loader.format, - reference_file: loader - .reference_file - .as_ref() - .map(|p| p.to_string_lossy().to_string()), - }; - let prepared = prepare_indexes(&prepare_request)?; - if let Some(idx) = prepared.input_index - && loader.input_index.is_none() - { - loader.input_index = Some(idx); - } - if let Some(ref_file) = prepared.reference_file { - loader.reference_file = Some(ref_file); - } - if let Some(ref_idx) = prepared.reference_index - && loader.reference_index.is_none() - { - loader.reference_index = Some(ref_idx); - } - ffi_timings.push(StageTiming { - stage: "auto_index".to_owned(), - duration_ms: auto_index_started.elapsed().as_millis(), - detail: "prepare_indexes".to_owned(), - }); - } - - let runtime = BioscriptRuntime::with_config(runtime_root, RuntimeConfig { limits, loader }) - .map_err(|err| err.to_string())?; - - let mut inputs = Vec::new(); - if let Some(input_file) = request.input_file { - inputs.push(("input_file", MontyObject::String(input_file))); - } - if let Some(output_file) = request.output_file { - inputs.push(("output_file", MontyObject::String(output_file))); - } - if let Some(participant_id) = request.participant_id { - inputs.push(("participant_id", MontyObject::String(participant_id))); - } - - runtime - .run_file( - &script_path, - request - .trace_report_path - .as_deref() - .map(std::path::Path::new), - inputs, - ) - .map_err(|err| err.to_string())?; - - if let Some(timing_path) = request.timing_report_path { - let mut all_timings = ffi_timings; - all_timings.extend(runtime.timing_snapshot()); - write_timing_report(&PathBuf::from(timing_path), &all_timings)?; - } - - Ok(RunFileResult { ok: true }) -} - -fn runtime_root(request: &RunFileRequest) -> Result { - match request.root.as_deref() { - Some(dir) => Ok(PathBuf::from(dir)), - None => env::current_dir().map_err(|err| format!("failed to get current directory: {err}")), - } -} - -fn build_loader(request: &RunFileRequest) -> Result { - let mut loader = GenotypeLoadOptions::default(); - if let Some(value) = request.input_format.as_deref() { - if value.eq_ignore_ascii_case("auto") { - loader.format = None; - } else { - let parsed = value - .parse::() - .map_err(|err| format!("invalid inputFormat value {value}: {err}"))?; - loader.format = Some(parsed); - } - } - loader.input_index = request.input_index.clone().map(PathBuf::from); - loader.reference_file = request.reference_file.clone().map(PathBuf::from); - loader.reference_index = request.reference_index.clone().map(PathBuf::from); - loader.allow_reference_md5_mismatch = request.allow_md5_mismatch.unwrap_or(false); - Ok(loader) -} - -fn build_limits(request: &RunFileRequest) -> ResourceLimits { - let mut limits = ResourceLimits::new() - .max_duration(Duration::from_millis(DEFAULT_MAX_DURATION_MS)) - .max_memory(DEFAULT_MAX_MEMORY_BYTES) - .max_allocations(DEFAULT_MAX_ALLOCATIONS) - .gc_interval(1000) - .max_recursion_depth(Some(DEFAULT_MAX_RECURSION_DEPTH)); - - if let Some(value) = request.max_duration_ms { - limits = limits.max_duration(Duration::from_millis(value.min(HARD_MAX_DURATION_MS))); - } - if let Some(value) = request.max_memory_bytes { - limits = limits.max_memory(value.min(HARD_MAX_MEMORY_BYTES)); - } - if let Some(value) = request.max_allocations { - limits = limits.max_allocations(value.min(HARD_MAX_ALLOCATIONS)); - } - if let Some(value) = request.max_recursion_depth { - limits = limits.max_recursion_depth(Some(value.min(HARD_MAX_RECURSION_DEPTH))); - } - - limits -} - -fn write_timing_report(path: &PathBuf, timings: &[StageTiming]) -> Result<(), String> { - if let Some(parent) = path.parent() { - fs::create_dir_all(parent).map_err(|err| { - format!( - "failed to create timing report dir {}: {err}", - parent.display() - ) - })?; - } - let mut output = String::from("stage\tduration_ms\tdetail\n"); - for timing in timings { - let _ = writeln!( - output, - "{}\t{}\t{}\n", - timing.stage, - timing.duration_ms, - timing.detail.replace('\t', " ") - ); - } - fs::write(path, output) - .map_err(|err| format!("failed to write timing report {}: {err}", path.display())) -} - -#[unsafe(no_mangle)] -/// Executes a bioscript run request encoded as a UTF-8 JSON C string. -/// -/// # Safety -/// -/// `request_json` must either be null or point to a valid, NUL-terminated C -/// string that remains alive for the duration of this call. -pub unsafe extern "C" fn bioscript_run_file_json(request_json: *const c_char) -> *mut c_char { - let response = unsafe { - if request_json.is_null() { - FfiResult:: { - ok: false, - value: None, - error: Some("request_json was null".to_owned()), - } - } else { - match CStr::from_ptr(request_json).to_str() { - Ok(value) => match serde_json::from_str::(value) { - Ok(request) => match run_file_request(request) { - Ok(result) => FfiResult { - ok: true, - value: Some(result), - error: None, - }, - Err(error) => FfiResult:: { - ok: false, - value: None, - error: Some(error), - }, - }, - Err(error) => FfiResult:: { - ok: false, - value: None, - error: Some(format!("invalid request JSON: {error}")), - }, - }, - Err(error) => FfiResult:: { - ok: false, - value: None, - error: Some(format!("request_json was not valid UTF-8: {error}")), - }, - } - } - }; - - match serde_json::to_string(&response) { - Ok(json) => match CString::new(json) { - Ok(value) => value.into_raw(), - Err(_) => std::ptr::null_mut(), - }, - Err(_) => std::ptr::null_mut(), - } -} - -#[unsafe(no_mangle)] -/// Frees a string previously returned by [`bioscript_run_file_json`]. -/// -/// # Safety -/// -/// `ptr` must be null or a pointer returned by [`CString::into_raw`] from this -/// library, and it must not be freed more than once. -pub unsafe extern "C" fn bioscript_free_string(ptr: *mut c_char) { - if !ptr.is_null() { - unsafe { - let _ = CString::from_raw(ptr); - } - } -} +pub use variant_yaml::run_variant_yaml_request; #[cfg(target_os = "android")] pub mod android { @@ -342,44 +56,3 @@ pub mod android { .expect("jni new_string should succeed") } } - -#[cfg(test)] -mod tests { - use super::*; - - fn request_with_limits() -> RunFileRequest { - RunFileRequest { - script_path: "script.py".to_owned(), - root: None, - input_file: None, - output_file: None, - participant_id: None, - trace_report_path: None, - timing_report_path: None, - input_format: None, - input_index: None, - reference_file: None, - reference_index: None, - allow_md5_mismatch: None, - auto_index: None, - cache_dir: None, - max_duration_ms: Some(u64::MAX), - max_memory_bytes: Some(usize::MAX), - max_allocations: Some(usize::MAX), - max_recursion_depth: Some(usize::MAX), - } - } - - #[test] - fn ffi_resource_limits_are_clamped_to_hard_ceilings() { - let limits = build_limits(&request_with_limits()); - - assert_eq!( - limits.max_duration, - Some(Duration::from_millis(HARD_MAX_DURATION_MS)) - ); - assert_eq!(limits.max_memory, Some(HARD_MAX_MEMORY_BYTES)); - assert_eq!(limits.max_allocations, Some(HARD_MAX_ALLOCATIONS)); - assert_eq!(limits.max_recursion_depth, Some(HARD_MAX_RECURSION_DEPTH)); - } -} diff --git a/rust/bioscript-ffi/src/limits.rs b/rust/bioscript-ffi/src/limits.rs new file mode 100644 index 0000000..84ede1c --- /dev/null +++ b/rust/bioscript-ffi/src/limits.rs @@ -0,0 +1,82 @@ +use std::time::Duration; + +use monty::ResourceLimits; + +use crate::RunFileRequest; + +pub(crate) const DEFAULT_MAX_DURATION_MS: u64 = 100; +pub(crate) const DEFAULT_MAX_MEMORY_BYTES: usize = 8 * 1024 * 1024; +pub(crate) const DEFAULT_MAX_ALLOCATIONS: usize = 200_000; +pub(crate) const DEFAULT_MAX_RECURSION_DEPTH: usize = 200; +pub(crate) const HARD_MAX_DURATION_MS: u64 = 60_000; +pub(crate) const HARD_MAX_MEMORY_BYTES: usize = 256 * 1024 * 1024; +pub(crate) const HARD_MAX_ALLOCATIONS: usize = 10_000_000; +pub(crate) const HARD_MAX_RECURSION_DEPTH: usize = 10_000; + +pub(crate) fn build_limits(request: &RunFileRequest) -> ResourceLimits { + let mut limits = ResourceLimits::new() + .max_duration(Duration::from_millis(DEFAULT_MAX_DURATION_MS)) + .max_memory(DEFAULT_MAX_MEMORY_BYTES) + .max_allocations(DEFAULT_MAX_ALLOCATIONS) + .gc_interval(1000) + .max_recursion_depth(Some(DEFAULT_MAX_RECURSION_DEPTH)); + + if let Some(value) = request.max_duration_ms { + limits = limits.max_duration(Duration::from_millis(value.min(HARD_MAX_DURATION_MS))); + } + if let Some(value) = request.max_memory_bytes { + limits = limits.max_memory(value.min(HARD_MAX_MEMORY_BYTES)); + } + if let Some(value) = request.max_allocations { + limits = limits.max_allocations(value.min(HARD_MAX_ALLOCATIONS)); + } + if let Some(value) = request.max_recursion_depth { + limits = limits.max_recursion_depth(Some(value.min(HARD_MAX_RECURSION_DEPTH))); + } + + limits +} + +#[cfg(test)] +mod tests { + use super::*; + + fn request_with_limits() -> RunFileRequest { + RunFileRequest { + script_path: "script.py".to_owned(), + script_contents: None, + root: None, + input_file: None, + input_contents: None, + output_file: None, + file_contents: None, + participant_id: None, + trace_report_path: None, + timing_report_path: None, + input_format: None, + input_index: None, + reference_file: None, + reference_index: None, + allow_md5_mismatch: None, + auto_index: None, + cache_dir: None, + max_duration_ms: Some(u64::MAX), + max_memory_bytes: Some(usize::MAX), + max_allocations: Some(usize::MAX), + max_recursion_depth: Some(usize::MAX), + } + } + + #[test] + fn ffi_resource_limits_are_clamped_to_hard_ceilings() { + let limits = build_limits(&request_with_limits()); + + assert_eq!( + limits.max_duration, + Some(Duration::from_millis(HARD_MAX_DURATION_MS)) + ); + assert_eq!(limits.max_memory, Some(HARD_MAX_MEMORY_BYTES)); + assert_eq!(limits.max_allocations, Some(HARD_MAX_ALLOCATIONS)); + assert_eq!(limits.max_recursion_depth, Some(HARD_MAX_RECURSION_DEPTH)); + } +} diff --git a/rust/bioscript-ffi/src/run_file.rs b/rust/bioscript-ffi/src/run_file.rs new file mode 100644 index 0000000..fe32e61 --- /dev/null +++ b/rust/bioscript-ffi/src/run_file.rs @@ -0,0 +1,217 @@ +use std::{ + env, + fmt::Write as _, + fs, + path::{Path, PathBuf}, + time::Instant, +}; + +use bioscript_formats::{ + GenotypeLoadOptions, GenotypeSourceFormat, PrepareRequest, prepare_indexes, +}; +use bioscript_runtime::{BioscriptRuntime, RuntimeConfig, StageTiming}; +use monty::MontyObject; + +use crate::{ + limits::build_limits, + types::{RunFileRequest, RunFileResult}, +}; + +/// Runs a bioscript file request described by a JSON-compatible Rust struct. +/// +/// # Errors +/// +/// Returns an error string when request parsing, optional index preparation, +/// runtime construction, script execution, or report writing fails. +pub fn run_file_request(request: RunFileRequest) -> Result { + let runtime_root = runtime_root(&request)?; + let mut loader = build_loader(&request)?; + let limits = build_limits(&request); + materialize_inline_files(&request, &runtime_root)?; + let script_path = materialize_script(&request, &runtime_root)?; + + let mut ffi_timings: Vec = Vec::new(); + if request.auto_index.unwrap_or(false) { + run_auto_index(&request, &runtime_root, &mut loader, &mut ffi_timings)?; + } + + let runtime = BioscriptRuntime::with_config(runtime_root, RuntimeConfig { limits, loader }) + .map_err(|err| err.to_string())?; + let inputs = runtime_inputs(&request); + + runtime + .run_file( + &script_path, + request + .trace_report_path + .as_deref() + .map(std::path::Path::new), + inputs, + ) + .map_err(|err| err.to_string())?; + + if let Some(timing_path) = request.timing_report_path { + let mut all_timings = ffi_timings; + all_timings.extend(runtime.timing_snapshot()); + write_timing_report(&PathBuf::from(timing_path), &all_timings)?; + } + + Ok(RunFileResult { ok: true }) +} + +fn runtime_root(request: &RunFileRequest) -> Result { + match request.root.as_deref() { + Some(dir) => Ok(PathBuf::from(dir)), + None => env::current_dir().map_err(|err| format!("failed to get current directory: {err}")), + } +} + +fn materialize_inline_files(request: &RunFileRequest, runtime_root: &Path) -> Result<(), String> { + if let Some(files) = &request.file_contents { + for (path, contents) in files { + write_runtime_file(runtime_root, path, contents)?; + } + } + if let (Some(input_file), Some(contents)) = (&request.input_file, &request.input_contents) { + write_runtime_file(runtime_root, input_file, contents)?; + } + Ok(()) +} + +fn materialize_script(request: &RunFileRequest, runtime_root: &Path) -> Result { + if let Some(contents) = &request.script_contents { + write_runtime_file(runtime_root, &request.script_path, contents)?; + } + Ok(resolve_runtime_path(runtime_root, &request.script_path)) +} + +fn write_runtime_file(runtime_root: &Path, path: &str, contents: &str) -> Result<(), String> { + let target = resolve_runtime_path(runtime_root, path); + if let Some(parent) = target.parent() { + fs::create_dir_all(parent).map_err(|err| { + format!( + "failed to create runtime file dir {}: {err}", + parent.display() + ) + })?; + } + fs::write(&target, contents) + .map_err(|err| format!("failed to write runtime file {}: {err}", target.display())) +} + +fn resolve_runtime_path(runtime_root: &Path, path: &str) -> PathBuf { + let candidate = PathBuf::from(path); + if candidate.is_absolute() { + candidate + } else { + runtime_root.join(candidate) + } +} + +fn run_auto_index( + request: &RunFileRequest, + runtime_root: &Path, + loader: &mut GenotypeLoadOptions, + ffi_timings: &mut Vec, +) -> Result<(), String> { + let auto_index_started = Instant::now(); + let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; + let effective_cache = request.cache_dir.as_ref().map_or_else( + || runtime_root.join(".bioscript-cache"), + |dir| { + let path = PathBuf::from(dir); + if path.is_absolute() { + path + } else { + runtime_root.join(path) + } + }, + ); + let prepare_request = PrepareRequest { + root: runtime_root.to_path_buf(), + cwd, + cache_dir: effective_cache, + input_file: request.input_file.clone(), + input_format: loader.format, + reference_file: loader + .reference_file + .as_ref() + .map(|p| p.to_string_lossy().to_string()), + }; + let prepared = prepare_indexes(&prepare_request)?; + if let Some(idx) = prepared.input_index + && loader.input_index.is_none() + { + loader.input_index = Some(idx); + } + if let Some(ref_file) = prepared.reference_file { + loader.reference_file = Some(ref_file); + } + if let Some(ref_idx) = prepared.reference_index + && loader.reference_index.is_none() + { + loader.reference_index = Some(ref_idx); + } + ffi_timings.push(StageTiming { + stage: "auto_index".to_owned(), + duration_ms: auto_index_started.elapsed().as_millis(), + detail: "prepare_indexes".to_owned(), + }); + Ok(()) +} + +fn runtime_inputs(request: &RunFileRequest) -> Vec<(&'static str, MontyObject)> { + let mut inputs = Vec::new(); + if let Some(input_file) = request.input_file.clone() { + inputs.push(("input_file", MontyObject::String(input_file))); + } + if let Some(output_file) = request.output_file.clone() { + inputs.push(("output_file", MontyObject::String(output_file))); + } + if let Some(participant_id) = request.participant_id.clone() { + inputs.push(("participant_id", MontyObject::String(participant_id))); + } + inputs +} + +pub(crate) fn build_loader(request: &RunFileRequest) -> Result { + let mut loader = GenotypeLoadOptions::default(); + if let Some(value) = request.input_format.as_deref() { + if value.eq_ignore_ascii_case("auto") { + loader.format = None; + } else { + let parsed = value + .parse::() + .map_err(|err| format!("invalid inputFormat value {value}: {err}"))?; + loader.format = Some(parsed); + } + } + loader.input_index = request.input_index.clone().map(PathBuf::from); + loader.reference_file = request.reference_file.clone().map(PathBuf::from); + loader.reference_index = request.reference_index.clone().map(PathBuf::from); + loader.allow_reference_md5_mismatch = request.allow_md5_mismatch.unwrap_or(false); + Ok(loader) +} + +fn write_timing_report(path: &PathBuf, timings: &[StageTiming]) -> Result<(), String> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).map_err(|err| { + format!( + "failed to create timing report dir {}: {err}", + parent.display() + ) + })?; + } + let mut output = String::from("stage\tduration_ms\tdetail\n"); + for timing in timings { + let _ = writeln!( + output, + "{}\t{}\t{}\n", + timing.stage, + timing.duration_ms, + timing.detail.replace('\t', " ") + ); + } + fs::write(path, output) + .map_err(|err| format!("failed to write timing report {}: {err}", path.display())) +} diff --git a/rust/bioscript-ffi/src/types.rs b/rust/bioscript-ffi/src/types.rs new file mode 100644 index 0000000..86c6e03 --- /dev/null +++ b/rust/bioscript-ffi/src/types.rs @@ -0,0 +1,119 @@ +use bioscript_core::{VariantObservation, VariantSpec}; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct RunFileRequest { + pub script_path: String, + pub script_contents: Option, + pub root: Option, + pub input_file: Option, + pub input_contents: Option, + pub output_file: Option, + pub file_contents: Option>, + pub participant_id: Option, + pub trace_report_path: Option, + pub timing_report_path: Option, + pub input_format: Option, + pub input_index: Option, + pub reference_file: Option, + pub reference_index: Option, + pub allow_md5_mismatch: Option, + pub auto_index: Option, + pub cache_dir: Option, + pub max_duration_ms: Option, + pub max_memory_bytes: Option, + pub max_allocations: Option, + pub max_recursion_depth: Option, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct RunFileResult { + pub ok: bool, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct RunVariantYamlRequest { + pub yaml_path: String, + pub genome_path: String, + pub input_format: Option, + pub input_index: Option, + pub reference_file: Option, + pub reference_index: Option, + pub allow_md5_mismatch: Option, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct RunVariantYamlResult { + pub observations: Vec, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct VariantObservationResult { + pub name: String, + pub backend: String, + #[serde(rename = "ref", skip_serializing_if = "Option::is_none")] + pub reference: Option, + #[serde(rename = "alt", skip_serializing_if = "Option::is_none")] + pub alternate: Option, + #[serde(rename = "matchedRsid", skip_serializing_if = "Option::is_none")] + pub matched_rsid: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub assembly: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub genotype: Option, + #[serde(rename = "refCount", skip_serializing_if = "Option::is_none")] + pub ref_count: Option, + #[serde(rename = "altCount", skip_serializing_if = "Option::is_none")] + pub alt_count: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub depth: Option, + #[serde(rename = "rawCounts")] + pub raw_counts: std::collections::BTreeMap, + #[serde(skip_serializing_if = "Option::is_none")] + pub decision: Option, + pub evidence: Vec, +} + +#[derive(Debug, Clone)] +pub(crate) struct NamedVariantSpec { + pub(crate) name: String, + pub(crate) spec: VariantSpec, +} + +pub(crate) fn observation_result( + variant: NamedVariantSpec, + observation: VariantObservation, +) -> VariantObservationResult { + VariantObservationResult { + name: variant.name, + backend: observation.backend, + reference: variant.spec.reference, + alternate: variant.spec.alternate, + matched_rsid: observation.matched_rsid, + assembly: observation + .assembly + .map(super::variant_yaml::assembly_label), + genotype: observation.genotype, + ref_count: observation.ref_count, + alt_count: observation.alt_count, + depth: observation.depth, + raw_counts: observation.raw_counts, + decision: observation.decision, + evidence: observation.evidence, + } +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +pub(crate) struct FfiResult { + pub(crate) ok: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) value: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) error: Option, +} diff --git a/rust/bioscript-ffi/src/variant_yaml.rs b/rust/bioscript-ffi/src/variant_yaml.rs new file mode 100644 index 0000000..69c2547 --- /dev/null +++ b/rust/bioscript-ffi/src/variant_yaml.rs @@ -0,0 +1,221 @@ +use std::{fs, path::PathBuf}; + +use bioscript_core::{Assembly, VariantKind, VariantSpec}; +use bioscript_formats::{GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore}; +use bioscript_schema::load_variant_manifest_text_for_lookup; + +use crate::types::{ + NamedVariantSpec, RunVariantYamlRequest, RunVariantYamlResult, observation_result, +}; + +/// Runs a BioScript variant YAML assay against a supported genome file. +/// +/// The native desktop/mobile path uses this instead of the web WASM exports. +/// It intentionally mirrors the web variant YAML flow: compile YAML through +/// `bioscript-schema`, choose the preferred assembly-specific variant, and +/// execute lookup through `bioscript-formats`. +pub fn run_variant_yaml_request( + request: RunVariantYamlRequest, +) -> Result { + let yaml_path = PathBuf::from(&request.yaml_path); + let yaml_text = fs::read_to_string(&yaml_path) + .map_err(|err| format!("failed to read YAML assay {}: {err}", yaml_path.display()))?; + let variants = compile_variant_yaml_named( + yaml_path + .file_name() + .and_then(|name| name.to_str()) + .unwrap_or("variant.yaml"), + &yaml_text, + )?; + let selected = select_preferred_assembly_variants(&request.genome_path, variants); + let loader = variant_loader(&request)?; + + let genome_path = PathBuf::from(&request.genome_path); + let store = GenotypeStore::from_file_with_options(&genome_path, &loader) + .map_err(|err| format!("failed to load genome {}: {err}", genome_path.display()))?; + let specs = selected + .iter() + .map(|variant| variant.spec.clone()) + .collect::>(); + let observations = store + .lookup_variants(&specs) + .map_err(|err| format!("variant lookup failed: {err}"))?; + + Ok(RunVariantYamlResult { + observations: selected + .into_iter() + .zip(observations) + .map(|(variant, observation)| observation_result(variant, observation)) + .collect(), + }) +} + +fn variant_loader(request: &RunVariantYamlRequest) -> Result { + let mut loader = GenotypeLoadOptions::default(); + if let Some(value) = request.input_format.as_deref() { + if value.eq_ignore_ascii_case("auto") { + loader.format = None; + } else { + loader.format = Some( + value + .parse::() + .map_err(|err| format!("invalid inputFormat value {value}: {err}"))?, + ); + } + } + loader.input_index = request.input_index.clone().map(PathBuf::from); + loader.reference_file = request.reference_file.clone().map(PathBuf::from); + loader.reference_index = request.reference_index.clone().map(PathBuf::from); + loader.allow_reference_md5_mismatch = request.allow_md5_mismatch.unwrap_or(false); + Ok(loader) +} + +fn compile_variant_yaml_named(name: &str, text: &str) -> Result, String> { + let manifest = load_variant_manifest_text_for_lookup(name, text) + .map_err(|err| format!("compile variant YAML failed: {err}"))?; + let mut out = Vec::new(); + if let Some(locus) = manifest.spec.grch38.clone() { + let mut spec = manifest.spec.clone(); + spec.grch37 = None; + spec.grch38 = Some(locus); + out.push(NamedVariantSpec { + name: manifest.name.clone(), + spec, + }); + } + if let Some(locus) = manifest.spec.grch37.clone() { + let mut spec = manifest.spec; + spec.grch37 = Some(locus); + spec.grch38 = None; + out.push(NamedVariantSpec { + name: if out.is_empty() { + manifest.name.clone() + } else { + format!("{}_grch37", manifest.name) + }, + spec, + }); + } + if out.is_empty() { + return Err(format!("variant {} has no coordinates", manifest.name)); + } + Ok(out) +} + +fn select_preferred_assembly_variants( + genome_name: &str, + variants: Vec, +) -> Vec { + let target = infer_genome_assembly(genome_name).unwrap_or(Assembly::Grch38); + let mut groups: Vec<(String, Vec)> = Vec::new(); + for variant in variants { + let key = variant_group_key(&variant); + if let Some((_, values)) = groups.iter_mut().find(|(candidate, _)| candidate == &key) { + values.push(variant); + } else { + groups.push((key, vec![variant])); + } + } + + groups + .into_iter() + .flat_map(|(_, group)| select_preferred_group(group, target)) + .collect() +} + +fn select_preferred_group(group: Vec, target: Assembly) -> Vec { + let has_multiple_assemblies = group + .iter() + .filter_map(|variant| variant_assembly(&variant.spec)) + .fold(Vec::new(), |mut assemblies, assembly| { + if !assemblies.contains(&assembly) { + assemblies.push(assembly); + } + assemblies + }) + .len() + > 1; + if group.len() <= 1 || !has_multiple_assemblies { + return group; + } + let fallback = group[0].clone(); + vec![ + group + .iter() + .find(|variant| variant_assembly(&variant.spec) == Some(target)) + .or_else(|| { + group + .iter() + .find(|variant| variant_assembly(&variant.spec) == Some(Assembly::Grch38)) + }) + .cloned() + .unwrap_or(fallback), + ] +} + +fn infer_genome_assembly(name: &str) -> Option { + let lower = name.to_ascii_lowercase(); + if lower.contains("grch38") || lower.contains("hg38") { + return Some(Assembly::Grch38); + } + if lower.contains("grch37") || lower.contains("hg19") { + return Some(Assembly::Grch37); + } + None +} + +fn variant_group_key(variant: &NamedVariantSpec) -> String { + if let Some(rsid) = variant.spec.rsids.first() { + return format!( + "{}|{}|{}|{}", + rsid.to_ascii_lowercase(), + variant + .spec + .reference + .as_deref() + .unwrap_or("") + .to_ascii_uppercase(), + variant + .spec + .alternate + .as_deref() + .unwrap_or("") + .to_ascii_uppercase(), + variant_kind_label(variant.spec.kind) + ); + } + variant + .name + .trim_end_matches("_grch37") + .trim_end_matches("_grch38") + .to_ascii_lowercase() +} + +fn variant_assembly(spec: &VariantSpec) -> Option { + if spec.grch37.is_some() { + return Some(Assembly::Grch37); + } + if spec.grch38.is_some() { + return Some(Assembly::Grch38); + } + None +} + +pub(crate) fn assembly_label(assembly: Assembly) -> String { + match assembly { + Assembly::Grch37 => "grch37", + Assembly::Grch38 => "grch38", + } + .to_owned() +} + +fn variant_kind_label(kind: Option) -> &'static str { + match kind { + Some(VariantKind::Snp) => "snv", + Some(VariantKind::Insertion) => "insertion", + Some(VariantKind::Deletion) => "deletion", + Some(VariantKind::Indel) => "indel", + Some(VariantKind::Other) => "other", + None => "", + } +} diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index 4c98424..754545c 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -1,138 +1,70 @@ use std::{ - collections::{BTreeMap, BTreeSet, HashMap}, - fmt::Write as _, + collections::HashMap, fs::File, - io::{BufRead, BufReader, Cursor, Read, Seek}, - path::{Path, PathBuf}, - str::FromStr, + io::{BufReader, Cursor}, + path::Path, }; -use noodles::bgzf; -use noodles::core::{Position, Region}; -use noodles::cram; -use noodles::csi::{self, BinningIndex}; -use noodles::sam::alignment::{ - Record as _, - record::{Cigar as _, QualityScores as _, Sequence as _, cigar::op::Kind as CigarOpKind}, -}; -use noodles::tabix; use zip::ZipArchive; -use bioscript_core::{ - Assembly, GenomicLocus, RuntimeError, VariantKind, VariantObservation, VariantSpec, -}; - -use crate::alignment::{self, AlignmentOpKind, AlignmentRecord}; - -const COMMENT_PREFIXES: [&str; 2] = ["#", "//"]; -const DEFAULT_MPILEUP_MIN_BASE_QUALITY: u8 = 13; -const DEFAULT_MPILEUP_MIN_MAPPING_QUALITY: u8 = 0; -const MAX_ZIP_ENTRY_BYTES: u64 = 128 * 1024 * 1024; - -const RSID_ALIASES: &[&str] = &["rsid", "name", "snp", "marker", "id", "snpid"]; -const CHROM_ALIASES: &[&str] = &["chromosome", "chr", "chrom"]; -const POSITION_ALIASES: &[&str] = &[ - "position", - "pos", - "coordinate", - "basepairposition", - "basepair", -]; -const GENOTYPE_ALIASES: &[&str] = &[ - "genotype", - "gt", - "result", - "results", - "result1", - "call", - "calls", - "yourcode", - "code", - "genotypevalue", - "variation", -]; -const ALLELE1_ALIASES: &[&str] = &["allele1", "allelea", "allele_a", "allele1top"]; -const ALLELE2_ALIASES: &[&str] = &["allele2", "alleleb", "allele_b", "allele2top"]; - -#[derive(Debug, Clone)] -pub struct GenotypeStore { - backend: QueryBackend, -} - -#[derive(Debug, Clone)] -enum QueryBackend { - RsidMap(RsidMapBackend), - Delimited(DelimitedBackend), - Vcf(VcfBackend), - Cram(CramBackend), -} - -#[derive(Debug, Clone)] -struct RsidMapBackend { - format: GenotypeSourceFormat, - values: HashMap, -} - -#[derive(Debug, Clone)] -struct DelimitedBackend { - format: GenotypeSourceFormat, - path: PathBuf, - zip_entry_name: Option, -} - -#[derive(Debug, Clone)] -struct VcfBackend { - path: PathBuf, -} - -#[derive(Debug, Clone)] -struct CramBackend { - path: PathBuf, - options: GenotypeLoadOptions, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum QueryKind { - GenotypeByRsid, - GenotypeByLocus, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct BackendCapabilities { - pub rsid_lookup: bool, - pub locus_lookup: bool, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum GenotypeSourceFormat { - Text, - Zip, - Vcf, - Cram, -} +#[cfg(test)] +use bioscript_core::{Assembly, GenomicLocus, VariantKind}; +use bioscript_core::{RuntimeError, VariantObservation, VariantSpec}; -impl FromStr for GenotypeSourceFormat { - type Err = String; +mod common; +mod cram_backend; +mod delimited; +mod io; +mod types; +mod vcf; +mod vcf_tokens; - fn from_str(value: &str) -> Result { - match value.trim().to_ascii_lowercase().as_str() { - "txt" | "text" | "genotype" => Ok(Self::Text), - "zip" => Ok(Self::Zip), - "vcf" => Ok(Self::Vcf), - "cram" => Ok(Self::Cram), - other => Err(format!("unsupported input format: {other}")), - } - } -} +#[cfg(test)] +use common::chrom_sort_key; +pub(crate) use common::{describe_query, normalize_genotype, variant_sort_key}; +#[cfg(test)] +use cram_backend::{ + SnpPileupCounts, anchor_window, choose_variant_locus, classify_expected_indel, + describe_copy_number_decision_rule, describe_locus, describe_snp_decision_rule, + detect_reference_assembly, first_base, indel_at_anchor, infer_copy_number_genotype, + infer_snp_genotype, len_as_i64, normalize_pileup_base, record_overlaps_locus, spans_position, +}; +pub use cram_backend::{observe_cram_indel_with_reader, observe_cram_snp_with_reader}; +#[cfg(test)] +use delimited::{ + Delimiter, GENOTYPE_ALIASES, parse_streaming_row, split_csv_line, strip_bom, + strip_inline_comment, +}; +use delimited::{RowParser, detect_delimiter, scan_delimited_variants}; +#[cfg(test)] +use delimited::{ + build_column_indexes, default_column_indexes, find_header_index, looks_like_header_fields, + normalize_name, +}; +#[cfg(test)] +use io::looks_like_vcf_lines; +use io::{ + detect_source_format, is_bgzf_path, read_lines_from_reader, read_zip_entry_limited, + select_zip_entry, +}; +pub use types::{ + BackendCapabilities, GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, QueryKind, +}; +use types::{CramBackend, DelimitedBackend, QueryBackend, RsidMapBackend, VcfBackend}; +pub use vcf::observe_vcf_snp_with_reader; +#[cfg(test)] +use vcf::{ + choose_variant_locus_for_assembly, detect_vcf_assembly, extract_vcf_sample_genotype, + normalize_chromosome_name, parse_vcf_record, vcf_row_matches_variant, +}; +use vcf::{lookup_indexed_vcf_variants, scan_vcf_variants}; +use vcf_tokens::genotype_from_vcf_gt; +#[cfg(test)] +use vcf_tokens::{ + is_symbolic_vcf_alt, normalize_sequence_token, vcf_alt_token, vcf_reference_token, +}; -#[derive(Debug, Clone, Default)] -pub struct GenotypeLoadOptions { - pub format: Option, - pub input_index: Option, - pub reference_file: Option, - pub reference_index: Option, - pub allow_reference_md5_mismatch: bool, -} +const MAX_ZIP_ENTRY_BYTES: u64 = 128 * 1024 * 1024; impl GenotypeStore { pub fn from_file(path: &Path) -> Result { @@ -150,7 +82,7 @@ impl GenotypeStore { None, )), GenotypeSourceFormat::Zip => Self::from_zip_file(path), - GenotypeSourceFormat::Vcf => Ok(Self::from_vcf_file(path)), + GenotypeSourceFormat::Vcf => Ok(Self::from_vcf_file(path, options)), GenotypeSourceFormat::Cram => Self::from_cram_file(path, options), } } @@ -215,10 +147,11 @@ impl GenotypeStore { Self::from_delimited_lines(GenotypeSourceFormat::Zip, lines) } - fn from_vcf_file(path: &Path) -> Self { + fn from_vcf_file(path: &Path, options: &GenotypeLoadOptions) -> Self { Self { backend: QueryBackend::Vcf(VcfBackend { path: path.to_path_buf(), + options: options.clone(), }), } } @@ -298,2370 +231,215 @@ impl GenotypeStore { values.insert(rsid.to_owned(), genotype); } } - - Ok(Self::from_rsid_map(GenotypeSourceFormat::Vcf, values)) - } - - fn from_delimited_lines( - format: GenotypeSourceFormat, - lines: Vec, - ) -> Result { - let delimiter = detect_delimiter(&lines); - let mut parser = RowParser::new(delimiter); - let mut values = HashMap::new(); - for line in lines { - if let Some((rsid, genotype)) = parser.consume_line(&line)? { - values.insert(rsid, genotype); - } - } - Ok(Self::from_rsid_map(format, values)) - } - - fn from_rsid_map(format: GenotypeSourceFormat, values: HashMap) -> Self { - Self { - backend: QueryBackend::RsidMap(RsidMapBackend { format, values }), - } - } - - fn from_delimited_file( - path: &Path, - format: GenotypeSourceFormat, - zip_entry_name: Option, - ) -> Self { - Self { - backend: QueryBackend::Delimited(DelimitedBackend { - format, - path: path.to_path_buf(), - zip_entry_name, - }), - } - } - - pub fn capabilities(&self) -> BackendCapabilities { - match &self.backend { - QueryBackend::RsidMap(_) => BackendCapabilities { - rsid_lookup: true, - locus_lookup: false, - }, - QueryBackend::Delimited(_) | QueryBackend::Vcf(_) => BackendCapabilities { - rsid_lookup: true, - locus_lookup: true, - }, - QueryBackend::Cram(_) => BackendCapabilities { - rsid_lookup: false, - locus_lookup: true, - }, - } - } - - pub fn supports(&self, query: QueryKind) -> bool { - let caps = self.capabilities(); - match query { - QueryKind::GenotypeByRsid => caps.rsid_lookup, - QueryKind::GenotypeByLocus => caps.locus_lookup, - } - } - - pub fn backend_name(&self) -> &'static str { - match &self.backend { - QueryBackend::RsidMap(map) => map.backend_name(), - QueryBackend::Delimited(backend) => backend.backend_name(), - QueryBackend::Vcf(backend) => backend.backend_name(), - QueryBackend::Cram(backend) => backend.backend_name(), - } - } - - pub fn get(&self, rsid: &str) -> Result, RuntimeError> { - match &self.backend { - QueryBackend::RsidMap(map) => Ok(map.values.get(rsid).cloned()), - QueryBackend::Delimited(backend) => backend.get(rsid), - QueryBackend::Vcf(backend) => backend.get(rsid), - QueryBackend::Cram(backend) => backend - .lookup_variant(&VariantSpec { - rsids: vec![rsid.to_owned()], - ..VariantSpec::default() - }) - .map(|obs| obs.genotype), - } - } - - pub fn lookup_variant( - &self, - variant: &VariantSpec, - ) -> Result { - match &self.backend { - QueryBackend::RsidMap(map) => map.lookup_variant(variant), - QueryBackend::Delimited(backend) => backend.lookup_variant(variant), - QueryBackend::Vcf(backend) => backend.lookup_variant(variant), - QueryBackend::Cram(backend) => backend.lookup_variant(variant), - } - } - - pub fn lookup_variants( - &self, - variants: &[VariantSpec], - ) -> Result, RuntimeError> { - if let QueryBackend::Delimited(backend) = &self.backend { - return backend.lookup_variants(variants); - } - if let QueryBackend::Vcf(backend) = &self.backend { - return backend.lookup_variants(variants); - } - let mut indexed: Vec<(usize, &VariantSpec)> = variants.iter().enumerate().collect(); - indexed.sort_by_cached_key(|(_, variant)| variant_sort_key(variant)); - - let mut results = vec![VariantObservation::default(); variants.len()]; - for (original_idx, variant) in indexed { - results[original_idx] = self.lookup_variant(variant)?; - } - Ok(results) - } -} - -impl RsidMapBackend { - fn backend_name(&self) -> &'static str { - match self.format { - GenotypeSourceFormat::Text => "text", - GenotypeSourceFormat::Zip => "zip", - GenotypeSourceFormat::Vcf => "vcf", - GenotypeSourceFormat::Cram => "cram", - } - } - - fn lookup_variant(&self, variant: &VariantSpec) -> Result { - for rsid in &variant.rsids { - if let Some(value) = self.values.get(rsid) { - return Ok(VariantObservation { - backend: self.backend_name().to_owned(), - matched_rsid: Some(rsid.clone()), - genotype: Some(value.clone()), - evidence: vec![format!("resolved by rsid {rsid}")], - ..VariantObservation::default() - }); - } - } - - Ok(VariantObservation { - backend: self.backend_name().to_owned(), - evidence: vec!["no matching rsid found".to_owned()], - ..VariantObservation::default() - }) - } -} - -#[derive(Debug, Clone)] -struct ParsedDelimitedRow { - rsid: Option, - chrom: Option, - position: Option, - genotype: String, -} - -impl DelimitedBackend { - fn backend_name(&self) -> &'static str { - match self.format { - GenotypeSourceFormat::Text => "text", - GenotypeSourceFormat::Zip => "zip", - GenotypeSourceFormat::Vcf => "vcf", - GenotypeSourceFormat::Cram => "cram", - } - } - - fn get(&self, rsid: &str) -> Result, RuntimeError> { - let results = self.lookup_variants(&[VariantSpec { - rsids: vec![rsid.to_owned()], - ..VariantSpec::default() - }])?; - Ok(results.into_iter().next().and_then(|obs| obs.genotype)) - } - - fn lookup_variant(&self, variant: &VariantSpec) -> Result { - let mut results = self.lookup_variants(std::slice::from_ref(variant))?; - Ok(results.pop().unwrap_or_default()) - } - - fn lookup_variants( - &self, - variants: &[VariantSpec], - ) -> Result, RuntimeError> { - scan_delimited_variants(self, variants) - } -} - -impl VcfBackend { - fn backend_name(&self) -> &'static str { - "vcf" - } - - fn get(&self, rsid: &str) -> Result, RuntimeError> { - let results = self.lookup_variants(&[VariantSpec { - rsids: vec![rsid.to_owned()], - ..VariantSpec::default() - }])?; - Ok(results.into_iter().next().and_then(|obs| obs.genotype)) - } - - fn lookup_variant(&self, variant: &VariantSpec) -> Result { - let mut results = self.lookup_variants(std::slice::from_ref(variant))?; - Ok(results.pop().unwrap_or_default()) - } - - fn lookup_variants( - &self, - variants: &[VariantSpec], - ) -> Result, RuntimeError> { - scan_vcf_variants(self, variants) - } -} - -impl CramBackend { - fn backend_name(&self) -> &'static str { - "cram" - } - - fn lookup_variant(&self, variant: &VariantSpec) -> Result { - let Some(reference_file) = self.options.reference_file.as_ref() else { - return Err(RuntimeError::Unsupported(format!( - "backend '{}' cannot satisfy query '{}' for {} without --reference-file", - self.backend_name(), - describe_query(variant), - self.path.display() - ))); - }; - - let Some((assembly, locus)) = choose_variant_locus(variant, reference_file) else { - let mut detail = format!( - "backend '{}' cannot satisfy query '{}' for {} using reference {}", - self.backend_name(), - describe_query(variant), - self.path.display(), - reference_file.display() - ); - detail.push_str(". This backend needs GRCh37/GRCh38 coordinates, not only rsIDs"); - if let Some(reference_index) = self.options.reference_index.as_ref() { - let _ = write!(detail, " (reference index {})", reference_index.display()); - } - if let Some(input_index) = self.options.input_index.as_ref() { - let _ = write!(detail, " (input index {})", input_index.display()); - } - return Err(RuntimeError::Unsupported(detail)); - }; - - let observation = match variant.kind.unwrap_or(VariantKind::Other) { - VariantKind::Snp => self.observe_snp(variant, assembly, &locus, reference_file)?, - VariantKind::Deletion => { - self.observe_deletion(variant, assembly, &locus, reference_file)? - } - VariantKind::Insertion | VariantKind::Indel => { - self.observe_indel(variant, assembly, &locus, reference_file)? - } - VariantKind::Other => { - return Err(RuntimeError::Unsupported(format!( - "backend '{}' does not yet support {:?} observation for {}", - self.backend_name(), - variant.kind.unwrap_or(VariantKind::Other), - self.path.display() - ))); - } - }; - - Ok(observation) - } - - fn observe_snp( - &self, - variant: &VariantSpec, - assembly: Assembly, - locus: &GenomicLocus, - reference_file: &Path, - ) -> Result { - let reference = variant - .reference - .as_deref() - .and_then(first_base) - .ok_or_else(|| { - RuntimeError::InvalidArguments("SNP variant requires ref/reference".to_owned()) - })?; - let alternate = variant - .alternate - .as_deref() - .and_then(first_base) - .ok_or_else(|| { - RuntimeError::InvalidArguments("SNP variant requires alt/alternate".to_owned()) - })?; - - let target_pos = locus.start; - let pileup = observe_snp_pileup( - &self.path, - &self.options, - reference_file, - locus, - reference, - alternate, - )?; - let ref_count = pileup.filtered_ref_count; - let alt_count = pileup.filtered_alt_count; - let depth = pileup.filtered_depth; - - let evidence = pileup.evidence_lines(&describe_locus(locus), target_pos); - - Ok(VariantObservation { - backend: self.backend_name().to_owned(), - matched_rsid: variant.rsids.first().cloned(), - assembly: Some(assembly), - genotype: infer_snp_genotype(reference, alternate, ref_count, alt_count, depth), - ref_count: Some(ref_count), - alt_count: Some(alt_count), - depth: Some(depth), - raw_counts: pileup.raw_base_counts, - decision: Some(describe_snp_decision_rule( - reference, alternate, ref_count, alt_count, depth, - )), - evidence, - }) - } - - fn observe_deletion( - &self, - variant: &VariantSpec, - assembly: Assembly, - locus: &GenomicLocus, - reference_file: &Path, - ) -> Result { - let deletion_length = variant.deletion_length.ok_or_else(|| { - RuntimeError::InvalidArguments("deletion variant requires deletion_length".to_owned()) - })?; - let reference = variant.reference.clone().unwrap_or_else(|| "I".to_owned()); - let alternate = variant.alternate.clone().unwrap_or_else(|| "D".to_owned()); - let anchor_pos = locus.start.saturating_sub(1); - - let mut alt_count = 0u32; - let mut ref_count = 0u32; - let mut depth = 0u32; - - alignment::for_each_cram_record( - &self.path, - &self.options, - reference_file, - &anchor_window(locus), - |record| { - if record.is_unmapped || !spans_position(&record, anchor_pos) { - return Ok(true); - } - depth += 1; - match indel_at_anchor(&record, anchor_pos) { - Some((AlignmentOpKind::Deletion, len)) if len == deletion_length => { - alt_count += 1; - } - _ => ref_count += 1, - } - Ok(true) - }, - )?; - - Ok(VariantObservation { - backend: self.backend_name().to_owned(), - matched_rsid: variant.rsids.first().cloned(), - assembly: Some(assembly), - genotype: infer_copy_number_genotype( - &reference, &alternate, ref_count, alt_count, depth, - ), - ref_count: Some(ref_count), - alt_count: Some(alt_count), - depth: Some(depth), - raw_counts: BTreeMap::new(), - decision: Some(describe_copy_number_decision_rule( - &reference, &alternate, ref_count, alt_count, depth, - )), - evidence: vec![format!( - "observed deletion anchor {}:{} len={} depth={} ref_count={} alt_count={}", - locus.chrom, anchor_pos, deletion_length, depth, ref_count, alt_count - )], - }) - } - - fn observe_indel( - &self, - variant: &VariantSpec, - assembly: Assembly, - locus: &GenomicLocus, - reference_file: &Path, - ) -> Result { - let reference = variant.reference.clone().ok_or_else(|| { - RuntimeError::InvalidArguments("indel variant requires ref/reference".to_owned()) - })?; - let alternate = variant.alternate.clone().ok_or_else(|| { - RuntimeError::InvalidArguments("indel variant requires alt/alternate".to_owned()) - })?; - let records = - alignment::query_cram_records(&self.path, &self.options, reference_file, locus)?; - - let mut alt_count = 0u32; - let mut ref_count = 0u32; - let mut depth = 0u32; - let mut matching_alt_lengths = BTreeSet::new(); - - for record in records { - if record.is_unmapped { - continue; - } - if !record_overlaps_locus(&record, locus) { - continue; - } - let classification = - classify_expected_indel(&record, locus, reference.len(), &alternate)?; - if !classification.covering { - continue; - } - depth += 1; - if classification.matches_alt { - alt_count += 1; - matching_alt_lengths.insert(classification.observed_len); - } else if classification.reference_like { - ref_count += 1; - } - } - - let evidence_label = if matching_alt_lengths.is_empty() { - "none".to_owned() - } else { - matching_alt_lengths - .into_iter() - .map(|len| len.to_string()) - .collect::>() - .join(",") - }; - - Ok(VariantObservation { - backend: self.backend_name().to_owned(), - matched_rsid: variant.rsids.first().cloned(), - assembly: Some(assembly), - genotype: infer_copy_number_genotype( - &reference, &alternate, ref_count, alt_count, depth, - ), - ref_count: Some(ref_count), - alt_count: Some(alt_count), - depth: Some(depth), - raw_counts: BTreeMap::new(), - decision: Some(describe_copy_number_decision_rule( - &reference, &alternate, ref_count, alt_count, depth, - )), - evidence: vec![format!( - "observed indel at {} depth={} ref_count={} alt_count={} matching_alt_lengths={}", - describe_locus(locus), - depth, - ref_count, - alt_count, - evidence_label - )], - }) - } -} - -fn choose_variant_locus( - variant: &VariantSpec, - reference_file: &Path, -) -> Option<(Assembly, GenomicLocus)> { - match detect_reference_assembly(reference_file) { - Some(Assembly::Grch38) => variant - .grch38 - .clone() - .map(|locus| (Assembly::Grch38, locus)) - .or_else(|| { - variant - .grch37 - .clone() - .map(|locus| (Assembly::Grch37, locus)) - }), - Some(Assembly::Grch37) => variant - .grch37 - .clone() - .map(|locus| (Assembly::Grch37, locus)) - .or_else(|| { - variant - .grch38 - .clone() - .map(|locus| (Assembly::Grch38, locus)) - }), - None => variant - .grch38 - .clone() - .map(|locus| (Assembly::Grch38, locus)) - .or_else(|| { - variant - .grch37 - .clone() - .map(|locus| (Assembly::Grch37, locus)) - }), - } -} - -fn detect_reference_assembly(reference_file: &Path) -> Option { - let lower = reference_file.to_string_lossy().to_ascii_lowercase(); - if lower.contains("grch38") || lower.contains("hg38") || lower.contains("assembly38") { - Some(Assembly::Grch38) - } else if lower.contains("grch37") || lower.contains("hg19") || lower.contains("assembly37") { - Some(Assembly::Grch37) - } else { - None - } -} - -fn describe_locus(locus: &GenomicLocus) -> String { - format!("{}:{}-{}", locus.chrom, locus.start, locus.end) -} - -fn anchor_window(locus: &GenomicLocus) -> GenomicLocus { - let anchor = locus.start.saturating_sub(1); - GenomicLocus { - chrom: locus.chrom.clone(), - start: anchor, - end: anchor, - } -} - -fn first_base(value: &str) -> Option { - value - .trim() - .chars() - .next() - .map(|ch| ch.to_ascii_uppercase()) -} - -fn infer_snp_genotype( - reference: char, - alternate: char, - ref_count: u32, - alt_count: u32, - depth: u32, -) -> Option { - if depth == 0 || ref_count + alt_count == 0 { - return None; - } - let alt_fraction = f64::from(alt_count) / f64::from(depth); - if alt_fraction >= 0.8 { - Some(format!("{alternate}{alternate}")) - } else if alt_fraction <= 0.2 { - Some(format!("{reference}{reference}")) - } else { - Some(format!("{reference}{alternate}")) - } -} - -fn describe_snp_decision_rule( - reference: char, - alternate: char, - ref_count: u32, - alt_count: u32, - depth: u32, -) -> String { - if depth == 0 { - return format!( - "no covering reads for SNP; genotype unresolved (ref={reference}, alt={alternate})" - ); - } - if ref_count + alt_count == 0 { - return format!( - "no reads matched the declared SNP alleles; genotype unresolved; counts ref={ref_count} alt={alt_count} depth={depth} for {reference}>{alternate}" - ); - } - - let alt_fraction = f64::from(alt_count) / f64::from(depth); - format!( - "SNP genotype rule: alt_fraction={alt_fraction:.3} with thresholds ref<=0.200, het=(0.200,0.800), alt>=0.800; counts ref={ref_count} alt={alt_count} depth={depth} for {reference}>{alternate}" - ) -} - -fn infer_copy_number_genotype( - reference: &str, - alternate: &str, - _ref_count: u32, - alt_count: u32, - depth: u32, -) -> Option { - if depth == 0 { - return None; - } - let alt_fraction = f64::from(alt_count) / f64::from(depth); - if alt_fraction >= 0.8 { - Some(format!("{alternate}{alternate}")) - } else if alt_fraction <= 0.2 { - Some(format!("{reference}{reference}")) - } else { - Some(format!("{reference}{alternate}")) - } -} - -fn describe_copy_number_decision_rule( - reference: &str, - alternate: &str, - ref_count: u32, - alt_count: u32, - depth: u32, -) -> String { - if depth == 0 { - return format!( - "no covering reads for copy-number style variant; genotype unresolved (ref={reference}, alt={alternate})" - ); - } - - let alt_fraction = f64::from(alt_count) / f64::from(depth); - format!( - "copy-number genotype rule: alt_fraction={alt_fraction:.3} with thresholds ref<=0.200, het=(0.200,0.800), alt>=0.800; counts ref={ref_count} alt={alt_count} depth={depth} for {reference}->{alternate}" - ) -} - -#[derive(Debug, Clone, Default)] -struct SnpPileupCounts { - filtered_depth: u32, - filtered_ref_count: u32, - filtered_alt_count: u32, - filtered_base_counts: BTreeMap, - raw_depth: u32, - raw_ref_count: u32, - raw_alt_count: u32, - raw_base_counts: BTreeMap, - filtered_low_base_quality: u32, - filtered_low_mapping_quality: u32, - filtered_non_acgt: u32, - filtered_unmapped: u32, - filtered_secondary: u32, - filtered_qc_fail: u32, - filtered_duplicate: u32, - filtered_improper_pair: u32, - raw_forward_counts: BTreeMap, - raw_reverse_counts: BTreeMap, -} - -impl SnpPileupCounts { - fn evidence_lines(&self, locus: &str, target_pos: i64) -> Vec { - vec![ - format!( - "observed SNP pileup at {locus} target_pos={target_pos} filtered_depth={} ref_count={} alt_count={}", - self.filtered_depth, self.filtered_ref_count, self.filtered_alt_count - ), - format!( - "raw pileup depth={} ref_count={} alt_count={} raw_counts={:?}", - self.raw_depth, self.raw_ref_count, self.raw_alt_count, self.raw_base_counts - ), - format!( - "raw strand counts: forward={:?} reverse={:?}", - self.raw_forward_counts, self.raw_reverse_counts - ), - format!( - "filters applied: min_base_quality={} min_mapping_quality={} filtered_low_base_quality={} filtered_low_mapping_quality={} filtered_non_acgt={} filtered_unmapped={} filtered_secondary={} filtered_qc_fail={} filtered_duplicate={} filtered_improper_pair={}", - DEFAULT_MPILEUP_MIN_BASE_QUALITY, - DEFAULT_MPILEUP_MIN_MAPPING_QUALITY, - self.filtered_low_base_quality, - self.filtered_low_mapping_quality, - self.filtered_non_acgt, - self.filtered_unmapped, - self.filtered_secondary, - self.filtered_qc_fail, - self.filtered_duplicate, - self.filtered_improper_pair - ), - ] - } -} - -fn observe_snp_pileup( - cram_path: &Path, - options: &GenotypeLoadOptions, - reference_file: &Path, - locus: &GenomicLocus, - reference: char, - alternate: char, -) -> Result { - let repository = alignment::build_reference_repository(reference_file)?; - let mut reader = - alignment::build_cram_indexed_reader_from_path(cram_path, options, repository)?; - let label = cram_path.display().to_string(); - snp_pileup_with_reader( - &mut reader, - &label, - locus, - reference, - alternate, - options.allow_reference_md5_mismatch, - ) -} - -fn snp_pileup_with_reader( - reader: &mut cram::io::indexed_reader::IndexedReader, - label: &str, - locus: &GenomicLocus, - reference: char, - alternate: char, - allow_reference_md5_mismatch: bool, -) -> Result { - let mut counts = SnpPileupCounts::default(); - let target_position = Position::try_from(usize::try_from(locus.start).map_err(|_| { - RuntimeError::InvalidArguments("SNP locus start is out of range".to_owned()) - })?) - .map_err(|_| RuntimeError::InvalidArguments("SNP locus start is out of range".to_owned()))?; - let reference_base = reference as u8; - - alignment::for_each_raw_cram_record_with_reader_inner( - reader, - label, - locus, - allow_reference_md5_mismatch, - |record| { - let flags = record - .flags() - .map_err(|err| RuntimeError::Io(format!("failed to read CRAM flags: {err}")))?; - if flags.is_unmapped() { - counts.filtered_unmapped += 1; - return Ok(true); - } - if flags.is_secondary() { - counts.filtered_secondary += 1; - return Ok(true); - } - if flags.is_qc_fail() { - counts.filtered_qc_fail += 1; - return Ok(true); - } - if flags.is_duplicate() { - counts.filtered_duplicate += 1; - return Ok(true); - } - if flags.is_segmented() && !flags.is_properly_segmented() { - counts.filtered_improper_pair += 1; - return Ok(true); - } - - let Some((base, base_quality)) = - cram_base_quality_at_reference_position(&record, target_position, reference_base)? - else { - return Ok(true); - }; - - let normalized_base = normalize_pileup_base(base); - record.mapping_quality().transpose().map_err(|err| { - RuntimeError::Io(format!("failed to read CRAM mapping quality: {err}")) - })?; - let is_reverse = flags.is_reverse_complemented(); - if let Some(base) = normalized_base { - counts.raw_depth += 1; - *counts.raw_base_counts.entry(base.to_string()).or_insert(0) += 1; - let strand_counts = if is_reverse { - &mut counts.raw_reverse_counts - } else { - &mut counts.raw_forward_counts - }; - *strand_counts.entry(base.to_string()).or_insert(0) += 1; - if base == reference { - counts.raw_ref_count += 1; - } else if base == alternate { - counts.raw_alt_count += 1; - } - } - - if base_quality < DEFAULT_MPILEUP_MIN_BASE_QUALITY { - counts.filtered_low_base_quality += 1; - return Ok(true); - } - - let Some(base) = normalized_base else { - counts.filtered_non_acgt += 1; - return Ok(true); - }; - - counts.filtered_depth += 1; - *counts - .filtered_base_counts - .entry(base.to_string()) - .or_insert(0) += 1; - if base == reference { - counts.filtered_ref_count += 1; - } else if base == alternate { - counts.filtered_alt_count += 1; - } - Ok(true) - }, - )?; - - Ok(counts) -} - -fn cram_base_quality_at_reference_position( - record: &cram::Record<'_>, - target_position: Position, - reference_base: u8, -) -> Result, RuntimeError> { - let Some(alignment_start) = record.alignment_start() else { - return Ok(None); - }; - let alignment_start = alignment_start - .map_err(|err| RuntimeError::Io(format!("failed to read CRAM alignment start: {err}")))?; - let mut reference_position = usize::from(alignment_start); - let target = usize::from(target_position); - let mut read_position = 0usize; - let sequence = record.sequence(); - let qualities = record.quality_scores(); - - for op in record.cigar().iter() { - let op = op.map_err(|err| RuntimeError::Io(format!("failed to read CRAM CIGAR: {err}")))?; - match op.kind() { - CigarOpKind::Match | CigarOpKind::SequenceMatch | CigarOpKind::SequenceMismatch => { - for offset in 0..op.len() { - if reference_position + offset == target { - let base = sequence - .get(read_position + offset) - .unwrap_or(reference_base); - let quality = qualities - .iter() - .nth(read_position + offset) - .transpose() - .map_err(|err| { - RuntimeError::Io(format!("failed to read CRAM base quality: {err}")) - })? - .unwrap_or(0); - return Ok(Some((base, quality))); - } - } - reference_position += op.len(); - read_position += op.len(); - } - CigarOpKind::Insertion | CigarOpKind::SoftClip => { - read_position += op.len(); - } - CigarOpKind::Deletion | CigarOpKind::Skip => { - if target >= reference_position && target < reference_position + op.len() { - return Ok(None); - } - reference_position += op.len(); - } - CigarOpKind::HardClip | CigarOpKind::Pad => {} - } - } - - Ok(None) -} - -/// Observe a SNP at `locus` over an already-built CRAM `IndexedReader` and -/// reference repository (held by the reader). Mirrors the internal -/// `CramBackend::observe_snp` but reader-based, so non-filesystem callers -/// (e.g. wasm with a JS-backed reader) don't need a `GenotypeStore` or paths. -/// -/// `matched_rsid` and `assembly` are passed through to the returned -/// observation unchanged — callers that already know them (e.g. from -/// compiling a YAML variant) should supply them; otherwise `None`. -pub fn observe_cram_snp_with_reader( - reader: &mut cram::io::indexed_reader::IndexedReader, - label: &str, - locus: &GenomicLocus, - reference: char, - alternate: char, - matched_rsid: Option, - assembly: Option, -) -> Result { - let pileup = snp_pileup_with_reader(reader, label, locus, reference, alternate, false)?; - let ref_count = pileup.filtered_ref_count; - let alt_count = pileup.filtered_alt_count; - let depth = pileup.filtered_depth; - let evidence = pileup.evidence_lines(&describe_locus(locus), locus.start); - - Ok(VariantObservation { - backend: "cram".to_owned(), - matched_rsid, - assembly, - genotype: infer_snp_genotype(reference, alternate, ref_count, alt_count, depth), - ref_count: Some(ref_count), - alt_count: Some(alt_count), - depth: Some(depth), - raw_counts: pileup.raw_base_counts, - decision: Some(describe_snp_decision_rule( - reference, alternate, ref_count, alt_count, depth, - )), - evidence, - }) -} - -/// Observe an insertion/indel-like variant at `locus` over an already-built -/// CRAM `IndexedReader`. -pub fn observe_cram_indel_with_reader( - reader: &mut cram::io::indexed_reader::IndexedReader, - label: &str, - locus: &GenomicLocus, - reference: &str, - alternate: &str, - matched_rsid: Option, - assembly: Option, -) -> Result { - let mut alt_count = 0u32; - let mut ref_count = 0u32; - let mut depth = 0u32; - let mut matching_alt_lengths = BTreeSet::new(); - - alignment::for_each_cram_record_with_reader(reader, label, locus, |record| { - if record.is_unmapped || !record_overlaps_locus(&record, locus) { - return Ok(true); - } - let classification = classify_expected_indel(&record, locus, reference.len(), alternate)?; - if !classification.covering { - return Ok(true); - } - depth += 1; - if classification.matches_alt { - alt_count += 1; - matching_alt_lengths.insert(classification.observed_len); - } else if classification.reference_like { - ref_count += 1; - } - Ok(true) - })?; - - let evidence_label = if matching_alt_lengths.is_empty() { - "none".to_owned() - } else { - matching_alt_lengths - .into_iter() - .map(|len| len.to_string()) - .collect::>() - .join(",") - }; - - Ok(VariantObservation { - backend: "cram".to_owned(), - matched_rsid, - assembly, - genotype: infer_copy_number_genotype(reference, alternate, ref_count, alt_count, depth), - ref_count: Some(ref_count), - alt_count: Some(alt_count), - depth: Some(depth), - raw_counts: BTreeMap::new(), - decision: Some(describe_copy_number_decision_rule( - reference, alternate, ref_count, alt_count, depth, - )), - evidence: vec![format!( - "observed indel at {} depth={} ref_count={} alt_count={} matching_alt_lengths={}", - describe_locus(locus), - depth, - ref_count, - alt_count, - evidence_label - )], - }) -} - -/// Observe a SNP at `locus` over an already-built tabix-indexed bgzipped VCF -/// reader. Mirrors the CRAM variant for VCF: caller builds -/// `csi::io::IndexedReader::new(reader, tabix_index)` once and calls this per -/// variant. Non-filesystem callers (wasm with a JS-backed reader) go through -/// this path. -/// -/// Only the locus-based match path is implemented — rsid-only variants would -/// need a linear scan, which is a follow-up. Pass `matched_rsid` through if -/// the caller already resolved it. -pub fn observe_vcf_snp_with_reader( - indexed: &mut csi::io::IndexedReader, tabix::Index>, - label: &str, - locus: &GenomicLocus, - reference: char, - alternate: char, - matched_rsid: Option, - assembly: Option, -) -> Result -where - R: Read + Seek, -{ - let locus_label = format!("{}:{}", locus.chrom, locus.start); - - let Some(seq_name) = resolve_vcf_chrom_name(indexed.index(), &locus.chrom) else { - return Ok(VariantObservation { - backend: "vcf".to_owned(), - matched_rsid, - assembly, - evidence: vec![format!( - "{label}: tabix index has no contig matching {} (tried chr-prefixed and bare forms)", - locus.chrom - )], - ..VariantObservation::default() - }); - }; - - let pos_usize = usize::try_from(locus.start).map_err(|err| { - RuntimeError::Io(format!( - "{label}: invalid VCF position {} for {locus_label}: {err}", - locus.start - )) - })?; - let position = Position::try_from(pos_usize).map_err(|err| { - RuntimeError::Io(format!( - "{label}: invalid VCF position {} for {locus_label}: {err}", - locus.start - )) - })?; - let region = Region::new(seq_name.as_str(), position..=position); - - let query = indexed.query(®ion).map_err(|err| { - RuntimeError::Io(format!("{label}: tabix query for {locus_label}: {err}")) - })?; - - let reference_str = reference.to_ascii_uppercase().to_string(); - let alternate_str = alternate.to_ascii_uppercase().to_string(); - - let mut saw_any = false; - for record_result in query { - let record = record_result - .map_err(|err| RuntimeError::Io(format!("{label}: tabix record iter: {err}")))?; - let line: &str = record.as_ref(); - let Some(row) = parse_vcf_record(line)? else { - continue; - }; - if row.position != locus.start { - continue; - } - saw_any = true; - if !row.reference.eq_ignore_ascii_case(&reference_str) { - continue; - } - if !row - .alternates - .iter() - .any(|alt| alt.eq_ignore_ascii_case(&alternate_str)) - { - continue; - } - - return Ok(VariantObservation { - backend: "vcf".to_owned(), - matched_rsid: matched_rsid.or_else(|| row.rsid.clone()), - assembly, - genotype: Some(row.genotype.clone()), - evidence: vec![format!("{label}: resolved by locus {locus_label}")], - ..VariantObservation::default() - }); - } - - let evidence = if saw_any { - vec![format!( - "{label}: {locus_label} present but ref={reference}/alt={alternate} did not match any record" - )] - } else { - vec![format!("{label}: no VCF record at {locus_label}")] - }; - Ok(VariantObservation { - backend: "vcf".to_owned(), - matched_rsid, - assembly, - evidence, - ..VariantObservation::default() - }) -} - -/// Match the user-provided chromosome name against the tabix index's set of -/// reference sequence names. VCFs vary: some use `chr22`, others `22`. Try -/// the user's spelling verbatim, then toggle the `chr` prefix, then fall back -/// to a case-insensitive compare against the normalized suffix. -fn resolve_vcf_chrom_name(index: &tabix::Index, user_chrom: &str) -> Option { - let header = index.header()?; - let names = header.reference_sequence_names(); - - let trimmed = user_chrom.trim(); - let stripped = trimmed.strip_prefix("chr").unwrap_or(trimmed); - - let candidates = [ - trimmed.to_owned(), - stripped.to_owned(), - format!("chr{stripped}"), - ]; - for cand in &candidates { - if names.contains(cand.as_bytes()) { - return Some(cand.clone()); - } - } - // Case-insensitive fallback against the full set. - let target = stripped.to_ascii_lowercase(); - for name in names { - let as_str = std::str::from_utf8(name.as_ref()).ok()?; - let as_stripped = as_str.strip_prefix("chr").unwrap_or(as_str); - if as_stripped.eq_ignore_ascii_case(&target) { - return Some(as_str.to_owned()); - } - } - None -} - -fn normalize_pileup_base(base: u8) -> Option { - match (base as char).to_ascii_uppercase() { - 'A' | 'C' | 'G' | 'T' => Some((base as char).to_ascii_uppercase()), - _ => None, - } -} - -#[derive(Debug, Clone, Copy)] -struct IndelClassification { - covering: bool, - reference_like: bool, - matches_alt: bool, - observed_len: usize, -} - -fn len_as_i64(len: usize) -> Option { - i64::try_from(len).ok() -} - -fn spans_position(record: &AlignmentRecord, pos: i64) -> bool { - pos >= record.start.saturating_sub(1) && pos <= record.end -} - -fn record_overlaps_locus(record: &AlignmentRecord, locus: &GenomicLocus) -> bool { - record.end >= locus.start && record.start <= locus.end -} - -fn indel_at_anchor(record: &AlignmentRecord, anchor_pos: i64) -> Option<(AlignmentOpKind, usize)> { - let mut ref_pos = record.start; - - for op in &record.cigar { - match op.kind { - AlignmentOpKind::Match - | AlignmentOpKind::SequenceMatch - | AlignmentOpKind::SequenceMismatch - | AlignmentOpKind::Skip => { - ref_pos += len_as_i64(op.len)?; - } - AlignmentOpKind::Insertion => { - let anchor = ref_pos.saturating_sub(1); - if anchor == anchor_pos { - return Some((AlignmentOpKind::Insertion, op.len)); - } - } - AlignmentOpKind::Deletion => { - let anchor = ref_pos.saturating_sub(1); - if anchor == anchor_pos { - return Some((AlignmentOpKind::Deletion, op.len)); - } - ref_pos += len_as_i64(op.len)?; - } - AlignmentOpKind::SoftClip | AlignmentOpKind::HardClip | AlignmentOpKind::Pad => {} - } - } - - None -} - -fn classify_expected_indel( - record: &AlignmentRecord, - locus: &GenomicLocus, - reference_len: usize, - alternate: &str, -) -> Result { - let alt_len = alternate.len(); - let anchor_start = locus.start.saturating_sub(1); - let anchor_end = locus.end; - - let covering = record.start <= locus.start && record.end >= locus.end; - if !covering { - return Ok(IndelClassification { - covering: false, - reference_like: false, - matches_alt: false, - observed_len: reference_len, - }); - } - - let mut observed_len = reference_len; - - for anchor in anchor_start..=anchor_end { - if let Some((kind, len)) = indel_at_anchor(record, anchor) { - observed_len = match kind { - AlignmentOpKind::Insertion => reference_len + len, - AlignmentOpKind::Deletion => reference_len.saturating_sub(len), - _ => reference_len, - }; - - return Ok(IndelClassification { - covering: true, - reference_like: false, - matches_alt: observed_len == alt_len, - observed_len, - }); - } - } - - Ok(IndelClassification { - covering: true, - reference_like: true, - matches_alt: false, - observed_len, - }) -} - -fn describe_query(variant: &VariantSpec) -> &'static str { - if variant.has_coordinates() { - "variant_by_locus" - } else { - "variant_by_rsid" - } -} - -fn variant_sort_key(variant: &VariantSpec) -> (u8, String, i64, i64, String) { - if let Some(locus) = &variant.grch38 { - return ( - 0, - chrom_sort_key(&locus.chrom), - locus.start, - locus.end, - variant.rsids.first().cloned().unwrap_or_default(), - ); - } - if let Some(locus) = &variant.grch37 { - return ( - 1, - chrom_sort_key(&locus.chrom), - locus.start, - locus.end, - variant.rsids.first().cloned().unwrap_or_default(), - ); - } - ( - 2, - "~".to_owned(), - i64::MAX, - i64::MAX, - variant.rsids.first().cloned().unwrap_or_default(), - ) -} - -fn chrom_sort_key(raw: &str) -> String { - let chrom = raw.trim().strip_prefix("chr").unwrap_or(raw.trim()); - if let Ok(value) = chrom.parse::() { - return format!("{value:03}"); - } - match chrom.to_ascii_uppercase().as_str() { - "X" => "023".to_owned(), - "Y" => "024".to_owned(), - "M" | "MT" => "025".to_owned(), - other => format!("999-{other}"), - } -} - -#[derive(Debug, Clone, Copy)] -enum Delimiter { - Tab, - Comma, - Space, -} - -fn detect_delimiter(lines: &[String]) -> Delimiter { - for line in lines { - let trimmed = line.trim(); - if trimmed.is_empty() - || COMMENT_PREFIXES - .iter() - .any(|prefix| trimmed.starts_with(prefix)) - { - continue; - } - if line.contains('\t') { - return Delimiter::Tab; - } - if line.contains(',') { - return Delimiter::Comma; - } - if trimmed.split_whitespace().count() > 1 { - return Delimiter::Space; - } - } - Delimiter::Tab -} - -#[allow(dead_code)] -struct RowParser { - delimiter: Delimiter, - header: Option>, - comment_header: Option>, - alias_map: HashMap<&'static str, BTreeSet<&'static str>>, -} - -#[allow(dead_code)] -impl RowParser { - fn new(delimiter: Delimiter) -> Self { - let mut alias_map = HashMap::new(); - alias_map.insert("rsid", RSID_ALIASES.iter().copied().collect()); - alias_map.insert("chromosome", CHROM_ALIASES.iter().copied().collect()); - alias_map.insert("position", POSITION_ALIASES.iter().copied().collect()); - alias_map.insert("genotype", GENOTYPE_ALIASES.iter().copied().collect()); - alias_map.insert("allele1", ALLELE1_ALIASES.iter().copied().collect()); - alias_map.insert("allele2", ALLELE2_ALIASES.iter().copied().collect()); - Self { - delimiter, - header: None, - comment_header: None, - alias_map, - } - } - - fn consume_line(&mut self, line: &str) -> Result, RuntimeError> { - Ok(self - .consume_record(line)? - .and_then(|row| row.rsid.map(|rsid| (rsid, row.genotype)))) - } - - fn consume_record(&mut self, line: &str) -> Result, RuntimeError> { - let trimmed = line.trim(); - if trimmed.is_empty() { - return Ok(None); - } - - let trimmed = strip_bom(trimmed); - if let Some(prefix) = COMMENT_PREFIXES - .iter() - .find(|prefix| trimmed.starts_with(**prefix)) - { - let candidate = trimmed.trim_start_matches(prefix).trim(); - if !candidate.is_empty() { - let fields = self.parse_fields(candidate); - if self.looks_like_header(&fields) { - self.comment_header = Some(fields); - } - } - return Ok(None); - } - - let fields = self.parse_fields(strip_bom(line)); - if fields.is_empty() { - return Ok(None); - } - - if self.header.is_none() { - if self.looks_like_header(&fields) { - self.header = Some(fields); - return Ok(None); - } - if let Some(header) = self.comment_header.take() { - self.header = Some(header); - } else { - self.header = Some(self.default_header(fields.len())); - } - } - - let header = self.header.as_ref().expect("header initialized"); - let mut row_map = HashMap::new(); - for (idx, value) in fields.into_iter().enumerate() { - if idx >= header.len() { - continue; - } - row_map.insert(normalize_name(&header[idx]), strip_inline_comment(&value)); - } - - let rsid = self - .lookup(&row_map, "rsid") - .filter(|value| !value.is_empty()); - let chrom = self - .lookup(&row_map, "chromosome") - .filter(|value| !value.is_empty()); - let position = self - .lookup(&row_map, "position") - .and_then(|value| value.parse::().ok()); - if rsid.is_none() && (chrom.is_none() || position.is_none()) { - return Ok(None); - } - - let genotype = if let Some(gt) = self.lookup(&row_map, "genotype") { - gt - } else { - let allele1 = self.lookup(&row_map, "allele1").unwrap_or_default(); - let allele2 = self.lookup(&row_map, "allele2").unwrap_or_default(); - format!("{allele1}{allele2}") - }; - - Ok(Some(ParsedDelimitedRow { - rsid, - chrom, - position, - genotype: normalize_genotype(&genotype), - })) - } - - fn parse_fields(&self, line: &str) -> Vec { - match self.delimiter { - Delimiter::Tab => line - .split('\t') - .map(|field| field.trim().to_owned()) - .collect(), - Delimiter::Space => line.split_whitespace().map(str::to_owned).collect(), - Delimiter::Comma => split_csv_line(line), - } - } - - fn looks_like_header(&self, fields: &[String]) -> bool { - fields.first().is_some_and(|first| { - self.alias_map - .get("rsid") - .is_some_and(|aliases| aliases.contains(normalize_name(first).as_str())) - }) - } - - fn lookup(&self, row_map: &HashMap, key: &str) -> Option { - let aliases = self.alias_map.get(key)?; - for alias in aliases { - let key = normalize_name(alias); - if let Some(value) = row_map.get(&key) - && !value.is_empty() - { - return Some(value.clone()); - } - } - None - } - - fn default_header(&self, field_count: usize) -> Vec { - let base = ["rsid", "chromosome", "position", "genotype"]; - if field_count <= base.len() { - base[..field_count] - .iter() - .map(|s| (*s).to_owned()) - .collect() - } else { - let mut header: Vec = base.iter().map(|s| (*s).to_owned()).collect(); - for idx in 0..(field_count - header.len()) { - header.push(format!("extra_{idx}")); - } - header - } - } -} -fn strip_bom(value: &str) -> &str { - value.strip_prefix('\u{feff}').unwrap_or(value) -} - -fn normalize_name(name: &str) -> String { - name.trim() - .to_ascii_lowercase() - .chars() - .filter(|ch| !matches!(ch, ' ' | '_' | '-')) - .collect() -} - -fn strip_inline_comment(value: &str) -> String { - for marker in ["#", "//"] { - if let Some(idx) = value.find(marker) { - return value[..idx].trim().to_owned(); - } - } - value.trim().to_owned() -} - -fn normalize_genotype(value: &str) -> String { - let cleaned = value.trim().replace(' ', "").to_ascii_uppercase(); - if cleaned.is_empty() || matches!(cleaned.as_str(), "NA" | "N/A" | "#N/A" | "NONE") { - return "--".to_owned(); - } - if cleaned.contains('/') { - let parts: Vec<&str> = cleaned.split('/').collect(); - if parts.iter().any(|part| part.is_empty() || *part == "-") { - return "ID".to_owned(); - } - return parts.concat(); - } - cleaned -} - -fn split_csv_line(line: &str) -> Vec { - let mut fields = Vec::new(); - let mut current = String::new(); - let mut in_quotes = false; - let chars = line.chars().peekable(); - - for ch in chars { - match ch { - '"' => in_quotes = !in_quotes, - ',' if !in_quotes => { - fields.push(current.trim().to_owned()); - current.clear(); - } - _ => current.push(ch), - } - } - fields.push(current.trim().to_owned()); - fields -} - -#[derive(Debug, Clone)] -struct ParsedVcfRow { - rsid: Option, - chrom: String, - position: i64, - reference: String, - alternates: Vec, - genotype: String, -} - -fn scan_vcf_variants( - backend: &VcfBackend, - variants: &[VariantSpec], -) -> Result, RuntimeError> { - let mut indexed: Vec<(usize, &VariantSpec)> = variants.iter().enumerate().collect(); - indexed.sort_by_cached_key(|(_, variant)| variant_sort_key(variant)); - - let mut probe_lines = Vec::new(); - let detected_assembly = { - let file = File::open(&backend.path).map_err(|err| { - RuntimeError::Io(format!( - "failed to open VCF file {}: {err}", - backend.path.display() - )) - })?; - let mut reader: Box = if is_bgzf_path(&backend.path) { - Box::new(BufReader::new(bgzf::io::Reader::new(file))) - } else { - Box::new(BufReader::new(file)) - }; - - let mut buf = String::new(); - for _ in 0..256 { - buf.clear(); - let bytes = reader.read_line(&mut buf).map_err(|err| { - RuntimeError::Io(format!( - "failed to read VCF file {}: {err}", - backend.path.display() - )) - })?; - if bytes == 0 { - break; - } - let line = buf.trim_end_matches(['\n', '\r']).to_owned(); - let stop = line.starts_with("#CHROM\t"); - probe_lines.push(line); - if stop { - break; - } - } - - detect_vcf_assembly(&backend.path, &probe_lines) - }; - - let mut rsid_targets: HashMap> = HashMap::new(); - let mut coord_targets: HashMap<(String, i64), Vec> = HashMap::new(); - let mut results = vec![VariantObservation::default(); variants.len()]; - let mut unresolved = variants.len(); - - for (idx, variant) in &indexed { - for rsid in &variant.rsids { - rsid_targets.entry(rsid.clone()).or_default().push(*idx); - } - - if let Some(locus) = choose_variant_locus_for_assembly(variant, detected_assembly) { - let chrom = normalize_chromosome_name(&locus.chrom); - coord_targets - .entry((chrom.clone(), locus.start)) - .or_default() - .push(*idx); - if matches!( - variant.kind, - Some(VariantKind::Deletion | VariantKind::Insertion | VariantKind::Indel) - ) { - let anchor = locus.start.saturating_sub(1); - coord_targets.entry((chrom, anchor)).or_default().push(*idx); - } - } - } - - let targets = VcfResolutionTargets { - variants, - detected_assembly, - rsid_targets: &rsid_targets, - coord_targets: &coord_targets, - }; - - let file = File::open(&backend.path).map_err(|err| { - RuntimeError::Io(format!( - "failed to open VCF file {}: {err}", - backend.path.display() - )) - })?; - let mut reader: Box = if is_bgzf_path(&backend.path) { - Box::new(BufReader::new(bgzf::io::Reader::new(file))) - } else { - Box::new(BufReader::new(file)) - }; - - let mut buf = String::new(); - loop { - buf.clear(); - let bytes = reader.read_line(&mut buf).map_err(|err| { - RuntimeError::Io(format!( - "failed to read VCF file {}: {err}", - backend.path.display() - )) - })?; - if bytes == 0 || unresolved == 0 { - break; - } - if let Some(row) = parse_vcf_record(buf.trim_end_matches(['\n', '\r']))? { - resolve_vcf_row(backend, &row, &targets, &mut results, &mut unresolved); - } - } - - for (idx, variant) in indexed { - if results[idx].genotype.is_none() { - results[idx] = VariantObservation { - backend: backend.backend_name().to_owned(), - assembly: detected_assembly, - evidence: vec![format!( - "no matching rsid or locus found for {}", - describe_query(variant) - )], - ..VariantObservation::default() - }; - } - } - - Ok(results) -} - -struct VcfResolutionTargets<'a> { - variants: &'a [VariantSpec], - detected_assembly: Option, - rsid_targets: &'a HashMap>, - coord_targets: &'a HashMap<(String, i64), Vec>, -} - -fn resolve_vcf_row( - backend: &VcfBackend, - row: &ParsedVcfRow, - targets: &VcfResolutionTargets<'_>, - results: &mut [VariantObservation], - unresolved: &mut usize, -) { - if let Some(rsid) = row.rsid.as_ref() - && let Some(target_indexes) = targets.rsid_targets.get(rsid) - { - for &target_idx in target_indexes { - if results[target_idx].genotype.is_none() { - results[target_idx] = VariantObservation { - backend: backend.backend_name().to_owned(), - matched_rsid: Some(rsid.clone()), - assembly: targets.detected_assembly, - genotype: Some(row.genotype.clone()), - evidence: vec![format!("resolved by rsid {rsid}")], - ..VariantObservation::default() - }; - *unresolved = (*unresolved).saturating_sub(1); - } - } - } - - if *unresolved == 0 { - return; - } - - let key = (normalize_chromosome_name(&row.chrom), row.position); - if let Some(target_indexes) = targets.coord_targets.get(&key) { - for &target_idx in target_indexes { - if results[target_idx].genotype.is_none() - && vcf_row_matches_variant( - row, - &targets.variants[target_idx], - targets.detected_assembly, - ) - { - results[target_idx] = VariantObservation { - backend: backend.backend_name().to_owned(), - matched_rsid: row.rsid.clone(), - assembly: targets.detected_assembly, - genotype: Some(row.genotype.clone()), - evidence: vec![format!("resolved by locus {}:{}", row.chrom, row.position)], - ..VariantObservation::default() - }; - *unresolved = (*unresolved).saturating_sub(1); - } - } - } -} - -fn parse_vcf_record(line: &str) -> Result, RuntimeError> { - let trimmed = line.trim(); - if trimmed.is_empty() || trimmed.starts_with('#') { - return Ok(None); - } - - let fields: Vec<&str> = trimmed.split('\t').collect(); - if fields.len() < 10 { - return Ok(None); - } - - let chrom = fields[0].trim(); - let position = fields[1].trim().parse::().map_err(|err| { - RuntimeError::Io(format!( - "failed to parse VCF position '{}': {err}", - fields[1].trim() - )) - })?; - let rsid = { - let value = fields[2].trim(); - (!value.is_empty() && value != ".").then(|| value.to_owned()) - }; - let reference = fields[3].trim(); - if reference.is_empty() || reference == "." { - return Ok(None); - } - - let alternates: Vec = fields[4] - .split(',') - .map(str::trim) - .filter(|alt| !alt.is_empty() && *alt != ".") - .map(ToOwned::to_owned) - .collect(); - if alternates.is_empty() { - return Ok(None); - } - - let genotype = extract_vcf_sample_genotype(fields[8], fields[9], reference, &alternates) - .unwrap_or_else(|| "--".to_owned()); - - Ok(Some(ParsedVcfRow { - rsid, - chrom: chrom.to_owned(), - position, - reference: reference.to_owned(), - alternates, - genotype, - })) -} - -fn extract_vcf_sample_genotype( - format_field: &str, - sample_field: &str, - reference: &str, - alternates: &[String], -) -> Option { - let gt_index = format_field - .split(':') - .position(|field| field.eq_ignore_ascii_case("GT"))?; - let sample_parts: Vec<&str> = sample_field.split(':').collect(); - let sample_gt = sample_parts.get(gt_index).copied().unwrap_or("."); - let alternate_refs: Vec<&str> = alternates.iter().map(String::as_str).collect(); - genotype_from_vcf_gt(sample_gt, reference, &alternate_refs) -} - -fn detect_vcf_assembly(path: &Path, probe_lines: &[String]) -> Option { - let combined = probe_lines.join("\n").to_ascii_lowercase(); - if combined.contains("assembly=b37") - || combined.contains("assembly=grch37") - || combined.contains("assembly=hg19") - || combined.contains("reference=grch37") - || combined.contains("reference=hg19") - { - return Some(Assembly::Grch37); - } - if combined.contains("assembly=b38") - || combined.contains("assembly=grch38") - || combined.contains("assembly=hg38") - || combined.contains("reference=grch38") - || combined.contains("reference=hg38") - { - return Some(Assembly::Grch38); - } - - let lower = path.to_string_lossy().to_ascii_lowercase(); - if lower.contains("grch37") || lower.contains("hg19") || lower.contains("b37") { - Some(Assembly::Grch37) - } else if lower.contains("grch38") || lower.contains("hg38") || lower.contains("b38") { - Some(Assembly::Grch38) - } else { - None - } -} - -fn choose_variant_locus_for_assembly( - variant: &VariantSpec, - assembly: Option, -) -> Option { - match assembly { - Some(Assembly::Grch37) => variant.grch37.clone().or_else(|| variant.grch38.clone()), - Some(Assembly::Grch38) => variant.grch38.clone().or_else(|| variant.grch37.clone()), - None => variant.grch37.clone().or_else(|| variant.grch38.clone()), - } -} - -fn normalize_chromosome_name(value: &str) -> String { - value.trim().trim_start_matches("chr").to_ascii_lowercase() -} - -fn vcf_row_matches_variant( - row: &ParsedVcfRow, - variant: &VariantSpec, - assembly: Option, -) -> bool { - let Some(locus) = choose_variant_locus_for_assembly(variant, assembly) else { - return false; - }; - - if normalize_chromosome_name(&row.chrom) != normalize_chromosome_name(&locus.chrom) { - return false; - } - - match variant.kind.unwrap_or(VariantKind::Other) { - VariantKind::Snp => { - row.position == locus.start - && variant - .reference - .as_ref() - .is_none_or(|reference| reference.eq_ignore_ascii_case(&row.reference)) - && variant.alternate.as_ref().is_none_or(|alternate| { - row.alternates - .iter() - .any(|candidate| candidate.eq_ignore_ascii_case(alternate)) - }) - } - VariantKind::Deletion => { - let expected_len = variant.deletion_length.unwrap_or(0); - row.position == locus.start.saturating_sub(1) - && row.alternates.iter().any(|alternate| { - let actual_len = row.reference.len().saturating_sub(alternate.len()); - (expected_len == 0 || actual_len == expected_len) - && alternate.len() < row.reference.len() - }) - } - VariantKind::Insertion | VariantKind::Indel => { - row.position == locus.start.saturating_sub(1) - } - VariantKind::Other => row.position == locus.start, - } -} - -fn is_bgzf_path(path: &Path) -> bool { - path.extension() - .and_then(|ext| ext.to_str()) - .is_some_and(|ext| ext.eq_ignore_ascii_case("gz")) -} - -fn read_plain_lines(path: &Path) -> Result, RuntimeError> { - let file = File::open(path).map_err(|err| { - RuntimeError::Io(format!( - "failed to open genotype file {}: {err}", - path.display() - )) - })?; - read_lines_from_reader(BufReader::new(file), path) -} - -fn select_zip_entry(path: &Path) -> Result { - let file = File::open(path).map_err(|err| { - RuntimeError::Io(format!( - "failed to open genotype zip {}: {err}", - path.display() - )) - })?; - let mut archive = ZipArchive::new(file).map_err(|err| { - RuntimeError::Io(format!( - "failed to read genotype zip {}: {err}", - path.display() - )) - })?; - - let mut selected_name: Option = None; - for idx in 0..archive.len() { - let entry = archive.by_index(idx).map_err(|err| { - RuntimeError::Io(format!( - "failed to inspect genotype zip {}: {err}", - path.display() - )) - })?; - if entry.is_dir() { - continue; - } - let name = entry.name().to_owned(); - let lower = name.to_ascii_lowercase(); - if lower.ends_with(".txt") - || lower.ends_with(".csv") - || lower.ends_with(".tsv") - || lower.ends_with(".vcf") - || lower.ends_with(".vcf.gz") - { - return Ok(name); - } - if selected_name.is_none() { - selected_name = Some(name); - } - } - - selected_name.ok_or_else(|| { - RuntimeError::Unsupported(format!( - "zip archive {} does not contain a supported genotype file", - path.display() - )) - }) -} - -fn scan_delimited_variants( - backend: &DelimitedBackend, - variants: &[VariantSpec], -) -> Result, RuntimeError> { - let mut indexed: Vec<(usize, &VariantSpec)> = variants.iter().enumerate().collect(); - indexed.sort_by_cached_key(|(_, variant)| variant_sort_key(variant)); - - let mut rsid_targets: HashMap> = HashMap::new(); - let mut coord_targets: HashMap<(String, i64), Vec> = HashMap::new(); - let mut results = vec![VariantObservation::default(); variants.len()]; - let mut unresolved = variants.len(); - - for (idx, variant) in &indexed { - for rsid in &variant.rsids { - rsid_targets.entry(rsid.clone()).or_default().push(*idx); - } - if let Some(locus) = variant.grch38.as_ref().or(variant.grch37.as_ref()) { - coord_targets - .entry(( - locus.chrom.trim_start_matches("chr").to_ascii_lowercase(), - locus.start, - )) - .or_default() - .push(*idx); - } - } - - let mut scan_reader = |reader: &mut dyn BufRead| -> Result<(), RuntimeError> { - let mut probe_lines = Vec::new(); - let mut buf = String::new(); - for _ in 0..8 { - buf.clear(); - let bytes = reader.read_line(&mut buf).map_err(|err| { - RuntimeError::Io(format!( - "failed to read genotype stream {}: {err}", - backend.path.display() - )) - })?; - if bytes == 0 { - break; - } - probe_lines.push(buf.trim_end_matches(['\n', '\r']).to_owned()); - } - - let delimiter = detect_delimiter(&probe_lines); - let mut column_indexes: Option = None; - let mut comment_header: Option> = None; - - let mut process_line = |line: &str| -> Result { - let Some(row) = - parse_streaming_row(line, delimiter, &mut column_indexes, &mut comment_header)? - else { - return Ok(unresolved == 0); - }; - - if let Some(rsid) = row.rsid.as_ref() - && let Some(target_indexes) = rsid_targets.get(rsid) - { - for &target_idx in target_indexes { - if results[target_idx].genotype.is_none() { - results[target_idx] = VariantObservation { - backend: backend.backend_name().to_owned(), - matched_rsid: Some(rsid.clone()), - genotype: Some(row.genotype.clone()), - evidence: vec![format!("resolved by rsid {rsid}")], - ..VariantObservation::default() - }; - unresolved = unresolved.saturating_sub(1); - } - } - } - - if unresolved == 0 { - return Ok(true); - } - - if let (Some(chrom), Some(position)) = (row.chrom.as_ref(), row.position) { - let key = ( - chrom.trim_start_matches("chr").to_ascii_lowercase(), - position, - ); - if let Some(target_indexes) = coord_targets.get(&key) { - for &target_idx in target_indexes { - if results[target_idx].genotype.is_none() { - results[target_idx] = VariantObservation { - backend: backend.backend_name().to_owned(), - matched_rsid: row.rsid.clone(), - genotype: Some(row.genotype.clone()), - evidence: vec![format!("resolved by locus {}:{}", chrom, position)], - ..VariantObservation::default() - }; - unresolved = unresolved.saturating_sub(1); - } - } - } - } - Ok(unresolved == 0) - }; - - for line in &probe_lines { - if process_line(line)? { - return Ok(()); - } - } - - loop { - buf.clear(); - let bytes = reader.read_line(&mut buf).map_err(|err| { - RuntimeError::Io(format!( - "failed to read genotype stream {}: {err}", - backend.path.display() - )) - })?; - if bytes == 0 { - break; - } - if process_line(buf.trim_end_matches(['\n', '\r']))? { - break; - } - } - Ok(()) - }; - - match backend.format { - GenotypeSourceFormat::Text => { - let file = File::open(&backend.path).map_err(|err| { - RuntimeError::Io(format!( - "failed to open genotype file {}: {err}", - backend.path.display() - )) - })?; - let mut reader = BufReader::new(file); - scan_reader(&mut reader)?; - } - GenotypeSourceFormat::Zip => { - let entry_name = backend.zip_entry_name.as_ref().ok_or_else(|| { - RuntimeError::Unsupported(format!( - "zip backend missing selected entry for {}", - backend.path.display() - )) - })?; - let file = File::open(&backend.path).map_err(|err| { - RuntimeError::Io(format!( - "failed to open genotype zip {}: {err}", - backend.path.display() - )) - })?; - let mut archive = ZipArchive::new(file).map_err(|err| { - RuntimeError::Io(format!( - "failed to read genotype zip {}: {err}", - backend.path.display() - )) - })?; - let entry = archive.by_name(entry_name).map_err(|err| { - RuntimeError::Io(format!( - "failed to open genotype entry {entry_name} in {}: {err}", - backend.path.display() - )) - })?; - let mut reader = BufReader::new(entry); - scan_reader(&mut reader)?; - } - _ => { - return Err(RuntimeError::Unsupported( - "streaming delimited backend only supports text and zip".to_owned(), - )); - } + + Ok(Self::from_rsid_map(GenotypeSourceFormat::Vcf, values)) } - for (idx, variant) in indexed { - if results[idx].genotype.is_none() { - results[idx] = VariantObservation { - backend: backend.backend_name().to_owned(), - evidence: vec![format!( - "no matching rsid or locus found for {}", - describe_query(variant) - )], - ..VariantObservation::default() - }; + fn from_delimited_lines( + format: GenotypeSourceFormat, + lines: Vec, + ) -> Result { + let delimiter = detect_delimiter(&lines); + let mut parser = RowParser::new(delimiter); + let mut values = HashMap::new(); + for line in lines { + if let Some((rsid, genotype)) = parser.consume_line(&line)? { + values.insert(rsid, genotype); + } } + Ok(Self::from_rsid_map(format, values)) } - Ok(results) -} - -#[derive(Debug, Clone, Copy)] -struct DelimitedColumnIndexes { - rsid: Option, - chrom: Option, - position: Option, - genotype: Option, - allele1: Option, - allele2: Option, -} - -fn parse_streaming_row( - line: &str, - delimiter: Delimiter, - column_indexes: &mut Option, - comment_header: &mut Option>, -) -> Result, RuntimeError> { - let trimmed = line.trim(); - if trimmed.is_empty() { - return Ok(None); + fn from_rsid_map(format: GenotypeSourceFormat, values: HashMap) -> Self { + Self { + backend: QueryBackend::RsidMap(RsidMapBackend { format, values }), + } } - let trimmed = strip_bom(trimmed); - if let Some(prefix) = COMMENT_PREFIXES - .iter() - .find(|prefix| trimmed.starts_with(**prefix)) - { - let candidate = trimmed.trim_start_matches(prefix).trim(); - if !candidate.is_empty() { - let fields = parse_owned_fields(candidate, delimiter); - if looks_like_header_fields(&fields) { - *comment_header = Some(fields); - } + fn from_delimited_file( + path: &Path, + format: GenotypeSourceFormat, + zip_entry_name: Option, + ) -> Self { + Self { + backend: QueryBackend::Delimited(DelimitedBackend { + format, + path: path.to_path_buf(), + zip_entry_name, + }), } - return Ok(None); } - let fields = parse_owned_fields(strip_bom(line), delimiter); - if fields.is_empty() { - return Ok(None); + pub fn capabilities(&self) -> BackendCapabilities { + match &self.backend { + QueryBackend::RsidMap(_) => BackendCapabilities { + rsid_lookup: true, + locus_lookup: false, + }, + QueryBackend::Delimited(_) | QueryBackend::Vcf(_) => BackendCapabilities { + rsid_lookup: true, + locus_lookup: true, + }, + QueryBackend::Cram(_) => BackendCapabilities { + rsid_lookup: false, + locus_lookup: true, + }, + } } - if column_indexes.is_none() { - if looks_like_header_fields(&fields) { - *column_indexes = Some(build_column_indexes(&fields)); - return Ok(None); - } - if let Some(header) = comment_header.take() { - *column_indexes = Some(build_column_indexes(&header)); - } else { - *column_indexes = Some(default_column_indexes(fields.len())); + pub fn supports(&self, query: QueryKind) -> bool { + let caps = self.capabilities(); + match query { + QueryKind::GenotypeByRsid => caps.rsid_lookup, + QueryKind::GenotypeByLocus => caps.locus_lookup, } } - let indexes = column_indexes.expect("streaming column indexes initialized"); - let rsid = indexes - .rsid - .and_then(|idx| fields.get(idx)) - .map(|value| strip_inline_comment(value).trim().to_owned()) - .filter(|value| !value.is_empty()); - let chrom = indexes - .chrom - .and_then(|idx| fields.get(idx)) - .map(|value| strip_inline_comment(value).trim().to_owned()) - .filter(|value| !value.is_empty()); - let position = indexes - .position - .and_then(|idx| fields.get(idx)) - .and_then(|value| strip_inline_comment(value).trim().parse::().ok()); - if rsid.is_none() && (chrom.is_none() || position.is_none()) { - return Ok(None); + pub fn backend_name(&self) -> &'static str { + match &self.backend { + QueryBackend::RsidMap(map) => map.backend_name(), + QueryBackend::Delimited(backend) => backend.backend_name(), + QueryBackend::Vcf(backend) => backend.backend_name(), + QueryBackend::Cram(backend) => backend.backend_name(), + } } - let genotype = if let Some(idx) = indexes.genotype { - fields - .get(idx) - .map(|value| strip_inline_comment(value)) - .unwrap_or_default() - .clone() - } else { - let allele1 = indexes - .allele1 - .and_then(|idx| fields.get(idx)) - .map(|value| strip_inline_comment(value)) - .unwrap_or_default(); - let allele2 = indexes - .allele2 - .and_then(|idx| fields.get(idx)) - .map(|value| strip_inline_comment(value)) - .unwrap_or_default(); - format!("{allele1}{allele2}") - }; - - Ok(Some(ParsedDelimitedRow { - rsid, - chrom, - position, - genotype: normalize_genotype(&genotype), - })) -} + pub fn get(&self, rsid: &str) -> Result, RuntimeError> { + match &self.backend { + QueryBackend::RsidMap(map) => Ok(map.values.get(rsid).cloned()), + QueryBackend::Delimited(backend) => backend.get(rsid), + QueryBackend::Vcf(backend) => backend.get(rsid), + QueryBackend::Cram(backend) => backend + .lookup_variant(&VariantSpec { + rsids: vec![rsid.to_owned()], + ..VariantSpec::default() + }) + .map(|obs| obs.genotype), + } + } -fn parse_owned_fields(line: &str, delimiter: Delimiter) -> Vec { - match delimiter { - Delimiter::Tab => line - .split('\t') - .map(|field| field.trim().to_owned()) - .collect(), - Delimiter::Space => line.split_whitespace().map(str::to_owned).collect(), - Delimiter::Comma => split_csv_line(line), + pub fn lookup_variant( + &self, + variant: &VariantSpec, + ) -> Result { + match &self.backend { + QueryBackend::RsidMap(map) => map.lookup_variant(variant), + QueryBackend::Delimited(backend) => backend.lookup_variant(variant), + QueryBackend::Vcf(backend) => backend.lookup_variant(variant), + QueryBackend::Cram(backend) => backend.lookup_variant(variant), + } } -} -fn looks_like_header_fields(fields: &[String]) -> bool { - fields - .first() - .is_some_and(|first| RSID_ALIASES.contains(&normalize_name(first).as_str())) -} + pub fn lookup_variants( + &self, + variants: &[VariantSpec], + ) -> Result, RuntimeError> { + if let QueryBackend::Delimited(backend) = &self.backend { + return backend.lookup_variants(variants); + } + if let QueryBackend::Vcf(backend) = &self.backend { + return backend.lookup_variants(variants); + } + let mut indexed: Vec<(usize, &VariantSpec)> = variants.iter().enumerate().collect(); + indexed.sort_by_cached_key(|(_, variant)| variant_sort_key(variant)); -fn build_column_indexes(header: &[String]) -> DelimitedColumnIndexes { - DelimitedColumnIndexes { - rsid: find_header_index(header, RSID_ALIASES), - chrom: find_header_index(header, CHROM_ALIASES), - position: find_header_index(header, POSITION_ALIASES), - genotype: find_header_index(header, GENOTYPE_ALIASES), - allele1: find_header_index(header, ALLELE1_ALIASES), - allele2: find_header_index(header, ALLELE2_ALIASES), + let mut results = vec![VariantObservation::default(); variants.len()]; + for (original_idx, variant) in indexed { + results[original_idx] = self.lookup_variant(variant)?; + } + Ok(results) } } -fn default_column_indexes(field_count: usize) -> DelimitedColumnIndexes { - DelimitedColumnIndexes { - rsid: (field_count > 0).then_some(0), - chrom: (field_count > 1).then_some(1), - position: (field_count > 2).then_some(2), - genotype: (field_count > 3).then_some(3), - allele1: None, - allele2: None, +impl RsidMapBackend { + fn backend_name(&self) -> &'static str { + match self.format { + GenotypeSourceFormat::Text => "text", + GenotypeSourceFormat::Zip => "zip", + GenotypeSourceFormat::Vcf => "vcf", + GenotypeSourceFormat::Cram => "cram", + } } -} -fn find_header_index(header: &[String], aliases: &[&str]) -> Option { - header.iter().position(|field| { - aliases - .iter() - .any(|alias| normalize_name(field) == normalize_name(alias)) - }) -} - -fn read_lines_from_reader( - mut reader: R, - path: &Path, -) -> Result, RuntimeError> { - let mut lines = Vec::new(); - let mut buf = String::new(); - loop { - buf.clear(); - let bytes = reader.read_line(&mut buf).map_err(|err| { - RuntimeError::Io(format!( - "failed to read genotype file {}: {err}", - path.display() - )) - })?; - if bytes == 0 { - break; + fn lookup_variant(&self, variant: &VariantSpec) -> Result { + for rsid in &variant.rsids { + if let Some(value) = self.values.get(rsid) { + return Ok(VariantObservation { + backend: self.backend_name().to_owned(), + matched_rsid: Some(rsid.clone()), + genotype: Some(value.clone()), + evidence: vec![format!("resolved by rsid {rsid}")], + ..VariantObservation::default() + }); + } } - lines.push(buf.trim_end_matches(['\n', '\r']).to_owned()); - } - Ok(lines) -} -fn read_zip_entry_limited( - reader: &mut R, - max_bytes: u64, - label: &str, -) -> Result, RuntimeError> { - let mut contents = Vec::new(); - reader - .take(max_bytes.saturating_add(1)) - .read_to_end(&mut contents) - .map_err(|err| RuntimeError::Io(format!("failed to read {label}: {err}")))?; - if u64::try_from(contents.len()).unwrap_or(u64::MAX) > max_bytes { - return Err(RuntimeError::InvalidArguments(format!( - "{label} exceeds decompressed limit of {max_bytes} bytes" - ))); + Ok(VariantObservation { + backend: self.backend_name().to_owned(), + evidence: vec!["no matching rsid found".to_owned()], + ..VariantObservation::default() + }) } - Ok(contents) } -fn detect_source_format( - path: &Path, - forced: Option, -) -> Result { - if let Some(format) = forced { - return Ok(format); +impl DelimitedBackend { + fn backend_name(&self) -> &'static str { + match self.format { + GenotypeSourceFormat::Text => "text", + GenotypeSourceFormat::Zip => "zip", + GenotypeSourceFormat::Vcf => "vcf", + GenotypeSourceFormat::Cram => "cram", + } } - let lower = path.to_string_lossy().to_ascii_lowercase(); - if lower.ends_with(".zip") { - return Ok(GenotypeSourceFormat::Zip); - } - if lower.ends_with(".cram") { - return Ok(GenotypeSourceFormat::Cram); - } - if lower.ends_with(".vcf") || lower.ends_with(".vcf.gz") { - return Ok(GenotypeSourceFormat::Vcf); + fn get(&self, rsid: &str) -> Result, RuntimeError> { + let results = self.lookup_variants(&[VariantSpec { + rsids: vec![rsid.to_owned()], + ..VariantSpec::default() + }])?; + Ok(results.into_iter().next().and_then(|obs| obs.genotype)) } - let lines = read_plain_lines(path)?; - if looks_like_vcf_lines(&lines) { - Ok(GenotypeSourceFormat::Vcf) - } else { - Ok(GenotypeSourceFormat::Text) + fn lookup_variant(&self, variant: &VariantSpec) -> Result { + let mut results = self.lookup_variants(std::slice::from_ref(variant))?; + Ok(results.pop().unwrap_or_default()) } -} -fn looks_like_vcf_lines(lines: &[String]) -> bool { - lines.iter().any(|line| { - let trimmed = line.trim_start(); - trimmed.starts_with("##fileformat=VCF") || trimmed.starts_with("#CHROM\t") - }) + fn lookup_variants( + &self, + variants: &[VariantSpec], + ) -> Result, RuntimeError> { + scan_delimited_variants(self, variants) + } } -fn genotype_from_vcf_gt(gt: &str, reference: &str, alternates: &[&str]) -> Option { - if matches!(gt.trim(), "" | "." | "./." | ".|.") { - return Some("--".to_owned()); +impl VcfBackend { + fn backend_name(&self) -> &'static str { + "vcf" } - let cleaned = gt.trim().replace('|', "/"); - let parts: Vec<&str> = cleaned.split('/').collect(); - if parts.len() != 2 || parts.contains(&".") { - return Some("--".to_owned()); + fn get(&self, rsid: &str) -> Result, RuntimeError> { + let results = self.lookup_variants(&[VariantSpec { + rsids: vec![rsid.to_owned()], + ..VariantSpec::default() + }])?; + Ok(results.into_iter().next().and_then(|obs| obs.genotype)) } - let ref_token = vcf_reference_token(reference, alternates); - let mut out = String::new(); - for part in parts { - let idx = part.parse::().ok()?; - if idx == 0 { - out.push_str(&ref_token); - } else { - let alt = alternates.get(idx - 1)?; - out.push_str(&vcf_alt_token(reference, alt)); - } + fn lookup_variant(&self, variant: &VariantSpec) -> Result { + let mut results = self.lookup_variants(std::slice::from_ref(variant))?; + Ok(results.pop().unwrap_or_default()) } - Some(normalize_genotype(&out)) -} - -fn vcf_reference_token(reference: &str, alternates: &[&str]) -> String { - let mut saw_shorter = false; - let mut saw_longer = false; - - for alt in alternates { - if is_symbolic_vcf_alt(alt) { - continue; - } - match alt.len().cmp(&reference.len()) { - std::cmp::Ordering::Less => saw_shorter = true, - std::cmp::Ordering::Greater => saw_longer = true, - std::cmp::Ordering::Equal => {} + fn lookup_variants( + &self, + variants: &[VariantSpec], + ) -> Result, RuntimeError> { + if let Some(results) = lookup_indexed_vcf_variants(self, variants)? { + return Ok(results); } + scan_vcf_variants(self, variants) } - - match (saw_shorter, saw_longer) { - (true, false) => "I".to_owned(), - (false, true) => "D".to_owned(), - _ => normalize_sequence_token(reference), - } -} - -fn vcf_alt_token(reference: &str, alternate: &str) -> String { - if is_symbolic_vcf_alt(alternate) { - return "--".to_owned(); - } - match alternate.len().cmp(&reference.len()) { - std::cmp::Ordering::Less => "D".to_owned(), - std::cmp::Ordering::Greater => "I".to_owned(), - std::cmp::Ordering::Equal => normalize_sequence_token(alternate), - } -} - -fn is_symbolic_vcf_alt(alternate: &str) -> bool { - let trimmed = alternate.trim(); - trimmed.starts_with('<') && trimmed.ends_with('>') -} - -fn normalize_sequence_token(value: &str) -> String { - value.trim().to_ascii_uppercase() } #[cfg(test)] @@ -2670,12 +448,17 @@ mod tests { use std::{ fs, io::Write, + path::PathBuf, + str::FromStr, time::{SystemTime, UNIX_EPOCH}, }; + use noodles::bgzf; + use noodles::csi; use zip::write::SimpleFileOptions; - use crate::alignment::AlignmentOp; + use crate::alignment::{AlignmentOp, AlignmentOpKind, AlignmentRecord}; + use crate::genotype::{io::read_plain_lines, vcf::detect_vcf_assembly_from_path}; fn temp_dir(label: &str) -> PathBuf { let nanos = SystemTime::now() @@ -3117,6 +900,16 @@ mod tests { detect_source_format(&text, None).unwrap(), GenotypeSourceFormat::Text )); + let vcf_like = dir.join("sample.dat"); + fs::write(&vcf_like, "##fileformat=VCFv4.3\n#CHROM\tPOS\n").unwrap(); + assert!(matches!( + detect_source_format(&vcf_like, None).unwrap(), + GenotypeSourceFormat::Vcf + )); + assert!(is_bgzf_path(Path::new("sample.VCF.GZ"))); + assert!(!is_bgzf_path(Path::new("sample.vcf"))); + let missing = read_plain_lines(&dir.join("missing.txt")).unwrap_err(); + assert!(missing.to_string().contains("failed to open genotype file")); assert!(matches!( detect_source_format(&text, Some(GenotypeSourceFormat::Cram)).unwrap(), GenotypeSourceFormat::Cram @@ -3177,6 +970,30 @@ mod tests { let zip_backend = GenotypeStore::from_file(&zip_path).unwrap(); assert_eq!(zip_backend.get("rs3").unwrap().as_deref(), Some("GG")); + let fallback_zip = dir.join("fallback.zip"); + let cursor = std::io::Cursor::new(Vec::new()); + let mut writer = zip::ZipWriter::new(cursor); + writer + .add_directory("nested/", SimpleFileOptions::default()) + .unwrap(); + writer + .start_file("nested/notes.bin", SimpleFileOptions::default()) + .unwrap(); + writer.write_all(b"notes\n").unwrap(); + let bytes = writer.finish().unwrap().into_inner(); + fs::write(&fallback_zip, bytes).unwrap(); + assert_eq!(select_zip_entry(&fallback_zip).unwrap(), "nested/notes.bin"); + + let empty_zip = dir.join("empty.zip"); + let cursor = std::io::Cursor::new(Vec::new()); + let writer = zip::ZipWriter::new(cursor); + fs::write(&empty_zip, writer.finish().unwrap().into_inner()).unwrap(); + let err = select_zip_entry(&empty_zip).unwrap_err(); + assert!( + err.to_string() + .contains("does not contain a supported genotype file") + ); + let unsupported_backend = DelimitedBackend { format: GenotypeSourceFormat::Vcf, path: text, @@ -3330,12 +1147,77 @@ mod tests { let err = scan_vcf_variants( &VcfBackend { path: dir.join("missing.vcf"), + options: GenotypeLoadOptions::default(), }, &[VariantSpec::default()], ) .unwrap_err(); assert!(err.to_string().contains("failed to open VCF file")); + assert_eq!( + detect_vcf_assembly_from_path(&vcf_path).unwrap(), + Some(Assembly::Grch38) + ); + + let no_index = VcfBackend { + path: vcf_path.clone(), + options: GenotypeLoadOptions::default(), + }; + assert!( + lookup_indexed_vcf_variants(&no_index, &[VariantSpec::default()]) + .unwrap() + .is_none() + ); + + let indexed = VcfBackend { + path: vcf_path.clone(), + options: GenotypeLoadOptions { + input_index: Some(dir.join("missing.tbi")), + ..GenotypeLoadOptions::default() + }, + }; + for variant in [ + VariantSpec::default(), + VariantSpec { + grch38: Some(locus("1", 10, 10)), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }, + VariantSpec { + grch38: Some(locus("1", 10, 10)), + reference: Some("AT".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }, + VariantSpec { + grch38: Some(locus("1", 10, 10)), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Deletion), + ..VariantSpec::default() + }, + ] { + assert!( + lookup_indexed_vcf_variants(&indexed, &[variant]) + .unwrap() + .is_none() + ); + } + let err = lookup_indexed_vcf_variants( + &indexed, + &[VariantSpec { + grch38: Some(locus("1", 10, 10)), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }], + ) + .unwrap_err(); + assert!(err.to_string().contains("failed to read VCF index")); + let bad_zip_backend = DelimitedBackend { format: GenotypeSourceFormat::Zip, path: zip_path.clone(), @@ -3359,6 +1241,15 @@ mod tests { err.to_string().contains("failed to open genotype entry"), "{err}" ); + + let missing_text_backend = DelimitedBackend { + format: GenotypeSourceFormat::Text, + path: dir.join("missing.txt"), + zip_entry_name: None, + }; + let err = + scan_delimited_variants(&missing_text_backend, &[VariantSpec::default()]).unwrap_err(); + assert!(err.to_string().contains("failed to open genotype file")); } #[test] diff --git a/rust/bioscript-formats/src/genotype/common.rs b/rust/bioscript-formats/src/genotype/common.rs new file mode 100644 index 0000000..95b1338 --- /dev/null +++ b/rust/bioscript-formats/src/genotype/common.rs @@ -0,0 +1,65 @@ +use bioscript_core::VariantSpec; + +pub(crate) fn describe_query(variant: &VariantSpec) -> &'static str { + if variant.has_coordinates() { + "variant_by_locus" + } else { + "variant_by_rsid" + } +} + +pub(crate) fn variant_sort_key(variant: &VariantSpec) -> (u8, String, i64, i64, String) { + if let Some(locus) = &variant.grch38 { + return ( + 0, + chrom_sort_key(&locus.chrom), + locus.start, + locus.end, + variant.rsids.first().cloned().unwrap_or_default(), + ); + } + if let Some(locus) = &variant.grch37 { + return ( + 1, + chrom_sort_key(&locus.chrom), + locus.start, + locus.end, + variant.rsids.first().cloned().unwrap_or_default(), + ); + } + ( + 2, + "~".to_owned(), + i64::MAX, + i64::MAX, + variant.rsids.first().cloned().unwrap_or_default(), + ) +} + +pub(crate) fn chrom_sort_key(raw: &str) -> String { + let chrom = raw.trim().strip_prefix("chr").unwrap_or(raw.trim()); + if let Ok(value) = chrom.parse::() { + return format!("{value:03}"); + } + match chrom.to_ascii_uppercase().as_str() { + "X" => "023".to_owned(), + "Y" => "024".to_owned(), + "M" | "MT" => "025".to_owned(), + other => format!("999-{other}"), + } +} + +pub(crate) fn normalize_genotype(value: &str) -> String { + let cleaned = value.trim().replace(' ', "").to_ascii_uppercase(); + if cleaned.is_empty() || matches!(cleaned.as_str(), "NA" | "N/A" | "#N/A" | "NONE") { + return "--".to_owned(); + } + if cleaned.contains('/') { + let parts: Vec<&str> = cleaned.split('/').collect(); + if parts.iter().any(|part| part.is_empty() || *part == "-") { + return "ID".to_owned(); + } + return parts.concat(); + } + cleaned +} diff --git a/rust/bioscript-formats/src/genotype/cram_backend.rs b/rust/bioscript-formats/src/genotype/cram_backend.rs new file mode 100644 index 0000000..383bb46 --- /dev/null +++ b/rust/bioscript-formats/src/genotype/cram_backend.rs @@ -0,0 +1,422 @@ +use std::{ + collections::BTreeMap, + io::{Read, Seek}, + path::Path, +}; + +use noodles::core::Position; +use noodles::cram; +use noodles::sam::alignment::{ + Record as _, + record::{Cigar as _, QualityScores as _, Sequence as _, cigar::op::Kind as CigarOpKind}, +}; + +use bioscript_core::{Assembly, GenomicLocus, RuntimeError, VariantSpec}; + +use crate::alignment; + +use super::GenotypeLoadOptions; + +mod indel; +mod reader; +mod store; + +#[cfg(test)] +pub(crate) use indel::len_as_i64; +pub(crate) use indel::{ + classify_expected_indel, indel_at_anchor, record_overlaps_locus, spans_position, +}; +pub use reader::{observe_cram_indel_with_reader, observe_cram_snp_with_reader}; + +const DEFAULT_MPILEUP_MIN_BASE_QUALITY: u8 = 13; +const DEFAULT_MPILEUP_MIN_MAPPING_QUALITY: u8 = 0; + +pub(crate) fn choose_variant_locus( + variant: &VariantSpec, + reference_file: &Path, +) -> Option<(Assembly, GenomicLocus)> { + match detect_reference_assembly(reference_file) { + Some(Assembly::Grch38) => variant + .grch38 + .clone() + .map(|locus| (Assembly::Grch38, locus)) + .or_else(|| { + variant + .grch37 + .clone() + .map(|locus| (Assembly::Grch37, locus)) + }), + Some(Assembly::Grch37) => variant + .grch37 + .clone() + .map(|locus| (Assembly::Grch37, locus)) + .or_else(|| { + variant + .grch38 + .clone() + .map(|locus| (Assembly::Grch38, locus)) + }), + None => variant + .grch38 + .clone() + .map(|locus| (Assembly::Grch38, locus)) + .or_else(|| { + variant + .grch37 + .clone() + .map(|locus| (Assembly::Grch37, locus)) + }), + } +} + +pub(crate) fn detect_reference_assembly(reference_file: &Path) -> Option { + let lower = reference_file.to_string_lossy().to_ascii_lowercase(); + if lower.contains("grch38") || lower.contains("hg38") || lower.contains("assembly38") { + Some(Assembly::Grch38) + } else if lower.contains("grch37") || lower.contains("hg19") || lower.contains("assembly37") { + Some(Assembly::Grch37) + } else { + None + } +} + +pub(crate) fn describe_locus(locus: &GenomicLocus) -> String { + format!("{}:{}-{}", locus.chrom, locus.start, locus.end) +} + +pub(crate) fn anchor_window(locus: &GenomicLocus) -> GenomicLocus { + let anchor = locus.start.saturating_sub(1); + GenomicLocus { + chrom: locus.chrom.clone(), + start: anchor, + end: anchor, + } +} + +pub(crate) fn first_base(value: &str) -> Option { + value + .trim() + .chars() + .next() + .map(|ch| ch.to_ascii_uppercase()) +} + +pub(crate) fn infer_snp_genotype( + reference: char, + alternate: char, + ref_count: u32, + alt_count: u32, + depth: u32, +) -> Option { + if depth == 0 || ref_count + alt_count == 0 { + return None; + } + let alt_fraction = f64::from(alt_count) / f64::from(depth); + if alt_fraction >= 0.8 { + Some(format!("{alternate}{alternate}")) + } else if alt_fraction <= 0.2 { + Some(format!("{reference}{reference}")) + } else { + Some(format!("{reference}{alternate}")) + } +} + +pub(crate) fn describe_snp_decision_rule( + reference: char, + alternate: char, + ref_count: u32, + alt_count: u32, + depth: u32, +) -> String { + if depth == 0 { + return format!( + "no covering reads for SNP; genotype unresolved (ref={reference}, alt={alternate})" + ); + } + if ref_count + alt_count == 0 { + return format!( + "no reads matched the declared SNP alleles; genotype unresolved; counts ref={ref_count} alt={alt_count} depth={depth} for {reference}>{alternate}" + ); + } + + let alt_fraction = f64::from(alt_count) / f64::from(depth); + format!( + "SNP genotype rule: alt_fraction={alt_fraction:.3} with thresholds ref<=0.200, het=(0.200,0.800), alt>=0.800; counts ref={ref_count} alt={alt_count} depth={depth} for {reference}>{alternate}" + ) +} + +pub(crate) fn infer_copy_number_genotype( + reference: &str, + alternate: &str, + _ref_count: u32, + alt_count: u32, + depth: u32, +) -> Option { + if depth == 0 { + return None; + } + let alt_fraction = f64::from(alt_count) / f64::from(depth); + if alt_fraction >= 0.8 { + Some(format!("{alternate}{alternate}")) + } else if alt_fraction <= 0.2 { + Some(format!("{reference}{reference}")) + } else { + Some(format!("{reference}{alternate}")) + } +} + +pub(crate) fn describe_copy_number_decision_rule( + reference: &str, + alternate: &str, + ref_count: u32, + alt_count: u32, + depth: u32, +) -> String { + if depth == 0 { + return format!( + "no covering reads for copy-number style variant; genotype unresolved (ref={reference}, alt={alternate})" + ); + } + + let alt_fraction = f64::from(alt_count) / f64::from(depth); + format!( + "copy-number genotype rule: alt_fraction={alt_fraction:.3} with thresholds ref<=0.200, het=(0.200,0.800), alt>=0.800; counts ref={ref_count} alt={alt_count} depth={depth} for {reference}->{alternate}" + ) +} + +#[derive(Debug, Clone, Default)] +pub(crate) struct SnpPileupCounts { + pub(crate) filtered_depth: u32, + pub(crate) filtered_ref_count: u32, + pub(crate) filtered_alt_count: u32, + pub(crate) filtered_base_counts: BTreeMap, + pub(crate) raw_depth: u32, + pub(crate) raw_ref_count: u32, + pub(crate) raw_alt_count: u32, + pub(crate) raw_base_counts: BTreeMap, + pub(crate) filtered_low_base_quality: u32, + pub(crate) filtered_low_mapping_quality: u32, + pub(crate) filtered_non_acgt: u32, + pub(crate) filtered_unmapped: u32, + pub(crate) filtered_secondary: u32, + pub(crate) filtered_qc_fail: u32, + pub(crate) filtered_duplicate: u32, + pub(crate) filtered_improper_pair: u32, + pub(crate) raw_forward_counts: BTreeMap, + pub(crate) raw_reverse_counts: BTreeMap, +} + +impl SnpPileupCounts { + pub(crate) fn evidence_lines(&self, locus: &str, target_pos: i64) -> Vec { + vec![ + format!( + "observed SNP pileup at {locus} target_pos={target_pos} filtered_depth={} ref_count={} alt_count={}", + self.filtered_depth, self.filtered_ref_count, self.filtered_alt_count + ), + format!( + "raw pileup depth={} ref_count={} alt_count={} raw_counts={:?}", + self.raw_depth, self.raw_ref_count, self.raw_alt_count, self.raw_base_counts + ), + format!( + "raw strand counts: forward={:?} reverse={:?}", + self.raw_forward_counts, self.raw_reverse_counts + ), + format!( + "filters applied: min_base_quality={} min_mapping_quality={} filtered_low_base_quality={} filtered_low_mapping_quality={} filtered_non_acgt={} filtered_unmapped={} filtered_secondary={} filtered_qc_fail={} filtered_duplicate={} filtered_improper_pair={}", + DEFAULT_MPILEUP_MIN_BASE_QUALITY, + DEFAULT_MPILEUP_MIN_MAPPING_QUALITY, + self.filtered_low_base_quality, + self.filtered_low_mapping_quality, + self.filtered_non_acgt, + self.filtered_unmapped, + self.filtered_secondary, + self.filtered_qc_fail, + self.filtered_duplicate, + self.filtered_improper_pair + ), + ] + } +} + +fn observe_snp_pileup( + cram_path: &Path, + options: &GenotypeLoadOptions, + reference_file: &Path, + locus: &GenomicLocus, + reference: char, + alternate: char, +) -> Result { + let repository = alignment::build_reference_repository(reference_file)?; + let mut reader = + alignment::build_cram_indexed_reader_from_path(cram_path, options, repository)?; + let label = cram_path.display().to_string(); + snp_pileup_with_reader( + &mut reader, + &label, + locus, + reference, + alternate, + options.allow_reference_md5_mismatch, + ) +} + +fn snp_pileup_with_reader( + reader: &mut cram::io::indexed_reader::IndexedReader, + label: &str, + locus: &GenomicLocus, + reference: char, + alternate: char, + allow_reference_md5_mismatch: bool, +) -> Result { + let mut counts = SnpPileupCounts::default(); + let target_position = Position::try_from(usize::try_from(locus.start).map_err(|_| { + RuntimeError::InvalidArguments("SNP locus start is out of range".to_owned()) + })?) + .map_err(|_| RuntimeError::InvalidArguments("SNP locus start is out of range".to_owned()))?; + let reference_base = reference as u8; + + alignment::for_each_raw_cram_record_with_reader_inner( + reader, + label, + locus, + allow_reference_md5_mismatch, + |record| { + let flags = record + .flags() + .map_err(|err| RuntimeError::Io(format!("failed to read CRAM flags: {err}")))?; + if flags.is_unmapped() { + counts.filtered_unmapped += 1; + return Ok(true); + } + if flags.is_secondary() { + counts.filtered_secondary += 1; + return Ok(true); + } + if flags.is_qc_fail() { + counts.filtered_qc_fail += 1; + return Ok(true); + } + if flags.is_duplicate() { + counts.filtered_duplicate += 1; + return Ok(true); + } + if flags.is_segmented() && !flags.is_properly_segmented() { + counts.filtered_improper_pair += 1; + return Ok(true); + } + + let Some((base, base_quality)) = + cram_base_quality_at_reference_position(&record, target_position, reference_base)? + else { + return Ok(true); + }; + + let normalized_base = normalize_pileup_base(base); + record.mapping_quality().transpose().map_err(|err| { + RuntimeError::Io(format!("failed to read CRAM mapping quality: {err}")) + })?; + let is_reverse = flags.is_reverse_complemented(); + if let Some(base) = normalized_base { + counts.raw_depth += 1; + *counts.raw_base_counts.entry(base.to_string()).or_insert(0) += 1; + let strand_counts = if is_reverse { + &mut counts.raw_reverse_counts + } else { + &mut counts.raw_forward_counts + }; + *strand_counts.entry(base.to_string()).or_insert(0) += 1; + if base == reference { + counts.raw_ref_count += 1; + } else if base == alternate { + counts.raw_alt_count += 1; + } + } + + if base_quality < DEFAULT_MPILEUP_MIN_BASE_QUALITY { + counts.filtered_low_base_quality += 1; + return Ok(true); + } + + let Some(base) = normalized_base else { + counts.filtered_non_acgt += 1; + return Ok(true); + }; + + counts.filtered_depth += 1; + *counts + .filtered_base_counts + .entry(base.to_string()) + .or_insert(0) += 1; + if base == reference { + counts.filtered_ref_count += 1; + } else if base == alternate { + counts.filtered_alt_count += 1; + } + Ok(true) + }, + )?; + + Ok(counts) +} + +fn cram_base_quality_at_reference_position( + record: &cram::Record<'_>, + target_position: Position, + reference_base: u8, +) -> Result, RuntimeError> { + let Some(alignment_start) = record.alignment_start() else { + return Ok(None); + }; + let alignment_start = alignment_start + .map_err(|err| RuntimeError::Io(format!("failed to read CRAM alignment start: {err}")))?; + let mut reference_position = usize::from(alignment_start); + let target = usize::from(target_position); + let mut read_position = 0usize; + let sequence = record.sequence(); + let qualities = record.quality_scores(); + + for op in record.cigar().iter() { + let op = op.map_err(|err| RuntimeError::Io(format!("failed to read CRAM CIGAR: {err}")))?; + match op.kind() { + CigarOpKind::Match | CigarOpKind::SequenceMatch | CigarOpKind::SequenceMismatch => { + for offset in 0..op.len() { + if reference_position + offset == target { + let base = sequence + .get(read_position + offset) + .unwrap_or(reference_base); + let quality = qualities + .iter() + .nth(read_position + offset) + .transpose() + .map_err(|err| { + RuntimeError::Io(format!("failed to read CRAM base quality: {err}")) + })? + .unwrap_or(0); + return Ok(Some((base, quality))); + } + } + reference_position += op.len(); + read_position += op.len(); + } + CigarOpKind::Insertion | CigarOpKind::SoftClip => { + read_position += op.len(); + } + CigarOpKind::Deletion | CigarOpKind::Skip => { + if target >= reference_position && target < reference_position + op.len() { + return Ok(None); + } + reference_position += op.len(); + } + CigarOpKind::HardClip | CigarOpKind::Pad => {} + } + } + + Ok(None) +} + +pub(crate) fn normalize_pileup_base(base: u8) -> Option { + match (base as char).to_ascii_uppercase() { + 'A' | 'C' | 'G' | 'T' => Some((base as char).to_ascii_uppercase()), + _ => None, + } +} diff --git a/rust/bioscript-formats/src/genotype/cram_backend/indel.rs b/rust/bioscript-formats/src/genotype/cram_backend/indel.rs new file mode 100644 index 0000000..2d8752e --- /dev/null +++ b/rust/bioscript-formats/src/genotype/cram_backend/indel.rs @@ -0,0 +1,104 @@ +use bioscript_core::{GenomicLocus, RuntimeError}; + +use crate::alignment::{AlignmentOpKind, AlignmentRecord}; + +#[derive(Debug, Clone, Copy)] +pub(crate) struct IndelClassification { + pub(crate) covering: bool, + pub(crate) reference_like: bool, + pub(crate) matches_alt: bool, + pub(crate) observed_len: usize, +} + +pub(crate) fn len_as_i64(len: usize) -> Option { + i64::try_from(len).ok() +} + +pub(crate) fn spans_position(record: &AlignmentRecord, pos: i64) -> bool { + pos >= record.start.saturating_sub(1) && pos <= record.end +} + +pub(crate) fn record_overlaps_locus(record: &AlignmentRecord, locus: &GenomicLocus) -> bool { + record.end >= locus.start && record.start <= locus.end +} + +pub(crate) fn indel_at_anchor( + record: &AlignmentRecord, + anchor_pos: i64, +) -> Option<(AlignmentOpKind, usize)> { + let mut ref_pos = record.start; + + for op in &record.cigar { + match op.kind { + AlignmentOpKind::Match + | AlignmentOpKind::SequenceMatch + | AlignmentOpKind::SequenceMismatch + | AlignmentOpKind::Skip => { + ref_pos += len_as_i64(op.len)?; + } + AlignmentOpKind::Insertion => { + let anchor = ref_pos.saturating_sub(1); + if anchor == anchor_pos { + return Some((AlignmentOpKind::Insertion, op.len)); + } + } + AlignmentOpKind::Deletion => { + let anchor = ref_pos.saturating_sub(1); + if anchor == anchor_pos { + return Some((AlignmentOpKind::Deletion, op.len)); + } + ref_pos += len_as_i64(op.len)?; + } + AlignmentOpKind::SoftClip | AlignmentOpKind::HardClip | AlignmentOpKind::Pad => {} + } + } + + None +} + +pub(crate) fn classify_expected_indel( + record: &AlignmentRecord, + locus: &GenomicLocus, + reference_len: usize, + alternate: &str, +) -> Result { + let alt_len = alternate.len(); + let anchor_start = locus.start.saturating_sub(1); + let anchor_end = locus.end; + + let covering = record.start <= locus.start && record.end >= locus.end; + if !covering { + return Ok(IndelClassification { + covering: false, + reference_like: false, + matches_alt: false, + observed_len: reference_len, + }); + } + + let mut observed_len = reference_len; + + for anchor in anchor_start..=anchor_end { + if let Some((kind, len)) = indel_at_anchor(record, anchor) { + observed_len = match kind { + AlignmentOpKind::Insertion => reference_len + len, + AlignmentOpKind::Deletion => reference_len.saturating_sub(len), + _ => reference_len, + }; + + return Ok(IndelClassification { + covering: true, + reference_like: false, + matches_alt: observed_len == alt_len, + observed_len, + }); + } + } + + Ok(IndelClassification { + covering: true, + reference_like: true, + matches_alt: false, + observed_len, + }) +} diff --git a/rust/bioscript-formats/src/genotype/cram_backend/reader.rs b/rust/bioscript-formats/src/genotype/cram_backend/reader.rs new file mode 100644 index 0000000..82524da --- /dev/null +++ b/rust/bioscript-formats/src/genotype/cram_backend/reader.rs @@ -0,0 +1,122 @@ +use std::{ + collections::{BTreeMap, BTreeSet}, + io::{Read, Seek}, +}; + +use noodles::cram; + +use bioscript_core::{Assembly, GenomicLocus, RuntimeError, VariantObservation}; + +use crate::alignment; + +use super::{ + classify_expected_indel, describe_copy_number_decision_rule, describe_locus, + describe_snp_decision_rule, infer_copy_number_genotype, infer_snp_genotype, + record_overlaps_locus, snp_pileup_with_reader, +}; + +/// Observe a SNP at `locus` over an already-built CRAM `IndexedReader` and +/// reference repository (held by the reader). Mirrors the internal +/// `CramBackend::observe_snp` but reader-based, so non-filesystem callers +/// (e.g. wasm with a JS-backed reader) don't need a `GenotypeStore` or paths. +/// +/// `matched_rsid` and `assembly` are passed through to the returned +/// observation unchanged — callers that already know them (e.g. from +/// compiling a YAML variant) should supply them; otherwise `None`. +pub fn observe_cram_snp_with_reader( + reader: &mut cram::io::indexed_reader::IndexedReader, + label: &str, + locus: &GenomicLocus, + reference: char, + alternate: char, + matched_rsid: Option, + assembly: Option, +) -> Result { + let pileup = snp_pileup_with_reader(reader, label, locus, reference, alternate, false)?; + let ref_count = pileup.filtered_ref_count; + let alt_count = pileup.filtered_alt_count; + let depth = pileup.filtered_depth; + let evidence = pileup.evidence_lines(&describe_locus(locus), locus.start); + + Ok(VariantObservation { + backend: "cram".to_owned(), + matched_rsid, + assembly, + genotype: infer_snp_genotype(reference, alternate, ref_count, alt_count, depth), + ref_count: Some(ref_count), + alt_count: Some(alt_count), + depth: Some(depth), + raw_counts: pileup.raw_base_counts, + decision: Some(describe_snp_decision_rule( + reference, alternate, ref_count, alt_count, depth, + )), + evidence, + }) +} + +/// Observe an insertion/indel-like variant at `locus` over an already-built +/// CRAM `IndexedReader`. +pub fn observe_cram_indel_with_reader( + reader: &mut cram::io::indexed_reader::IndexedReader, + label: &str, + locus: &GenomicLocus, + reference: &str, + alternate: &str, + matched_rsid: Option, + assembly: Option, +) -> Result { + let mut alt_count = 0u32; + let mut ref_count = 0u32; + let mut depth = 0u32; + let mut matching_alt_lengths = BTreeSet::new(); + + alignment::for_each_cram_record_with_reader(reader, label, locus, |record| { + if record.is_unmapped || !record_overlaps_locus(&record, locus) { + return Ok(true); + } + let classification = classify_expected_indel(&record, locus, reference.len(), alternate)?; + if !classification.covering { + return Ok(true); + } + depth += 1; + if classification.matches_alt { + alt_count += 1; + matching_alt_lengths.insert(classification.observed_len); + } else if classification.reference_like { + ref_count += 1; + } + Ok(true) + })?; + + let evidence_label = if matching_alt_lengths.is_empty() { + "none".to_owned() + } else { + matching_alt_lengths + .into_iter() + .map(|len| len.to_string()) + .collect::>() + .join(",") + }; + + Ok(VariantObservation { + backend: "cram".to_owned(), + matched_rsid, + assembly, + genotype: infer_copy_number_genotype(reference, alternate, ref_count, alt_count, depth), + ref_count: Some(ref_count), + alt_count: Some(alt_count), + depth: Some(depth), + raw_counts: BTreeMap::new(), + decision: Some(describe_copy_number_decision_rule( + reference, alternate, ref_count, alt_count, depth, + )), + evidence: vec![format!( + "observed indel at {} depth={} ref_count={} alt_count={} matching_alt_lengths={}", + describe_locus(locus), + depth, + ref_count, + alt_count, + evidence_label + )], + }) +} diff --git a/rust/bioscript-formats/src/genotype/cram_backend/store.rs b/rust/bioscript-formats/src/genotype/cram_backend/store.rs new file mode 100644 index 0000000..2b6a27b --- /dev/null +++ b/rust/bioscript-formats/src/genotype/cram_backend/store.rs @@ -0,0 +1,266 @@ +use std::{ + collections::{BTreeMap, BTreeSet}, + fmt::Write as _, + path::Path, +}; + +use bioscript_core::{ + Assembly, GenomicLocus, RuntimeError, VariantKind, VariantObservation, VariantSpec, +}; + +use crate::alignment::{self, AlignmentOpKind}; + +use super::{ + anchor_window, choose_variant_locus, classify_expected_indel, + describe_copy_number_decision_rule, describe_locus, describe_snp_decision_rule, first_base, + indel_at_anchor, infer_copy_number_genotype, infer_snp_genotype, observe_snp_pileup, + record_overlaps_locus, spans_position, +}; +use crate::genotype::{describe_query, types::CramBackend}; + +impl CramBackend { + pub(crate) fn backend_name(&self) -> &'static str { + "cram" + } + + pub(crate) fn lookup_variant( + &self, + variant: &VariantSpec, + ) -> Result { + let Some(reference_file) = self.options.reference_file.as_ref() else { + return Err(RuntimeError::Unsupported(format!( + "backend '{}' cannot satisfy query '{}' for {} without --reference-file", + self.backend_name(), + describe_query(variant), + self.path.display() + ))); + }; + + let Some((assembly, locus)) = choose_variant_locus(variant, reference_file) else { + let mut detail = format!( + "backend '{}' cannot satisfy query '{}' for {} using reference {}", + self.backend_name(), + describe_query(variant), + self.path.display(), + reference_file.display() + ); + detail.push_str(". This backend needs GRCh37/GRCh38 coordinates, not only rsIDs"); + if let Some(reference_index) = self.options.reference_index.as_ref() { + let _ = write!(detail, " (reference index {})", reference_index.display()); + } + if let Some(input_index) = self.options.input_index.as_ref() { + let _ = write!(detail, " (input index {})", input_index.display()); + } + return Err(RuntimeError::Unsupported(detail)); + }; + + let observation = match variant.kind.unwrap_or(VariantKind::Other) { + VariantKind::Snp => self.observe_snp(variant, assembly, &locus, reference_file)?, + VariantKind::Deletion => { + self.observe_deletion(variant, assembly, &locus, reference_file)? + } + VariantKind::Insertion | VariantKind::Indel => { + self.observe_indel(variant, assembly, &locus, reference_file)? + } + VariantKind::Other => { + return Err(RuntimeError::Unsupported(format!( + "backend '{}' does not yet support {:?} observation for {}", + self.backend_name(), + variant.kind.unwrap_or(VariantKind::Other), + self.path.display() + ))); + } + }; + + Ok(observation) + } + + fn observe_snp( + &self, + variant: &VariantSpec, + assembly: Assembly, + locus: &GenomicLocus, + reference_file: &Path, + ) -> Result { + let reference = variant + .reference + .as_deref() + .and_then(first_base) + .ok_or_else(|| { + RuntimeError::InvalidArguments("SNP variant requires ref/reference".to_owned()) + })?; + let alternate = variant + .alternate + .as_deref() + .and_then(first_base) + .ok_or_else(|| { + RuntimeError::InvalidArguments("SNP variant requires alt/alternate".to_owned()) + })?; + + let target_pos = locus.start; + let pileup = observe_snp_pileup( + &self.path, + &self.options, + reference_file, + locus, + reference, + alternate, + )?; + let ref_count = pileup.filtered_ref_count; + let alt_count = pileup.filtered_alt_count; + let depth = pileup.filtered_depth; + + let evidence = pileup.evidence_lines(&describe_locus(locus), target_pos); + + Ok(VariantObservation { + backend: self.backend_name().to_owned(), + matched_rsid: variant.rsids.first().cloned(), + assembly: Some(assembly), + genotype: infer_snp_genotype(reference, alternate, ref_count, alt_count, depth), + ref_count: Some(ref_count), + alt_count: Some(alt_count), + depth: Some(depth), + raw_counts: pileup.raw_base_counts, + decision: Some(describe_snp_decision_rule( + reference, alternate, ref_count, alt_count, depth, + )), + evidence, + }) + } + + fn observe_deletion( + &self, + variant: &VariantSpec, + assembly: Assembly, + locus: &GenomicLocus, + reference_file: &Path, + ) -> Result { + let deletion_length = variant.deletion_length.ok_or_else(|| { + RuntimeError::InvalidArguments("deletion variant requires deletion_length".to_owned()) + })?; + let reference = variant.reference.clone().unwrap_or_else(|| "I".to_owned()); + let alternate = variant.alternate.clone().unwrap_or_else(|| "D".to_owned()); + let anchor_pos = locus.start.saturating_sub(1); + + let mut alt_count = 0u32; + let mut ref_count = 0u32; + let mut depth = 0u32; + + alignment::for_each_cram_record( + &self.path, + &self.options, + reference_file, + &anchor_window(locus), + |record| { + if record.is_unmapped || !spans_position(&record, anchor_pos) { + return Ok(true); + } + depth += 1; + match indel_at_anchor(&record, anchor_pos) { + Some((AlignmentOpKind::Deletion, len)) if len == deletion_length => { + alt_count += 1; + } + _ => ref_count += 1, + } + Ok(true) + }, + )?; + + Ok(VariantObservation { + backend: self.backend_name().to_owned(), + matched_rsid: variant.rsids.first().cloned(), + assembly: Some(assembly), + genotype: infer_copy_number_genotype( + &reference, &alternate, ref_count, alt_count, depth, + ), + ref_count: Some(ref_count), + alt_count: Some(alt_count), + depth: Some(depth), + raw_counts: BTreeMap::new(), + decision: Some(describe_copy_number_decision_rule( + &reference, &alternate, ref_count, alt_count, depth, + )), + evidence: vec![format!( + "observed deletion anchor {}:{} len={} depth={} ref_count={} alt_count={}", + locus.chrom, anchor_pos, deletion_length, depth, ref_count, alt_count + )], + }) + } + + fn observe_indel( + &self, + variant: &VariantSpec, + assembly: Assembly, + locus: &GenomicLocus, + reference_file: &Path, + ) -> Result { + let reference = variant.reference.clone().ok_or_else(|| { + RuntimeError::InvalidArguments("indel variant requires ref/reference".to_owned()) + })?; + let alternate = variant.alternate.clone().ok_or_else(|| { + RuntimeError::InvalidArguments("indel variant requires alt/alternate".to_owned()) + })?; + let records = + alignment::query_cram_records(&self.path, &self.options, reference_file, locus)?; + + let mut alt_count = 0u32; + let mut ref_count = 0u32; + let mut depth = 0u32; + let mut matching_alt_lengths = BTreeSet::new(); + + for record in records { + if record.is_unmapped { + continue; + } + if !record_overlaps_locus(&record, locus) { + continue; + } + let classification = + classify_expected_indel(&record, locus, reference.len(), &alternate)?; + if !classification.covering { + continue; + } + depth += 1; + if classification.matches_alt { + alt_count += 1; + matching_alt_lengths.insert(classification.observed_len); + } else if classification.reference_like { + ref_count += 1; + } + } + + let evidence_label = if matching_alt_lengths.is_empty() { + "none".to_owned() + } else { + matching_alt_lengths + .into_iter() + .map(|len| len.to_string()) + .collect::>() + .join(",") + }; + + Ok(VariantObservation { + backend: self.backend_name().to_owned(), + matched_rsid: variant.rsids.first().cloned(), + assembly: Some(assembly), + genotype: infer_copy_number_genotype( + &reference, &alternate, ref_count, alt_count, depth, + ), + ref_count: Some(ref_count), + alt_count: Some(alt_count), + depth: Some(depth), + raw_counts: BTreeMap::new(), + decision: Some(describe_copy_number_decision_rule( + &reference, &alternate, ref_count, alt_count, depth, + )), + evidence: vec![format!( + "observed indel at {} depth={} ref_count={} alt_count={} matching_alt_lengths={}", + describe_locus(locus), + depth, + ref_count, + alt_count, + evidence_label + )], + }) + } +} diff --git a/rust/bioscript-formats/src/genotype/delimited.rs b/rust/bioscript-formats/src/genotype/delimited.rs new file mode 100644 index 0000000..c4b8e81 --- /dev/null +++ b/rust/bioscript-formats/src/genotype/delimited.rs @@ -0,0 +1,417 @@ +use std::collections::{BTreeSet, HashMap}; + +use bioscript_core::RuntimeError; + +use super::normalize_genotype; + +mod scan; + +pub(crate) use scan::scan_delimited_variants; + +const COMMENT_PREFIXES: [&str; 2] = ["#", "//"]; +const RSID_ALIASES: &[&str] = &["rsid", "name", "snp", "marker", "id", "snpid"]; +const CHROM_ALIASES: &[&str] = &["chromosome", "chr", "chrom"]; +const POSITION_ALIASES: &[&str] = &[ + "position", + "pos", + "coordinate", + "basepairposition", + "basepair", +]; +pub(crate) const GENOTYPE_ALIASES: &[&str] = &[ + "genotype", + "gt", + "result", + "results", + "result1", + "call", + "calls", + "yourcode", + "code", + "genotypevalue", + "variation", +]; +const ALLELE1_ALIASES: &[&str] = &["allele1", "allelea", "allele_a", "allele1top"]; +const ALLELE2_ALIASES: &[&str] = &["allele2", "alleleb", "allele_b", "allele2top"]; + +#[derive(Debug, Clone)] +pub(crate) struct ParsedDelimitedRow { + pub(crate) rsid: Option, + pub(crate) chrom: Option, + pub(crate) position: Option, + pub(crate) genotype: String, +} + +#[derive(Debug, Clone, Copy)] +pub(crate) enum Delimiter { + Tab, + Comma, + Space, +} + +pub(crate) fn detect_delimiter(lines: &[String]) -> Delimiter { + for line in lines { + let trimmed = line.trim(); + if trimmed.is_empty() + || COMMENT_PREFIXES + .iter() + .any(|prefix| trimmed.starts_with(prefix)) + { + continue; + } + if line.contains('\t') { + return Delimiter::Tab; + } + if line.contains(',') { + return Delimiter::Comma; + } + if trimmed.split_whitespace().count() > 1 { + return Delimiter::Space; + } + } + Delimiter::Tab +} + +#[allow(dead_code)] +pub(crate) struct RowParser { + delimiter: Delimiter, + header: Option>, + comment_header: Option>, + alias_map: HashMap<&'static str, BTreeSet<&'static str>>, +} + +#[allow(dead_code)] +impl RowParser { + pub(crate) fn new(delimiter: Delimiter) -> Self { + let mut alias_map = HashMap::new(); + alias_map.insert("rsid", RSID_ALIASES.iter().copied().collect()); + alias_map.insert("chromosome", CHROM_ALIASES.iter().copied().collect()); + alias_map.insert("position", POSITION_ALIASES.iter().copied().collect()); + alias_map.insert("genotype", GENOTYPE_ALIASES.iter().copied().collect()); + alias_map.insert("allele1", ALLELE1_ALIASES.iter().copied().collect()); + alias_map.insert("allele2", ALLELE2_ALIASES.iter().copied().collect()); + Self { + delimiter, + header: None, + comment_header: None, + alias_map, + } + } + + pub(crate) fn consume_line( + &mut self, + line: &str, + ) -> Result, RuntimeError> { + Ok(self + .consume_record(line)? + .and_then(|row| row.rsid.map(|rsid| (rsid, row.genotype)))) + } + + pub(crate) fn consume_record( + &mut self, + line: &str, + ) -> Result, RuntimeError> { + let trimmed = line.trim(); + if trimmed.is_empty() { + return Ok(None); + } + + let trimmed = strip_bom(trimmed); + if let Some(prefix) = COMMENT_PREFIXES + .iter() + .find(|prefix| trimmed.starts_with(**prefix)) + { + let candidate = trimmed.trim_start_matches(prefix).trim(); + if !candidate.is_empty() { + let fields = self.parse_fields(candidate); + if self.looks_like_header(&fields) { + self.comment_header = Some(fields); + } + } + return Ok(None); + } + + let fields = self.parse_fields(strip_bom(line)); + if fields.is_empty() { + return Ok(None); + } + + if self.header.is_none() { + if self.looks_like_header(&fields) { + self.header = Some(fields); + return Ok(None); + } + if let Some(header) = self.comment_header.take() { + self.header = Some(header); + } else { + self.header = Some(self.default_header(fields.len())); + } + } + + let header = self.header.as_ref().expect("header initialized"); + let mut row_map = HashMap::new(); + for (idx, value) in fields.into_iter().enumerate() { + if idx >= header.len() { + continue; + } + row_map.insert(normalize_name(&header[idx]), strip_inline_comment(&value)); + } + + let rsid = self + .lookup(&row_map, "rsid") + .filter(|value| !value.is_empty()); + let chrom = self + .lookup(&row_map, "chromosome") + .filter(|value| !value.is_empty()); + let position = self + .lookup(&row_map, "position") + .and_then(|value| value.parse::().ok()); + if rsid.is_none() && (chrom.is_none() || position.is_none()) { + return Ok(None); + } + + let genotype = if let Some(gt) = self.lookup(&row_map, "genotype") { + gt + } else { + let allele1 = self.lookup(&row_map, "allele1").unwrap_or_default(); + let allele2 = self.lookup(&row_map, "allele2").unwrap_or_default(); + format!("{allele1}{allele2}") + }; + + Ok(Some(ParsedDelimitedRow { + rsid, + chrom, + position, + genotype: normalize_genotype(&genotype), + })) + } + + fn parse_fields(&self, line: &str) -> Vec { + parse_owned_fields(line, self.delimiter) + } + + fn looks_like_header(&self, fields: &[String]) -> bool { + fields.first().is_some_and(|first| { + self.alias_map + .get("rsid") + .is_some_and(|aliases| aliases.contains(normalize_name(first).as_str())) + }) + } + + fn lookup(&self, row_map: &HashMap, key: &str) -> Option { + let aliases = self.alias_map.get(key)?; + for alias in aliases { + let key = normalize_name(alias); + if let Some(value) = row_map.get(&key) + && !value.is_empty() + { + return Some(value.clone()); + } + } + None + } + + pub(crate) fn default_header(&self, field_count: usize) -> Vec { + let base = ["rsid", "chromosome", "position", "genotype"]; + if field_count <= base.len() { + base[..field_count] + .iter() + .map(|s| (*s).to_owned()) + .collect() + } else { + let mut header: Vec = base.iter().map(|s| (*s).to_owned()).collect(); + for idx in 0..(field_count - header.len()) { + header.push(format!("extra_{idx}")); + } + header + } + } +} + +pub(crate) fn strip_bom(value: &str) -> &str { + value.strip_prefix('\u{feff}').unwrap_or(value) +} + +pub(crate) fn normalize_name(name: &str) -> String { + name.trim() + .to_ascii_lowercase() + .chars() + .filter(|ch| !matches!(ch, ' ' | '_' | '-')) + .collect() +} + +pub(crate) fn strip_inline_comment(value: &str) -> String { + for marker in ["#", "//"] { + if let Some(idx) = value.find(marker) { + return value[..idx].trim().to_owned(); + } + } + value.trim().to_owned() +} + +pub(crate) fn split_csv_line(line: &str) -> Vec { + let mut fields = Vec::new(); + let mut current = String::new(); + let mut in_quotes = false; + let chars = line.chars().peekable(); + + for ch in chars { + match ch { + '"' => in_quotes = !in_quotes, + ',' if !in_quotes => { + fields.push(current.trim().to_owned()); + current.clear(); + } + _ => current.push(ch), + } + } + fields.push(current.trim().to_owned()); + fields +} + +#[derive(Debug, Clone, Copy)] +pub(crate) struct DelimitedColumnIndexes { + pub(crate) rsid: Option, + pub(crate) chrom: Option, + pub(crate) position: Option, + pub(crate) genotype: Option, + pub(crate) allele1: Option, + pub(crate) allele2: Option, +} + +pub(crate) fn parse_streaming_row( + line: &str, + delimiter: Delimiter, + column_indexes: &mut Option, + comment_header: &mut Option>, +) -> Result, RuntimeError> { + let trimmed = line.trim(); + if trimmed.is_empty() { + return Ok(None); + } + + let trimmed = strip_bom(trimmed); + if let Some(prefix) = COMMENT_PREFIXES + .iter() + .find(|prefix| trimmed.starts_with(**prefix)) + { + let candidate = trimmed.trim_start_matches(prefix).trim(); + if !candidate.is_empty() { + let fields = parse_owned_fields(candidate, delimiter); + if looks_like_header_fields(&fields) { + *comment_header = Some(fields); + } + } + return Ok(None); + } + + let fields = parse_owned_fields(strip_bom(line), delimiter); + if fields.is_empty() { + return Ok(None); + } + + if column_indexes.is_none() { + if looks_like_header_fields(&fields) { + *column_indexes = Some(build_column_indexes(&fields)); + return Ok(None); + } + if let Some(header) = comment_header.take() { + *column_indexes = Some(build_column_indexes(&header)); + } else { + *column_indexes = Some(default_column_indexes(fields.len())); + } + } + + let indexes = column_indexes.expect("streaming column indexes initialized"); + let rsid = indexes + .rsid + .and_then(|idx| fields.get(idx)) + .map(|value| strip_inline_comment(value).trim().to_owned()) + .filter(|value| !value.is_empty()); + let chrom = indexes + .chrom + .and_then(|idx| fields.get(idx)) + .map(|value| strip_inline_comment(value).trim().to_owned()) + .filter(|value| !value.is_empty()); + let position = indexes + .position + .and_then(|idx| fields.get(idx)) + .and_then(|value| strip_inline_comment(value).trim().parse::().ok()); + if rsid.is_none() && (chrom.is_none() || position.is_none()) { + return Ok(None); + } + + let genotype = if let Some(idx) = indexes.genotype { + fields + .get(idx) + .map(|value| strip_inline_comment(value)) + .unwrap_or_default() + .clone() + } else { + let allele1 = indexes + .allele1 + .and_then(|idx| fields.get(idx)) + .map(|value| strip_inline_comment(value)) + .unwrap_or_default(); + let allele2 = indexes + .allele2 + .and_then(|idx| fields.get(idx)) + .map(|value| strip_inline_comment(value)) + .unwrap_or_default(); + format!("{allele1}{allele2}") + }; + + Ok(Some(ParsedDelimitedRow { + rsid, + chrom, + position, + genotype: normalize_genotype(&genotype), + })) +} + +fn parse_owned_fields(line: &str, delimiter: Delimiter) -> Vec { + match delimiter { + Delimiter::Tab => line + .split('\t') + .map(|field| field.trim().to_owned()) + .collect(), + Delimiter::Space => line.split_whitespace().map(str::to_owned).collect(), + Delimiter::Comma => split_csv_line(line), + } +} + +pub(crate) fn looks_like_header_fields(fields: &[String]) -> bool { + fields + .first() + .is_some_and(|first| RSID_ALIASES.contains(&normalize_name(first).as_str())) +} + +pub(crate) fn build_column_indexes(header: &[String]) -> DelimitedColumnIndexes { + DelimitedColumnIndexes { + rsid: find_header_index(header, RSID_ALIASES), + chrom: find_header_index(header, CHROM_ALIASES), + position: find_header_index(header, POSITION_ALIASES), + genotype: find_header_index(header, GENOTYPE_ALIASES), + allele1: find_header_index(header, ALLELE1_ALIASES), + allele2: find_header_index(header, ALLELE2_ALIASES), + } +} + +pub(crate) fn default_column_indexes(field_count: usize) -> DelimitedColumnIndexes { + DelimitedColumnIndexes { + rsid: (field_count > 0).then_some(0), + chrom: (field_count > 1).then_some(1), + position: (field_count > 2).then_some(2), + genotype: (field_count > 3).then_some(3), + allele1: None, + allele2: None, + } +} + +pub(crate) fn find_header_index(header: &[String], aliases: &[&str]) -> Option { + header.iter().position(|field| { + aliases + .iter() + .any(|alias| normalize_name(field) == normalize_name(alias)) + }) +} diff --git a/rust/bioscript-formats/src/genotype/delimited/scan.rs b/rust/bioscript-formats/src/genotype/delimited/scan.rs new file mode 100644 index 0000000..5f51fd4 --- /dev/null +++ b/rust/bioscript-formats/src/genotype/delimited/scan.rs @@ -0,0 +1,199 @@ +use std::{ + collections::HashMap, + fs::File, + io::{BufRead, BufReader}, +}; + +use zip::ZipArchive; + +use bioscript_core::{RuntimeError, VariantObservation, VariantSpec}; + +use super::{ + super::{GenotypeSourceFormat, describe_query, types::DelimitedBackend, variant_sort_key}, + DelimitedColumnIndexes, detect_delimiter, parse_streaming_row, +}; + +pub(crate) fn scan_delimited_variants( + backend: &DelimitedBackend, + variants: &[VariantSpec], +) -> Result, RuntimeError> { + let mut indexed: Vec<(usize, &VariantSpec)> = variants.iter().enumerate().collect(); + indexed.sort_by_cached_key(|(_, variant)| variant_sort_key(variant)); + + let mut rsid_targets: HashMap> = HashMap::new(); + let mut coord_targets: HashMap<(String, i64), Vec> = HashMap::new(); + let mut results = vec![VariantObservation::default(); variants.len()]; + let mut unresolved = variants.len(); + + for (idx, variant) in &indexed { + for rsid in &variant.rsids { + rsid_targets.entry(rsid.clone()).or_default().push(*idx); + } + if let Some(locus) = variant.grch38.as_ref().or(variant.grch37.as_ref()) { + coord_targets + .entry(( + locus.chrom.trim_start_matches("chr").to_ascii_lowercase(), + locus.start, + )) + .or_default() + .push(*idx); + } + } + + let mut scan_reader = |reader: &mut dyn BufRead| -> Result<(), RuntimeError> { + let mut probe_lines = Vec::new(); + let mut buf = String::new(); + for _ in 0..8 { + buf.clear(); + let bytes = reader.read_line(&mut buf).map_err(|err| { + RuntimeError::Io(format!( + "failed to read genotype stream {}: {err}", + backend.path.display() + )) + })?; + if bytes == 0 { + break; + } + probe_lines.push(buf.trim_end_matches(['\n', '\r']).to_owned()); + } + + let delimiter = detect_delimiter(&probe_lines); + let mut column_indexes: Option = None; + let mut comment_header: Option> = None; + + let mut process_line = |line: &str| -> Result { + let Some(row) = + parse_streaming_row(line, delimiter, &mut column_indexes, &mut comment_header)? + else { + return Ok(unresolved == 0); + }; + + if let Some(rsid) = row.rsid.as_ref() + && let Some(target_indexes) = rsid_targets.get(rsid) + { + for &target_idx in target_indexes { + if results[target_idx].genotype.is_none() { + results[target_idx] = VariantObservation { + backend: backend.backend_name().to_owned(), + matched_rsid: Some(rsid.clone()), + genotype: Some(row.genotype.clone()), + evidence: vec![format!("resolved by rsid {rsid}")], + ..VariantObservation::default() + }; + unresolved = unresolved.saturating_sub(1); + } + } + } + + if unresolved == 0 { + return Ok(true); + } + + if let (Some(chrom), Some(position)) = (row.chrom.as_ref(), row.position) { + let key = ( + chrom.trim_start_matches("chr").to_ascii_lowercase(), + position, + ); + if let Some(target_indexes) = coord_targets.get(&key) { + for &target_idx in target_indexes { + if results[target_idx].genotype.is_none() { + results[target_idx] = VariantObservation { + backend: backend.backend_name().to_owned(), + matched_rsid: row.rsid.clone(), + genotype: Some(row.genotype.clone()), + evidence: vec![format!("resolved by locus {}:{}", chrom, position)], + ..VariantObservation::default() + }; + unresolved = unresolved.saturating_sub(1); + } + } + } + } + Ok(unresolved == 0) + }; + + for line in &probe_lines { + if process_line(line)? { + return Ok(()); + } + } + + loop { + buf.clear(); + let bytes = reader.read_line(&mut buf).map_err(|err| { + RuntimeError::Io(format!( + "failed to read genotype stream {}: {err}", + backend.path.display() + )) + })?; + if bytes == 0 { + break; + } + if process_line(buf.trim_end_matches(['\n', '\r']))? { + break; + } + } + Ok(()) + }; + + match backend.format { + GenotypeSourceFormat::Text => { + let file = File::open(&backend.path).map_err(|err| { + RuntimeError::Io(format!( + "failed to open genotype file {}: {err}", + backend.path.display() + )) + })?; + let mut reader = BufReader::new(file); + scan_reader(&mut reader)?; + } + GenotypeSourceFormat::Zip => { + let entry_name = backend.zip_entry_name.as_ref().ok_or_else(|| { + RuntimeError::Unsupported(format!( + "zip backend missing selected entry for {}", + backend.path.display() + )) + })?; + let file = File::open(&backend.path).map_err(|err| { + RuntimeError::Io(format!( + "failed to open genotype zip {}: {err}", + backend.path.display() + )) + })?; + let mut archive = ZipArchive::new(file).map_err(|err| { + RuntimeError::Io(format!( + "failed to read genotype zip {}: {err}", + backend.path.display() + )) + })?; + let entry = archive.by_name(entry_name).map_err(|err| { + RuntimeError::Io(format!( + "failed to open genotype entry {entry_name} in {}: {err}", + backend.path.display() + )) + })?; + let mut reader = BufReader::new(entry); + scan_reader(&mut reader)?; + } + _ => { + return Err(RuntimeError::Unsupported( + "streaming delimited backend only supports text and zip".to_owned(), + )); + } + } + + for (idx, variant) in indexed { + if results[idx].genotype.is_none() { + results[idx] = VariantObservation { + backend: backend.backend_name().to_owned(), + evidence: vec![format!( + "no matching rsid or locus found for {}", + describe_query(variant) + )], + ..VariantObservation::default() + }; + } + } + + Ok(results) +} diff --git a/rust/bioscript-formats/src/genotype/io.rs b/rust/bioscript-formats/src/genotype/io.rs new file mode 100644 index 0000000..d350fdb --- /dev/null +++ b/rust/bioscript-formats/src/genotype/io.rs @@ -0,0 +1,149 @@ +use std::{ + fs::File, + io::{BufRead, BufReader, Read}, + path::Path, +}; + +use zip::ZipArchive; + +use bioscript_core::RuntimeError; + +use super::GenotypeSourceFormat; + +pub(crate) fn is_bgzf_path(path: &Path) -> bool { + path.extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| ext.eq_ignore_ascii_case("gz")) +} + +pub(crate) fn read_plain_lines(path: &Path) -> Result, RuntimeError> { + let file = File::open(path).map_err(|err| { + RuntimeError::Io(format!( + "failed to open genotype file {}: {err}", + path.display() + )) + })?; + read_lines_from_reader(BufReader::new(file), path) +} + +pub(crate) fn select_zip_entry(path: &Path) -> Result { + let file = File::open(path).map_err(|err| { + RuntimeError::Io(format!( + "failed to open genotype zip {}: {err}", + path.display() + )) + })?; + let mut archive = ZipArchive::new(file).map_err(|err| { + RuntimeError::Io(format!( + "failed to read genotype zip {}: {err}", + path.display() + )) + })?; + + let mut selected_name: Option = None; + for idx in 0..archive.len() { + let entry = archive.by_index(idx).map_err(|err| { + RuntimeError::Io(format!( + "failed to inspect genotype zip {}: {err}", + path.display() + )) + })?; + if entry.is_dir() { + continue; + } + let name = entry.name().to_owned(); + let lower = name.to_ascii_lowercase(); + if lower.ends_with(".txt") + || lower.ends_with(".csv") + || lower.ends_with(".tsv") + || lower.ends_with(".vcf") + || lower.ends_with(".vcf.gz") + { + return Ok(name); + } + if selected_name.is_none() { + selected_name = Some(name); + } + } + + selected_name.ok_or_else(|| { + RuntimeError::Unsupported(format!( + "zip archive {} does not contain a supported genotype file", + path.display() + )) + }) +} + +pub(crate) fn read_lines_from_reader( + mut reader: R, + path: &Path, +) -> Result, RuntimeError> { + let mut lines = Vec::new(); + let mut buf = String::new(); + loop { + buf.clear(); + let bytes = reader.read_line(&mut buf).map_err(|err| { + RuntimeError::Io(format!( + "failed to read genotype file {}: {err}", + path.display() + )) + })?; + if bytes == 0 { + break; + } + lines.push(buf.trim_end_matches(['\n', '\r']).to_owned()); + } + Ok(lines) +} + +pub(crate) fn read_zip_entry_limited( + reader: &mut R, + max_bytes: u64, + label: &str, +) -> Result, RuntimeError> { + let mut contents = Vec::new(); + reader + .take(max_bytes.saturating_add(1)) + .read_to_end(&mut contents) + .map_err(|err| RuntimeError::Io(format!("failed to read {label}: {err}")))?; + if u64::try_from(contents.len()).unwrap_or(u64::MAX) > max_bytes { + return Err(RuntimeError::InvalidArguments(format!( + "{label} exceeds decompressed limit of {max_bytes} bytes" + ))); + } + Ok(contents) +} + +pub(crate) fn detect_source_format( + path: &Path, + forced: Option, +) -> Result { + if let Some(format) = forced { + return Ok(format); + } + + let lower = path.to_string_lossy().to_ascii_lowercase(); + if lower.ends_with(".zip") { + return Ok(GenotypeSourceFormat::Zip); + } + if lower.ends_with(".cram") { + return Ok(GenotypeSourceFormat::Cram); + } + if lower.ends_with(".vcf") || lower.ends_with(".vcf.gz") { + return Ok(GenotypeSourceFormat::Vcf); + } + + let lines = read_plain_lines(path)?; + if looks_like_vcf_lines(&lines) { + Ok(GenotypeSourceFormat::Vcf) + } else { + Ok(GenotypeSourceFormat::Text) + } +} + +pub(crate) fn looks_like_vcf_lines(lines: &[String]) -> bool { + lines.iter().any(|line| { + let trimmed = line.trim_start(); + trimmed.starts_with("##fileformat=VCF") || trimmed.starts_with("#CHROM\t") + }) +} diff --git a/rust/bioscript-formats/src/genotype/types.rs b/rust/bioscript-formats/src/genotype/types.rs new file mode 100644 index 0000000..756ce96 --- /dev/null +++ b/rust/bioscript-formats/src/genotype/types.rs @@ -0,0 +1,82 @@ +use std::{collections::HashMap, path::PathBuf, str::FromStr}; + +#[derive(Debug, Clone)] +pub struct GenotypeStore { + pub(crate) backend: QueryBackend, +} + +#[derive(Debug, Clone)] +pub(crate) enum QueryBackend { + RsidMap(RsidMapBackend), + Delimited(DelimitedBackend), + Vcf(VcfBackend), + Cram(CramBackend), +} + +#[derive(Debug, Clone)] +pub(crate) struct RsidMapBackend { + pub(crate) format: GenotypeSourceFormat, + pub(crate) values: HashMap, +} + +#[derive(Debug, Clone)] +pub(crate) struct DelimitedBackend { + pub(crate) format: GenotypeSourceFormat, + pub(crate) path: PathBuf, + pub(crate) zip_entry_name: Option, +} + +#[derive(Debug, Clone)] +pub(crate) struct VcfBackend { + pub(crate) path: PathBuf, + pub(crate) options: GenotypeLoadOptions, +} + +#[derive(Debug, Clone)] +pub(crate) struct CramBackend { + pub(crate) path: PathBuf, + pub(crate) options: GenotypeLoadOptions, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum QueryKind { + GenotypeByRsid, + GenotypeByLocus, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct BackendCapabilities { + pub rsid_lookup: bool, + pub locus_lookup: bool, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum GenotypeSourceFormat { + Text, + Zip, + Vcf, + Cram, +} + +impl FromStr for GenotypeSourceFormat { + type Err = String; + + fn from_str(value: &str) -> Result { + match value.trim().to_ascii_lowercase().as_str() { + "txt" | "text" | "genotype" => Ok(Self::Text), + "zip" => Ok(Self::Zip), + "vcf" => Ok(Self::Vcf), + "cram" => Ok(Self::Cram), + other => Err(format!("unsupported input format: {other}")), + } + } +} + +#[derive(Debug, Clone, Default)] +pub struct GenotypeLoadOptions { + pub format: Option, + pub input_index: Option, + pub reference_file: Option, + pub reference_index: Option, + pub allow_reference_md5_mismatch: bool, +} diff --git a/rust/bioscript-formats/src/genotype/vcf.rs b/rust/bioscript-formats/src/genotype/vcf.rs new file mode 100644 index 0000000..288c8cf --- /dev/null +++ b/rust/bioscript-formats/src/genotype/vcf.rs @@ -0,0 +1,460 @@ +use std::{ + collections::HashMap, + fs::File, + io::{BufRead, BufReader}, + path::Path, +}; + +use noodles::bgzf; +use noodles::csi; + +use bioscript_core::{ + Assembly, GenomicLocus, RuntimeError, VariantKind, VariantObservation, VariantSpec, +}; + +use crate::alignment; + +use super::{ + describe_query, genotype_from_vcf_gt, is_bgzf_path, types::VcfBackend, variant_sort_key, +}; + +mod reader; + +pub use reader::observe_vcf_snp_with_reader; + +#[derive(Debug, Clone)] +pub(crate) struct ParsedVcfRow { + pub(crate) rsid: Option, + pub(crate) chrom: String, + pub(crate) position: i64, + pub(crate) reference: String, + pub(crate) alternates: Vec, + pub(crate) genotype: String, +} + +pub(crate) fn scan_vcf_variants( + backend: &VcfBackend, + variants: &[VariantSpec], +) -> Result, RuntimeError> { + let mut indexed: Vec<(usize, &VariantSpec)> = variants.iter().enumerate().collect(); + indexed.sort_by_cached_key(|(_, variant)| variant_sort_key(variant)); + + let mut probe_lines = Vec::new(); + let detected_assembly = { + let file = File::open(&backend.path).map_err(|err| { + RuntimeError::Io(format!( + "failed to open VCF file {}: {err}", + backend.path.display() + )) + })?; + let mut reader: Box = if is_bgzf_path(&backend.path) { + Box::new(BufReader::new(bgzf::io::Reader::new(file))) + } else { + Box::new(BufReader::new(file)) + }; + + let mut buf = String::new(); + for _ in 0..256 { + buf.clear(); + let bytes = reader.read_line(&mut buf).map_err(|err| { + RuntimeError::Io(format!( + "failed to read VCF file {}: {err}", + backend.path.display() + )) + })?; + if bytes == 0 { + break; + } + let line = buf.trim_end_matches(['\n', '\r']).to_owned(); + let stop = line.starts_with("#CHROM\t"); + probe_lines.push(line); + if stop { + break; + } + } + + detect_vcf_assembly(&backend.path, &probe_lines) + }; + + let mut rsid_targets: HashMap> = HashMap::new(); + let mut coord_targets: HashMap<(String, i64), Vec> = HashMap::new(); + let mut results = vec![VariantObservation::default(); variants.len()]; + let mut unresolved = variants.len(); + + for (idx, variant) in &indexed { + for rsid in &variant.rsids { + rsid_targets.entry(rsid.clone()).or_default().push(*idx); + } + + if let Some(locus) = choose_variant_locus_for_assembly(variant, detected_assembly) { + let chrom = normalize_chromosome_name(&locus.chrom); + coord_targets + .entry((chrom.clone(), locus.start)) + .or_default() + .push(*idx); + if matches!( + variant.kind, + Some(VariantKind::Deletion | VariantKind::Insertion | VariantKind::Indel) + ) { + let anchor = locus.start.saturating_sub(1); + coord_targets.entry((chrom, anchor)).or_default().push(*idx); + } + } + } + + let targets = VcfResolutionTargets { + variants, + detected_assembly, + rsid_targets: &rsid_targets, + coord_targets: &coord_targets, + }; + + let file = File::open(&backend.path).map_err(|err| { + RuntimeError::Io(format!( + "failed to open VCF file {}: {err}", + backend.path.display() + )) + })?; + let mut reader: Box = if is_bgzf_path(&backend.path) { + Box::new(BufReader::new(bgzf::io::Reader::new(file))) + } else { + Box::new(BufReader::new(file)) + }; + + let mut buf = String::new(); + loop { + buf.clear(); + let bytes = reader.read_line(&mut buf).map_err(|err| { + RuntimeError::Io(format!( + "failed to read VCF file {}: {err}", + backend.path.display() + )) + })?; + if bytes == 0 || unresolved == 0 { + break; + } + if let Some(row) = parse_vcf_record(buf.trim_end_matches(['\n', '\r']))? { + resolve_vcf_row(backend, &row, &targets, &mut results, &mut unresolved); + } + } + + for (idx, variant) in indexed { + if results[idx].genotype.is_none() { + results[idx] = VariantObservation { + backend: backend.backend_name().to_owned(), + assembly: detected_assembly, + evidence: vec![format!( + "no matching rsid or locus found for {}", + describe_query(variant) + )], + ..VariantObservation::default() + }; + } + } + + Ok(results) +} + +pub(crate) fn lookup_indexed_vcf_variants( + backend: &VcfBackend, + variants: &[VariantSpec], +) -> Result>, RuntimeError> { + let Some(input_index) = backend.options.input_index.as_ref() else { + return Ok(None); + }; + let detected_assembly = detect_vcf_assembly_from_path(&backend.path)?; + let mut indexed_variants = Vec::with_capacity(variants.len()); + for (idx, variant) in variants.iter().enumerate() { + let Some(locus) = choose_variant_locus_for_assembly(variant, detected_assembly) else { + return Ok(None); + }; + let Some(reference) = first_single_base_allele(variant.reference.as_deref()) else { + return Ok(None); + }; + let Some(alternate) = first_single_base_allele(variant.alternate.as_deref()) else { + return Ok(None); + }; + if !matches!(variant.kind, None | Some(VariantKind::Snp)) { + return Ok(None); + } + indexed_variants.push((idx, variant, locus, reference, alternate)); + } + + let tabix_index = alignment::parse_tbi_bytes(&std::fs::read(input_index).map_err(|err| { + RuntimeError::Io(format!( + "failed to read VCF index {}: {err}", + input_index.display() + )) + })?)?; + let mut indexed = csi::io::IndexedReader::new( + File::open(&backend.path).map_err(|err| { + RuntimeError::Io(format!( + "failed to open VCF file {}: {err}", + backend.path.display() + )) + })?, + tabix_index, + ); + + let mut results = vec![VariantObservation::default(); variants.len()]; + for (idx, variant, locus, reference, alternate) in indexed_variants { + results[idx] = observe_vcf_snp_with_reader( + &mut indexed, + &backend.path.display().to_string(), + &locus, + reference, + alternate, + variant.rsids.first().cloned(), + detected_assembly, + )?; + } + Ok(Some(results)) +} + +pub(crate) fn detect_vcf_assembly_from_path(path: &Path) -> Result, RuntimeError> { + let mut probe_lines = Vec::new(); + let file = File::open(path).map_err(|err| { + RuntimeError::Io(format!("failed to open VCF file {}: {err}", path.display())) + })?; + let mut reader: Box = if is_bgzf_path(path) { + Box::new(BufReader::new(bgzf::io::Reader::new(file))) + } else { + Box::new(BufReader::new(file)) + }; + + let mut buf = String::new(); + for _ in 0..256 { + buf.clear(); + let bytes = reader.read_line(&mut buf).map_err(|err| { + RuntimeError::Io(format!("failed to read VCF file {}: {err}", path.display())) + })?; + if bytes == 0 { + break; + } + let line = buf.trim_end_matches(['\n', '\r']).to_owned(); + let stop = line.starts_with("#CHROM\t"); + probe_lines.push(line); + if stop { + break; + } + } + Ok(detect_vcf_assembly(path, &probe_lines)) +} + +fn first_single_base_allele(value: Option<&str>) -> Option { + let value = value?; + let mut chars = value.chars(); + let base = chars.next()?; + chars.next().is_none().then_some(base) +} + +struct VcfResolutionTargets<'a> { + variants: &'a [VariantSpec], + detected_assembly: Option, + rsid_targets: &'a HashMap>, + coord_targets: &'a HashMap<(String, i64), Vec>, +} + +fn resolve_vcf_row( + backend: &VcfBackend, + row: &ParsedVcfRow, + targets: &VcfResolutionTargets<'_>, + results: &mut [VariantObservation], + unresolved: &mut usize, +) { + if let Some(rsid) = row.rsid.as_ref() + && let Some(target_indexes) = targets.rsid_targets.get(rsid) + { + for &target_idx in target_indexes { + if results[target_idx].genotype.is_none() { + results[target_idx] = VariantObservation { + backend: backend.backend_name().to_owned(), + matched_rsid: Some(rsid.clone()), + assembly: targets.detected_assembly, + genotype: Some(row.genotype.clone()), + evidence: vec![format!("resolved by rsid {rsid}")], + ..VariantObservation::default() + }; + *unresolved = (*unresolved).saturating_sub(1); + } + } + } + + if *unresolved == 0 { + return; + } + + let key = (normalize_chromosome_name(&row.chrom), row.position); + if let Some(target_indexes) = targets.coord_targets.get(&key) { + for &target_idx in target_indexes { + if results[target_idx].genotype.is_none() + && vcf_row_matches_variant( + row, + &targets.variants[target_idx], + targets.detected_assembly, + ) + { + results[target_idx] = VariantObservation { + backend: backend.backend_name().to_owned(), + matched_rsid: row.rsid.clone(), + assembly: targets.detected_assembly, + genotype: Some(row.genotype.clone()), + evidence: vec![format!("resolved by locus {}:{}", row.chrom, row.position)], + ..VariantObservation::default() + }; + *unresolved = (*unresolved).saturating_sub(1); + } + } + } +} + +pub(crate) fn parse_vcf_record(line: &str) -> Result, RuntimeError> { + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.starts_with('#') { + return Ok(None); + } + + let fields: Vec<&str> = trimmed.split('\t').collect(); + if fields.len() < 10 { + return Ok(None); + } + + let chrom = fields[0].trim(); + let position = fields[1].trim().parse::().map_err(|err| { + RuntimeError::Io(format!( + "failed to parse VCF position '{}': {err}", + fields[1].trim() + )) + })?; + let rsid = { + let value = fields[2].trim(); + (!value.is_empty() && value != ".").then(|| value.to_owned()) + }; + let reference = fields[3].trim(); + if reference.is_empty() || reference == "." { + return Ok(None); + } + + let alternates: Vec = fields[4] + .split(',') + .map(str::trim) + .filter(|alt| !alt.is_empty() && *alt != ".") + .map(ToOwned::to_owned) + .collect(); + if alternates.is_empty() { + return Ok(None); + } + + let genotype = extract_vcf_sample_genotype(fields[8], fields[9], reference, &alternates) + .unwrap_or_else(|| "--".to_owned()); + + Ok(Some(ParsedVcfRow { + rsid, + chrom: chrom.to_owned(), + position, + reference: reference.to_owned(), + alternates, + genotype, + })) +} + +pub(crate) fn extract_vcf_sample_genotype( + format_field: &str, + sample_field: &str, + reference: &str, + alternates: &[String], +) -> Option { + let gt_index = format_field + .split(':') + .position(|field| field.eq_ignore_ascii_case("GT"))?; + let sample_parts: Vec<&str> = sample_field.split(':').collect(); + let sample_gt = sample_parts.get(gt_index).copied().unwrap_or("."); + let alternate_refs: Vec<&str> = alternates.iter().map(String::as_str).collect(); + genotype_from_vcf_gt(sample_gt, reference, &alternate_refs) +} + +pub(crate) fn detect_vcf_assembly(path: &Path, probe_lines: &[String]) -> Option { + let combined = probe_lines.join("\n").to_ascii_lowercase(); + if combined.contains("assembly=b37") + || combined.contains("assembly=grch37") + || combined.contains("assembly=hg19") + || combined.contains("reference=grch37") + || combined.contains("reference=hg19") + { + return Some(Assembly::Grch37); + } + if combined.contains("assembly=b38") + || combined.contains("assembly=grch38") + || combined.contains("assembly=hg38") + || combined.contains("reference=grch38") + || combined.contains("reference=hg38") + { + return Some(Assembly::Grch38); + } + + let lower = path.to_string_lossy().to_ascii_lowercase(); + if lower.contains("grch37") || lower.contains("hg19") || lower.contains("b37") { + Some(Assembly::Grch37) + } else if lower.contains("grch38") || lower.contains("hg38") || lower.contains("b38") { + Some(Assembly::Grch38) + } else { + None + } +} + +pub(crate) fn choose_variant_locus_for_assembly( + variant: &VariantSpec, + assembly: Option, +) -> Option { + match assembly { + Some(Assembly::Grch37) => variant.grch37.clone().or_else(|| variant.grch38.clone()), + Some(Assembly::Grch38) => variant.grch38.clone().or_else(|| variant.grch37.clone()), + None => variant.grch37.clone().or_else(|| variant.grch38.clone()), + } +} + +pub(crate) fn normalize_chromosome_name(value: &str) -> String { + value.trim().trim_start_matches("chr").to_ascii_lowercase() +} + +pub(crate) fn vcf_row_matches_variant( + row: &ParsedVcfRow, + variant: &VariantSpec, + assembly: Option, +) -> bool { + let Some(locus) = choose_variant_locus_for_assembly(variant, assembly) else { + return false; + }; + + if normalize_chromosome_name(&row.chrom) != normalize_chromosome_name(&locus.chrom) { + return false; + } + + match variant.kind.unwrap_or(VariantKind::Other) { + VariantKind::Snp => { + row.position == locus.start + && variant + .reference + .as_ref() + .is_none_or(|reference| reference.eq_ignore_ascii_case(&row.reference)) + && variant.alternate.as_ref().is_none_or(|alternate| { + row.alternates + .iter() + .any(|candidate| candidate.eq_ignore_ascii_case(alternate)) + }) + } + VariantKind::Deletion => { + let expected_len = variant.deletion_length.unwrap_or(0); + row.position == locus.start.saturating_sub(1) + && row.alternates.iter().any(|alternate| { + let actual_len = row.reference.len().saturating_sub(alternate.len()); + (expected_len == 0 || actual_len == expected_len) + && alternate.len() < row.reference.len() + }) + } + VariantKind::Insertion | VariantKind::Indel => { + row.position == locus.start.saturating_sub(1) + } + VariantKind::Other => row.position == locus.start, + } +} diff --git a/rust/bioscript-formats/src/genotype/vcf/reader.rs b/rust/bioscript-formats/src/genotype/vcf/reader.rs new file mode 100644 index 0000000..3bc2038 --- /dev/null +++ b/rust/bioscript-formats/src/genotype/vcf/reader.rs @@ -0,0 +1,139 @@ +use std::io::{Read, Seek}; + +use noodles::bgzf; +use noodles::core::{Position, Region}; +use noodles::csi::{self, BinningIndex}; +use noodles::tabix; + +use bioscript_core::{Assembly, GenomicLocus, RuntimeError, VariantObservation}; + +use super::parse_vcf_record; + +/// Observe a SNP at `locus` over an already-built tabix-indexed bgzipped VCF +/// reader. Caller builds `csi::io::IndexedReader::new(reader, tabix_index)` +/// once and calls this per variant. +pub fn observe_vcf_snp_with_reader( + indexed: &mut csi::io::IndexedReader, tabix::Index>, + label: &str, + locus: &GenomicLocus, + reference: char, + alternate: char, + matched_rsid: Option, + assembly: Option, +) -> Result +where + R: Read + Seek, +{ + let locus_label = format!("{}:{}", locus.chrom, locus.start); + + let Some(seq_name) = resolve_vcf_chrom_name(indexed.index(), &locus.chrom) else { + return Ok(VariantObservation { + backend: "vcf".to_owned(), + matched_rsid, + assembly, + evidence: vec![format!( + "{label}: tabix index has no contig matching {} (tried chr-prefixed and bare forms)", + locus.chrom + )], + ..VariantObservation::default() + }); + }; + + let pos_usize = usize::try_from(locus.start).map_err(|err| { + RuntimeError::Io(format!( + "{label}: invalid VCF position {} for {locus_label}: {err}", + locus.start + )) + })?; + let position = Position::try_from(pos_usize).map_err(|err| { + RuntimeError::Io(format!( + "{label}: invalid VCF position {} for {locus_label}: {err}", + locus.start + )) + })?; + let region = Region::new(seq_name.as_str(), position..=position); + + let query = indexed.query(®ion).map_err(|err| { + RuntimeError::Io(format!("{label}: tabix query for {locus_label}: {err}")) + })?; + + let reference_str = reference.to_ascii_uppercase().to_string(); + let alternate_str = alternate.to_ascii_uppercase().to_string(); + + let mut saw_any = false; + for record_result in query { + let record = record_result + .map_err(|err| RuntimeError::Io(format!("{label}: tabix record iter: {err}")))?; + let line: &str = record.as_ref(); + let Some(row) = parse_vcf_record(line)? else { + continue; + }; + if row.position != locus.start { + continue; + } + saw_any = true; + if !row.reference.eq_ignore_ascii_case(&reference_str) { + continue; + } + if !row + .alternates + .iter() + .any(|alt| alt.eq_ignore_ascii_case(&alternate_str)) + { + continue; + } + + return Ok(VariantObservation { + backend: "vcf".to_owned(), + matched_rsid: matched_rsid.or_else(|| row.rsid.clone()), + assembly, + genotype: Some(row.genotype.clone()), + evidence: vec![format!("{label}: resolved by locus {locus_label}")], + ..VariantObservation::default() + }); + } + + let evidence = if saw_any { + vec![format!( + "{label}: {locus_label} present but ref={reference}/alt={alternate} did not match any record" + )] + } else { + vec![format!("{label}: no VCF record at {locus_label}")] + }; + Ok(VariantObservation { + backend: "vcf".to_owned(), + matched_rsid, + assembly, + evidence, + ..VariantObservation::default() + }) +} + +fn resolve_vcf_chrom_name(index: &tabix::Index, user_chrom: &str) -> Option { + let header = index.header()?; + let names = header.reference_sequence_names(); + + let trimmed = user_chrom.trim(); + let stripped = trimmed.strip_prefix("chr").unwrap_or(trimmed); + + let candidates = [ + trimmed.to_owned(), + stripped.to_owned(), + format!("chr{stripped}"), + ]; + for cand in &candidates { + if names.contains(cand.as_bytes()) { + return Some(cand.clone()); + } + } + + let target = stripped.to_ascii_lowercase(); + for name in names { + let as_str = std::str::from_utf8(name.as_ref()).ok()?; + let as_stripped = as_str.strip_prefix("chr").unwrap_or(as_str); + if as_stripped.eq_ignore_ascii_case(&target) { + return Some(as_str.to_owned()); + } + } + None +} diff --git a/rust/bioscript-formats/src/genotype/vcf_tokens.rs b/rust/bioscript-formats/src/genotype/vcf_tokens.rs new file mode 100644 index 0000000..5cd5650 --- /dev/null +++ b/rust/bioscript-formats/src/genotype/vcf_tokens.rs @@ -0,0 +1,73 @@ +use super::normalize_genotype; + +pub(crate) fn genotype_from_vcf_gt( + gt: &str, + reference: &str, + alternates: &[&str], +) -> Option { + if matches!(gt.trim(), "" | "." | "./." | ".|.") { + return Some("--".to_owned()); + } + + let cleaned = gt.trim().replace('|', "/"); + let parts: Vec<&str> = cleaned.split('/').collect(); + if parts.len() != 2 || parts.contains(&".") { + return Some("--".to_owned()); + } + + let ref_token = vcf_reference_token(reference, alternates); + let mut out = String::new(); + for part in parts { + let idx = part.parse::().ok()?; + if idx == 0 { + out.push_str(&ref_token); + } else { + let alt = alternates.get(idx - 1)?; + out.push_str(&vcf_alt_token(reference, alt)); + } + } + + Some(normalize_genotype(&out)) +} + +pub(crate) fn vcf_reference_token(reference: &str, alternates: &[&str]) -> String { + let mut saw_shorter = false; + let mut saw_longer = false; + + for alt in alternates { + if is_symbolic_vcf_alt(alt) { + continue; + } + match alt.len().cmp(&reference.len()) { + std::cmp::Ordering::Less => saw_shorter = true, + std::cmp::Ordering::Greater => saw_longer = true, + std::cmp::Ordering::Equal => {} + } + } + + match (saw_shorter, saw_longer) { + (true, false) => "I".to_owned(), + (false, true) => "D".to_owned(), + _ => normalize_sequence_token(reference), + } +} + +pub(crate) fn vcf_alt_token(reference: &str, alternate: &str) -> String { + if is_symbolic_vcf_alt(alternate) { + return "--".to_owned(); + } + match alternate.len().cmp(&reference.len()) { + std::cmp::Ordering::Less => "D".to_owned(), + std::cmp::Ordering::Greater => "I".to_owned(), + std::cmp::Ordering::Equal => normalize_sequence_token(alternate), + } +} + +pub(crate) fn is_symbolic_vcf_alt(alternate: &str) -> bool { + let trimmed = alternate.trim(); + trimmed.starts_with('<') && trimmed.ends_with('>') +} + +pub(crate) fn normalize_sequence_token(value: &str) -> String { + value.trim().to_ascii_uppercase() +} diff --git a/rust/bioscript-formats/src/inspect.rs b/rust/bioscript-formats/src/inspect.rs index 1c70dea..add8e70 100644 --- a/rust/bioscript-formats/src/inspect.rs +++ b/rust/bioscript-formats/src/inspect.rs @@ -1,8 +1,4 @@ -use std::{ - fs::File, - io::{BufRead, BufReader, Cursor, Read}, - path::{Path, PathBuf}, -}; +use std::path::{Path, PathBuf}; #[cfg(not(target_arch = "wasm32"))] use std::time::Instant; @@ -34,10 +30,15 @@ impl StubDuration { } use bioscript_core::{Assembly, RuntimeError}; -use noodles::bgzf; -use zip::ZipArchive; -const MAX_ZIP_SAMPLE_ENTRY_BYTES: u64 = 128 * 1024 * 1024; +mod heuristics; +mod io; +mod render; + +pub(crate) use heuristics::*; +pub(crate) use io::*; +#[cfg(test)] +pub(crate) use render::*; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum FileContainer { @@ -96,62 +97,6 @@ pub struct FileInspection { pub duration_ms: u128, } -impl FileInspection { - #[must_use] - pub fn render_text(&self) -> String { - let mut lines = Vec::new(); - lines.push(format!("path\t{}", self.path.display())); - lines.push(format!("container\t{}", render_container(self.container))); - lines.push(format!("kind\t{}", render_kind(self.detected_kind))); - lines.push(format!( - "confidence\t{}", - render_confidence(self.confidence) - )); - lines.push(format!("assembly\t{}", render_assembly(self.assembly))); - lines.push(format!("phased\t{}", render_bool(self.phased))); - lines.push(format!( - "selected_entry\t{}", - self.selected_entry.as_deref().unwrap_or("") - )); - lines.push(format!("has_index\t{}", render_bool(self.has_index))); - lines.push(format!( - "index_path\t{}", - self.index_path - .as_ref() - .map(|path| path.display().to_string()) - .unwrap_or_default() - )); - lines.push(format!( - "reference_matches\t{}", - render_bool(self.reference_matches) - )); - if let Some(source) = &self.source { - lines.push(format!( - "vendor\t{}", - source.vendor.as_deref().unwrap_or_default() - )); - lines.push(format!( - "platform_version\t{}", - source.platform_version.as_deref().unwrap_or_default() - )); - lines.push(format!( - "source_confidence\t{}", - render_confidence(source.confidence) - )); - lines.push(format!("source_evidence\t{}", source.evidence.join(" | "))); - } else { - lines.push("vendor\t".to_owned()); - lines.push("platform_version\t".to_owned()); - lines.push("source_confidence\t".to_owned()); - lines.push("source_evidence\t".to_owned()); - } - lines.push(format!("evidence\t{}", self.evidence.join(" | "))); - lines.push(format!("warnings\t{}", self.warnings.join(" | "))); - lines.push(format!("duration_ms\t{}", self.duration_ms)); - lines.join("\n") - } -} - /// Classify a file from in-memory bytes. Mirrors `inspect_file` but sources /// its sample lines / zip entries from a byte buffer instead of the /// filesystem. Needed by wasm targets where `std::fs` isn't available. @@ -255,74 +200,6 @@ pub fn inspect_bytes( }) } -fn read_plain_sample_lines_from_bytes( - lower_name: &str, - bytes: &[u8], -) -> Result, RuntimeError> { - if lower_name.ends_with(".vcf.gz") { - return read_sample_lines_from_reader(BufReader::new(bgzf::io::Reader::new(Cursor::new( - bytes, - )))); - } - read_sample_lines_from_reader(BufReader::new(Cursor::new(bytes))) -} - -fn read_zip_sample_lines_from_bytes( - bytes: &[u8], - selected_entry: &str, -) -> Result, RuntimeError> { - let mut archive = ZipArchive::new(Cursor::new(bytes)) - .map_err(|err| RuntimeError::Io(format!("failed to read zip bytes: {err}")))?; - let mut entry = archive.by_name(selected_entry).map_err(|err| { - RuntimeError::Io(format!( - "failed to open zip entry {selected_entry} from bytes: {err}" - )) - })?; - if selected_entry.to_ascii_lowercase().ends_with(".vcf.gz") { - let inner = read_entry_limited( - &mut entry, - MAX_ZIP_SAMPLE_ENTRY_BYTES, - &format!("compressed zip entry {selected_entry}"), - )?; - let reader = bgzf::io::Reader::new(Cursor::new(inner)); - return read_sample_lines_from_reader(BufReader::new(reader)); - } - read_sample_lines_from_reader(BufReader::new(entry)) -} - -fn select_zip_entry_from_bytes(bytes: &[u8]) -> Result { - let mut archive = ZipArchive::new(Cursor::new(bytes)) - .map_err(|err| RuntimeError::Io(format!("failed to read zip bytes: {err}")))?; - let mut fallback = None; - for idx in 0..archive.len() { - let entry = archive - .by_index(idx) - .map_err(|err| RuntimeError::Io(format!("failed to inspect zip bytes: {err}")))?; - if entry.is_dir() { - continue; - } - let name = entry.name().to_owned(); - if name.starts_with("__MACOSX/") { - continue; - } - let lower = name.to_ascii_lowercase(); - if lower.ends_with(".vcf") - || lower.ends_with(".vcf.gz") - || lower.ends_with(".txt") - || lower.ends_with(".tsv") - || lower.ends_with(".csv") - { - return Ok(name); - } - if fallback.is_none() { - fallback = Some(name); - } - } - fallback.ok_or_else(|| { - RuntimeError::Unsupported("zip archive does not contain a supported file".to_owned()) - }) -} - pub fn inspect_file(path: &Path, options: &InspectOptions) -> Result { let started = Instant::now(); let lower = path.to_string_lossy().to_ascii_lowercase(); @@ -455,515 +332,11 @@ fn inspect_from_textual_sample( } } -fn read_plain_sample_lines(path: &Path) -> Result, RuntimeError> { - let lower = path.to_string_lossy().to_ascii_lowercase(); - let file = File::open(path) - .map_err(|err| RuntimeError::Io(format!("failed to open {}: {err}", path.display())))?; - if lower.ends_with(".vcf.gz") { - return read_sample_lines_from_reader(BufReader::new(bgzf::io::Reader::new(file))); - } - read_sample_lines_from_reader(BufReader::new(file)) -} - -fn read_zip_sample_lines(path: &Path, selected_entry: &str) -> Result, RuntimeError> { - let file = File::open(path) - .map_err(|err| RuntimeError::Io(format!("failed to open zip {}: {err}", path.display())))?; - let mut archive = ZipArchive::new(file) - .map_err(|err| RuntimeError::Io(format!("failed to read zip {}: {err}", path.display())))?; - let mut entry = archive.by_name(selected_entry).map_err(|err| { - RuntimeError::Io(format!( - "failed to open zip entry {selected_entry} in {}: {err}", - path.display() - )) - })?; - - if selected_entry.to_ascii_lowercase().ends_with(".vcf.gz") { - let bytes = read_entry_limited( - &mut entry, - MAX_ZIP_SAMPLE_ENTRY_BYTES, - &format!( - "compressed zip entry {selected_entry} in {}", - path.display() - ), - )?; - let reader = bgzf::io::Reader::new(Cursor::new(bytes)); - return read_sample_lines_from_reader(BufReader::new(reader)); - } - - read_sample_lines_from_reader(BufReader::new(entry)) -} - -fn read_entry_limited( - reader: &mut R, - max_bytes: u64, - label: &str, -) -> Result, RuntimeError> { - let mut bytes = Vec::new(); - reader - .take(max_bytes.saturating_add(1)) - .read_to_end(&mut bytes) - .map_err(|err| RuntimeError::Io(format!("failed to read {label}: {err}")))?; - if u64::try_from(bytes.len()).unwrap_or(u64::MAX) > max_bytes { - return Err(RuntimeError::InvalidArguments(format!( - "{label} exceeds decompressed limit of {max_bytes} bytes" - ))); - } - Ok(bytes) -} - -fn read_sample_lines_from_reader(mut reader: R) -> Result, RuntimeError> { - let mut out = Vec::new(); - let mut buf = String::new(); - for _ in 0..64 { - buf.clear(); - let bytes = reader - .read_line(&mut buf) - .map_err(|err| RuntimeError::Io(format!("failed to read sample lines: {err}")))?; - if bytes == 0 { - break; - } - out.push(buf.trim_end_matches(['\n', '\r']).to_owned()); - } - Ok(out) -} - -fn select_zip_entry(path: &Path) -> Result { - let file = File::open(path) - .map_err(|err| RuntimeError::Io(format!("failed to open zip {}: {err}", path.display())))?; - let mut archive = ZipArchive::new(file) - .map_err(|err| RuntimeError::Io(format!("failed to read zip {}: {err}", path.display())))?; - let mut fallback = None; - for idx in 0..archive.len() { - let entry = archive.by_index(idx).map_err(|err| { - RuntimeError::Io(format!("failed to inspect zip {}: {err}", path.display())) - })?; - if entry.is_dir() { - continue; - } - let name = entry.name().to_owned(); - if name.starts_with("__MACOSX/") { - continue; - } - let lower = name.to_ascii_lowercase(); - if lower.ends_with(".vcf") - || lower.ends_with(".vcf.gz") - || lower.ends_with(".txt") - || lower.ends_with(".tsv") - || lower.ends_with(".csv") - { - return Ok(name); - } - if fallback.is_none() { - fallback = Some(name); - } - } - fallback.ok_or_else(|| { - RuntimeError::Unsupported(format!( - "zip archive {} does not contain a supported file", - path.display() - )) - }) -} - -fn looks_like_vcf_lines(lines: &[String]) -> bool { - lines.iter().any(|line| { - let trimmed = line.trim_start(); - trimmed.starts_with("##fileformat=VCF") || trimmed.starts_with("#CHROM\t") - }) -} - -fn looks_like_genotype_text(lines: &[String]) -> bool { - let mut checked = 0usize; - let mut valid = 0usize; - for line in lines { - let trimmed = line.trim(); - if trimmed.is_empty() || trimmed.starts_with('#') || trimmed.starts_with("//") { - continue; - } - let fields = split_fields(trimmed); - checked += 1; - if matches_genotype_shape(&fields) { - valid += 1; - } - } - checked > 0 && valid * 10 >= checked * 7 -} - -fn split_fields(line: &str) -> Vec { - if line.contains('\t') { - return line - .split('\t') - .map(|field| field.trim().to_owned()) - .collect(); - } - if line.contains(',') { - return line - .split(',') - .map(|field| field.trim().trim_matches('"').to_owned()) - .collect(); - } - line.split_whitespace().map(str::to_owned).collect() -} - -fn matches_genotype_shape(fields: &[String]) -> bool { - if fields.len() < 4 { - return false; - } - let rsid_like = fields[0].starts_with("rs") || fields[0].starts_with('i'); - if !rsid_like { - return false; - } - let chr_idx = fields.iter().position(|field| is_valid_chromosome(field)); - let Some(chr_idx) = chr_idx else { - return false; - }; - for pos_idx in (chr_idx + 1)..fields.len() { - if fields[pos_idx].parse::().is_err() { - continue; - } - for field in fields.iter().skip(pos_idx + 1) { - if is_valid_genotype(field) { - return true; - } - } - if pos_idx + 2 < fields.len() - && is_valid_allele(&fields[pos_idx + 1]) - && is_valid_allele(&fields[pos_idx + 2]) - { - return true; - } - } - false -} - -fn is_valid_chromosome(value: &str) -> bool { - let trimmed = value.trim().trim_start_matches("chr"); - if let Ok(n) = trimmed.parse::() { - return (1..=26).contains(&n); - } - matches!( - trimmed.to_ascii_uppercase().as_str(), - "X" | "Y" | "M" | "MT" | "XY" - ) -} - -fn is_valid_genotype(value: &str) -> bool { - let trimmed = value.trim().to_ascii_uppercase(); - if trimmed.is_empty() || trimmed.len() > 4 { - return false; - } - trimmed - .chars() - .all(|ch| matches!(ch, 'A' | 'C' | 'G' | 'T' | 'I' | 'D' | '-' | '0')) -} - -fn is_valid_allele(value: &str) -> bool { - let trimmed = value.trim().to_ascii_uppercase(); - matches!( - trimmed.as_str(), - "A" | "C" | "G" | "T" | "I" | "D" | "-" | "0" - ) -} - -fn detect_source( - lower_name: &str, - sample_lines: &[String], - kind: DetectedKind, -) -> Option { - let header = sample_lines - .iter() - .filter(|line| line.starts_with('#') || line.starts_with("//")) - .map(|line| line.to_ascii_lowercase()) - .collect::>() - .join("\n"); - let combined = format!("{lower_name}\n{header}"); - let normalized = combined.replace(['_', '-', '.'], " "); - let mut evidence = Vec::new(); - let mut vendor = None; - let mut platform_version = None; - let mut confidence = DetectionConfidence::Unknown; - - if normalized.contains("genes for good") || normalized.contains("geneforgood") { - vendor = Some("Genes for Good".to_owned()); - confidence = DetectionConfidence::StrongHeuristic; - evidence.push("Genes for Good header".to_owned()); - if let Some(version) = extract_token_after_marker(&header, "genes for good ") { - platform_version = Some(version); - evidence.push("Genes for Good version header".to_owned()); - } - } else if normalized.contains("23andme") || normalized.contains("23&me") { - vendor = Some("23andMe".to_owned()); - confidence = DetectionConfidence::StrongHeuristic; - evidence.push("23andMe header/export name".to_owned()); - if normalized.contains(" v2 ") || lower_name.contains("/v2/") { - platform_version = Some("v2".to_owned()); - evidence.push("v2 token".to_owned()); - } else if normalized.contains(" v3 ") || lower_name.contains("/v3/") { - platform_version = Some("v3".to_owned()); - evidence.push("v3 token".to_owned()); - } else if normalized.contains(" v4 ") || lower_name.contains("/v4/") { - platform_version = Some("v4".to_owned()); - evidence.push("v4 token".to_owned()); - } else if normalized.contains(" v5 ") || lower_name.contains("/v5/") { - platform_version = Some("v5".to_owned()); - evidence.push("v5 token".to_owned()); - } - } else if normalized.contains("ancestrydna") || normalized.contains("ancestry com dna") { - vendor = Some("AncestryDNA".to_owned()); - confidence = DetectionConfidence::StrongHeuristic; - evidence.push("AncestryDNA header/export name".to_owned()); - if let Some(version) = extract_after_marker(&header, "array version:") { - platform_version = Some(canonicalize_ancestry_version(&version)); - evidence.push("AncestryDNA array version header".to_owned()); - } - } else if normalized.contains("family tree dna") - || normalized.contains("familytreedna") - || normalized.contains("ftdna") - { - vendor = Some("FamilyTreeDNA".to_owned()); - confidence = DetectionConfidence::StrongHeuristic; - evidence.push("FamilyTreeDNA header/export name".to_owned()); - } else if normalized.contains("dynamic dna") - || normalized.contains("dynamicdnalabs") - || normalized.contains("ddna laboratories") - || normalized.contains("ddna") - { - vendor = Some("Dynamic DNA".to_owned()); - confidence = DetectionConfidence::StrongHeuristic; - evidence.push("Dynamic DNA header".to_owned()); - if normalized.contains("gsav3 dtc") { - platform_version = Some("GSAv3-DTC".to_owned()); - evidence.push("GSAv3-DTC token".to_owned()); - } else if normalized.contains("gsav3") { - platform_version = Some("GSAv3".to_owned()); - evidence.push("GSAv3 token".to_owned()); - } - } else if normalized.contains("myheritage") { - vendor = Some("MyHeritage".to_owned()); - confidence = DetectionConfidence::StrongHeuristic; - evidence.push("MyHeritage header/export name".to_owned()); - } else if normalized.contains("sequencing com") && kind == DetectedKind::Vcf { - vendor = Some("Sequencing.com".to_owned()); - confidence = DetectionConfidence::WeakHeuristic; - evidence.push("sequencing.com header text".to_owned()); - } else if normalized.contains("carigenetics") || normalized.contains("cari genetics") { - vendor = Some("CariGenetics".to_owned()); - confidence = DetectionConfidence::StrongHeuristic; - evidence.push("CariGenetics path/header text".to_owned()); - } - - vendor.map(|vendor| SourceMetadata { - vendor: Some(vendor), - platform_version, - confidence, - evidence, - }) -} - -fn extract_after_marker(text: &str, marker: &str) -> Option { - text.lines().find_map(|line| { - let trimmed = line.trim(); - let lower = trimmed.to_ascii_lowercase(); - lower.find(marker).map(|idx| { - trimmed[idx + marker.len()..] - .trim() - .trim_end_matches('.') - .to_owned() - }) - }) -} - -fn extract_token_after_marker(text: &str, marker: &str) -> Option { - extract_after_marker(text, marker).map(|value| { - value - .split_whitespace() - .next() - .unwrap_or_default() - .trim_end_matches(':') - .to_owned() - }) -} - -fn canonicalize_ancestry_version(value: &str) -> String { - let trimmed = value.trim(); - if let Some(rest) = trimmed.strip_prefix('v') { - return format!("V{rest}"); - } - trimmed.to_owned() -} - -fn detect_assembly(lower_name: &str, sample_lines: &[String]) -> Option { - let header = sample_lines.join("\n").to_ascii_lowercase(); - let combined = format!("{lower_name}\n{header}"); - let looks_like_grch38 = combined.contains("build 38") - || combined.contains("grch38") - || combined.contains("hg38") - || combined.contains("gca_000001405.15") - || combined.contains("grch38_no_alt_analysis_set") - || combined.contains("##contig="); - - if looks_like_grch38 { - Some(Assembly::Grch38) - } else if combined.contains("build 37") - || combined.contains("grch37") - || combined.contains("hg19") - || combined.contains("assembly=b37") - || combined.contains("assembly=\"b37\"") - || combined.contains("human_g1k_v37") - || combined.contains("37.1") - { - Some(Assembly::Grch37) - } else { - None - } -} - -fn detect_vcf_phasing(lines: &[String]) -> Option { - let mut saw_slash = false; - for line in lines { - if line.starts_with('#') { - continue; - } - let fields: Vec<&str> = line.split('\t').collect(); - if fields.len() < 10 { - continue; - } - let gt = fields[9].split(':').next().unwrap_or_default().trim(); - if gt.contains('|') { - return Some(true); - } - if gt.contains('/') { - saw_slash = true; - } - } - saw_slash.then_some(false) -} - -fn detect_index( - path: &Path, - kind: DetectedKind, - options: &InspectOptions, -) -> (Option, Option) { - if let Some(index) = options - .input_index - .as_ref() - .or(options.reference_index.as_ref()) - { - return (Some(index.exists()), Some(index.clone())); - } - - match kind { - DetectedKind::AlignmentCram => { - let candidate = if path - .to_string_lossy() - .to_ascii_lowercase() - .ends_with(".cram") - { - let first = path.with_extension("cram.crai"); - if first.exists() { - Some(first) - } else { - Some(path.with_extension("crai")) - } - } else { - None - }; - match candidate { - Some(candidate) => (Some(candidate.exists()), Some(candidate)), - None => (Some(false), None), - } - } - DetectedKind::AlignmentBam => { - let first = path.with_extension("bam.bai"); - if first.exists() { - return (Some(true), Some(first)); - } - let second = path.with_extension("bai"); - (Some(second.exists()), Some(second)) - } - DetectedKind::ReferenceFasta => { - let candidate = if let Some(ext) = path.extension().and_then(|ext| ext.to_str()) { - path.with_extension(format!("{ext}.fai")) - } else { - path.with_extension("fai") - }; - (Some(candidate.exists()), Some(candidate)) - } - _ => (None, None), - } -} - -fn is_reference_path(path: &Path) -> bool { - let lower = path.to_string_lossy().to_ascii_lowercase(); - lower.ends_with(".fa") || lower.ends_with(".fasta") -} - -fn classify_confidence( - kind: DetectedKind, - sample_lines: &[String], - source: Option<&SourceMetadata>, -) -> DetectionConfidence { - match kind { - DetectedKind::Vcf if looks_like_vcf_lines(sample_lines) => { - DetectionConfidence::Authoritative - } - DetectedKind::AlignmentCram | DetectedKind::AlignmentBam | DetectedKind::ReferenceFasta => { - DetectionConfidence::Authoritative - } - DetectedKind::GenotypeText if source.is_some() => DetectionConfidence::StrongHeuristic, - DetectedKind::GenotypeText => DetectionConfidence::WeakHeuristic, - DetectedKind::Unknown => DetectionConfidence::Unknown, - DetectedKind::Vcf => DetectionConfidence::StrongHeuristic, - } -} - -fn render_container(value: FileContainer) -> &'static str { - match value { - FileContainer::Plain => "plain", - FileContainer::Zip => "zip", - } -} - -fn render_kind(value: DetectedKind) -> &'static str { - match value { - DetectedKind::GenotypeText => "genotype_text", - DetectedKind::Vcf => "vcf", - DetectedKind::AlignmentCram => "alignment_cram", - DetectedKind::AlignmentBam => "alignment_bam", - DetectedKind::ReferenceFasta => "reference_fasta", - DetectedKind::Unknown => "unknown", - } -} - -fn render_confidence(value: DetectionConfidence) -> &'static str { - match value { - DetectionConfidence::Authoritative => "authoritative", - DetectionConfidence::StrongHeuristic => "strong_heuristic", - DetectionConfidence::WeakHeuristic => "weak_heuristic", - DetectionConfidence::Unknown => "unknown", - } -} - -fn render_assembly(value: Option) -> &'static str { - match value { - Some(Assembly::Grch37) => "grch37", - Some(Assembly::Grch38) => "grch38", - None => "", - } -} - -fn render_bool(value: Option) -> &'static str { - match value { - Some(true) => "true", - Some(false) => "false", - None => "", - } -} - #[cfg(test)] mod tests { use super::*; - use std::io::Write as _; + use noodles::bgzf; + use std::io::{Cursor, Write as _}; use std::path::PathBuf; #[test] @@ -1140,6 +513,25 @@ mod tests { let missing = read_zip_sample_lines_from_bytes(&zip_bytes, "missing.vcf").unwrap_err(); assert!(missing.to_string().contains("failed to open zip entry")); + assert_eq!( + read_plain_sample_lines_from_bytes("sample.txt", b"rs1\t1\t10\tAG\n") + .unwrap() + .len(), + 1 + ); + assert!( + read_zip_sample_lines_from_bytes(b"not a zip", "sample.txt") + .unwrap_err() + .to_string() + .contains("failed to read zip bytes") + ); + assert!( + select_zip_entry_from_bytes(b"not a zip") + .unwrap_err() + .to_string() + .contains("failed to read zip bytes") + ); + let dir = std::env::temp_dir().join(format!("bioscript-inspect-unit-{}", std::process::id())); std::fs::create_dir_all(&dir).unwrap(); @@ -1223,6 +615,7 @@ mod tests { let bytes = writer.finish().unwrap().into_inner(); std::fs::write(&zip_path, &bytes).unwrap(); assert_eq!(select_zip_entry(&zip_path).unwrap(), "notes.bin"); + assert_eq!(select_zip_entry_from_bytes(&bytes).unwrap(), "notes.bin"); let zip_gz_path = dir.join("vcf-gz.zip"); let cursor = Cursor::new(Vec::new()); @@ -1253,6 +646,14 @@ mod tests { err.to_string() .contains("does not contain a supported file") ); + assert!( + select_zip_entry_from_bytes(&std::fs::read(&empty_zip_path).unwrap()) + .unwrap_err() + .to_string() + .contains("does not contain a supported file") + ); + let err = read_zip_sample_lines(&zip_gz_path, "missing.vcf").unwrap_err(); + assert!(err.to_string().contains("failed to open zip entry")); let source = detect_source( "dynamicdna.txt", diff --git a/rust/bioscript-formats/src/inspect/heuristics.rs b/rust/bioscript-formats/src/inspect/heuristics.rs new file mode 100644 index 0000000..84f396a --- /dev/null +++ b/rust/bioscript-formats/src/inspect/heuristics.rs @@ -0,0 +1,357 @@ +use std::path::{Path, PathBuf}; + +use bioscript_core::Assembly; + +use super::{DetectedKind, DetectionConfidence, InspectOptions, SourceMetadata}; + +pub(crate) fn looks_like_vcf_lines(lines: &[String]) -> bool { + lines.iter().any(|line| { + let trimmed = line.trim_start(); + trimmed.starts_with("##fileformat=VCF") || trimmed.starts_with("#CHROM\t") + }) +} + +pub(crate) fn looks_like_genotype_text(lines: &[String]) -> bool { + let mut checked = 0usize; + let mut valid = 0usize; + for line in lines { + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.starts_with('#') || trimmed.starts_with("//") { + continue; + } + let fields = split_fields(trimmed); + checked += 1; + if matches_genotype_shape(&fields) { + valid += 1; + } + } + checked > 0 && valid * 10 >= checked * 7 +} + +pub(crate) fn split_fields(line: &str) -> Vec { + if line.contains('\t') { + return line + .split('\t') + .map(|field| field.trim().to_owned()) + .collect(); + } + if line.contains(',') { + return line + .split(',') + .map(|field| field.trim().trim_matches('"').to_owned()) + .collect(); + } + line.split_whitespace().map(str::to_owned).collect() +} + +pub(crate) fn matches_genotype_shape(fields: &[String]) -> bool { + if fields.len() < 4 { + return false; + } + let rsid_like = fields[0].starts_with("rs") || fields[0].starts_with('i'); + if !rsid_like { + return false; + } + let chr_idx = fields.iter().position(|field| is_valid_chromosome(field)); + let Some(chr_idx) = chr_idx else { + return false; + }; + for pos_idx in (chr_idx + 1)..fields.len() { + if fields[pos_idx].parse::().is_err() { + continue; + } + for field in fields.iter().skip(pos_idx + 1) { + if is_valid_genotype(field) { + return true; + } + } + if pos_idx + 2 < fields.len() + && is_valid_allele(&fields[pos_idx + 1]) + && is_valid_allele(&fields[pos_idx + 2]) + { + return true; + } + } + false +} + +fn is_valid_chromosome(value: &str) -> bool { + let trimmed = value.trim().trim_start_matches("chr"); + if let Ok(n) = trimmed.parse::() { + return (1..=26).contains(&n); + } + matches!( + trimmed.to_ascii_uppercase().as_str(), + "X" | "Y" | "M" | "MT" | "XY" + ) +} + +pub(crate) fn is_valid_genotype(value: &str) -> bool { + let trimmed = value.trim().to_ascii_uppercase(); + if trimmed.is_empty() || trimmed.len() > 4 { + return false; + } + trimmed + .chars() + .all(|ch| matches!(ch, 'A' | 'C' | 'G' | 'T' | 'I' | 'D' | '-' | '0')) +} + +pub(crate) fn is_valid_allele(value: &str) -> bool { + let trimmed = value.trim().to_ascii_uppercase(); + matches!( + trimmed.as_str(), + "A" | "C" | "G" | "T" | "I" | "D" | "-" | "0" + ) +} + +pub(crate) fn detect_source( + lower_name: &str, + sample_lines: &[String], + kind: DetectedKind, +) -> Option { + let header = sample_lines + .iter() + .filter(|line| line.starts_with('#') || line.starts_with("//")) + .map(|line| line.to_ascii_lowercase()) + .collect::>() + .join("\n"); + let combined = format!("{lower_name}\n{header}"); + let normalized = combined.replace(['_', '-', '.'], " "); + let mut evidence = Vec::new(); + let mut vendor = None; + let mut platform_version = None; + let mut confidence = DetectionConfidence::Unknown; + + if normalized.contains("genes for good") || normalized.contains("geneforgood") { + vendor = Some("Genes for Good".to_owned()); + confidence = DetectionConfidence::StrongHeuristic; + evidence.push("Genes for Good header".to_owned()); + if let Some(version) = extract_token_after_marker(&header, "genes for good ") { + platform_version = Some(version); + evidence.push("Genes for Good version header".to_owned()); + } + } else if normalized.contains("23andme") || normalized.contains("23&me") { + vendor = Some("23andMe".to_owned()); + confidence = DetectionConfidence::StrongHeuristic; + evidence.push("23andMe header/export name".to_owned()); + if normalized.contains(" v2 ") || lower_name.contains("/v2/") { + platform_version = Some("v2".to_owned()); + evidence.push("v2 token".to_owned()); + } else if normalized.contains(" v3 ") || lower_name.contains("/v3/") { + platform_version = Some("v3".to_owned()); + evidence.push("v3 token".to_owned()); + } else if normalized.contains(" v4 ") || lower_name.contains("/v4/") { + platform_version = Some("v4".to_owned()); + evidence.push("v4 token".to_owned()); + } else if normalized.contains(" v5 ") || lower_name.contains("/v5/") { + platform_version = Some("v5".to_owned()); + evidence.push("v5 token".to_owned()); + } + } else if normalized.contains("ancestrydna") || normalized.contains("ancestry com dna") { + vendor = Some("AncestryDNA".to_owned()); + confidence = DetectionConfidence::StrongHeuristic; + evidence.push("AncestryDNA header/export name".to_owned()); + if let Some(version) = extract_after_marker(&header, "array version:") { + platform_version = Some(canonicalize_ancestry_version(&version)); + evidence.push("AncestryDNA array version header".to_owned()); + } + } else if normalized.contains("family tree dna") + || normalized.contains("familytreedna") + || normalized.contains("ftdna") + { + vendor = Some("FamilyTreeDNA".to_owned()); + confidence = DetectionConfidence::StrongHeuristic; + evidence.push("FamilyTreeDNA header/export name".to_owned()); + } else if normalized.contains("dynamic dna") + || normalized.contains("dynamicdnalabs") + || normalized.contains("ddna laboratories") + || normalized.contains("ddna") + { + vendor = Some("Dynamic DNA".to_owned()); + confidence = DetectionConfidence::StrongHeuristic; + evidence.push("Dynamic DNA header".to_owned()); + if normalized.contains("gsav3 dtc") { + platform_version = Some("GSAv3-DTC".to_owned()); + evidence.push("GSAv3-DTC token".to_owned()); + } else if normalized.contains("gsav3") { + platform_version = Some("GSAv3".to_owned()); + evidence.push("GSAv3 token".to_owned()); + } + } else if normalized.contains("myheritage") { + vendor = Some("MyHeritage".to_owned()); + confidence = DetectionConfidence::StrongHeuristic; + evidence.push("MyHeritage header/export name".to_owned()); + } else if normalized.contains("sequencing com") && kind == DetectedKind::Vcf { + vendor = Some("Sequencing.com".to_owned()); + confidence = DetectionConfidence::WeakHeuristic; + evidence.push("sequencing.com header text".to_owned()); + } else if normalized.contains("carigenetics") || normalized.contains("cari genetics") { + vendor = Some("CariGenetics".to_owned()); + confidence = DetectionConfidence::StrongHeuristic; + evidence.push("CariGenetics path/header text".to_owned()); + } + + vendor.map(|vendor| SourceMetadata { + vendor: Some(vendor), + platform_version, + confidence, + evidence, + }) +} + +fn extract_after_marker(text: &str, marker: &str) -> Option { + text.lines().find_map(|line| { + let trimmed = line.trim(); + let lower = trimmed.to_ascii_lowercase(); + lower.find(marker).map(|idx| { + trimmed[idx + marker.len()..] + .trim() + .trim_end_matches('.') + .to_owned() + }) + }) +} + +fn extract_token_after_marker(text: &str, marker: &str) -> Option { + extract_after_marker(text, marker).map(|value| { + value + .split_whitespace() + .next() + .unwrap_or_default() + .trim_end_matches(':') + .to_owned() + }) +} + +pub(crate) fn canonicalize_ancestry_version(value: &str) -> String { + let trimmed = value.trim(); + if let Some(rest) = trimmed.strip_prefix('v') { + return format!("V{rest}"); + } + trimmed.to_owned() +} + +pub(crate) fn detect_assembly(lower_name: &str, sample_lines: &[String]) -> Option { + let header = sample_lines.join("\n").to_ascii_lowercase(); + let combined = format!("{lower_name}\n{header}"); + let looks_like_grch38 = combined.contains("build 38") + || combined.contains("grch38") + || combined.contains("hg38") + || combined.contains("gca_000001405.15") + || combined.contains("grch38_no_alt_analysis_set") + || combined.contains("##contig="); + + if looks_like_grch38 { + Some(Assembly::Grch38) + } else if combined.contains("build 37") + || combined.contains("grch37") + || combined.contains("hg19") + || combined.contains("assembly=b37") + || combined.contains("assembly=\"b37\"") + || combined.contains("human_g1k_v37") + || combined.contains("37.1") + { + Some(Assembly::Grch37) + } else { + None + } +} + +pub(crate) fn detect_vcf_phasing(lines: &[String]) -> Option { + let mut saw_slash = false; + for line in lines { + if line.starts_with('#') { + continue; + } + let fields: Vec<&str> = line.split('\t').collect(); + if fields.len() < 10 { + continue; + } + let gt = fields[9].split(':').next().unwrap_or_default().trim(); + if gt.contains('|') { + return Some(true); + } + if gt.contains('/') { + saw_slash = true; + } + } + saw_slash.then_some(false) +} + +pub(crate) fn detect_index( + path: &Path, + kind: DetectedKind, + options: &InspectOptions, +) -> (Option, Option) { + if let Some(index) = options + .input_index + .as_ref() + .or(options.reference_index.as_ref()) + { + return (Some(index.exists()), Some(index.clone())); + } + + match kind { + DetectedKind::AlignmentCram => { + let candidate = if path + .to_string_lossy() + .to_ascii_lowercase() + .ends_with(".cram") + { + let first = path.with_extension("cram.crai"); + if first.exists() { + Some(first) + } else { + Some(path.with_extension("crai")) + } + } else { + None + }; + match candidate { + Some(candidate) => (Some(candidate.exists()), Some(candidate)), + None => (Some(false), None), + } + } + DetectedKind::AlignmentBam => { + let first = path.with_extension("bam.bai"); + if first.exists() { + return (Some(true), Some(first)); + } + let second = path.with_extension("bai"); + (Some(second.exists()), Some(second)) + } + DetectedKind::ReferenceFasta => { + let candidate = if let Some(ext) = path.extension().and_then(|ext| ext.to_str()) { + path.with_extension(format!("{ext}.fai")) + } else { + path.with_extension("fai") + }; + (Some(candidate.exists()), Some(candidate)) + } + _ => (None, None), + } +} + +pub(crate) fn is_reference_path(path: &Path) -> bool { + let lower = path.to_string_lossy().to_ascii_lowercase(); + lower.ends_with(".fa") || lower.ends_with(".fasta") +} + +pub(crate) fn classify_confidence( + kind: DetectedKind, + sample_lines: &[String], + source: Option<&SourceMetadata>, +) -> DetectionConfidence { + match kind { + DetectedKind::Vcf if looks_like_vcf_lines(sample_lines) => { + DetectionConfidence::Authoritative + } + DetectedKind::AlignmentCram | DetectedKind::AlignmentBam | DetectedKind::ReferenceFasta => { + DetectionConfidence::Authoritative + } + DetectedKind::GenotypeText if source.is_some() => DetectionConfidence::StrongHeuristic, + DetectedKind::GenotypeText => DetectionConfidence::WeakHeuristic, + DetectedKind::Unknown => DetectionConfidence::Unknown, + DetectedKind::Vcf => DetectionConfidence::StrongHeuristic, + } +} diff --git a/rust/bioscript-formats/src/inspect/io.rs b/rust/bioscript-formats/src/inspect/io.rs new file mode 100644 index 0000000..a6f06a2 --- /dev/null +++ b/rust/bioscript-formats/src/inspect/io.rs @@ -0,0 +1,192 @@ +use std::{ + fs::File, + io::{BufRead, BufReader, Cursor, Read}, + path::Path, +}; + +use bioscript_core::RuntimeError; +use noodles::bgzf; +use zip::ZipArchive; + +const MAX_ZIP_SAMPLE_ENTRY_BYTES: u64 = 128 * 1024 * 1024; + +pub(crate) fn read_plain_sample_lines_from_bytes( + lower_name: &str, + bytes: &[u8], +) -> Result, RuntimeError> { + if lower_name.ends_with(".vcf.gz") { + return read_sample_lines_from_reader(BufReader::new(bgzf::io::Reader::new(Cursor::new( + bytes, + )))); + } + read_sample_lines_from_reader(BufReader::new(Cursor::new(bytes))) +} + +pub(crate) fn read_zip_sample_lines_from_bytes( + bytes: &[u8], + selected_entry: &str, +) -> Result, RuntimeError> { + let mut archive = ZipArchive::new(Cursor::new(bytes)) + .map_err(|err| RuntimeError::Io(format!("failed to read zip bytes: {err}")))?; + let mut entry = archive.by_name(selected_entry).map_err(|err| { + RuntimeError::Io(format!( + "failed to open zip entry {selected_entry} from bytes: {err}" + )) + })?; + if selected_entry.to_ascii_lowercase().ends_with(".vcf.gz") { + let inner = read_entry_limited( + &mut entry, + MAX_ZIP_SAMPLE_ENTRY_BYTES, + &format!("compressed zip entry {selected_entry}"), + )?; + let reader = bgzf::io::Reader::new(Cursor::new(inner)); + return read_sample_lines_from_reader(BufReader::new(reader)); + } + read_sample_lines_from_reader(BufReader::new(entry)) +} + +pub(crate) fn select_zip_entry_from_bytes(bytes: &[u8]) -> Result { + let mut archive = ZipArchive::new(Cursor::new(bytes)) + .map_err(|err| RuntimeError::Io(format!("failed to read zip bytes: {err}")))?; + let mut fallback = None; + for idx in 0..archive.len() { + let entry = archive + .by_index(idx) + .map_err(|err| RuntimeError::Io(format!("failed to inspect zip bytes: {err}")))?; + if entry.is_dir() { + continue; + } + let name = entry.name().to_owned(); + if name.starts_with("__MACOSX/") { + continue; + } + let lower = name.to_ascii_lowercase(); + if lower.ends_with(".vcf") + || lower.ends_with(".vcf.gz") + || lower.ends_with(".txt") + || lower.ends_with(".tsv") + || lower.ends_with(".csv") + { + return Ok(name); + } + if fallback.is_none() { + fallback = Some(name); + } + } + fallback.ok_or_else(|| { + RuntimeError::Unsupported("zip archive does not contain a supported file".to_owned()) + }) +} + +pub(crate) fn read_plain_sample_lines(path: &Path) -> Result, RuntimeError> { + let lower = path.to_string_lossy().to_ascii_lowercase(); + let file = File::open(path) + .map_err(|err| RuntimeError::Io(format!("failed to open {}: {err}", path.display())))?; + if lower.ends_with(".vcf.gz") { + return read_sample_lines_from_reader(BufReader::new(bgzf::io::Reader::new(file))); + } + read_sample_lines_from_reader(BufReader::new(file)) +} + +pub(crate) fn read_zip_sample_lines( + path: &Path, + selected_entry: &str, +) -> Result, RuntimeError> { + let file = File::open(path) + .map_err(|err| RuntimeError::Io(format!("failed to open zip {}: {err}", path.display())))?; + let mut archive = ZipArchive::new(file) + .map_err(|err| RuntimeError::Io(format!("failed to read zip {}: {err}", path.display())))?; + let mut entry = archive.by_name(selected_entry).map_err(|err| { + RuntimeError::Io(format!( + "failed to open zip entry {selected_entry} in {}: {err}", + path.display() + )) + })?; + + if selected_entry.to_ascii_lowercase().ends_with(".vcf.gz") { + let bytes = read_entry_limited( + &mut entry, + MAX_ZIP_SAMPLE_ENTRY_BYTES, + &format!( + "compressed zip entry {selected_entry} in {}", + path.display() + ), + )?; + let reader = bgzf::io::Reader::new(Cursor::new(bytes)); + return read_sample_lines_from_reader(BufReader::new(reader)); + } + + read_sample_lines_from_reader(BufReader::new(entry)) +} + +pub(crate) fn read_entry_limited( + reader: &mut R, + max_bytes: u64, + label: &str, +) -> Result, RuntimeError> { + let mut bytes = Vec::new(); + reader + .take(max_bytes.saturating_add(1)) + .read_to_end(&mut bytes) + .map_err(|err| RuntimeError::Io(format!("failed to read {label}: {err}")))?; + if u64::try_from(bytes.len()).unwrap_or(u64::MAX) > max_bytes { + return Err(RuntimeError::InvalidArguments(format!( + "{label} exceeds decompressed limit of {max_bytes} bytes" + ))); + } + Ok(bytes) +} + +fn read_sample_lines_from_reader(mut reader: R) -> Result, RuntimeError> { + let mut out = Vec::new(); + let mut buf = String::new(); + for _ in 0..64 { + buf.clear(); + let bytes = reader + .read_line(&mut buf) + .map_err(|err| RuntimeError::Io(format!("failed to read sample lines: {err}")))?; + if bytes == 0 { + break; + } + out.push(buf.trim_end_matches(['\n', '\r']).to_owned()); + } + Ok(out) +} + +pub(crate) fn select_zip_entry(path: &Path) -> Result { + let file = File::open(path) + .map_err(|err| RuntimeError::Io(format!("failed to open zip {}: {err}", path.display())))?; + let mut archive = ZipArchive::new(file) + .map_err(|err| RuntimeError::Io(format!("failed to read zip {}: {err}", path.display())))?; + let mut fallback = None; + for idx in 0..archive.len() { + let entry = archive.by_index(idx).map_err(|err| { + RuntimeError::Io(format!("failed to inspect zip {}: {err}", path.display())) + })?; + if entry.is_dir() { + continue; + } + let name = entry.name().to_owned(); + if name.starts_with("__MACOSX/") { + continue; + } + let lower = name.to_ascii_lowercase(); + if lower.ends_with(".vcf") + || lower.ends_with(".vcf.gz") + || lower.ends_with(".txt") + || lower.ends_with(".tsv") + || lower.ends_with(".csv") + { + return Ok(name); + } + if fallback.is_none() { + fallback = Some(name); + } + } + fallback.ok_or_else(|| { + RuntimeError::Unsupported(format!( + "zip archive {} does not contain a supported file", + path.display() + )) + }) +} diff --git a/rust/bioscript-formats/src/inspect/render.rs b/rust/bioscript-formats/src/inspect/render.rs new file mode 100644 index 0000000..0cf700f --- /dev/null +++ b/rust/bioscript-formats/src/inspect/render.rs @@ -0,0 +1,102 @@ +use bioscript_core::Assembly; + +use super::{DetectedKind, DetectionConfidence, FileContainer, FileInspection}; + +impl FileInspection { + #[must_use] + pub fn render_text(&self) -> String { + let mut lines = Vec::new(); + lines.push(format!("path\t{}", self.path.display())); + lines.push(format!("container\t{}", render_container(self.container))); + lines.push(format!("kind\t{}", render_kind(self.detected_kind))); + lines.push(format!( + "confidence\t{}", + render_confidence(self.confidence) + )); + lines.push(format!("assembly\t{}", render_assembly(self.assembly))); + lines.push(format!("phased\t{}", render_bool(self.phased))); + lines.push(format!( + "selected_entry\t{}", + self.selected_entry.as_deref().unwrap_or("") + )); + lines.push(format!("has_index\t{}", render_bool(self.has_index))); + lines.push(format!( + "index_path\t{}", + self.index_path + .as_ref() + .map(|path| path.display().to_string()) + .unwrap_or_default() + )); + lines.push(format!( + "reference_matches\t{}", + render_bool(self.reference_matches) + )); + if let Some(source) = &self.source { + lines.push(format!( + "vendor\t{}", + source.vendor.as_deref().unwrap_or_default() + )); + lines.push(format!( + "platform_version\t{}", + source.platform_version.as_deref().unwrap_or_default() + )); + lines.push(format!( + "source_confidence\t{}", + render_confidence(source.confidence) + )); + lines.push(format!("source_evidence\t{}", source.evidence.join(" | "))); + } else { + lines.push("vendor\t".to_owned()); + lines.push("platform_version\t".to_owned()); + lines.push("source_confidence\t".to_owned()); + lines.push("source_evidence\t".to_owned()); + } + lines.push(format!("evidence\t{}", self.evidence.join(" | "))); + lines.push(format!("warnings\t{}", self.warnings.join(" | "))); + lines.push(format!("duration_ms\t{}", self.duration_ms)); + lines.join("\n") + } +} + +pub(crate) fn render_container(value: FileContainer) -> &'static str { + match value { + FileContainer::Plain => "plain", + FileContainer::Zip => "zip", + } +} + +pub(crate) fn render_kind(value: DetectedKind) -> &'static str { + match value { + DetectedKind::GenotypeText => "genotype_text", + DetectedKind::Vcf => "vcf", + DetectedKind::AlignmentCram => "alignment_cram", + DetectedKind::AlignmentBam => "alignment_bam", + DetectedKind::ReferenceFasta => "reference_fasta", + DetectedKind::Unknown => "unknown", + } +} + +pub(crate) fn render_confidence(value: DetectionConfidence) -> &'static str { + match value { + DetectionConfidence::Authoritative => "authoritative", + DetectionConfidence::StrongHeuristic => "strong_heuristic", + DetectionConfidence::WeakHeuristic => "weak_heuristic", + DetectionConfidence::Unknown => "unknown", + } +} + +pub(crate) fn render_assembly(value: Option) -> &'static str { + match value { + Some(Assembly::Grch37) => "grch37", + Some(Assembly::Grch38) => "grch38", + None => "", + } +} + +pub(crate) fn render_bool(value: Option) -> &'static str { + match value { + Some(true) => "true", + Some(false) => "false", + None => "", + } +} diff --git a/rust/bioscript-runtime/src/runtime.rs b/rust/bioscript-runtime/src/runtime.rs index d67d342..069a067 100644 --- a/rust/bioscript-runtime/src/runtime.rs +++ b/rust/bioscript-runtime/src/runtime.rs @@ -1,21 +1,41 @@ use std::{ - collections::{BTreeMap, HashMap}, + collections::BTreeMap, fs, - io::Read, path::{Component, Path, PathBuf}, - sync::{ - Arc, Mutex, - atomic::{AtomicU64, Ordering}, - }, + sync::Arc, time::{Duration, Instant}, }; -use bioscript_core::{GenomicLocus, RuntimeError, VariantKind, VariantSpec}; -use bioscript_formats::{GenotypeLoadOptions, GenotypeStore}; -use monty::{ - LimitedTracker, MontyException, MontyObject, MontyRun, NameLookupResult, PrintWriter, - ResourceLimits, RunProgress, +use bioscript_core::RuntimeError; +use monty::{LimitedTracker, MontyObject, MontyRun, NameLookupResult, PrintWriter, RunProgress}; + +mod args; +mod host_io; +mod methods; +mod objects; +mod state; +mod trace; +mod variants; + +#[cfg(test)] +use bioscript_core::VariantSpec; +use host_io::{deepest_existing_ancestor, host_read_text, host_write_text}; +use objects::bioscript_object; +#[cfg(test)] +use objects::{ + genotype_file_object, variant_object, variant_observation_object, variant_plan_object, +}; +pub use state::{RuntimeConfig, StageTiming}; +use state::{RuntimeState, monty_error}; +#[cfg(test)] +use trace::{ + ends_with_unescaped_backslash, extract_coordinate, extract_rsid, update_nesting_depth, }; +use trace::{host_trace, instrument_source, statement_context, trace_lookup_metadata}; +#[cfg(test)] +use variants::{dataclass_handle_id, dataclass_to_variant_spec, variant_specs_from_plan}; +#[cfg(test)] +use variants::{int_from_optional, string_from_optional, string_list_from_object, string_or_list}; type HostFunction = fn( &BioscriptRuntime, @@ -23,62 +43,6 @@ type HostFunction = fn( &[(MontyObject, MontyObject)], ) -> Result; -const MAX_HOST_TEXT_BYTES: u64 = 16 * 1024 * 1024; - -#[derive(Debug, Clone)] -pub struct RuntimeConfig { - pub limits: ResourceLimits, - pub loader: GenotypeLoadOptions, -} - -impl Default for RuntimeConfig { - fn default() -> Self { - let limits = ResourceLimits::new() - .max_duration(Duration::from_millis(100)) - .max_memory(8 * 1024 * 1024) - .max_allocations(200_000) - .gc_interval(1000) - .max_recursion_depth(Some(200)); - Self { - limits, - loader: GenotypeLoadOptions::default(), - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct StageTiming { - pub stage: String, - pub duration_ms: u128, - pub detail: String, -} - -fn monty_error(value: MontyException) -> RuntimeError { - RuntimeError::Monty(value.to_string()) -} - -struct RuntimeState { - next_handle: AtomicU64, - genotype_files: Mutex>, - trace_lines: Mutex>, - timings: Mutex>, -} - -impl RuntimeState { - fn new() -> Self { - Self { - next_handle: AtomicU64::new(1), - genotype_files: Mutex::new(HashMap::new()), - trace_lines: Mutex::new(Vec::new()), - timings: Mutex::new(Vec::new()), - } - } - - fn next_handle(&self) -> u64 { - self.next_handle.fetch_add(1, Ordering::Relaxed) - } -} - #[derive(Clone)] pub struct BioscriptRuntime { root: PathBuf, @@ -290,301 +254,6 @@ impl BioscriptRuntime { } } - fn method_load_genotypes( - &self, - args: &[MontyObject], - kwargs: &[(MontyObject, MontyObject)], - ) -> Result { - let started = Instant::now(); - reject_kwargs(kwargs, "bioscript.load_genotypes")?; - if args.len() != 2 { - return Err(RuntimeError::InvalidArguments( - "bioscript.load_genotypes expects self and path".to_owned(), - )); - } - let path = self.resolve_existing_user_path(&expect_string_arg( - args, - 1, - "bioscript.load_genotypes", - )?)?; - let loader = self.resolved_loader_options()?; - let store = GenotypeStore::from_file_with_options(&path, &loader)?; - let handle = self.state.next_handle(); - self.state - .genotype_files - .lock() - .expect("genotype mutex poisoned") - .insert(handle, store); - self.record_timing( - "load_genotypes", - started.elapsed(), - format!("path={}", path.display()), - ); - Ok(genotype_file_object(handle)) - } - - fn resolved_loader_options(&self) -> Result { - let mut loader = self.config.loader.clone(); - loader.input_index = resolve_optional_loader_path(self, loader.input_index)?; - loader.reference_file = resolve_optional_loader_path(self, loader.reference_file)?; - loader.reference_index = resolve_optional_loader_path(self, loader.reference_index)?; - Ok(loader) - } - - fn method_genotype_get( - &self, - args: &[MontyObject], - kwargs: &[(MontyObject, MontyObject)], - ) -> Result { - reject_kwargs(kwargs, "GenotypeFile.get")?; - if args.len() != 2 { - return Err(RuntimeError::InvalidArguments( - "GenotypeFile.get expects self and rsid".to_owned(), - )); - } - let handle = dataclass_handle_id(&args[0], "GenotypeFile")?; - let rsid = expect_string_arg(args, 1, "GenotypeFile.get")?; - let guard = self - .state - .genotype_files - .lock() - .expect("genotype mutex poisoned"); - let Some(store) = guard.get(&handle) else { - return Err(RuntimeError::InvalidArguments(format!( - "unknown genotype handle: {handle}" - ))); - }; - Ok(match store.get(&rsid)? { - Some(value) => MontyObject::String(value), - None => MontyObject::None, - }) - } - - fn method_variant( - &self, - args: &[MontyObject], - kwargs: &[(MontyObject, MontyObject)], - ) -> Result { - if args.len() != 1 { - return Err(RuntimeError::InvalidArguments( - "bioscript.variant expects only self as a positional argument".to_owned(), - )); - } - let spec = variant_spec_from_kwargs(kwargs)?; - Ok(variant_object(&spec)) - } - - fn method_query_plan( - &self, - args: &[MontyObject], - kwargs: &[(MontyObject, MontyObject)], - ) -> Result { - reject_kwargs(kwargs, "bioscript.query_plan")?; - if args.len() != 2 { - return Err(RuntimeError::InvalidArguments( - "bioscript.query_plan expects self and a list of variants".to_owned(), - )); - } - let variants = variant_specs_from_plan(&args[1])?; - Ok(variant_plan_object(&variants)) - } - - fn method_genotype_lookup_variant( - &self, - args: &[MontyObject], - kwargs: &[(MontyObject, MontyObject)], - ) -> Result { - let started = Instant::now(); - reject_kwargs(kwargs, "GenotypeFile.lookup_variant")?; - if args.len() != 2 { - return Err(RuntimeError::InvalidArguments( - "GenotypeFile.lookup_variant expects self and variant".to_owned(), - )); - } - let handle = dataclass_handle_id(&args[0], "GenotypeFile")?; - let spec = dataclass_to_variant_spec(&args[1])?; - let guard = self - .state - .genotype_files - .lock() - .expect("genotype mutex poisoned"); - let Some(store) = guard.get(&handle) else { - return Err(RuntimeError::InvalidArguments(format!( - "unknown genotype handle: {handle}" - ))); - }; - let observation = store.lookup_variant(&spec)?; - self.record_timing( - "lookup_variant", - started.elapsed(), - format!("rsids={}", spec.rsids.join("|")), - ); - Ok(match observation.genotype { - Some(value) => MontyObject::String(value), - None => MontyObject::None, - }) - } - - fn method_genotype_lookup_variant_details( - &self, - args: &[MontyObject], - kwargs: &[(MontyObject, MontyObject)], - ) -> Result { - let started = Instant::now(); - reject_kwargs(kwargs, "GenotypeFile.lookup_variant_details")?; - if args.len() != 2 { - return Err(RuntimeError::InvalidArguments( - "GenotypeFile.lookup_variant_details expects self and variant".to_owned(), - )); - } - let handle = dataclass_handle_id(&args[0], "GenotypeFile")?; - let spec = dataclass_to_variant_spec(&args[1])?; - let guard = self - .state - .genotype_files - .lock() - .expect("genotype mutex poisoned"); - let Some(store) = guard.get(&handle) else { - return Err(RuntimeError::InvalidArguments(format!( - "unknown genotype handle: {handle}" - ))); - }; - let observation = store.lookup_variant(&spec)?; - self.record_timing( - "lookup_variant_details", - started.elapsed(), - format!("rsids={}", spec.rsids.join("|")), - ); - Ok(variant_observation_object(&observation)) - } - - fn method_genotype_lookup_variants( - &self, - args: &[MontyObject], - kwargs: &[(MontyObject, MontyObject)], - ) -> Result { - let started = Instant::now(); - reject_kwargs(kwargs, "GenotypeFile.lookup_variants")?; - if args.len() != 2 { - return Err(RuntimeError::InvalidArguments( - "GenotypeFile.lookup_variants expects self and a variant plan".to_owned(), - )); - } - let handle = dataclass_handle_id(&args[0], "GenotypeFile")?; - let specs = variant_specs_from_plan(&args[1])?; - let guard = self - .state - .genotype_files - .lock() - .expect("genotype mutex poisoned"); - let Some(store) = guard.get(&handle) else { - return Err(RuntimeError::InvalidArguments(format!( - "unknown genotype handle: {handle}" - ))); - }; - let observations = store.lookup_variants(&specs)?; - self.record_timing( - "lookup_variants", - started.elapsed(), - format!("count={}", specs.len()), - ); - Ok(MontyObject::List( - observations - .into_iter() - .map(|observation| match observation.genotype { - Some(value) => MontyObject::String(value), - None => MontyObject::None, - }) - .collect(), - )) - } - - fn method_genotype_lookup_variants_details( - &self, - args: &[MontyObject], - kwargs: &[(MontyObject, MontyObject)], - ) -> Result { - let started = Instant::now(); - reject_kwargs(kwargs, "GenotypeFile.lookup_variants_details")?; - if args.len() != 2 { - return Err(RuntimeError::InvalidArguments( - "GenotypeFile.lookup_variants_details expects self and a variant plan".to_owned(), - )); - } - let handle = dataclass_handle_id(&args[0], "GenotypeFile")?; - let specs = variant_specs_from_plan(&args[1])?; - let guard = self - .state - .genotype_files - .lock() - .expect("genotype mutex poisoned"); - let Some(store) = guard.get(&handle) else { - return Err(RuntimeError::InvalidArguments(format!( - "unknown genotype handle: {handle}" - ))); - }; - let observations = store.lookup_variants(&specs)?; - self.record_timing( - "lookup_variants_details", - started.elapsed(), - format!("count={}", specs.len()), - ); - Ok(MontyObject::List( - observations - .iter() - .map(variant_observation_object) - .collect(), - )) - } - - fn method_write_tsv( - &self, - args: &[MontyObject], - kwargs: &[(MontyObject, MontyObject)], - ) -> Result { - let started = Instant::now(); - reject_kwargs(kwargs, "bioscript.write_tsv")?; - if args.len() != 3 { - return Err(RuntimeError::InvalidArguments( - "bioscript.write_tsv expects self, path, rows".to_owned(), - )); - } - let path = - self.resolve_user_write_path(&expect_string_arg(args, 1, "bioscript.write_tsv")?)?; - let rows = expect_rows(&args[2])?; - if let Some(parent) = path.parent() { - fs::create_dir_all(parent).map_err(|err| { - RuntimeError::Io(format!( - "failed to create parent dir {}: {err}", - parent.display() - )) - })?; - } - let mut output = String::new(); - if let Some(first) = rows.first() { - let headers: Vec = first.keys().cloned().collect(); - output.push_str(&headers.join("\t")); - output.push('\n'); - for row in &rows { - let values: Vec = headers - .iter() - .map(|header| row.get(header).cloned().unwrap_or_default()) - .collect(); - output.push_str(&values.join("\t")); - output.push('\n'); - } - } - fs::write(&path, output).map_err(|err| { - RuntimeError::Io(format!("failed to write {}: {err}", path.display())) - })?; - self.record_timing( - "write_tsv", - started.elapsed(), - format!("path={} rows={}", path.display(), rows.len()), - ); - Ok(MontyObject::None) - } - fn record_timing(&self, stage: &str, duration: Duration, detail: String) { self.state .timings @@ -597,32 +266,6 @@ impl BioscriptRuntime { }); } - fn method_read_text( - &self, - args: &[MontyObject], - kwargs: &[(MontyObject, MontyObject)], - ) -> Result { - if args.is_empty() { - return Err(RuntimeError::InvalidArguments( - "bioscript.read_text expects self and path".to_owned(), - )); - } - host_read_text(self, &args[1..], kwargs) - } - - fn method_write_text( - &self, - args: &[MontyObject], - kwargs: &[(MontyObject, MontyObject)], - ) -> Result { - if args.is_empty() { - return Err(RuntimeError::InvalidArguments( - "bioscript.write_text expects self, path, text".to_owned(), - )); - } - host_write_text(self, &args[1..], kwargs) - } - fn resolve_user_path(&self, raw_path: &str) -> Result { let path = Path::new(raw_path); if path.is_absolute() { @@ -726,562 +369,6 @@ impl BioscriptRuntime { } } -fn trace_lookup_metadata(source: &str) -> (Option, Option) { - if let Some(rsid) = extract_rsid(source) { - let url = format!("https://www.ncbi.nlm.nih.gov/snp/{rsid}"); - return (Some(rsid), Some(url)); - } - - if let Some(coord) = extract_coordinate(source) { - let lower = source.to_ascii_lowercase(); - let host = if lower.contains("grch37") || lower.contains("hg19") { - "https://grch37.ensembl.org" - } else { - "https://www.ensembl.org" - }; - let url = format!("{host}/Homo_sapiens/Location/View?r={coord}"); - return (Some(coord), Some(url)); - } - - (None, None) -} - -fn statement_context(lines: &[&str], line_no: usize) -> String { - let Some(start_idx) = line_no.checked_sub(1) else { - return String::new(); - }; - let Some(first_line) = lines.get(start_idx) else { - return String::new(); - }; - - let mut out = String::from(first_line.trim()); - let mut depth = update_nesting_depth(0, first_line); - let mut current = start_idx + 1; - - while depth > 0 { - let Some(line) = lines.get(current) else { - break; - }; - if !out.is_empty() { - out.push(' '); - } - out.push_str(line.trim()); - depth = update_nesting_depth(depth, line); - current += 1; - } - - out -} - -fn extract_rsid(source: &str) -> Option { - let chars: Vec = source.chars().collect(); - let len = chars.len(); - let mut idx = 0; - while idx + 2 <= len { - if chars[idx] == 'r' - && chars.get(idx + 1) == Some(&'s') - && (idx == 0 || !chars[idx - 1].is_ascii_alphanumeric()) - { - let mut end = idx + 2; - while end < len && chars[end].is_ascii_digit() { - end += 1; - } - if end > idx + 2 { - return Some(chars[idx..end].iter().collect()); - } - } - idx += 1; - } - None -} - -fn extract_coordinate(source: &str) -> Option { - for token in source.split(|ch: char| { - ch.is_whitespace() || matches!(ch, '"' | '\'' | ',' | ')' | '(' | '[' | ']' | '{' | '}') - }) { - let cleaned = token.trim_matches(|ch: char| matches!(ch, ';')); - let normalized = cleaned.strip_prefix("chr").unwrap_or(cleaned); - if let Some((chrom, rest)) = normalized.split_once(':') - && !chrom.is_empty() - && chrom.chars().all(|ch| ch.is_ascii_alphanumeric()) - { - if let Some((start, end)) = rest.split_once('-') { - if start.chars().all(|ch| ch.is_ascii_digit()) - && end.chars().all(|ch| ch.is_ascii_digit()) - { - return Some(format!("{chrom}:{start}-{end}")); - } - } else if rest.chars().all(|ch| ch.is_ascii_digit()) { - return Some(format!("{chrom}:{rest}-{rest}")); - } - } - } - None -} - -fn bioscript_object() -> MontyObject { - MontyObject::Dataclass { - name: "Bioscript".to_owned(), - type_id: 1, - field_names: vec![], - attrs: vec![].into(), - frozen: true, - } -} - -fn genotype_file_object(handle_id: u64) -> MontyObject { - MontyObject::Dataclass { - name: "GenotypeFile".to_owned(), - type_id: 2, - field_names: vec!["handle_id".to_owned()], - attrs: vec![( - MontyObject::String("handle_id".to_owned()), - MontyObject::Int(handle_id as i64), - )] - .into(), - frozen: true, - } -} - -fn variant_object(spec: &VariantSpec) -> MontyObject { - let mut attrs = Vec::new(); - attrs.push(( - MontyObject::String("rsids".to_owned()), - MontyObject::List( - spec.rsids - .iter() - .cloned() - .map(MontyObject::String) - .collect(), - ), - )); - if let Some(locus) = &spec.grch37 { - attrs.push(( - MontyObject::String("grch37".to_owned()), - MontyObject::String(format!("{}:{}-{}", locus.chrom, locus.start, locus.end)), - )); - } - if let Some(locus) = &spec.grch38 { - attrs.push(( - MontyObject::String("grch38".to_owned()), - MontyObject::String(format!("{}:{}-{}", locus.chrom, locus.start, locus.end)), - )); - } - if let Some(reference) = &spec.reference { - attrs.push(( - MontyObject::String("reference".to_owned()), - MontyObject::String(reference.clone()), - )); - } - if let Some(alternate) = &spec.alternate { - attrs.push(( - MontyObject::String("alternate".to_owned()), - MontyObject::String(alternate.clone()), - )); - } - if let Some(kind) = spec.kind { - attrs.push(( - MontyObject::String("kind".to_owned()), - MontyObject::String(variant_kind_name(kind).to_owned()), - )); - } - if let Some(length) = spec.deletion_length { - attrs.push(( - MontyObject::String("deletion_length".to_owned()), - MontyObject::Int(length as i64), - )); - } - if !spec.motifs.is_empty() { - attrs.push(( - MontyObject::String("motifs".to_owned()), - MontyObject::List( - spec.motifs - .iter() - .cloned() - .map(MontyObject::String) - .collect(), - ), - )); - } - - MontyObject::Dataclass { - name: "Variant".to_owned(), - type_id: 3, - field_names: vec![ - "rsids".to_owned(), - "grch37".to_owned(), - "grch38".to_owned(), - "reference".to_owned(), - "alternate".to_owned(), - "kind".to_owned(), - "deletion_length".to_owned(), - "motifs".to_owned(), - ], - attrs: attrs.into(), - frozen: true, - } -} - -fn variant_plan_object(variants: &[VariantSpec]) -> MontyObject { - MontyObject::Dataclass { - name: "VariantPlan".to_owned(), - type_id: 4, - field_names: vec!["variants".to_owned()], - attrs: vec![( - MontyObject::String("variants".to_owned()), - MontyObject::List(variants.iter().map(variant_object).collect()), - )] - .into(), - frozen: true, - } -} - -fn variant_observation_object(observation: &bioscript_core::VariantObservation) -> MontyObject { - let mut attrs = vec![ - ( - MontyObject::String("backend".to_owned()), - MontyObject::String(observation.backend.clone()), - ), - ( - MontyObject::String("matched_rsid".to_owned()), - match &observation.matched_rsid { - Some(value) => MontyObject::String(value.clone()), - None => MontyObject::None, - }, - ), - ( - MontyObject::String("assembly".to_owned()), - match observation.assembly { - Some(assembly) => MontyObject::String(match assembly { - bioscript_core::Assembly::Grch37 => "grch37".to_owned(), - bioscript_core::Assembly::Grch38 => "grch38".to_owned(), - }), - None => MontyObject::None, - }, - ), - ( - MontyObject::String("genotype".to_owned()), - match &observation.genotype { - Some(value) => MontyObject::String(value.clone()), - None => MontyObject::None, - }, - ), - ( - MontyObject::String("ref_count".to_owned()), - observation.ref_count.map_or(MontyObject::None, |value| { - MontyObject::Int(i64::from(value)) - }), - ), - ( - MontyObject::String("alt_count".to_owned()), - observation.alt_count.map_or(MontyObject::None, |value| { - MontyObject::Int(i64::from(value)) - }), - ), - ( - MontyObject::String("depth".to_owned()), - observation.depth.map_or(MontyObject::None, |value| { - MontyObject::Int(i64::from(value)) - }), - ), - ( - MontyObject::String("decision".to_owned()), - match &observation.decision { - Some(value) => MontyObject::String(value.clone()), - None => MontyObject::None, - }, - ), - ( - MontyObject::String("raw_counts".to_owned()), - MontyObject::Dict( - observation - .raw_counts - .iter() - .map(|(base, count)| { - ( - MontyObject::String(base.clone()), - MontyObject::Int(i64::from(*count)), - ) - }) - .collect(), - ), - ), - ( - MontyObject::String("evidence".to_owned()), - MontyObject::List( - observation - .evidence - .iter() - .cloned() - .map(MontyObject::String) - .collect(), - ), - ), - ]; - - MontyObject::Dataclass { - name: "VariantObservation".to_owned(), - type_id: 5, - field_names: vec![ - "backend".to_owned(), - "matched_rsid".to_owned(), - "assembly".to_owned(), - "genotype".to_owned(), - "ref_count".to_owned(), - "alt_count".to_owned(), - "depth".to_owned(), - "decision".to_owned(), - "raw_counts".to_owned(), - "evidence".to_owned(), - ], - attrs: attrs.drain(..).collect(), - frozen: true, - } -} - -fn dataclass_handle_id(obj: &MontyObject, expected_name: &str) -> Result { - match obj { - MontyObject::Dataclass { name, attrs, .. } if name == expected_name => { - for (key, value) in attrs { - if matches!(key, MontyObject::String(text) if text == "handle_id") - && let MontyObject::Int(id) = value - { - return Ok(*id as u64); - } - } - Err(RuntimeError::InvalidArguments(format!( - "{expected_name} missing handle_id" - ))) - } - _ => Err(RuntimeError::InvalidArguments(format!( - "expected {expected_name} object" - ))), - } -} - -fn dataclass_to_variant_spec(obj: &MontyObject) -> Result { - let MontyObject::Dataclass { name, attrs, .. } = obj else { - return Err(RuntimeError::InvalidArguments( - "expected Variant object".to_owned(), - )); - }; - if name != "Variant" { - return Err(RuntimeError::InvalidArguments(format!( - "expected Variant object, got {name}" - ))); - } - - let mut spec = VariantSpec::default(); - for (key, value) in attrs { - let MontyObject::String(key) = key else { - continue; - }; - match key.as_str() { - "rsids" => spec.rsids = string_list_from_object(value)?, - "grch37" => { - spec.grch37 = string_from_optional(value)? - .map(|v| parse_locus_string(&v)) - .transpose()? - } - "grch38" => { - spec.grch38 = string_from_optional(value)? - .map(|v| parse_locus_string(&v)) - .transpose()? - } - "reference" => spec.reference = string_from_optional(value)?, - "alternate" => spec.alternate = string_from_optional(value)?, - "kind" => { - spec.kind = string_from_optional(value)? - .as_deref() - .map(parse_variant_kind) - .transpose()? - } - "deletion_length" => { - spec.deletion_length = int_from_optional(value)?.map(|v| v as usize) - } - "motifs" => spec.motifs = string_list_from_object(value)?, - _ => {} - } - } - Ok(spec) -} - -fn variant_specs_from_plan(obj: &MontyObject) -> Result, RuntimeError> { - match obj { - MontyObject::List(items) => items.iter().map(dataclass_to_variant_spec).collect(), - MontyObject::Dataclass { name, attrs, .. } if name == "VariantPlan" => { - for (key, value) in attrs { - if matches!(key, MontyObject::String(text) if text == "variants") { - return variant_specs_from_plan(value); - } - } - Err(RuntimeError::InvalidArguments( - "VariantPlan missing variants".to_owned(), - )) - } - _ => Err(RuntimeError::InvalidArguments( - "expected a list of Variant objects or a VariantPlan".to_owned(), - )), - } -} - -fn variant_spec_from_kwargs( - kwargs: &[(MontyObject, MontyObject)], -) -> Result { - let mut spec = VariantSpec::default(); - for (key, value) in kwargs { - let MontyObject::String(key) = key else { - return Err(RuntimeError::InvalidArguments( - "bioscript.variant keyword names must be strings".to_owned(), - )); - }; - match key.as_str() { - "rsid" | "rsids" => spec.rsids = string_or_list(value)?, - "grch37" => { - spec.grch37 = string_from_optional(value)? - .map(|v| parse_locus_string(&v)) - .transpose()? - } - "grch38" => { - spec.grch38 = string_from_optional(value)? - .map(|v| parse_locus_string(&v)) - .transpose()? - } - "ref" | "reference" => spec.reference = string_from_optional(value)?, - "alt" | "alternate" => spec.alternate = string_from_optional(value)?, - "kind" => { - spec.kind = string_from_optional(value)? - .as_deref() - .map(parse_variant_kind) - .transpose()? - } - "deletion_length" => { - spec.deletion_length = int_from_optional(value)?.map(|v| v as usize) - } - "motifs" => spec.motifs = string_or_list(value)?, - other => { - return Err(RuntimeError::InvalidArguments(format!( - "bioscript.variant does not accept keyword '{other}'" - ))); - } - } - } - Ok(spec) -} - -fn parse_locus_string(value: &str) -> Result { - let normalized = value.trim().strip_prefix("chr").unwrap_or(value.trim()); - let Some((chrom, rest)) = normalized.split_once(':') else { - return Err(RuntimeError::InvalidArguments(format!( - "invalid locus string: {value}" - ))); - }; - let (start, end) = if let Some((start, end)) = rest.split_once('-') { - (start, end) - } else { - (rest, rest) - }; - let start = start.parse::().map_err(|err| { - RuntimeError::InvalidArguments(format!("invalid locus start {value}: {err}")) - })?; - let end = end.parse::().map_err(|err| { - RuntimeError::InvalidArguments(format!("invalid locus end {value}: {err}")) - })?; - Ok(GenomicLocus { - chrom: chrom.to_owned(), - start, - end, - }) -} - -fn parse_variant_kind(value: &str) -> Result { - match value.trim().to_ascii_lowercase().as_str() { - "snp" => Ok(VariantKind::Snp), - "insertion" | "ins" => Ok(VariantKind::Insertion), - "deletion" | "del" => Ok(VariantKind::Deletion), - "indel" => Ok(VariantKind::Indel), - "other" => Ok(VariantKind::Other), - other => Err(RuntimeError::InvalidArguments(format!( - "invalid variant kind: {other}" - ))), - } -} - -fn variant_kind_name(kind: VariantKind) -> &'static str { - match kind { - VariantKind::Snp => "snp", - VariantKind::Insertion => "insertion", - VariantKind::Deletion => "deletion", - VariantKind::Indel => "indel", - VariantKind::Other => "other", - } -} - -fn string_or_list(value: &MontyObject) -> Result, RuntimeError> { - match value { - MontyObject::String(text) => Ok(vec![text.clone()]), - MontyObject::List(_) => string_list_from_object(value), - MontyObject::None => Ok(Vec::new()), - other => Err(RuntimeError::InvalidArguments(format!( - "expected string or list of strings, got {other:?}" - ))), - } -} - -fn string_list_from_object(value: &MontyObject) -> Result, RuntimeError> { - match value { - MontyObject::List(items) => { - let mut out = Vec::new(); - for item in items { - let MontyObject::String(text) = item else { - return Err(RuntimeError::InvalidArguments( - "expected list of strings".to_owned(), - )); - }; - out.push(text.clone()); - } - Ok(out) - } - MontyObject::None => Ok(Vec::new()), - other => Err(RuntimeError::InvalidArguments(format!( - "expected list of strings, got {other:?}" - ))), - } -} - -fn string_from_optional(value: &MontyObject) -> Result, RuntimeError> { - match value { - MontyObject::None => Ok(None), - MontyObject::String(text) => Ok(Some(text.clone())), - other => Err(RuntimeError::InvalidArguments(format!( - "expected optional string, got {other:?}" - ))), - } -} - -fn int_from_optional(value: &MontyObject) -> Result, RuntimeError> { - match value { - MontyObject::None => Ok(None), - MontyObject::Int(v) => Ok(Some(*v)), - other => Err(RuntimeError::InvalidArguments(format!( - "expected optional int, got {other:?}" - ))), - } -} - -fn reject_kwargs( - kwargs: &[(MontyObject, MontyObject)], - function_name: &str, -) -> Result<(), RuntimeError> { - if kwargs.is_empty() { - Ok(()) - } else { - Err(RuntimeError::InvalidArguments(format!( - "{function_name} does not accept keyword arguments" - ))) - } -} - fn resolve_optional_loader_path( runtime: &BioscriptRuntime, path: Option, @@ -1296,248 +383,6 @@ fn resolve_optional_loader_path( .transpose() } -fn expect_string_arg( - args: &[MontyObject], - index: usize, - function_name: &str, -) -> Result { - let Some(value) = args.get(index) else { - return Err(RuntimeError::InvalidArguments(format!( - "{function_name} missing argument at position {index}" - ))); - }; - match value { - MontyObject::String(text) => Ok(text.clone()), - other => Err(RuntimeError::InvalidArguments(format!( - "{function_name} expected str at position {index}, got {other:?}" - ))), - } -} - -fn expect_rows(value: &MontyObject) -> Result>, RuntimeError> { - let MontyObject::List(rows) = value else { - return Err(RuntimeError::InvalidArguments( - "write_tsv expects a list of dict rows".to_owned(), - )); - }; - - let mut out = Vec::new(); - for row in rows { - let MontyObject::Dict(dict) = row else { - return Err(RuntimeError::InvalidArguments( - "write_tsv row must be a dict".to_owned(), - )); - }; - let mut mapped = BTreeMap::new(); - for (key, value) in dict { - let MontyObject::String(key) = key else { - return Err(RuntimeError::InvalidArguments( - "write_tsv dict keys must be strings".to_owned(), - )); - }; - mapped.insert(key.clone(), stringify_value(value)); - } - out.push(mapped); - } - Ok(out) -} - -fn stringify_value(value: &MontyObject) -> String { - match value { - MontyObject::None => String::new(), - MontyObject::String(text) => text.clone(), - MontyObject::Int(v) => v.to_string(), - MontyObject::Bool(v) => v.to_string(), - other => format!("{other}"), - } -} - -fn host_read_text( - runtime: &BioscriptRuntime, - args: &[MontyObject], - kwargs: &[(MontyObject, MontyObject)], -) -> Result { - reject_kwargs(kwargs, "read_text")?; - let path = runtime.resolve_existing_user_path(&expect_string_arg(args, 0, "read_text")?)?; - let content = read_text_limited(&path, MAX_HOST_TEXT_BYTES)?; - Ok(MontyObject::String(content)) -} - -fn host_write_text( - runtime: &BioscriptRuntime, - args: &[MontyObject], - kwargs: &[(MontyObject, MontyObject)], -) -> Result { - reject_kwargs(kwargs, "write_text")?; - let path = runtime.resolve_user_write_path(&expect_string_arg(args, 0, "write_text")?)?; - let content = expect_string_arg(args, 1, "write_text")?; - if u64::try_from(content.len()).unwrap_or(u64::MAX) > MAX_HOST_TEXT_BYTES { - return Err(RuntimeError::InvalidArguments(format!( - "write_text content exceeds {MAX_HOST_TEXT_BYTES} bytes" - ))); - } - if let Some(parent) = path.parent() { - fs::create_dir_all(parent).map_err(|err| { - RuntimeError::Io(format!( - "failed to create parent dir {}: {err}", - parent.display() - )) - })?; - } - fs::write(&path, content) - .map_err(|err| RuntimeError::Io(format!("failed to write {}: {err}", path.display())))?; - Ok(MontyObject::None) -} - -fn deepest_existing_ancestor(path: &Path) -> &Path { - let mut current = path; - while !current.exists() { - let Some(parent) = current.parent() else { - break; - }; - current = parent; - } - current -} - -fn read_text_limited(path: &Path, max_bytes: u64) -> Result { - let mut file = fs::File::open(path) - .map_err(|err| RuntimeError::Io(format!("failed to read {}: {err}", path.display())))?; - let mut bytes = Vec::new(); - file.by_ref() - .take(max_bytes.saturating_add(1)) - .read_to_end(&mut bytes) - .map_err(|err| RuntimeError::Io(format!("failed to read {}: {err}", path.display())))?; - if u64::try_from(bytes.len()).unwrap_or(u64::MAX) > max_bytes { - return Err(RuntimeError::InvalidArguments(format!( - "read_text input {} exceeds {} bytes", - path.display(), - max_bytes - ))); - } - String::from_utf8(bytes).map_err(|err| { - RuntimeError::Io(format!( - "failed to decode {} as UTF-8: {err}", - path.display() - )) - }) -} - -fn host_trace( - runtime: &BioscriptRuntime, - args: &[MontyObject], - kwargs: &[(MontyObject, MontyObject)], -) -> Result { - reject_kwargs(kwargs, "__bioscript_trace__")?; - if let Some(MontyObject::Int(v)) = args.first() { - runtime - .state - .trace_lines - .lock() - .expect("trace mutex poisoned") - .push(*v as usize); - } - Ok(MontyObject::None) -} - -fn instrument_source(code: &str) -> String { - let mut out = Vec::new(); - let mut nesting_depth = 0usize; - let mut pending_backslash = false; - for (idx, line) in code.lines().enumerate() { - let line_no = idx + 1; - let trimmed = line.trim_start(); - - let in_continuation = nesting_depth > 0 || pending_backslash; - let should_trace = !in_continuation - && !trimmed.is_empty() - && !trimmed.starts_with('#') - && !trimmed.starts_with('@') - && !trimmed.starts_with('"') - && !trimmed.starts_with('\'') - && !trimmed.starts_with(']') - && !trimmed.starts_with(')') - && !trimmed.starts_with('}') - && !trimmed.starts_with(',') - && !trimmed.starts_with('+') - && !trimmed.starts_with('-') - && !trimmed.starts_with('*') - && !trimmed.starts_with('/') - && !trimmed.starts_with('%') - && !trimmed.starts_with("and ") - && !trimmed.starts_with("or ") - && !trimmed.starts_with("if ") - && !trimmed.starts_with("for ") - && !trimmed.starts_with("elif ") - && !trimmed.starts_with("else:") - && !trimmed.starts_with("except") - && !trimmed.starts_with("finally:") - && !trimmed.ends_with(':'); - - if should_trace { - let indent_len = line.len() - trimmed.len(); - let indent = &line[..indent_len]; - out.push(format!("{indent}__bioscript_trace__({line_no})")); - } - out.push(line.to_owned()); - - pending_backslash = ends_with_unescaped_backslash(line); - nesting_depth = update_nesting_depth(nesting_depth, line); - } - if code.ends_with('\n') { - out.join("\n") + "\n" - } else { - out.join("\n") - } -} - -fn ends_with_unescaped_backslash(line: &str) -> bool { - let trimmed = line.trim_end(); - if !trimmed.ends_with('\\') { - return false; - } - - let slash_count = trimmed.chars().rev().take_while(|ch| *ch == '\\').count(); - slash_count % 2 == 1 -} - -fn update_nesting_depth(mut depth: usize, line: &str) -> usize { - let mut chars = line.chars().peekable(); - let mut in_single = false; - let mut in_double = false; - - while let Some(ch) = chars.next() { - if in_single { - if ch == '\\' { - chars.next(); - } else if ch == '\'' { - in_single = false; - } - continue; - } - - if in_double { - if ch == '\\' { - chars.next(); - } else if ch == '"' { - in_double = false; - } - continue; - } - - match ch { - '#' => break, - '\'' => in_single = true, - '"' => in_double = true, - '(' | '[' | '{' => depth += 1, - ')' | ']' | '}' => depth = depth.saturating_sub(1), - _ => {} - } - } - - depth -} - #[cfg(test)] mod tests { use super::*; diff --git a/rust/bioscript-runtime/src/runtime/args.rs b/rust/bioscript-runtime/src/runtime/args.rs new file mode 100644 index 0000000..e630695 --- /dev/null +++ b/rust/bioscript-runtime/src/runtime/args.rs @@ -0,0 +1,75 @@ +use std::collections::BTreeMap; + +use bioscript_core::RuntimeError; +use monty::MontyObject; + +pub(crate) fn reject_kwargs( + kwargs: &[(MontyObject, MontyObject)], + function_name: &str, +) -> Result<(), RuntimeError> { + if kwargs.is_empty() { + Ok(()) + } else { + Err(RuntimeError::InvalidArguments(format!( + "{function_name} does not accept keyword arguments" + ))) + } +} + +pub(crate) fn expect_string_arg( + args: &[MontyObject], + index: usize, + function_name: &str, +) -> Result { + let Some(value) = args.get(index) else { + return Err(RuntimeError::InvalidArguments(format!( + "{function_name} missing argument at position {index}" + ))); + }; + match value { + MontyObject::String(text) => Ok(text.clone()), + other => Err(RuntimeError::InvalidArguments(format!( + "{function_name} expected str at position {index}, got {other:?}" + ))), + } +} + +pub(crate) fn expect_rows( + value: &MontyObject, +) -> Result>, RuntimeError> { + let MontyObject::List(rows) = value else { + return Err(RuntimeError::InvalidArguments( + "write_tsv expects a list of dict rows".to_owned(), + )); + }; + + let mut out = Vec::new(); + for row in rows { + let MontyObject::Dict(dict) = row else { + return Err(RuntimeError::InvalidArguments( + "write_tsv row must be a dict".to_owned(), + )); + }; + let mut mapped = BTreeMap::new(); + for (key, value) in dict { + let MontyObject::String(key) = key else { + return Err(RuntimeError::InvalidArguments( + "write_tsv dict keys must be strings".to_owned(), + )); + }; + mapped.insert(key.clone(), stringify_value(value)); + } + out.push(mapped); + } + Ok(out) +} + +fn stringify_value(value: &MontyObject) -> String { + match value { + MontyObject::None => String::new(), + MontyObject::String(text) => text.clone(), + MontyObject::Int(v) => v.to_string(), + MontyObject::Bool(v) => v.to_string(), + other => format!("{other}"), + } +} diff --git a/rust/bioscript-runtime/src/runtime/host_io.rs b/rust/bioscript-runtime/src/runtime/host_io.rs new file mode 100644 index 0000000..0934a7a --- /dev/null +++ b/rust/bioscript-runtime/src/runtime/host_io.rs @@ -0,0 +1,79 @@ +use std::{fs, io::Read, path::Path}; + +use bioscript_core::RuntimeError; +use monty::MontyObject; + +use super::{BioscriptRuntime, args::expect_string_arg, args::reject_kwargs}; + +const MAX_HOST_TEXT_BYTES: u64 = 16 * 1024 * 1024; + +pub(crate) fn host_read_text( + runtime: &BioscriptRuntime, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], +) -> Result { + reject_kwargs(kwargs, "read_text")?; + let path = runtime.resolve_existing_user_path(&expect_string_arg(args, 0, "read_text")?)?; + let content = read_text_limited(&path, MAX_HOST_TEXT_BYTES)?; + Ok(MontyObject::String(content)) +} + +pub(crate) fn host_write_text( + runtime: &BioscriptRuntime, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], +) -> Result { + reject_kwargs(kwargs, "write_text")?; + let path = runtime.resolve_user_write_path(&expect_string_arg(args, 0, "write_text")?)?; + let content = expect_string_arg(args, 1, "write_text")?; + if u64::try_from(content.len()).unwrap_or(u64::MAX) > MAX_HOST_TEXT_BYTES { + return Err(RuntimeError::InvalidArguments(format!( + "write_text content exceeds {MAX_HOST_TEXT_BYTES} bytes" + ))); + } + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).map_err(|err| { + RuntimeError::Io(format!( + "failed to create parent dir {}: {err}", + parent.display() + )) + })?; + } + fs::write(&path, content) + .map_err(|err| RuntimeError::Io(format!("failed to write {}: {err}", path.display())))?; + Ok(MontyObject::None) +} + +pub(crate) fn deepest_existing_ancestor(path: &Path) -> &Path { + let mut current = path; + while !current.exists() { + let Some(parent) = current.parent() else { + break; + }; + current = parent; + } + current +} + +fn read_text_limited(path: &Path, max_bytes: u64) -> Result { + let mut file = fs::File::open(path) + .map_err(|err| RuntimeError::Io(format!("failed to read {}: {err}", path.display())))?; + let mut bytes = Vec::new(); + file.by_ref() + .take(max_bytes.saturating_add(1)) + .read_to_end(&mut bytes) + .map_err(|err| RuntimeError::Io(format!("failed to read {}: {err}", path.display())))?; + if u64::try_from(bytes.len()).unwrap_or(u64::MAX) > max_bytes { + return Err(RuntimeError::InvalidArguments(format!( + "read_text input {} exceeds {} bytes", + path.display(), + max_bytes + ))); + } + String::from_utf8(bytes).map_err(|err| { + RuntimeError::Io(format!( + "failed to decode {} as UTF-8: {err}", + path.display() + )) + }) +} diff --git a/rust/bioscript-runtime/src/runtime/methods.rs b/rust/bioscript-runtime/src/runtime/methods.rs new file mode 100644 index 0000000..98a7d4c --- /dev/null +++ b/rust/bioscript-runtime/src/runtime/methods.rs @@ -0,0 +1,342 @@ +use std::{fs, time::Instant}; + +use bioscript_core::RuntimeError; +use bioscript_formats::{GenotypeLoadOptions, GenotypeStore}; +use monty::MontyObject; + +use super::{ + BioscriptRuntime, + args::{expect_rows, expect_string_arg, reject_kwargs}, + host_io::{host_read_text, host_write_text}, + objects::{ + genotype_file_object, variant_object, variant_observation_object, variant_plan_object, + }, + resolve_optional_loader_path, + variants::{ + dataclass_handle_id, dataclass_to_variant_spec, variant_spec_from_kwargs, + variant_specs_from_plan, + }, +}; + +impl BioscriptRuntime { + pub(super) fn method_load_genotypes( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + let started = Instant::now(); + reject_kwargs(kwargs, "bioscript.load_genotypes")?; + if args.len() != 2 { + return Err(RuntimeError::InvalidArguments( + "bioscript.load_genotypes expects self and path".to_owned(), + )); + } + let path = self.resolve_existing_user_path(&expect_string_arg( + args, + 1, + "bioscript.load_genotypes", + )?)?; + let loader = self.resolved_loader_options()?; + let store = GenotypeStore::from_file_with_options(&path, &loader)?; + let handle = self.state.next_handle(); + self.state + .genotype_files + .lock() + .expect("genotype mutex poisoned") + .insert(handle, store); + self.record_timing( + "load_genotypes", + started.elapsed(), + format!("path={}", path.display()), + ); + Ok(genotype_file_object(handle)) + } + + pub(super) fn resolved_loader_options(&self) -> Result { + let mut loader = self.config.loader.clone(); + loader.input_index = resolve_optional_loader_path(self, loader.input_index)?; + loader.reference_file = resolve_optional_loader_path(self, loader.reference_file)?; + loader.reference_index = resolve_optional_loader_path(self, loader.reference_index)?; + Ok(loader) + } + + pub(super) fn method_genotype_get( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + reject_kwargs(kwargs, "GenotypeFile.get")?; + if args.len() != 2 { + return Err(RuntimeError::InvalidArguments( + "GenotypeFile.get expects self and rsid".to_owned(), + )); + } + let handle = dataclass_handle_id(&args[0], "GenotypeFile")?; + let rsid = expect_string_arg(args, 1, "GenotypeFile.get")?; + let guard = self + .state + .genotype_files + .lock() + .expect("genotype mutex poisoned"); + let Some(store) = guard.get(&handle) else { + return Err(RuntimeError::InvalidArguments(format!( + "unknown genotype handle: {handle}" + ))); + }; + Ok(match store.get(&rsid)? { + Some(value) => MontyObject::String(value), + None => MontyObject::None, + }) + } + + pub(super) fn method_variant( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + if args.len() != 1 { + return Err(RuntimeError::InvalidArguments( + "bioscript.variant expects only self as a positional argument".to_owned(), + )); + } + let spec = variant_spec_from_kwargs(kwargs)?; + Ok(variant_object(&spec)) + } + + pub(super) fn method_query_plan( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + reject_kwargs(kwargs, "bioscript.query_plan")?; + if args.len() != 2 { + return Err(RuntimeError::InvalidArguments( + "bioscript.query_plan expects self and a list of variants".to_owned(), + )); + } + let variants = variant_specs_from_plan(&args[1])?; + Ok(variant_plan_object(&variants)) + } + + pub(super) fn method_genotype_lookup_variant( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + let started = Instant::now(); + reject_kwargs(kwargs, "GenotypeFile.lookup_variant")?; + if args.len() != 2 { + return Err(RuntimeError::InvalidArguments( + "GenotypeFile.lookup_variant expects self and variant".to_owned(), + )); + } + let handle = dataclass_handle_id(&args[0], "GenotypeFile")?; + let spec = dataclass_to_variant_spec(&args[1])?; + let guard = self + .state + .genotype_files + .lock() + .expect("genotype mutex poisoned"); + let Some(store) = guard.get(&handle) else { + return Err(RuntimeError::InvalidArguments(format!( + "unknown genotype handle: {handle}" + ))); + }; + let observation = store.lookup_variant(&spec)?; + self.record_timing( + "lookup_variant", + started.elapsed(), + format!("rsids={}", spec.rsids.join("|")), + ); + Ok(match observation.genotype { + Some(value) => MontyObject::String(value), + None => MontyObject::None, + }) + } + + pub(super) fn method_genotype_lookup_variant_details( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + let started = Instant::now(); + reject_kwargs(kwargs, "GenotypeFile.lookup_variant_details")?; + if args.len() != 2 { + return Err(RuntimeError::InvalidArguments( + "GenotypeFile.lookup_variant_details expects self and variant".to_owned(), + )); + } + let handle = dataclass_handle_id(&args[0], "GenotypeFile")?; + let spec = dataclass_to_variant_spec(&args[1])?; + let guard = self + .state + .genotype_files + .lock() + .expect("genotype mutex poisoned"); + let Some(store) = guard.get(&handle) else { + return Err(RuntimeError::InvalidArguments(format!( + "unknown genotype handle: {handle}" + ))); + }; + let observation = store.lookup_variant(&spec)?; + self.record_timing( + "lookup_variant_details", + started.elapsed(), + format!("rsids={}", spec.rsids.join("|")), + ); + Ok(variant_observation_object(&observation)) + } + + pub(super) fn method_genotype_lookup_variants( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + let started = Instant::now(); + reject_kwargs(kwargs, "GenotypeFile.lookup_variants")?; + if args.len() != 2 { + return Err(RuntimeError::InvalidArguments( + "GenotypeFile.lookup_variants expects self and a variant plan".to_owned(), + )); + } + let handle = dataclass_handle_id(&args[0], "GenotypeFile")?; + let specs = variant_specs_from_plan(&args[1])?; + let guard = self + .state + .genotype_files + .lock() + .expect("genotype mutex poisoned"); + let Some(store) = guard.get(&handle) else { + return Err(RuntimeError::InvalidArguments(format!( + "unknown genotype handle: {handle}" + ))); + }; + let observations = store.lookup_variants(&specs)?; + self.record_timing( + "lookup_variants", + started.elapsed(), + format!("count={}", specs.len()), + ); + Ok(MontyObject::List( + observations + .into_iter() + .map(|observation| match observation.genotype { + Some(value) => MontyObject::String(value), + None => MontyObject::None, + }) + .collect(), + )) + } + + pub(super) fn method_genotype_lookup_variants_details( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + let started = Instant::now(); + reject_kwargs(kwargs, "GenotypeFile.lookup_variants_details")?; + if args.len() != 2 { + return Err(RuntimeError::InvalidArguments( + "GenotypeFile.lookup_variants_details expects self and a variant plan".to_owned(), + )); + } + let handle = dataclass_handle_id(&args[0], "GenotypeFile")?; + let specs = variant_specs_from_plan(&args[1])?; + let guard = self + .state + .genotype_files + .lock() + .expect("genotype mutex poisoned"); + let Some(store) = guard.get(&handle) else { + return Err(RuntimeError::InvalidArguments(format!( + "unknown genotype handle: {handle}" + ))); + }; + let observations = store.lookup_variants(&specs)?; + self.record_timing( + "lookup_variants_details", + started.elapsed(), + format!("count={}", specs.len()), + ); + Ok(MontyObject::List( + observations + .iter() + .map(variant_observation_object) + .collect(), + )) + } + + pub(super) fn method_write_tsv( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + let started = Instant::now(); + reject_kwargs(kwargs, "bioscript.write_tsv")?; + if args.len() != 3 { + return Err(RuntimeError::InvalidArguments( + "bioscript.write_tsv expects self, path, rows".to_owned(), + )); + } + let path = + self.resolve_user_write_path(&expect_string_arg(args, 1, "bioscript.write_tsv")?)?; + let rows = expect_rows(&args[2])?; + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).map_err(|err| { + RuntimeError::Io(format!( + "failed to create parent dir {}: {err}", + parent.display() + )) + })?; + } + let mut output = String::new(); + if let Some(first) = rows.first() { + let headers: Vec = first.keys().cloned().collect(); + output.push_str(&headers.join("\t")); + output.push('\n'); + for row in &rows { + let values: Vec = headers + .iter() + .map(|header| row.get(header).cloned().unwrap_or_default()) + .collect(); + output.push_str(&values.join("\t")); + output.push('\n'); + } + } + fs::write(&path, output).map_err(|err| { + RuntimeError::Io(format!("failed to write {}: {err}", path.display())) + })?; + self.record_timing( + "write_tsv", + started.elapsed(), + format!("path={} rows={}", path.display(), rows.len()), + ); + Ok(MontyObject::None) + } + + pub(super) fn method_read_text( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + if args.is_empty() { + return Err(RuntimeError::InvalidArguments( + "bioscript.read_text expects self and path".to_owned(), + )); + } + host_read_text(self, &args[1..], kwargs) + } + + pub(super) fn method_write_text( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + if args.is_empty() { + return Err(RuntimeError::InvalidArguments( + "bioscript.write_text expects self, path, text".to_owned(), + )); + } + host_write_text(self, &args[1..], kwargs) + } +} diff --git a/rust/bioscript-runtime/src/runtime/objects.rs b/rust/bioscript-runtime/src/runtime/objects.rs new file mode 100644 index 0000000..818ccdb --- /dev/null +++ b/rust/bioscript-runtime/src/runtime/objects.rs @@ -0,0 +1,234 @@ +use bioscript_core::{VariantKind, VariantSpec}; +use monty::MontyObject; + +pub(crate) fn bioscript_object() -> MontyObject { + MontyObject::Dataclass { + name: "Bioscript".to_owned(), + type_id: 1, + field_names: vec![], + attrs: vec![].into(), + frozen: true, + } +} + +pub(crate) fn genotype_file_object(handle_id: u64) -> MontyObject { + MontyObject::Dataclass { + name: "GenotypeFile".to_owned(), + type_id: 2, + field_names: vec!["handle_id".to_owned()], + attrs: vec![( + MontyObject::String("handle_id".to_owned()), + MontyObject::Int(handle_id as i64), + )] + .into(), + frozen: true, + } +} + +pub(crate) fn variant_object(spec: &VariantSpec) -> MontyObject { + let mut attrs = Vec::new(); + attrs.push(( + MontyObject::String("rsids".to_owned()), + MontyObject::List( + spec.rsids + .iter() + .cloned() + .map(MontyObject::String) + .collect(), + ), + )); + if let Some(locus) = &spec.grch37 { + attrs.push(( + MontyObject::String("grch37".to_owned()), + MontyObject::String(format!("{}:{}-{}", locus.chrom, locus.start, locus.end)), + )); + } + if let Some(locus) = &spec.grch38 { + attrs.push(( + MontyObject::String("grch38".to_owned()), + MontyObject::String(format!("{}:{}-{}", locus.chrom, locus.start, locus.end)), + )); + } + if let Some(reference) = &spec.reference { + attrs.push(( + MontyObject::String("reference".to_owned()), + MontyObject::String(reference.clone()), + )); + } + if let Some(alternate) = &spec.alternate { + attrs.push(( + MontyObject::String("alternate".to_owned()), + MontyObject::String(alternate.clone()), + )); + } + if let Some(kind) = spec.kind { + attrs.push(( + MontyObject::String("kind".to_owned()), + MontyObject::String(variant_kind_name(kind).to_owned()), + )); + } + if let Some(length) = spec.deletion_length { + attrs.push(( + MontyObject::String("deletion_length".to_owned()), + MontyObject::Int(length as i64), + )); + } + if !spec.motifs.is_empty() { + attrs.push(( + MontyObject::String("motifs".to_owned()), + MontyObject::List( + spec.motifs + .iter() + .cloned() + .map(MontyObject::String) + .collect(), + ), + )); + } + + MontyObject::Dataclass { + name: "Variant".to_owned(), + type_id: 3, + field_names: vec![ + "rsids".to_owned(), + "grch37".to_owned(), + "grch38".to_owned(), + "reference".to_owned(), + "alternate".to_owned(), + "kind".to_owned(), + "deletion_length".to_owned(), + "motifs".to_owned(), + ], + attrs: attrs.into(), + frozen: true, + } +} + +pub(crate) fn variant_plan_object(variants: &[VariantSpec]) -> MontyObject { + MontyObject::Dataclass { + name: "VariantPlan".to_owned(), + type_id: 4, + field_names: vec!["variants".to_owned()], + attrs: vec![( + MontyObject::String("variants".to_owned()), + MontyObject::List(variants.iter().map(variant_object).collect()), + )] + .into(), + frozen: true, + } +} + +pub(crate) fn variant_observation_object( + observation: &bioscript_core::VariantObservation, +) -> MontyObject { + let mut attrs = vec![ + ( + MontyObject::String("backend".to_owned()), + MontyObject::String(observation.backend.clone()), + ), + ( + MontyObject::String("matched_rsid".to_owned()), + match &observation.matched_rsid { + Some(value) => MontyObject::String(value.clone()), + None => MontyObject::None, + }, + ), + ( + MontyObject::String("assembly".to_owned()), + match observation.assembly { + Some(assembly) => MontyObject::String(match assembly { + bioscript_core::Assembly::Grch37 => "grch37".to_owned(), + bioscript_core::Assembly::Grch38 => "grch38".to_owned(), + }), + None => MontyObject::None, + }, + ), + ( + MontyObject::String("genotype".to_owned()), + match &observation.genotype { + Some(value) => MontyObject::String(value.clone()), + None => MontyObject::None, + }, + ), + ( + MontyObject::String("ref_count".to_owned()), + observation.ref_count.map_or(MontyObject::None, |value| { + MontyObject::Int(i64::from(value)) + }), + ), + ( + MontyObject::String("alt_count".to_owned()), + observation.alt_count.map_or(MontyObject::None, |value| { + MontyObject::Int(i64::from(value)) + }), + ), + ( + MontyObject::String("depth".to_owned()), + observation.depth.map_or(MontyObject::None, |value| { + MontyObject::Int(i64::from(value)) + }), + ), + ( + MontyObject::String("decision".to_owned()), + match &observation.decision { + Some(value) => MontyObject::String(value.clone()), + None => MontyObject::None, + }, + ), + ( + MontyObject::String("raw_counts".to_owned()), + MontyObject::Dict( + observation + .raw_counts + .iter() + .map(|(base, count)| { + ( + MontyObject::String(base.clone()), + MontyObject::Int(i64::from(*count)), + ) + }) + .collect(), + ), + ), + ( + MontyObject::String("evidence".to_owned()), + MontyObject::List( + observation + .evidence + .iter() + .cloned() + .map(MontyObject::String) + .collect(), + ), + ), + ]; + + MontyObject::Dataclass { + name: "VariantObservation".to_owned(), + type_id: 5, + field_names: vec![ + "backend".to_owned(), + "matched_rsid".to_owned(), + "assembly".to_owned(), + "genotype".to_owned(), + "ref_count".to_owned(), + "alt_count".to_owned(), + "depth".to_owned(), + "decision".to_owned(), + "raw_counts".to_owned(), + "evidence".to_owned(), + ], + attrs: attrs.drain(..).collect(), + frozen: true, + } +} + +pub(crate) fn variant_kind_name(kind: VariantKind) -> &'static str { + match kind { + VariantKind::Snp => "snp", + VariantKind::Insertion => "insertion", + VariantKind::Deletion => "deletion", + VariantKind::Indel => "indel", + VariantKind::Other => "other", + } +} diff --git a/rust/bioscript-runtime/src/runtime/state.rs b/rust/bioscript-runtime/src/runtime/state.rs new file mode 100644 index 0000000..3b7d2cc --- /dev/null +++ b/rust/bioscript-runtime/src/runtime/state.rs @@ -0,0 +1,67 @@ +use std::{ + collections::HashMap, + sync::{ + Mutex, + atomic::{AtomicU64, Ordering}, + }, + time::Duration, +}; + +use bioscript_formats::{GenotypeLoadOptions, GenotypeStore}; +use monty::{MontyException, ResourceLimits}; + +use bioscript_core::RuntimeError; + +#[derive(Debug, Clone)] +pub struct RuntimeConfig { + pub limits: ResourceLimits, + pub loader: GenotypeLoadOptions, +} + +impl Default for RuntimeConfig { + fn default() -> Self { + let limits = ResourceLimits::new() + .max_duration(Duration::from_millis(100)) + .max_memory(8 * 1024 * 1024) + .max_allocations(200_000) + .gc_interval(1000) + .max_recursion_depth(Some(200)); + Self { + limits, + loader: GenotypeLoadOptions::default(), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct StageTiming { + pub stage: String, + pub duration_ms: u128, + pub detail: String, +} + +pub(crate) fn monty_error(value: MontyException) -> RuntimeError { + RuntimeError::Monty(value.to_string()) +} + +pub(crate) struct RuntimeState { + pub(crate) next_handle: AtomicU64, + pub(crate) genotype_files: Mutex>, + pub(crate) trace_lines: Mutex>, + pub(crate) timings: Mutex>, +} + +impl RuntimeState { + pub(crate) fn new() -> Self { + Self { + next_handle: AtomicU64::new(1), + genotype_files: Mutex::new(HashMap::new()), + trace_lines: Mutex::new(Vec::new()), + timings: Mutex::new(Vec::new()), + } + } + + pub(crate) fn next_handle(&self) -> u64 { + self.next_handle.fetch_add(1, Ordering::Relaxed) + } +} diff --git a/rust/bioscript-runtime/src/runtime/trace.rs b/rust/bioscript-runtime/src/runtime/trace.rs new file mode 100644 index 0000000..69e7d29 --- /dev/null +++ b/rust/bioscript-runtime/src/runtime/trace.rs @@ -0,0 +1,212 @@ +use bioscript_core::RuntimeError; +use monty::MontyObject; + +use super::{BioscriptRuntime, args::reject_kwargs}; + +pub(crate) fn trace_lookup_metadata(source: &str) -> (Option, Option) { + if let Some(rsid) = extract_rsid(source) { + let url = format!("https://www.ncbi.nlm.nih.gov/snp/{rsid}"); + return (Some(rsid), Some(url)); + } + + if let Some(coord) = extract_coordinate(source) { + let lower = source.to_ascii_lowercase(); + let host = if lower.contains("grch37") || lower.contains("hg19") { + "https://grch37.ensembl.org" + } else { + "https://www.ensembl.org" + }; + let url = format!("{host}/Homo_sapiens/Location/View?r={coord}"); + return (Some(coord), Some(url)); + } + + (None, None) +} + +pub(crate) fn statement_context(lines: &[&str], line_no: usize) -> String { + let Some(start_idx) = line_no.checked_sub(1) else { + return String::new(); + }; + let Some(first_line) = lines.get(start_idx) else { + return String::new(); + }; + + let mut out = String::from(first_line.trim()); + let mut depth = update_nesting_depth(0, first_line); + let mut current = start_idx + 1; + + while depth > 0 { + let Some(line) = lines.get(current) else { + break; + }; + if !out.is_empty() { + out.push(' '); + } + out.push_str(line.trim()); + depth = update_nesting_depth(depth, line); + current += 1; + } + + out +} + +pub(crate) fn extract_rsid(source: &str) -> Option { + let chars: Vec = source.chars().collect(); + let len = chars.len(); + let mut idx = 0; + while idx + 2 <= len { + if chars[idx] == 'r' + && chars.get(idx + 1) == Some(&'s') + && (idx == 0 || !chars[idx - 1].is_ascii_alphanumeric()) + { + let mut end = idx + 2; + while end < len && chars[end].is_ascii_digit() { + end += 1; + } + if end > idx + 2 { + return Some(chars[idx..end].iter().collect()); + } + } + idx += 1; + } + None +} + +pub(crate) fn extract_coordinate(source: &str) -> Option { + for token in source.split(|ch: char| { + ch.is_whitespace() || matches!(ch, '"' | '\'' | ',' | ')' | '(' | '[' | ']' | '{' | '}') + }) { + let cleaned = token.trim_matches(|ch: char| matches!(ch, ';')); + let normalized = cleaned.strip_prefix("chr").unwrap_or(cleaned); + if let Some((chrom, rest)) = normalized.split_once(':') + && !chrom.is_empty() + && chrom.chars().all(|ch| ch.is_ascii_alphanumeric()) + { + if let Some((start, end)) = rest.split_once('-') { + if start.chars().all(|ch| ch.is_ascii_digit()) + && end.chars().all(|ch| ch.is_ascii_digit()) + { + return Some(format!("{chrom}:{start}-{end}")); + } + } else if rest.chars().all(|ch| ch.is_ascii_digit()) { + return Some(format!("{chrom}:{rest}-{rest}")); + } + } + } + None +} + +pub(crate) fn host_trace( + runtime: &BioscriptRuntime, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], +) -> Result { + reject_kwargs(kwargs, "__bioscript_trace__")?; + if let Some(MontyObject::Int(v)) = args.first() { + runtime + .state + .trace_lines + .lock() + .expect("trace mutex poisoned") + .push(*v as usize); + } + Ok(MontyObject::None) +} + +pub(crate) fn instrument_source(code: &str) -> String { + let mut out = Vec::new(); + let mut nesting_depth = 0usize; + let mut pending_backslash = false; + for (idx, line) in code.lines().enumerate() { + let line_no = idx + 1; + let trimmed = line.trim_start(); + + let in_continuation = nesting_depth > 0 || pending_backslash; + let should_trace = !in_continuation + && !trimmed.is_empty() + && !trimmed.starts_with('#') + && !trimmed.starts_with('@') + && !trimmed.starts_with('"') + && !trimmed.starts_with('\'') + && !trimmed.starts_with(']') + && !trimmed.starts_with(')') + && !trimmed.starts_with('}') + && !trimmed.starts_with(',') + && !trimmed.starts_with('+') + && !trimmed.starts_with('-') + && !trimmed.starts_with('*') + && !trimmed.starts_with('/') + && !trimmed.starts_with('%') + && !trimmed.starts_with("and ") + && !trimmed.starts_with("or ") + && !trimmed.starts_with("if ") + && !trimmed.starts_with("for ") + && !trimmed.starts_with("elif ") + && !trimmed.starts_with("else:") + && !trimmed.starts_with("except") + && !trimmed.starts_with("finally:") + && !trimmed.ends_with(':'); + + if should_trace { + let indent_len = line.len() - trimmed.len(); + let indent = &line[..indent_len]; + out.push(format!("{indent}__bioscript_trace__({line_no})")); + } + out.push(line.to_owned()); + + pending_backslash = ends_with_unescaped_backslash(line); + nesting_depth = update_nesting_depth(nesting_depth, line); + } + if code.ends_with('\n') { + out.join("\n") + "\n" + } else { + out.join("\n") + } +} + +pub(crate) fn ends_with_unescaped_backslash(line: &str) -> bool { + let trimmed = line.trim_end(); + if !trimmed.ends_with('\\') { + return false; + } + + let slash_count = trimmed.chars().rev().take_while(|ch| *ch == '\\').count(); + slash_count % 2 == 1 +} + +pub(crate) fn update_nesting_depth(mut depth: usize, line: &str) -> usize { + let mut chars = line.chars().peekable(); + let mut in_single = false; + let mut in_double = false; + + while let Some(ch) = chars.next() { + if in_single { + if ch == '\\' { + chars.next(); + } else if ch == '\'' { + in_single = false; + } + continue; + } + + if in_double { + if ch == '\\' { + chars.next(); + } else if ch == '"' { + in_double = false; + } + continue; + } + + match ch { + '#' => break, + '\'' => in_single = true, + '"' => in_double = true, + '(' | '[' | '{' => depth += 1, + ')' | ']' | '}' => depth = depth.saturating_sub(1), + _ => {} + } + } + + depth +} diff --git a/rust/bioscript-runtime/src/runtime/variants.rs b/rust/bioscript-runtime/src/runtime/variants.rs new file mode 100644 index 0000000..5eee041 --- /dev/null +++ b/rust/bioscript-runtime/src/runtime/variants.rs @@ -0,0 +1,225 @@ +use bioscript_core::{GenomicLocus, RuntimeError, VariantKind, VariantSpec}; +use monty::MontyObject; + +pub(crate) fn dataclass_handle_id( + obj: &MontyObject, + expected_name: &str, +) -> Result { + match obj { + MontyObject::Dataclass { name, attrs, .. } if name == expected_name => { + for (key, value) in attrs { + if matches!(key, MontyObject::String(text) if text == "handle_id") + && let MontyObject::Int(id) = value + { + return Ok(*id as u64); + } + } + Err(RuntimeError::InvalidArguments(format!( + "{expected_name} missing handle_id" + ))) + } + _ => Err(RuntimeError::InvalidArguments(format!( + "expected {expected_name} object" + ))), + } +} + +pub(crate) fn dataclass_to_variant_spec(obj: &MontyObject) -> Result { + let MontyObject::Dataclass { name, attrs, .. } = obj else { + return Err(RuntimeError::InvalidArguments( + "expected Variant object".to_owned(), + )); + }; + if name != "Variant" { + return Err(RuntimeError::InvalidArguments(format!( + "expected Variant object, got {name}" + ))); + } + + let mut spec = VariantSpec::default(); + for (key, value) in attrs { + let MontyObject::String(key) = key else { + continue; + }; + match key.as_str() { + "rsids" => spec.rsids = string_list_from_object(value)?, + "grch37" => { + spec.grch37 = string_from_optional(value)? + .map(|v| parse_locus_string(&v)) + .transpose()? + } + "grch38" => { + spec.grch38 = string_from_optional(value)? + .map(|v| parse_locus_string(&v)) + .transpose()? + } + "reference" => spec.reference = string_from_optional(value)?, + "alternate" => spec.alternate = string_from_optional(value)?, + "kind" => { + spec.kind = string_from_optional(value)? + .as_deref() + .map(parse_variant_kind) + .transpose()? + } + "deletion_length" => { + spec.deletion_length = int_from_optional(value)?.map(|v| v as usize) + } + "motifs" => spec.motifs = string_list_from_object(value)?, + _ => {} + } + } + Ok(spec) +} + +pub(crate) fn variant_specs_from_plan(obj: &MontyObject) -> Result, RuntimeError> { + match obj { + MontyObject::List(items) => items.iter().map(dataclass_to_variant_spec).collect(), + MontyObject::Dataclass { name, attrs, .. } if name == "VariantPlan" => { + for (key, value) in attrs { + if matches!(key, MontyObject::String(text) if text == "variants") { + return variant_specs_from_plan(value); + } + } + Err(RuntimeError::InvalidArguments( + "VariantPlan missing variants".to_owned(), + )) + } + _ => Err(RuntimeError::InvalidArguments( + "expected a list of Variant objects or a VariantPlan".to_owned(), + )), + } +} + +pub(crate) fn variant_spec_from_kwargs( + kwargs: &[(MontyObject, MontyObject)], +) -> Result { + let mut spec = VariantSpec::default(); + for (key, value) in kwargs { + let MontyObject::String(key) = key else { + return Err(RuntimeError::InvalidArguments( + "bioscript.variant keyword names must be strings".to_owned(), + )); + }; + match key.as_str() { + "rsid" | "rsids" => spec.rsids = string_or_list(value)?, + "grch37" => { + spec.grch37 = string_from_optional(value)? + .map(|v| parse_locus_string(&v)) + .transpose()? + } + "grch38" => { + spec.grch38 = string_from_optional(value)? + .map(|v| parse_locus_string(&v)) + .transpose()? + } + "ref" | "reference" => spec.reference = string_from_optional(value)?, + "alt" | "alternate" => spec.alternate = string_from_optional(value)?, + "kind" => { + spec.kind = string_from_optional(value)? + .as_deref() + .map(parse_variant_kind) + .transpose()? + } + "deletion_length" => { + spec.deletion_length = int_from_optional(value)?.map(|v| v as usize) + } + "motifs" => spec.motifs = string_or_list(value)?, + other => { + return Err(RuntimeError::InvalidArguments(format!( + "bioscript.variant does not accept keyword '{other}'" + ))); + } + } + } + Ok(spec) +} + +fn parse_locus_string(value: &str) -> Result { + let normalized = value.trim().strip_prefix("chr").unwrap_or(value.trim()); + let Some((chrom, rest)) = normalized.split_once(':') else { + return Err(RuntimeError::InvalidArguments(format!( + "invalid locus string: {value}" + ))); + }; + let (start, end) = if let Some((start, end)) = rest.split_once('-') { + (start, end) + } else { + (rest, rest) + }; + let start = start.parse::().map_err(|err| { + RuntimeError::InvalidArguments(format!("invalid locus start {value}: {err}")) + })?; + let end = end.parse::().map_err(|err| { + RuntimeError::InvalidArguments(format!("invalid locus end {value}: {err}")) + })?; + Ok(GenomicLocus { + chrom: chrom.to_owned(), + start, + end, + }) +} + +fn parse_variant_kind(value: &str) -> Result { + match value.trim().to_ascii_lowercase().as_str() { + "snp" => Ok(VariantKind::Snp), + "insertion" | "ins" => Ok(VariantKind::Insertion), + "deletion" | "del" => Ok(VariantKind::Deletion), + "indel" => Ok(VariantKind::Indel), + "other" => Ok(VariantKind::Other), + other => Err(RuntimeError::InvalidArguments(format!( + "invalid variant kind: {other}" + ))), + } +} + +pub(crate) fn string_or_list(value: &MontyObject) -> Result, RuntimeError> { + match value { + MontyObject::String(text) => Ok(vec![text.clone()]), + MontyObject::List(_) => string_list_from_object(value), + MontyObject::None => Ok(Vec::new()), + other => Err(RuntimeError::InvalidArguments(format!( + "expected string or list of strings, got {other:?}" + ))), + } +} + +pub(crate) fn string_list_from_object(value: &MontyObject) -> Result, RuntimeError> { + match value { + MontyObject::List(items) => { + let mut out = Vec::new(); + for item in items { + let MontyObject::String(text) = item else { + return Err(RuntimeError::InvalidArguments( + "expected list of strings".to_owned(), + )); + }; + out.push(text.clone()); + } + Ok(out) + } + MontyObject::None => Ok(Vec::new()), + other => Err(RuntimeError::InvalidArguments(format!( + "expected list of strings, got {other:?}" + ))), + } +} + +pub(crate) fn string_from_optional(value: &MontyObject) -> Result, RuntimeError> { + match value { + MontyObject::None => Ok(None), + MontyObject::String(text) => Ok(Some(text.clone())), + other => Err(RuntimeError::InvalidArguments(format!( + "expected optional string, got {other:?}" + ))), + } +} + +pub(crate) fn int_from_optional(value: &MontyObject) -> Result, RuntimeError> { + match value { + MontyObject::None => Ok(None), + MontyObject::Int(v) => Ok(Some(*v)), + other => Err(RuntimeError::InvalidArguments(format!( + "expected optional int, got {other:?}" + ))), + } +} diff --git a/rust/bioscript-runtime/tests/resources_coverage.rs b/rust/bioscript-runtime/tests/resources_coverage.rs index dec19d5..087ab4b 100644 --- a/rust/bioscript-runtime/tests/resources_coverage.rs +++ b/rust/bioscript-runtime/tests/resources_coverage.rs @@ -1,6 +1,7 @@ use std::{ fs, path::PathBuf, + sync::atomic::{AtomicUsize, Ordering}, time::{Duration, SystemTime, UNIX_EPOCH}, }; @@ -8,13 +9,16 @@ use bioscript_formats::GenotypeLoadOptions; use bioscript_runtime::{BioscriptRuntime, RuntimeConfig}; use monty::ResourceLimits; +static TEMP_COUNTER: AtomicUsize = AtomicUsize::new(0); + fn temp_dir(label: &str) -> PathBuf { let nanos = SystemTime::now() .duration_since(UNIX_EPOCH) .expect("clock drift") .as_nanos(); + let counter = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed); let dir = std::env::temp_dir().join(format!( - "bioscript-runtime-coverage-{label}-{}-{nanos}", + "bioscript-runtime-coverage-{label}-{}-{nanos}-{counter}", std::process::id() )); fs::create_dir_all(&dir).unwrap(); diff --git a/rust/bioscript-runtime/tests/security.rs b/rust/bioscript-runtime/tests/security.rs index afbaecd..63f0513 100644 --- a/rust/bioscript-runtime/tests/security.rs +++ b/rust/bioscript-runtime/tests/security.rs @@ -1,6 +1,7 @@ use std::{ fs, path::PathBuf, + sync::atomic::{AtomicUsize, Ordering}, time::{Duration, SystemTime, UNIX_EPOCH}, }; @@ -8,13 +9,16 @@ use bioscript_formats::GenotypeLoadOptions; use bioscript_runtime::{BioscriptRuntime, RuntimeConfig}; use monty::{MontyObject, ResourceLimits}; +static TEMP_COUNTER: AtomicUsize = AtomicUsize::new(0); + fn temp_dir(label: &str) -> PathBuf { let nanos = SystemTime::now() .duration_since(UNIX_EPOCH) .expect("clock drift") .as_nanos(); + let counter = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed); let dir = std::env::temp_dir().join(format!( - "bioscript-runtime-{label}-{}-{nanos}", + "bioscript-runtime-{label}-{}-{nanos}-{counter}", std::process::id() )); fs::create_dir_all(&dir).unwrap(); diff --git a/rust/bioscript-schema/src/validator.rs b/rust/bioscript-schema/src/validator.rs index 5f11620..3e6e7f8 100644 --- a/rust/bioscript-schema/src/validator.rs +++ b/rust/bioscript-schema/src/validator.rs @@ -1,13 +1,25 @@ use std::{ - collections::BTreeSet, fmt::{self, Write as _}, - fs, path::{Path, PathBuf}, }; -use bioscript_core::{GenomicLocus, VariantKind, VariantSpec}; -use serde_yaml::{Mapping, Value}; -use url::Url; +use bioscript_core::VariantSpec; +use serde_yaml::Value; + +mod common; +mod panel; +mod spec; +mod variant; + +use common::{ + collect_yaml_files, load_yaml, render_single_manifest_errors, required_non_empty_string, + scalar_at, seq_of_strings, validate_schema_and_identity, +}; +use panel::{parse_downloads, parse_panel_members, validate_panel_root}; +use spec::variant_spec_from_root; +use variant::{ + validate_alleles, validate_coordinates, validate_identifiers, validate_variant_root, +}; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Severity { @@ -292,38 +304,6 @@ fn validate_manifest_path( }) } -fn collect_yaml_files(path: &Path) -> Result, String> { - if path.is_file() { - return Ok(vec![path.to_path_buf()]); - } - - let mut files = Vec::new(); - collect_yaml_files_recursive(path, &mut files)?; - files.sort(); - Ok(files) -} - -fn collect_yaml_files_recursive(path: &Path, files: &mut Vec) -> Result<(), String> { - let entries = fs::read_dir(path) - .map_err(|err| format!("failed to read directory {}: {err}", path.display()))?; - for entry in entries { - let entry = entry.map_err(|err| format!("failed to read directory entry: {err}"))?; - let entry_path = entry.path(); - if entry_path.is_dir() { - collect_yaml_files_recursive(&entry_path, files)?; - continue; - } - if entry_path.extension().is_some_and(|extension| { - ["yaml", "yml"] - .iter() - .any(|item| extension.eq_ignore_ascii_case(item)) - }) { - files.push(entry_path); - } - } - Ok(()) -} - fn validate_variant_file(path: &Path) -> Result { let value = load_yaml(path)?; let Some(schema) = scalar_at(&value, &["schema"]) else { @@ -377,1126 +357,3 @@ fn validate_panel_file(path: &Path) -> Result { issues, }) } - -fn validate_variant_root(root: &Value, issues: &mut Vec) { - validate_schema_and_identity( - root, - "bioscript:variant:1.0", - Some("bioscript:variant"), - issues, - ); - validate_optional_strings(root, &["name", "label", "gene", "summary"], issues); - validate_tags(root, issues); - validate_identifiers(root, issues); - validate_coordinates(root, issues); - validate_alleles(root, issues); - validate_findings(root, issues); - validate_provenance(root, issues); - - let has_identifiers = value_at(root, &["identifiers"]) - .and_then(Value::as_mapping) - .is_some_and(|mapping| !mapping.is_empty()); - let has_coordinates = ["grch37", "grch38"] - .iter() - .any(|assembly| value_at(root, &["coordinates", assembly]).is_some()); - if !has_identifiers && !has_coordinates { - issues.push(Issue { - severity: Severity::Error, - path: "identifiers/coordinates".to_owned(), - message: "expected at least one identifier block or one coordinate block".to_owned(), - }); - } -} - -fn validate_panel_root(root: &Value, issues: &mut Vec) { - validate_schema_and_identity(root, "bioscript:panel:1.0", None, issues); - validate_optional_strings(root, &["name", "label", "summary"], issues); - validate_tags(root, issues); - validate_permissions(root, issues); - validate_downloads(root, issues); - validate_panel_members(root, issues); -} - -fn validate_schema_and_identity( - root: &Value, - canonical_schema: &str, - legacy_schema: Option<&str>, - issues: &mut Vec, -) { - let schema = scalar_at(root, &["schema"]); - let valid_schema = schema - .as_deref() - .is_some_and(|value| value == canonical_schema || legacy_schema == Some(value)); - if !valid_schema { - issues.push(Issue { - severity: Severity::Error, - path: "schema".to_owned(), - message: format!("expected schema to be '{canonical_schema}'"), - }); - } - if let Some(legacy_schema) = legacy_schema - && matches!(schema.as_deref(), Some(value) if value == legacy_schema) - { - issues.push(Issue { - severity: Severity::Warning, - path: "schema".to_owned(), - message: format!("legacy schema value '{legacy_schema}'; prefer '{canonical_schema}'"), - }); - } - require_const(root, &["version"], "1.0", issues); - match scalar_at(root, &["name"]) { - Some(name) if !name.trim().is_empty() => {} - Some(_) => issues.push(Issue { - severity: Severity::Error, - path: "name".to_owned(), - message: "empty string".to_owned(), - }), - None => issues.push(Issue { - severity: Severity::Error, - path: "name".to_owned(), - message: "missing required field".to_owned(), - }), - } - if value_at(root, &["variant_id"]).is_some() { - issues.push(Issue { - severity: Severity::Warning, - path: "variant_id".to_owned(), - message: "variant_id is legacy; prefer name".to_owned(), - }); - } -} - -fn validate_optional_strings(root: &Value, fields: &[&str], issues: &mut Vec) { - for field in fields { - if let Some(value) = value_at(root, &[*field]) { - match value.as_str() { - Some(text) if !text.trim().is_empty() => {} - Some(_) => issues.push(Issue { - severity: Severity::Warning, - path: (*field).to_owned(), - message: "empty string".to_owned(), - }), - None => issues.push(Issue { - severity: Severity::Error, - path: (*field).to_owned(), - message: "expected string".to_owned(), - }), - } - } - } -} - -fn validate_tags(root: &Value, issues: &mut Vec) { - let Some(value) = value_at(root, &["tags"]) else { - return; - }; - let Some(items) = value.as_sequence() else { - issues.push(Issue { - severity: Severity::Error, - path: "tags".to_owned(), - message: "expected a sequence of strings".to_owned(), - }); - return; - }; - - for (idx, item) in items.iter().enumerate() { - let Some(tag) = item.as_str() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("tags[{idx}]"), - message: "expected string".to_owned(), - }); - continue; - }; - if tag.trim().is_empty() { - issues.push(Issue { - severity: Severity::Error, - path: format!("tags[{idx}]"), - message: "empty tag string".to_owned(), - }); - } - } -} - -fn validate_identifiers(root: &Value, issues: &mut Vec) { - for field in ["rsids", "aliases"] { - let Some(values) = value_at(root, &["identifiers", field]) else { - continue; - }; - let Some(items) = values.as_sequence() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("identifiers.{field}"), - message: "expected a sequence of strings".to_owned(), - }); - continue; - }; - let mut seen = BTreeSet::new(); - for (idx, item) in items.iter().enumerate() { - let Some(value) = item.as_str() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("identifiers.{field}[{idx}]"), - message: "expected string".to_owned(), - }); - continue; - }; - if !is_rsid(value) { - issues.push(Issue { - severity: Severity::Error, - path: format!("identifiers.{field}[{idx}]"), - message: format!("expected rsid like rs123, found '{value}'"), - }); - } - if !seen.insert(value.to_owned()) { - issues.push(Issue { - severity: Severity::Warning, - path: format!("identifiers.{field}[{idx}]"), - message: format!("duplicate identifier '{value}'"), - }); - } - } - } -} - -fn validate_coordinates(root: &Value, issues: &mut Vec) { - for assembly in ["grch37", "grch38"] { - let Some(coord) = mapping_at(root, &["coordinates", assembly]) else { - continue; - }; - - let Some(chrom) = coord - .get(Value::String("chrom".to_owned())) - .and_then(Value::as_str) - else { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}.chrom"), - message: "missing chrom".to_owned(), - }); - continue; - }; - if !is_allowed_chromosome(chrom) { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}.chrom"), - message: format!("invalid chromosome '{chrom}'; expected 1-22, X, Y, or MT"), - }); - } - - let has_pos = coord.contains_key(Value::String("pos".to_owned())); - let has_start = coord.contains_key(Value::String("start".to_owned())); - let has_end = coord.contains_key(Value::String("end".to_owned())); - if has_pos && (has_start || has_end) { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}"), - message: "use either pos or start/end, not both".to_owned(), - }); - continue; - } - if !(has_pos || has_start && has_end) { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}"), - message: "expected either pos or start/end".to_owned(), - }); - continue; - } - - if has_pos { - validate_coordinate_pos(coord, assembly, issues); - } else { - validate_coordinate_range(coord, assembly, issues); - } - } -} - -fn validate_coordinate_pos(coord: &Mapping, assembly: &str, issues: &mut Vec) { - if let Some(pos) = i64_at_mapping(coord, "pos") { - if pos < 1 { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}.pos"), - message: "expected integer >= 1".to_owned(), - }); - } - } else { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}.pos"), - message: "expected integer".to_owned(), - }); - } -} - -fn validate_coordinate_range(coord: &Mapping, assembly: &str, issues: &mut Vec) { - let start = i64_at_mapping(coord, "start"); - let end = i64_at_mapping(coord, "end"); - match (start, end) { - (Some(start), Some(end)) => validate_coordinate_range_values(start, end, assembly, issues), - _ => issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}"), - message: "expected integer start/end".to_owned(), - }), - } -} - -fn validate_coordinate_range_values(start: i64, end: i64, assembly: &str, issues: &mut Vec) { - if start < 1 { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}.start"), - message: "expected integer >= 1".to_owned(), - }); - } - if end < 1 { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}.end"), - message: "expected integer >= 1".to_owned(), - }); - } - if end < start { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}.end"), - message: "expected end >= start".to_owned(), - }); - } - if start == end { - issues.push(Issue { - severity: Severity::Warning, - path: format!("coordinates.{assembly}"), - message: "single-position coordinate uses start/end; prefer pos".to_owned(), - }); - } -} - -fn validate_alleles(root: &Value, issues: &mut Vec) { - require_path(root, &["alleles"], issues); - require_path(root, &["alleles", "kind"], issues); - require_path(root, &["alleles", "ref"], issues); - require_path(root, &["alleles", "alts"], issues); - - let Some(kind) = scalar_at(root, &["alleles", "kind"]) else { - return; - }; - if !matches!(kind.as_str(), "snv" | "deletion" | "insertion" | "indel") { - issues.push(Issue { - severity: Severity::Error, - path: "alleles.kind".to_owned(), - message: "expected one of snv, deletion, insertion, indel".to_owned(), - }); - } - - if value_at(root, &["alleles", "canonical_alt"]).is_some() { - issues.push(Issue { - severity: Severity::Error, - path: "alleles.canonical_alt".to_owned(), - message: "canonical_alt is not part of the current schema".to_owned(), - }); - } - - let Some(reference) = scalar_at(root, &["alleles", "ref"]) else { - return; - }; - if reference.trim().is_empty() { - issues.push(Issue { - severity: Severity::Error, - path: "alleles.ref".to_owned(), - message: "empty string".to_owned(), - }); - } - - let Some(alts_value) = value_at(root, &["alleles", "alts"]) else { - return; - }; - let Some(alts_seq) = alts_value.as_sequence() else { - issues.push(Issue { - severity: Severity::Error, - path: "alleles.alts".to_owned(), - message: "expected a non-empty sequence of strings".to_owned(), - }); - return; - }; - if alts_seq.is_empty() { - issues.push(Issue { - severity: Severity::Error, - path: "alleles.alts".to_owned(), - message: "expected at least one alternate allele".to_owned(), - }); - return; - } - - let mut alts = Vec::new(); - for (idx, item) in alts_seq.iter().enumerate() { - let Some(alt) = item.as_str() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("alleles.alts[{idx}]"), - message: "expected string".to_owned(), - }); - continue; - }; - if alt.trim().is_empty() { - issues.push(Issue { - severity: Severity::Error, - path: format!("alleles.alts[{idx}]"), - message: "empty string".to_owned(), - }); - continue; - } - alts.push(alt.to_owned()); - } - validate_symbolic_alleles(&reference, &alts, issues); - validate_snv_alleles(&kind, &reference, &alts, issues); -} - -fn validate_symbolic_alleles(reference: &str, alts: &[String], issues: &mut Vec) { - if reference == "I" || reference == "D" { - issues.push(Issue { - severity: Severity::Error, - path: "alleles.ref".to_owned(), - message: "symbolic I/D alleles are not allowed in stored YAML; use biological alleles" - .to_owned(), - }); - } - for (idx, alt) in alts.iter().enumerate() { - if alt == "I" || alt == "D" { - issues.push(Issue { - severity: Severity::Error, - path: format!("alleles.alts[{idx}]"), - message: - "symbolic I/D alleles are not allowed in stored YAML; use biological alleles" - .to_owned(), - }); - } - } -} - -fn validate_snv_alleles(kind: &str, reference: &str, alts: &[String], issues: &mut Vec) { - if kind != "snv" { - return; - } - if !is_base_allele(reference) { - issues.push(Issue { - severity: Severity::Error, - path: "alleles.ref".to_owned(), - message: "snv ref must be one of A/C/G/T".to_owned(), - }); - } - for (idx, alt) in alts.iter().enumerate() { - if !is_base_allele(alt) { - issues.push(Issue { - severity: Severity::Error, - path: format!("alleles.alts[{idx}]"), - message: "snv alt must be one of A/C/G/T".to_owned(), - }); - } - } -} - -fn validate_findings(root: &Value, issues: &mut Vec) { - let alts = seq_of_strings(root, &["alleles", "alts"]).unwrap_or_default(); - let Some(findings) = value_at(root, &["findings"]).and_then(Value::as_sequence) else { - return; - }; - - for (idx, finding) in findings.iter().enumerate() { - let Some(mapping) = finding.as_mapping() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("findings[{idx}]"), - message: "expected mapping".to_owned(), - }); - continue; - }; - - let Some(schema) = mapping - .get(Value::String("schema".to_owned())) - .and_then(Value::as_str) - else { - issues.push(Issue { - severity: Severity::Error, - path: format!("findings[{idx}].schema"), - message: "missing schema".to_owned(), - }); - continue; - }; - if schema.trim().is_empty() { - issues.push(Issue { - severity: Severity::Error, - path: format!("findings[{idx}].schema"), - message: "empty string".to_owned(), - }); - } - if let Some(alt) = mapping - .get(Value::String("alt".to_owned())) - .and_then(Value::as_str) - && alt != "*" - && !alts.iter().any(|item| item == alt) - { - issues.push(Issue { - severity: Severity::Error, - path: format!("findings[{idx}].alt"), - message: format!("finding alt '{alt}' is not present in alleles.alts {alts:?}"), - }); - } - let has_summary = mapping - .get(Value::String("summary".to_owned())) - .and_then(Value::as_str) - .is_some_and(|value| !value.trim().is_empty()); - let has_notes = mapping - .get(Value::String("notes".to_owned())) - .and_then(Value::as_str) - .is_some_and(|value| !value.trim().is_empty()); - if !has_summary && !has_notes { - issues.push(Issue { - severity: Severity::Warning, - path: format!("findings[{idx}]"), - message: "finding has neither summary nor notes".to_owned(), - }); - } - } -} - -fn validate_provenance(root: &Value, issues: &mut Vec) { - let Some(sources) = value_at(root, &["provenance", "sources"]).and_then(Value::as_sequence) - else { - return; - }; - for (idx, source) in sources.iter().enumerate() { - let Some(mapping) = source.as_mapping() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("provenance.sources[{idx}]"), - message: "expected mapping".to_owned(), - }); - continue; - }; - for field in ["kind", "label", "url"] { - match mapping - .get(Value::String(field.to_owned())) - .and_then(Value::as_str) - { - Some(text) if !text.trim().is_empty() => {} - Some(_) => issues.push(Issue { - severity: Severity::Error, - path: format!("provenance.sources[{idx}].{field}"), - message: "empty string".to_owned(), - }), - None => issues.push(Issue { - severity: Severity::Error, - path: format!("provenance.sources[{idx}].{field}"), - message: "missing required field".to_owned(), - }), - } - } - if let Some(url) = mapping - .get(Value::String("url".to_owned())) - .and_then(Value::as_str) - { - validate_url_string( - url, - &format!("provenance.sources[{idx}].url"), - false, - issues, - ); - } - } -} - -fn validate_permissions(root: &Value, issues: &mut Vec) { - let Some(domains) = value_at(root, &["permissions", "domains"]) else { - return; - }; - let Some(items) = domains.as_sequence() else { - issues.push(Issue { - severity: Severity::Error, - path: "permissions.domains".to_owned(), - message: "expected a sequence of origins".to_owned(), - }); - return; - }; - let mut seen = BTreeSet::new(); - for (idx, item) in items.iter().enumerate() { - let Some(value) = item.as_str() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("permissions.domains[{idx}]"), - message: "expected string".to_owned(), - }); - continue; - }; - match normalize_origin(value) { - Ok(origin) => { - if !seen.insert(origin.clone()) { - issues.push(Issue { - severity: Severity::Warning, - path: format!("permissions.domains[{idx}]"), - message: format!("duplicate origin '{origin}'"), - }); - } - } - Err(message) => issues.push(Issue { - severity: Severity::Error, - path: format!("permissions.domains[{idx}]"), - message, - }), - } - } -} - -fn validate_downloads(root: &Value, issues: &mut Vec) { - let allowed_origins: BTreeSet = seq_of_strings(root, &["permissions", "domains"]) - .unwrap_or_default() - .into_iter() - .filter_map(|domain| normalize_origin(&domain).ok()) - .collect(); - let Some(downloads) = value_at(root, &["downloads"]).and_then(Value::as_sequence) else { - return; - }; - let mut ids = BTreeSet::new(); - for (idx, item) in downloads.iter().enumerate() { - let Some(mapping) = item.as_mapping() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("downloads[{idx}]"), - message: "expected mapping".to_owned(), - }); - continue; - }; - for field in ["id", "url", "sha256", "version"] { - match mapping - .get(Value::String(field.to_owned())) - .and_then(Value::as_str) - { - Some(text) if !text.trim().is_empty() => {} - Some(_) => issues.push(Issue { - severity: Severity::Error, - path: format!("downloads[{idx}].{field}"), - message: "empty string".to_owned(), - }), - None => issues.push(Issue { - severity: Severity::Error, - path: format!("downloads[{idx}].{field}"), - message: "missing required field".to_owned(), - }), - } - } - - if let Some(id) = mapping - .get(Value::String("id".to_owned())) - .and_then(Value::as_str) - && !ids.insert(id.to_owned()) - { - issues.push(Issue { - severity: Severity::Error, - path: format!("downloads[{idx}].id"), - message: format!("duplicate download id '{id}'"), - }); - } - if let Some(sha) = mapping - .get(Value::String("sha256".to_owned())) - .and_then(Value::as_str) - && !is_sha256(sha) - { - issues.push(Issue { - severity: Severity::Error, - path: format!("downloads[{idx}].sha256"), - message: "expected 64 lowercase hex characters".to_owned(), - }); - } - if let Some(url) = mapping - .get(Value::String("url".to_owned())) - .and_then(Value::as_str) - { - match normalize_download_url(url) { - Ok(origin) => { - if !allowed_origins.is_empty() && !allowed_origins.contains(&origin) { - issues.push(Issue { - severity: Severity::Error, - path: format!("downloads[{idx}].url"), - message: format!( - "download origin '{origin}' is not listed in permissions.domains" - ), - }); - } - } - Err(message) => issues.push(Issue { - severity: Severity::Error, - path: format!("downloads[{idx}].url"), - message, - }), - } - } - } -} - -fn validate_panel_members(root: &Value, issues: &mut Vec) { - let Some(members) = value_at(root, &["members"]).and_then(Value::as_sequence) else { - issues.push(Issue { - severity: Severity::Error, - path: "members".to_owned(), - message: "missing required field".to_owned(), - }); - return; - }; - if members.is_empty() { - issues.push(Issue { - severity: Severity::Error, - path: "members".to_owned(), - message: "expected at least one member".to_owned(), - }); - return; - } - - let download_ids = panel_download_ids(root); - - for (idx, item) in members.iter().enumerate() { - let Some(mapping) = item.as_mapping() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}]"), - message: "expected mapping".to_owned(), - }); - continue; - }; - validate_panel_member(idx, mapping, &download_ids, issues); - } -} - -fn panel_download_ids(root: &Value) -> BTreeSet { - value_at(root, &["downloads"]) - .and_then(Value::as_sequence) - .into_iter() - .flatten() - .filter_map(|item| { - item.as_mapping()? - .get(Value::String("id".to_owned())) - .and_then(Value::as_str) - .map(ToOwned::to_owned) - }) - .collect() -} - -fn validate_panel_member( - idx: usize, - mapping: &Mapping, - download_ids: &BTreeSet, - issues: &mut Vec, -) { - let kind = mapping - .get(Value::String("kind".to_owned())) - .and_then(Value::as_str); - match kind { - Some("variant") => {} - Some(other) => issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}].kind"), - message: format!( - "unsupported member kind '{other}'; panel support is currently variant-only" - ), - }), - None => issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}].kind"), - message: "missing required field".to_owned(), - }), - } - - let path_value = mapping - .get(Value::String("path".to_owned())) - .and_then(Value::as_str); - let download_value = mapping - .get(Value::String("download".to_owned())) - .and_then(Value::as_str); - if path_value.is_some() == download_value.is_some() { - issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}]"), - message: "expected exactly one of path or download".to_owned(), - }); - } - validate_panel_member_path(idx, path_value, issues); - validate_panel_member_download(idx, download_value, download_ids, issues); - validate_panel_member_metadata(idx, mapping, issues); -} - -fn validate_panel_member_path(idx: usize, path_value: Option<&str>, issues: &mut Vec) { - if let Some(path) = path_value - && path.trim().is_empty() - { - issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}].path"), - message: "empty string".to_owned(), - }); - } -} - -fn validate_panel_member_download( - idx: usize, - download_value: Option<&str>, - download_ids: &BTreeSet, - issues: &mut Vec, -) { - let Some(download) = download_value else { - return; - }; - if download.trim().is_empty() { - issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}].download"), - message: "empty string".to_owned(), - }); - } else if !download_ids.contains(download) { - issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}].download"), - message: format!("unknown download id '{download}'"), - }); - } -} - -fn validate_panel_member_metadata(idx: usize, mapping: &Mapping, issues: &mut Vec) { - if let Some(version) = mapping - .get(Value::String("version".to_owned())) - .and_then(Value::as_str) - && version.trim().is_empty() - { - issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}].version"), - message: "empty string".to_owned(), - }); - } - if let Some(sha) = mapping - .get(Value::String("sha256".to_owned())) - .and_then(Value::as_str) - && !is_sha256(sha) - { - issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}].sha256"), - message: "expected 64 lowercase hex characters".to_owned(), - }); - } -} - -fn variant_spec_from_root(root: &Value) -> Result { - let rsids = seq_of_strings(root, &["identifiers", "rsids"]).unwrap_or_default(); - let grch37 = locus_from_root(root, "grch37")?; - let grch38 = locus_from_root(root, "grch38")?; - let reference = scalar_at(root, &["alleles", "ref"]); - let alternate = preferred_alternate_from_root(root); - let deletion_length = value_at(root, &["alleles", "deletion_length"]) - .and_then(Value::as_u64) - .and_then(|value| usize::try_from(value).ok()); - let motifs = seq_of_strings(root, &["alleles", "motifs"]).unwrap_or_default(); - let kind = scalar_at(root, &["alleles", "kind"]).map(|kind| match kind.as_str() { - "snv" => VariantKind::Snp, - "deletion" => VariantKind::Deletion, - "insertion" => VariantKind::Insertion, - "indel" => VariantKind::Indel, - _ => VariantKind::Other, - }); - - Ok(VariantSpec { - rsids, - grch37, - grch38, - reference, - alternate, - kind, - deletion_length, - motifs, - }) -} - -fn preferred_alternate_from_root(root: &Value) -> Option { - let alts = seq_of_strings(root, &["alleles", "alts"])?; - if let Some(finding_alt) = first_specific_finding_alt(root) - && alts.iter().any(|alt| alt == &finding_alt) - { - return Some(finding_alt); - } - alts.first().cloned() -} - -fn first_specific_finding_alt(root: &Value) -> Option { - let findings = value_at(root, &["findings"])?.as_sequence()?; - for finding in findings { - let Some(alt) = finding - .as_mapping() - .and_then(|mapping| mapping.get(Value::String("alt".to_owned()))) - .and_then(Value::as_str) - .map(str::trim) - else { - continue; - }; - if !alt.is_empty() && alt != "*" { - return Some(alt.to_owned()); - } - } - None -} - -fn locus_from_root(root: &Value, assembly: &str) -> Result, String> { - let Some(mapping) = mapping_at(root, &["coordinates", assembly]) else { - return Ok(None); - }; - let chrom = mapping - .get(Value::String("chrom".to_owned())) - .and_then(Value::as_str) - .ok_or_else(|| format!("coordinates.{assembly}.chrom missing"))?; - let (start, end) = if let Some(pos) = i64_at_mapping(mapping, "pos") { - (pos, pos) - } else { - let start = i64_at_mapping(mapping, "start") - .ok_or_else(|| format!("coordinates.{assembly}.start missing"))?; - let end = i64_at_mapping(mapping, "end") - .ok_or_else(|| format!("coordinates.{assembly}.end missing"))?; - (start, end) - }; - Ok(Some(GenomicLocus { - chrom: chrom.to_owned(), - start, - end, - })) -} - -fn parse_downloads(root: &Value) -> Result, String> { - let mut downloads = Vec::new(); - let Some(items) = value_at(root, &["downloads"]).and_then(Value::as_sequence) else { - return Ok(downloads); - }; - for (idx, item) in items.iter().enumerate() { - let Some(mapping) = item.as_mapping() else { - return Err(format!("downloads[{idx}] must be a mapping")); - }; - let id = mapping_required_string(mapping, "id", idx, "downloads")?; - let url = mapping_required_string(mapping, "url", idx, "downloads")?; - let sha256 = mapping_required_string(mapping, "sha256", idx, "downloads")?; - let version = mapping_required_string(mapping, "version", idx, "downloads")?; - let origin = normalize_download_url(&url)?; - downloads.push(Download { - id, - url, - origin, - sha256, - version, - }); - } - Ok(downloads) -} - -fn parse_panel_members(root: &Value) -> Result, String> { - let mut members = Vec::new(); - let Some(items) = value_at(root, &["members"]).and_then(Value::as_sequence) else { - return Ok(members); - }; - for (idx, item) in items.iter().enumerate() { - let Some(mapping) = item.as_mapping() else { - return Err(format!("members[{idx}] must be a mapping")); - }; - members.push(PanelMember { - kind: mapping_required_string(mapping, "kind", idx, "members")?, - path: mapping - .get(Value::String("path".to_owned())) - .and_then(Value::as_str) - .map(ToOwned::to_owned), - download: mapping - .get(Value::String("download".to_owned())) - .and_then(Value::as_str) - .map(ToOwned::to_owned), - sha256: mapping - .get(Value::String("sha256".to_owned())) - .and_then(Value::as_str) - .map(ToOwned::to_owned), - version: mapping - .get(Value::String("version".to_owned())) - .and_then(Value::as_str) - .map(ToOwned::to_owned), - }); - } - Ok(members) -} - -fn mapping_required_string( - mapping: &Mapping, - field: &str, - idx: usize, - parent: &str, -) -> Result { - mapping - .get(Value::String(field.to_owned())) - .and_then(Value::as_str) - .filter(|value| !value.trim().is_empty()) - .map(ToOwned::to_owned) - .ok_or_else(|| format!("{parent}[{idx}].{field} missing or empty")) -} - -fn validate_url_string( - value: &str, - path: &str, - require_origin_only: bool, - issues: &mut Vec, -) { - let normalized = if require_origin_only { - normalize_origin(value) - } else { - normalize_download_url(value) - }; - if let Err(message) = normalized { - issues.push(Issue { - severity: Severity::Error, - path: path.to_owned(), - message, - }); - } -} - -fn normalize_origin(value: &str) -> Result { - let url = Url::parse(value).map_err(|err| format!("invalid URL: {err}"))?; - if !matches!(url.scheme(), "http" | "https") { - return Err("expected http or https origin".to_owned()); - } - if url.host_str().is_none() { - return Err("origin is missing host".to_owned()); - } - if url.path() != "/" || url.query().is_some() || url.fragment().is_some() { - return Err("expected origin only, without path, query, or fragment".to_owned()); - } - let mut origin = format!("{}://{}", url.scheme(), url.host_str().unwrap_or_default()); - if let Some(port) = url.port() { - let _ = write!(origin, ":{port}"); - } - Ok(origin) -} - -fn normalize_download_url(value: &str) -> Result { - let url = Url::parse(value).map_err(|err| format!("invalid URL: {err}"))?; - if !matches!(url.scheme(), "http" | "https") { - return Err("expected http or https URL".to_owned()); - } - if url.host_str().is_none() { - return Err("URL is missing host".to_owned()); - } - let mut origin = format!("{}://{}", url.scheme(), url.host_str().unwrap_or_default()); - if let Some(port) = url.port() { - let _ = write!(origin, ":{port}"); - } - Ok(origin) -} - -fn is_allowed_chromosome(value: &str) -> bool { - matches!(value, "X" | "Y" | "MT") - || value - .parse::() - .is_ok_and(|chrom| (1..=22).contains(&chrom)) -} - -fn is_base_allele(value: &str) -> bool { - matches!(value, "A" | "C" | "G" | "T") -} - -fn is_rsid(value: &str) -> bool { - value.starts_with("rs") && value[2..].chars().all(|ch| ch.is_ascii_digit()) -} - -fn is_sha256(value: &str) -> bool { - value.len() == 64 - && value - .chars() - .all(|ch| ch.is_ascii_hexdigit() && !ch.is_ascii_uppercase()) -} - -fn i64_at_mapping(mapping: &Mapping, key: &str) -> Option { - mapping - .get(Value::String(key.to_owned())) - .and_then(Value::as_i64) -} - -fn required_non_empty_string(root: &Value, path: &[&str]) -> Result { - scalar_at(root, path) - .filter(|value| !value.trim().is_empty()) - .ok_or_else(|| format!("{} missing or empty", path.join("."))) -} - -fn render_single_manifest_errors(path: &Path, issues: &[Issue]) -> String { - let mut out = format!("invalid manifest {}:\n", path.display()); - for issue in issues { - let _ = writeln!( - out, - " - [{}] {}: {}", - issue.severity, issue.path, issue.message - ); - } - out -} - -fn load_yaml(path: &Path) -> Result { - let text = fs::read_to_string(path) - .map_err(|err| format!("failed to read {}: {err}", path.display()))?; - serde_yaml::from_str(&text) - .map_err(|err| format!("failed to parse YAML {}: {err}", path.display())) -} - -fn require_const(root: &Value, path: &[&str], expected: &str, issues: &mut Vec) { - match scalar_at(root, path) { - Some(actual) if actual == expected => {} - Some(actual) => issues.push(Issue { - severity: Severity::Error, - path: path.join("."), - message: format!("expected '{expected}', found '{actual}'"), - }), - None => issues.push(Issue { - severity: Severity::Error, - path: path.join("."), - message: "missing required field".to_owned(), - }), - } -} - -fn require_path(root: &Value, path: &[&str], issues: &mut Vec) { - if value_at(root, path).is_none() { - issues.push(Issue { - severity: Severity::Error, - path: path.join("."), - message: "missing required field".to_owned(), - }); - } -} - -fn value_at<'a>(root: &'a Value, path: &[&str]) -> Option<&'a Value> { - let mut current = root; - for key in path { - let mapping = current.as_mapping()?; - current = mapping.get(Value::String((*key).to_owned()))?; - } - Some(current) -} - -fn mapping_at<'a>(root: &'a Value, path: &[&str]) -> Option<&'a Mapping> { - value_at(root, path)?.as_mapping() -} - -fn scalar_at(root: &Value, path: &[&str]) -> Option { - value_at(root, path).and_then(|value| match value { - Value::String(text) => Some(text.clone()), - Value::Number(number) => Some(number.to_string()), - _ => None, - }) -} - -fn seq_of_strings(root: &Value, path: &[&str]) -> Option> { - value_at(root, path)?.as_sequence().map(|items| { - items - .iter() - .filter_map(|item| item.as_str().map(ToOwned::to_owned)) - .collect() - }) -} diff --git a/rust/bioscript-schema/src/validator/common.rs b/rust/bioscript-schema/src/validator/common.rs new file mode 100644 index 0000000..d36b3e3 --- /dev/null +++ b/rust/bioscript-schema/src/validator/common.rs @@ -0,0 +1,304 @@ +use std::{ + fmt::Write as _, + fs, + path::{Path, PathBuf}, +}; + +use serde_yaml::{Mapping, Value}; +use url::Url; + +use super::{Issue, Severity}; + +pub(crate) fn collect_yaml_files(path: &Path) -> Result, String> { + if path.is_file() { + return Ok(vec![path.to_path_buf()]); + } + + let mut files = Vec::new(); + collect_yaml_files_recursive(path, &mut files)?; + files.sort(); + Ok(files) +} + +fn collect_yaml_files_recursive(path: &Path, files: &mut Vec) -> Result<(), String> { + let entries = fs::read_dir(path) + .map_err(|err| format!("failed to read directory {}: {err}", path.display()))?; + for entry in entries { + let entry = entry.map_err(|err| format!("failed to read directory entry: {err}"))?; + let entry_path = entry.path(); + if entry_path.is_dir() { + collect_yaml_files_recursive(&entry_path, files)?; + continue; + } + if entry_path.extension().is_some_and(|extension| { + ["yaml", "yml"] + .iter() + .any(|item| extension.eq_ignore_ascii_case(item)) + }) { + files.push(entry_path); + } + } + Ok(()) +} + +pub(crate) fn validate_schema_and_identity( + root: &Value, + canonical_schema: &str, + legacy_schema: Option<&str>, + issues: &mut Vec, +) { + let schema = scalar_at(root, &["schema"]); + let valid_schema = schema + .as_deref() + .is_some_and(|value| value == canonical_schema || legacy_schema == Some(value)); + if !valid_schema { + issues.push(Issue { + severity: Severity::Error, + path: "schema".to_owned(), + message: format!("expected schema to be '{canonical_schema}'"), + }); + } + if let Some(legacy_schema) = legacy_schema + && matches!(schema.as_deref(), Some(value) if value == legacy_schema) + { + issues.push(Issue { + severity: Severity::Warning, + path: "schema".to_owned(), + message: format!("legacy schema value '{legacy_schema}'; prefer '{canonical_schema}'"), + }); + } + require_const(root, &["version"], "1.0", issues); + match scalar_at(root, &["name"]) { + Some(name) if !name.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: "name".to_owned(), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: "name".to_owned(), + message: "missing required field".to_owned(), + }), + } + if value_at(root, &["variant_id"]).is_some() { + issues.push(Issue { + severity: Severity::Warning, + path: "variant_id".to_owned(), + message: "variant_id is legacy; prefer name".to_owned(), + }); + } +} + +pub(crate) fn validate_optional_strings(root: &Value, fields: &[&str], issues: &mut Vec) { + for field in fields { + if let Some(value) = value_at(root, &[*field]) { + match value.as_str() { + Some(text) if !text.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Warning, + path: (*field).to_owned(), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: (*field).to_owned(), + message: "expected string".to_owned(), + }), + } + } + } +} + +pub(crate) fn validate_tags(root: &Value, issues: &mut Vec) { + let Some(value) = value_at(root, &["tags"]) else { + return; + }; + let Some(items) = value.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: "tags".to_owned(), + message: "expected a sequence of strings".to_owned(), + }); + return; + }; + + for (idx, item) in items.iter().enumerate() { + let Some(tag) = item.as_str() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("tags[{idx}]"), + message: "expected string".to_owned(), + }); + continue; + }; + if tag.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("tags[{idx}]"), + message: "empty tag string".to_owned(), + }); + } + } +} + +pub(crate) fn mapping_required_string( + mapping: &Mapping, + field: &str, + idx: usize, + parent: &str, +) -> Result { + mapping + .get(Value::String(field.to_owned())) + .and_then(Value::as_str) + .filter(|value| !value.trim().is_empty()) + .map(ToOwned::to_owned) + .ok_or_else(|| format!("{parent}[{idx}].{field} missing or empty")) +} + +pub(crate) fn validate_url_string( + value: &str, + path: &str, + require_origin_only: bool, + issues: &mut Vec, +) { + let normalized = if require_origin_only { + normalize_origin(value) + } else { + normalize_download_url(value) + }; + if let Err(message) = normalized { + issues.push(Issue { + severity: Severity::Error, + path: path.to_owned(), + message, + }); + } +} + +pub(crate) fn normalize_origin(value: &str) -> Result { + let url = Url::parse(value).map_err(|err| format!("invalid URL: {err}"))?; + if !matches!(url.scheme(), "http" | "https") { + return Err("expected http or https origin".to_owned()); + } + if url.host_str().is_none() { + return Err("origin is missing host".to_owned()); + } + if url.path() != "/" || url.query().is_some() || url.fragment().is_some() { + return Err("expected origin only, without path, query, or fragment".to_owned()); + } + let mut origin = format!("{}://{}", url.scheme(), url.host_str().unwrap_or_default()); + if let Some(port) = url.port() { + let _ = write!(origin, ":{port}"); + } + Ok(origin) +} + +pub(crate) fn normalize_download_url(value: &str) -> Result { + let url = Url::parse(value).map_err(|err| format!("invalid URL: {err}"))?; + if !matches!(url.scheme(), "http" | "https") { + return Err("expected http or https URL".to_owned()); + } + if url.host_str().is_none() { + return Err("URL is missing host".to_owned()); + } + let mut origin = format!("{}://{}", url.scheme(), url.host_str().unwrap_or_default()); + if let Some(port) = url.port() { + let _ = write!(origin, ":{port}"); + } + Ok(origin) +} + +pub(crate) fn is_sha256(value: &str) -> bool { + value.len() == 64 + && value + .chars() + .all(|ch| ch.is_ascii_hexdigit() && !ch.is_ascii_uppercase()) +} + +pub(crate) fn i64_at_mapping(mapping: &Mapping, key: &str) -> Option { + mapping + .get(Value::String(key.to_owned())) + .and_then(Value::as_i64) +} + +pub(crate) fn required_non_empty_string(root: &Value, path: &[&str]) -> Result { + scalar_at(root, path) + .filter(|value| !value.trim().is_empty()) + .ok_or_else(|| format!("{} missing or empty", path.join("."))) +} + +pub(crate) fn render_single_manifest_errors(path: &Path, issues: &[Issue]) -> String { + let mut out = format!("invalid manifest {}:\n", path.display()); + for issue in issues { + let _ = writeln!( + out, + " - [{}] {}: {}", + issue.severity, issue.path, issue.message + ); + } + out +} + +pub(crate) fn load_yaml(path: &Path) -> Result { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read {}: {err}", path.display()))?; + serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse YAML {}: {err}", path.display())) +} + +pub(crate) fn require_const(root: &Value, path: &[&str], expected: &str, issues: &mut Vec) { + match scalar_at(root, path) { + Some(actual) if actual == expected => {} + Some(actual) => issues.push(Issue { + severity: Severity::Error, + path: path.join("."), + message: format!("expected '{expected}', found '{actual}'"), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: path.join("."), + message: "missing required field".to_owned(), + }), + } +} + +pub(crate) fn require_path(root: &Value, path: &[&str], issues: &mut Vec) { + if value_at(root, path).is_none() { + issues.push(Issue { + severity: Severity::Error, + path: path.join("."), + message: "missing required field".to_owned(), + }); + } +} + +pub(crate) fn value_at<'a>(root: &'a Value, path: &[&str]) -> Option<&'a Value> { + let mut current = root; + for key in path { + let mapping = current.as_mapping()?; + current = mapping.get(Value::String((*key).to_owned()))?; + } + Some(current) +} + +pub(crate) fn mapping_at<'a>(root: &'a Value, path: &[&str]) -> Option<&'a Mapping> { + value_at(root, path)?.as_mapping() +} + +pub(crate) fn scalar_at(root: &Value, path: &[&str]) -> Option { + value_at(root, path).and_then(|value| match value { + Value::String(text) => Some(text.clone()), + Value::Number(number) => Some(number.to_string()), + _ => None, + }) +} + +pub(crate) fn seq_of_strings(root: &Value, path: &[&str]) -> Option> { + value_at(root, path)?.as_sequence().map(|items| { + items + .iter() + .filter_map(|item| item.as_str().map(ToOwned::to_owned)) + .collect() + }) +} diff --git a/rust/bioscript-schema/src/validator/panel.rs b/rust/bioscript-schema/src/validator/panel.rs new file mode 100644 index 0000000..867aaa4 --- /dev/null +++ b/rust/bioscript-schema/src/validator/panel.rs @@ -0,0 +1,356 @@ +use std::collections::BTreeSet; + +use serde_yaml::{Mapping, Value}; + +use super::{ + Download, Issue, PanelMember, Severity, + common::{ + is_sha256, mapping_required_string, normalize_download_url, normalize_origin, + seq_of_strings, validate_optional_strings, validate_schema_and_identity, validate_tags, + value_at, + }, +}; + +pub(crate) fn validate_panel_root(root: &Value, issues: &mut Vec) { + validate_schema_and_identity(root, "bioscript:panel:1.0", None, issues); + validate_optional_strings(root, &["name", "label", "summary"], issues); + validate_tags(root, issues); + validate_permissions(root, issues); + validate_downloads(root, issues); + validate_panel_members(root, issues); +} + +fn validate_permissions(root: &Value, issues: &mut Vec) { + let Some(domains) = value_at(root, &["permissions", "domains"]) else { + return; + }; + let Some(items) = domains.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: "permissions.domains".to_owned(), + message: "expected a sequence of origins".to_owned(), + }); + return; + }; + let mut seen = BTreeSet::new(); + for (idx, item) in items.iter().enumerate() { + let Some(value) = item.as_str() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("permissions.domains[{idx}]"), + message: "expected string".to_owned(), + }); + continue; + }; + match normalize_origin(value) { + Ok(origin) => { + if !seen.insert(origin.clone()) { + issues.push(Issue { + severity: Severity::Warning, + path: format!("permissions.domains[{idx}]"), + message: format!("duplicate origin '{origin}'"), + }); + } + } + Err(message) => issues.push(Issue { + severity: Severity::Error, + path: format!("permissions.domains[{idx}]"), + message, + }), + } + } +} + +fn validate_downloads(root: &Value, issues: &mut Vec) { + let allowed_origins: BTreeSet = seq_of_strings(root, &["permissions", "domains"]) + .unwrap_or_default() + .into_iter() + .filter_map(|domain| normalize_origin(&domain).ok()) + .collect(); + let Some(downloads) = value_at(root, &["downloads"]).and_then(Value::as_sequence) else { + return; + }; + let mut ids = BTreeSet::new(); + for (idx, item) in downloads.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + for field in ["id", "url", "sha256", "version"] { + match mapping + .get(Value::String(field.to_owned())) + .and_then(Value::as_str) + { + Some(text) if !text.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].{field}"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].{field}"), + message: "missing required field".to_owned(), + }), + } + } + + if let Some(id) = mapping + .get(Value::String("id".to_owned())) + .and_then(Value::as_str) + && !ids.insert(id.to_owned()) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].id"), + message: format!("duplicate download id '{id}'"), + }); + } + if let Some(sha) = mapping + .get(Value::String("sha256".to_owned())) + .and_then(Value::as_str) + && !is_sha256(sha) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].sha256"), + message: "expected 64 lowercase hex characters".to_owned(), + }); + } + if let Some(url) = mapping + .get(Value::String("url".to_owned())) + .and_then(Value::as_str) + { + match normalize_download_url(url) { + Ok(origin) => { + if !allowed_origins.is_empty() && !allowed_origins.contains(&origin) { + issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].url"), + message: format!( + "download origin '{origin}' is not listed in permissions.domains" + ), + }); + } + } + Err(message) => issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].url"), + message, + }), + } + } + } +} + +fn validate_panel_members(root: &Value, issues: &mut Vec) { + let Some(members) = value_at(root, &["members"]).and_then(Value::as_sequence) else { + issues.push(Issue { + severity: Severity::Error, + path: "members".to_owned(), + message: "missing required field".to_owned(), + }); + return; + }; + if members.is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: "members".to_owned(), + message: "expected at least one member".to_owned(), + }); + return; + } + + let download_ids = panel_download_ids(root); + + for (idx, item) in members.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + validate_panel_member(idx, mapping, &download_ids, issues); + } +} + +fn panel_download_ids(root: &Value) -> BTreeSet { + value_at(root, &["downloads"]) + .and_then(Value::as_sequence) + .into_iter() + .flatten() + .filter_map(|item| { + item.as_mapping()? + .get(Value::String("id".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned) + }) + .collect() +} + +fn validate_panel_member( + idx: usize, + mapping: &Mapping, + download_ids: &BTreeSet, + issues: &mut Vec, +) { + let kind = mapping + .get(Value::String("kind".to_owned())) + .and_then(Value::as_str); + match kind { + Some("variant") => {} + Some(other) => issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].kind"), + message: format!( + "unsupported member kind '{other}'; panel support is currently variant-only" + ), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].kind"), + message: "missing required field".to_owned(), + }), + } + + let path_value = mapping + .get(Value::String("path".to_owned())) + .and_then(Value::as_str); + let download_value = mapping + .get(Value::String("download".to_owned())) + .and_then(Value::as_str); + if path_value.is_some() == download_value.is_some() { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}]"), + message: "expected exactly one of path or download".to_owned(), + }); + } + validate_panel_member_path(idx, path_value, issues); + validate_panel_member_download(idx, download_value, download_ids, issues); + validate_panel_member_metadata(idx, mapping, issues); +} + +fn validate_panel_member_path(idx: usize, path_value: Option<&str>, issues: &mut Vec) { + if let Some(path) = path_value + && path.trim().is_empty() + { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].path"), + message: "empty string".to_owned(), + }); + } +} + +fn validate_panel_member_download( + idx: usize, + download_value: Option<&str>, + download_ids: &BTreeSet, + issues: &mut Vec, +) { + let Some(download) = download_value else { + return; + }; + if download.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].download"), + message: "empty string".to_owned(), + }); + } else if !download_ids.contains(download) { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].download"), + message: format!("unknown download id '{download}'"), + }); + } +} + +fn validate_panel_member_metadata(idx: usize, mapping: &Mapping, issues: &mut Vec) { + if let Some(version) = mapping + .get(Value::String("version".to_owned())) + .and_then(Value::as_str) + && version.trim().is_empty() + { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].version"), + message: "empty string".to_owned(), + }); + } + if let Some(sha) = mapping + .get(Value::String("sha256".to_owned())) + .and_then(Value::as_str) + && !is_sha256(sha) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].sha256"), + message: "expected 64 lowercase hex characters".to_owned(), + }); + } +} + +pub(crate) fn parse_downloads(root: &Value) -> Result, String> { + let mut downloads = Vec::new(); + let Some(items) = value_at(root, &["downloads"]).and_then(Value::as_sequence) else { + return Ok(downloads); + }; + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + return Err(format!("downloads[{idx}] must be a mapping")); + }; + let id = mapping_required_string(mapping, "id", idx, "downloads")?; + let url = mapping_required_string(mapping, "url", idx, "downloads")?; + let sha256 = mapping_required_string(mapping, "sha256", idx, "downloads")?; + let version = mapping_required_string(mapping, "version", idx, "downloads")?; + let origin = normalize_download_url(&url)?; + downloads.push(Download { + id, + url, + origin, + sha256, + version, + }); + } + Ok(downloads) +} + +pub(crate) fn parse_panel_members(root: &Value) -> Result, String> { + let mut members = Vec::new(); + let Some(items) = value_at(root, &["members"]).and_then(Value::as_sequence) else { + return Ok(members); + }; + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + return Err(format!("members[{idx}] must be a mapping")); + }; + members.push(PanelMember { + kind: mapping_required_string(mapping, "kind", idx, "members")?, + path: mapping + .get(Value::String("path".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + download: mapping + .get(Value::String("download".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + sha256: mapping + .get(Value::String("sha256".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + version: mapping + .get(Value::String("version".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + }); + } + Ok(members) +} diff --git a/rust/bioscript-schema/src/validator/spec.rs b/rust/bioscript-schema/src/validator/spec.rs new file mode 100644 index 0000000..0abc368 --- /dev/null +++ b/rust/bioscript-schema/src/validator/spec.rs @@ -0,0 +1,86 @@ +use bioscript_core::{GenomicLocus, VariantKind, VariantSpec}; +use serde_yaml::Value; + +use super::common::{i64_at_mapping, mapping_at, scalar_at, seq_of_strings, value_at}; + +pub(crate) fn variant_spec_from_root(root: &Value) -> Result { + let rsids = seq_of_strings(root, &["identifiers", "rsids"]).unwrap_or_default(); + let grch37 = locus_from_root(root, "grch37")?; + let grch38 = locus_from_root(root, "grch38")?; + let reference = scalar_at(root, &["alleles", "ref"]); + let alternate = preferred_alternate_from_root(root); + let deletion_length = value_at(root, &["alleles", "deletion_length"]) + .and_then(Value::as_u64) + .and_then(|value| usize::try_from(value).ok()); + let motifs = seq_of_strings(root, &["alleles", "motifs"]).unwrap_or_default(); + let kind = scalar_at(root, &["alleles", "kind"]).map(|kind| match kind.as_str() { + "snv" => VariantKind::Snp, + "deletion" => VariantKind::Deletion, + "insertion" => VariantKind::Insertion, + "indel" => VariantKind::Indel, + _ => VariantKind::Other, + }); + + Ok(VariantSpec { + rsids, + grch37, + grch38, + reference, + alternate, + kind, + deletion_length, + motifs, + }) +} + +fn preferred_alternate_from_root(root: &Value) -> Option { + let alts = seq_of_strings(root, &["alleles", "alts"])?; + if let Some(finding_alt) = first_specific_finding_alt(root) + && alts.iter().any(|alt| alt == &finding_alt) + { + return Some(finding_alt); + } + alts.first().cloned() +} + +fn first_specific_finding_alt(root: &Value) -> Option { + let findings = value_at(root, &["findings"])?.as_sequence()?; + for finding in findings { + let Some(alt) = finding + .as_mapping() + .and_then(|mapping| mapping.get(Value::String("alt".to_owned()))) + .and_then(Value::as_str) + .map(str::trim) + else { + continue; + }; + if !alt.is_empty() && alt != "*" { + return Some(alt.to_owned()); + } + } + None +} + +fn locus_from_root(root: &Value, assembly: &str) -> Result, String> { + let Some(mapping) = mapping_at(root, &["coordinates", assembly]) else { + return Ok(None); + }; + let chrom = mapping + .get(Value::String("chrom".to_owned())) + .and_then(Value::as_str) + .ok_or_else(|| format!("coordinates.{assembly}.chrom missing"))?; + let (start, end) = if let Some(pos) = i64_at_mapping(mapping, "pos") { + (pos, pos) + } else { + let start = i64_at_mapping(mapping, "start") + .ok_or_else(|| format!("coordinates.{assembly}.start missing"))?; + let end = i64_at_mapping(mapping, "end") + .ok_or_else(|| format!("coordinates.{assembly}.end missing"))?; + (start, end) + }; + Ok(Some(GenomicLocus { + chrom: chrom.to_owned(), + start, + end, + })) +} diff --git a/rust/bioscript-schema/src/validator/variant.rs b/rust/bioscript-schema/src/validator/variant.rs new file mode 100644 index 0000000..4a5ced5 --- /dev/null +++ b/rust/bioscript-schema/src/validator/variant.rs @@ -0,0 +1,447 @@ +use std::collections::BTreeSet; + +use serde_yaml::{Mapping, Value}; + +use super::{ + Issue, Severity, + common::{ + i64_at_mapping, mapping_at, require_path, scalar_at, seq_of_strings, + validate_optional_strings, validate_schema_and_identity, validate_tags, + validate_url_string, value_at, + }, +}; + +pub(crate) fn validate_variant_root(root: &Value, issues: &mut Vec) { + validate_schema_and_identity( + root, + "bioscript:variant:1.0", + Some("bioscript:variant"), + issues, + ); + validate_optional_strings(root, &["name", "label", "gene", "summary"], issues); + validate_tags(root, issues); + validate_identifiers(root, issues); + validate_coordinates(root, issues); + validate_alleles(root, issues); + validate_findings(root, issues); + validate_provenance(root, issues); + + let has_identifiers = value_at(root, &["identifiers"]) + .and_then(Value::as_mapping) + .is_some_and(|mapping| !mapping.is_empty()); + let has_coordinates = ["grch37", "grch38"] + .iter() + .any(|assembly| value_at(root, &["coordinates", assembly]).is_some()); + if !has_identifiers && !has_coordinates { + issues.push(Issue { + severity: Severity::Error, + path: "identifiers/coordinates".to_owned(), + message: "expected at least one identifier block or one coordinate block".to_owned(), + }); + } +} + +pub(crate) fn validate_identifiers(root: &Value, issues: &mut Vec) { + for field in ["rsids", "aliases"] { + let Some(values) = value_at(root, &["identifiers", field]) else { + continue; + }; + let Some(items) = values.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("identifiers.{field}"), + message: "expected a sequence of strings".to_owned(), + }); + continue; + }; + let mut seen = BTreeSet::new(); + for (idx, item) in items.iter().enumerate() { + let Some(value) = item.as_str() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("identifiers.{field}[{idx}]"), + message: "expected string".to_owned(), + }); + continue; + }; + if !is_rsid(value) { + issues.push(Issue { + severity: Severity::Error, + path: format!("identifiers.{field}[{idx}]"), + message: format!("expected rsid like rs123, found '{value}'"), + }); + } + if !seen.insert(value.to_owned()) { + issues.push(Issue { + severity: Severity::Warning, + path: format!("identifiers.{field}[{idx}]"), + message: format!("duplicate identifier '{value}'"), + }); + } + } + } +} + +pub(crate) fn validate_coordinates(root: &Value, issues: &mut Vec) { + for assembly in ["grch37", "grch38"] { + let Some(coord) = mapping_at(root, &["coordinates", assembly]) else { + continue; + }; + + let Some(chrom) = coord + .get(Value::String("chrom".to_owned())) + .and_then(Value::as_str) + else { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.chrom"), + message: "missing chrom".to_owned(), + }); + continue; + }; + if !is_allowed_chromosome(chrom) { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.chrom"), + message: format!("invalid chromosome '{chrom}'; expected 1-22, X, Y, or MT"), + }); + } + + let has_pos = coord.contains_key(Value::String("pos".to_owned())); + let has_start = coord.contains_key(Value::String("start".to_owned())); + let has_end = coord.contains_key(Value::String("end".to_owned())); + if has_pos && (has_start || has_end) { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}"), + message: "use either pos or start/end, not both".to_owned(), + }); + continue; + } + if !(has_pos || has_start && has_end) { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}"), + message: "expected either pos or start/end".to_owned(), + }); + continue; + } + + if has_pos { + validate_coordinate_pos(coord, assembly, issues); + } else { + validate_coordinate_range(coord, assembly, issues); + } + } +} + +fn validate_coordinate_pos(coord: &Mapping, assembly: &str, issues: &mut Vec) { + if let Some(pos) = i64_at_mapping(coord, "pos") { + if pos < 1 { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.pos"), + message: "expected integer >= 1".to_owned(), + }); + } + } else { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.pos"), + message: "expected integer".to_owned(), + }); + } +} + +fn validate_coordinate_range(coord: &Mapping, assembly: &str, issues: &mut Vec) { + let start = i64_at_mapping(coord, "start"); + let end = i64_at_mapping(coord, "end"); + match (start, end) { + (Some(start), Some(end)) => validate_coordinate_range_values(start, end, assembly, issues), + _ => issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}"), + message: "expected integer start/end".to_owned(), + }), + } +} + +fn validate_coordinate_range_values(start: i64, end: i64, assembly: &str, issues: &mut Vec) { + if start < 1 { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.start"), + message: "expected integer >= 1".to_owned(), + }); + } + if end < 1 { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.end"), + message: "expected integer >= 1".to_owned(), + }); + } + if end < start { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.end"), + message: "expected end >= start".to_owned(), + }); + } + if start == end { + issues.push(Issue { + severity: Severity::Warning, + path: format!("coordinates.{assembly}"), + message: "single-position coordinate uses start/end; prefer pos".to_owned(), + }); + } +} + +pub(crate) fn validate_alleles(root: &Value, issues: &mut Vec) { + require_path(root, &["alleles"], issues); + require_path(root, &["alleles", "kind"], issues); + require_path(root, &["alleles", "ref"], issues); + require_path(root, &["alleles", "alts"], issues); + + let Some(kind) = scalar_at(root, &["alleles", "kind"]) else { + return; + }; + if !matches!(kind.as_str(), "snv" | "deletion" | "insertion" | "indel") { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.kind".to_owned(), + message: "expected one of snv, deletion, insertion, indel".to_owned(), + }); + } + + if value_at(root, &["alleles", "canonical_alt"]).is_some() { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.canonical_alt".to_owned(), + message: "canonical_alt is not part of the current schema".to_owned(), + }); + } + + let Some(reference) = scalar_at(root, &["alleles", "ref"]) else { + return; + }; + if reference.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.ref".to_owned(), + message: "empty string".to_owned(), + }); + } + + let Some(alts_value) = value_at(root, &["alleles", "alts"]) else { + return; + }; + let Some(alts_seq) = alts_value.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.alts".to_owned(), + message: "expected a non-empty sequence of strings".to_owned(), + }); + return; + }; + if alts_seq.is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.alts".to_owned(), + message: "expected at least one alternate allele".to_owned(), + }); + return; + } + + let mut alts = Vec::new(); + for (idx, item) in alts_seq.iter().enumerate() { + let Some(alt) = item.as_str() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("alleles.alts[{idx}]"), + message: "expected string".to_owned(), + }); + continue; + }; + if alt.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("alleles.alts[{idx}]"), + message: "empty string".to_owned(), + }); + continue; + } + alts.push(alt.to_owned()); + } + validate_symbolic_alleles(&reference, &alts, issues); + validate_snv_alleles(&kind, &reference, &alts, issues); +} + +fn validate_symbolic_alleles(reference: &str, alts: &[String], issues: &mut Vec) { + if reference == "I" || reference == "D" { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.ref".to_owned(), + message: "symbolic I/D alleles are not allowed in stored YAML; use biological alleles" + .to_owned(), + }); + } + for (idx, alt) in alts.iter().enumerate() { + if alt == "I" || alt == "D" { + issues.push(Issue { + severity: Severity::Error, + path: format!("alleles.alts[{idx}]"), + message: + "symbolic I/D alleles are not allowed in stored YAML; use biological alleles" + .to_owned(), + }); + } + } +} + +fn validate_snv_alleles(kind: &str, reference: &str, alts: &[String], issues: &mut Vec) { + if kind != "snv" { + return; + } + if !is_base_allele(reference) { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.ref".to_owned(), + message: "snv ref must be one of A/C/G/T".to_owned(), + }); + } + for (idx, alt) in alts.iter().enumerate() { + if !is_base_allele(alt) { + issues.push(Issue { + severity: Severity::Error, + path: format!("alleles.alts[{idx}]"), + message: "snv alt must be one of A/C/G/T".to_owned(), + }); + } + } +} + +fn validate_findings(root: &Value, issues: &mut Vec) { + let alts = seq_of_strings(root, &["alleles", "alts"]).unwrap_or_default(); + let Some(findings) = value_at(root, &["findings"]).and_then(Value::as_sequence) else { + return; + }; + + for (idx, finding) in findings.iter().enumerate() { + let Some(mapping) = finding.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + + let Some(schema) = mapping + .get(Value::String("schema".to_owned())) + .and_then(Value::as_str) + else { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].schema"), + message: "missing schema".to_owned(), + }); + continue; + }; + if schema.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].schema"), + message: "empty string".to_owned(), + }); + } + if let Some(alt) = mapping + .get(Value::String("alt".to_owned())) + .and_then(Value::as_str) + && alt != "*" + && !alts.iter().any(|item| item == alt) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].alt"), + message: format!("finding alt '{alt}' is not present in alleles.alts {alts:?}"), + }); + } + let has_summary = mapping + .get(Value::String("summary".to_owned())) + .and_then(Value::as_str) + .is_some_and(|value| !value.trim().is_empty()); + let has_notes = mapping + .get(Value::String("notes".to_owned())) + .and_then(Value::as_str) + .is_some_and(|value| !value.trim().is_empty()); + if !has_summary && !has_notes { + issues.push(Issue { + severity: Severity::Warning, + path: format!("findings[{idx}]"), + message: "finding has neither summary nor notes".to_owned(), + }); + } + } +} + +fn validate_provenance(root: &Value, issues: &mut Vec) { + let Some(sources) = value_at(root, &["provenance", "sources"]).and_then(Value::as_sequence) + else { + return; + }; + for (idx, source) in sources.iter().enumerate() { + let Some(mapping) = source.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("provenance.sources[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + for field in ["kind", "label", "url"] { + match mapping + .get(Value::String(field.to_owned())) + .and_then(Value::as_str) + { + Some(text) if !text.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: format!("provenance.sources[{idx}].{field}"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("provenance.sources[{idx}].{field}"), + message: "missing required field".to_owned(), + }), + } + } + if let Some(url) = mapping + .get(Value::String("url".to_owned())) + .and_then(Value::as_str) + { + validate_url_string( + url, + &format!("provenance.sources[{idx}].url"), + false, + issues, + ); + } + } +} + +fn is_allowed_chromosome(value: &str) -> bool { + matches!(value, "X" | "Y" | "MT") + || value + .parse::() + .is_ok_and(|chrom| (1..=22).contains(&chrom)) +} + +fn is_base_allele(value: &str) -> bool { + matches!(value, "A" | "C" | "G" | "T") +} + +fn is_rsid(value: &str) -> bool { + value.starts_with("rs") && value[2..].chars().all(|ch| ch.is_ascii_digit()) +}