diff --git a/.gitmodules b/.gitmodules index 41b2bd7..d37d0a8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "monty"] path = monty - url = git@github.com:pydantic/monty.git + url = git@github.com:madhavajay/monty.git [submodule "noodles"] path = noodles url = git@github.com:madhavajay/noodles.git diff --git a/monty b/monty index 811ca2c..3c7b875 160000 --- a/monty +++ b/monty @@ -1 +1 @@ -Subproject commit 811ca2c2193154a6376d6bbc7246f51700eb429a +Subproject commit 3c7b8752ebd0e734572757a62dac4b5474ab0605 diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 86cd271..c2dbe5b 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -149,6 +149,7 @@ version = "0.1.0" dependencies = [ "bioscript-core", "bioscript-formats", + "getrandom 0.3.4", "monty", ] @@ -169,13 +170,19 @@ version = "0.1.0" dependencies = [ "bioscript-core", "bioscript-formats", + "bioscript-runtime", "bioscript-schema", "console_error_panic_hook", + "getrandom 0.3.4", "js-sys", + "monty", "noodles", "serde", "serde_json", + "serde_yaml", + "sha2", "wasm-bindgen", + "zip", ] [[package]] @@ -648,9 +655,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", + "js-sys", "libc", "r-efi", "wasip2", + "wasm-bindgen", ] [[package]] diff --git a/rust/bioscript-cli/src/cli_bootstrap.rs b/rust/bioscript-cli/src/cli_bootstrap.rs index b85e5a5..b0cf393 100644 --- a/rust/bioscript-cli/src/cli_bootstrap.rs +++ b/rust/bioscript-cli/src/cli_bootstrap.rs @@ -369,6 +369,7 @@ fn run_cli_script( RuntimeConfig { limits: options.limits, loader: options.loader, + ..RuntimeConfig::default() }, ) .map_err(|err| err.to_string())?; diff --git a/rust/bioscript-cli/src/report_execution.rs b/rust/bioscript-cli/src/report_execution.rs index 8bda914..b40578f 100644 --- a/rust/bioscript-cli/src/report_execution.rs +++ b/rust/bioscript-cli/src/report_execution.rs @@ -205,6 +205,7 @@ fn run_bioscript_analysis_script( RuntimeConfig { limits, loader: loader.clone(), + ..RuntimeConfig::default() }, ) .map_err(|err| err.to_string())?; diff --git a/rust/bioscript-ffi/src/run_file.rs b/rust/bioscript-ffi/src/run_file.rs index fe32e61..3142be1 100644 --- a/rust/bioscript-ffi/src/run_file.rs +++ b/rust/bioscript-ffi/src/run_file.rs @@ -35,8 +35,15 @@ pub fn run_file_request(request: RunFileRequest) -> Result, tabix::Index>, - label: &str, - variant: &VariantSpec, - locus: &bioscript_core::GenomicLocus, - assembly: Option, -) -> Result { - let query_pos = if matches!( - variant.kind, - Some(VariantKind::Deletion | VariantKind::Insertion | VariantKind::Indel) - ) { - locus.start.saturating_sub(1) - } else { - locus.start - }; - let locus_label = format!("{}:{query_pos}", locus.chrom); - let Some(seq_name) = resolve_vcf_chrom_name(indexed.index(), &locus.chrom) else { - return Ok(VariantObservation { - backend: "vcf".to_owned(), - matched_rsid: variant.rsids.first().cloned(), - assembly, - evidence: vec![format!( - "{label}: tabix index has no contig matching {} (tried chr-prefixed and bare forms)", - locus.chrom - )], - ..VariantObservation::default() - }); - }; - let pos_usize = usize::try_from(query_pos).map_err(|err| { - RuntimeError::Io(format!( - "{label}: invalid VCF position {query_pos} for {locus_label}: {err}" - )) - })?; - let position = Position::try_from(pos_usize).map_err(|err| { - RuntimeError::Io(format!( - "{label}: invalid VCF position {query_pos} for {locus_label}: {err}" - )) - })?; - let region = Region::new(seq_name.as_str(), position..=position); - let query = indexed.query(®ion).map_err(|err| { - RuntimeError::Io(format!("{label}: tabix query for {locus_label}: {err}")) - })?; - - let mut saw_any = false; - for record_result in query { - let record = record_result - .map_err(|err| RuntimeError::Io(format!("{label}: tabix record iter: {err}")))?; - let line: &str = record.as_ref(); - let Some(row) = parse_vcf_record(line)? else { - continue; - }; - saw_any = true; - if vcf_row_matches_variant(&row, variant, assembly) { - return Ok(VariantObservation { - backend: "vcf".to_owned(), - matched_rsid: variant.rsids.first().cloned().or_else(|| row.rsid.clone()), - assembly, - genotype: Some(row.genotype.clone()), - evidence: vec![format!("{label}: resolved by indexed locus {locus_label}")], - ..VariantObservation::default() - }); - } - } - - let evidence = if saw_any { - vec![format!( - "{label}: {locus_label} present but no record matched {}", - describe_query(variant) - )] - } else { - vec![format!("{label}: no VCF record at {locus_label}")] - }; - Ok(VariantObservation { - backend: "vcf".to_owned(), - matched_rsid: variant.rsids.first().cloned(), - assembly, - evidence, - ..VariantObservation::default() - }) -} - -fn resolve_vcf_chrom_name(index: &tabix::Index, user_chrom: &str) -> Option { - let header = index.header()?; - let names = header.reference_sequence_names(); - - let trimmed = user_chrom.trim(); - let stripped = trimmed.strip_prefix("chr").unwrap_or(trimmed); - - let candidates = [ - trimmed.to_owned(), - stripped.to_owned(), - format!("chr{stripped}"), - ]; - for cand in &candidates { - if names.contains(cand.as_bytes()) { - return Some(cand.clone()); - } - } - - let target = stripped.to_ascii_lowercase(); - for name in names { - let as_str = std::str::from_utf8(name.as_ref()).ok()?; - let as_stripped = as_str.strip_prefix("chr").unwrap_or(as_str); - if as_stripped.eq_ignore_ascii_case(&target) { - return Some(as_str.to_owned()); - } - } - None -} diff --git a/rust/bioscript-formats/src/genotype/vcf/reader.rs b/rust/bioscript-formats/src/genotype/vcf/reader.rs index 3bc2038..6244b14 100644 --- a/rust/bioscript-formats/src/genotype/vcf/reader.rs +++ b/rust/bioscript-formats/src/genotype/vcf/reader.rs @@ -5,9 +5,9 @@ use noodles::core::{Position, Region}; use noodles::csi::{self, BinningIndex}; use noodles::tabix; -use bioscript_core::{Assembly, GenomicLocus, RuntimeError, VariantObservation}; +use bioscript_core::{Assembly, GenomicLocus, RuntimeError, VariantObservation, VariantSpec}; -use super::parse_vcf_record; +use super::{parse_vcf_record, vcf_row_matches_variant}; /// Observe a SNP at `locus` over an already-built tabix-indexed bgzipped VCF /// reader. Caller builds `csi::io::IndexedReader::new(reader, tabix_index)` @@ -109,6 +109,97 @@ where }) } +pub fn observe_vcf_variant_with_reader( + indexed: &mut csi::io::IndexedReader, tabix::Index>, + label: &str, + locus: &GenomicLocus, + variant: &VariantSpec, + matched_rsid: Option, + assembly: Option, +) -> Result +where + R: Read + Seek, +{ + let locus_label = format!("{}:{}-{}", locus.chrom, locus.start, locus.end); + + let Some(seq_name) = resolve_vcf_chrom_name(indexed.index(), &locus.chrom) else { + return Ok(VariantObservation { + backend: "vcf".to_owned(), + matched_rsid, + assembly, + evidence: vec![format!( + "{label}: tabix index has no contig matching {} (tried chr-prefixed and bare forms)", + locus.chrom + )], + ..VariantObservation::default() + }); + }; + + let start = locus.start.saturating_sub(1).max(1); + let end = locus.end.max(locus.start).max(start); + let start_position = Position::try_from(usize::try_from(start).map_err(|err| { + RuntimeError::Io(format!( + "{label}: invalid VCF start position {start} for {locus_label}: {err}" + )) + })?) + .map_err(|err| { + RuntimeError::Io(format!( + "{label}: invalid VCF start position {start} for {locus_label}: {err}" + )) + })?; + let end_position = Position::try_from(usize::try_from(end).map_err(|err| { + RuntimeError::Io(format!( + "{label}: invalid VCF end position {end} for {locus_label}: {err}" + )) + })?) + .map_err(|err| { + RuntimeError::Io(format!( + "{label}: invalid VCF end position {end} for {locus_label}: {err}" + )) + })?; + let region = Region::new(seq_name.as_str(), start_position..=end_position); + + let query = indexed.query(®ion).map_err(|err| { + RuntimeError::Io(format!("{label}: tabix query for {locus_label}: {err}")) + })?; + + let mut saw_any = false; + for record_result in query { + let record = record_result + .map_err(|err| RuntimeError::Io(format!("{label}: tabix record iter: {err}")))?; + let line: &str = record.as_ref(); + let Some(row) = parse_vcf_record(line)? else { + continue; + }; + saw_any = true; + if vcf_row_matches_variant(&row, variant, assembly) { + return Ok(VariantObservation { + backend: "vcf".to_owned(), + matched_rsid: matched_rsid.or_else(|| row.rsid.clone()), + assembly, + genotype: Some(row.genotype.clone()), + evidence: vec![format!("{label}: resolved by indexed locus {locus_label}")], + ..VariantObservation::default() + }); + } + } + + let evidence = if saw_any { + vec![format!( + "{label}: indexed region {locus_label} had records, but none matched query" + )] + } else { + vec![format!("{label}: no VCF record at {locus_label}")] + }; + Ok(VariantObservation { + backend: "vcf".to_owned(), + matched_rsid, + assembly, + evidence, + ..VariantObservation::default() + }) +} + fn resolve_vcf_chrom_name(index: &tabix::Index, user_chrom: &str) -> Option { let header = index.header()?; let names = header.reference_sequence_names(); diff --git a/rust/bioscript-runtime/Cargo.toml b/rust/bioscript-runtime/Cargo.toml index 652cf81..e93c2a5 100644 --- a/rust/bioscript-runtime/Cargo.toml +++ b/rust/bioscript-runtime/Cargo.toml @@ -6,6 +6,7 @@ edition = "2024" [dependencies] bioscript-core = { path = "../bioscript-core" } bioscript-formats = { path = "../bioscript-formats" } +getrandom = { version = "0.3", features = ["wasm_js"] } monty = { path = "../../monty/crates/monty" } [lints.clippy] diff --git a/rust/bioscript-runtime/src/runtime.rs b/rust/bioscript-runtime/src/runtime.rs index 069a067..a5e0e11 100644 --- a/rust/bioscript-runtime/src/runtime.rs +++ b/rust/bioscript-runtime/src/runtime.rs @@ -3,7 +3,7 @@ use std::{ fs, path::{Component, Path, PathBuf}, sync::Arc, - time::{Duration, Instant}, + time::Duration, }; use bioscript_core::RuntimeError; @@ -14,6 +14,7 @@ mod host_io; mod methods; mod objects; mod state; +mod timing; mod trace; mod variants; @@ -27,6 +28,7 @@ use objects::{ }; pub use state::{RuntimeConfig, StageTiming}; use state::{RuntimeState, monty_error}; +use timing::RuntimeInstant; #[cfg(test)] use trace::{ ends_with_unescaped_backslash, extract_coordinate, extract_rsid, update_nesting_depth, @@ -61,12 +63,18 @@ impl BioscriptRuntime { config: RuntimeConfig, ) -> Result { let root = root.into(); - let canonical_root = root.canonicalize().map_err(|err| { - RuntimeError::Io(format!( - "failed to canonicalize bioscript root {}: {err}", - root.display() - )) - })?; + let has_virtual_files = + !config.virtual_text_files.is_empty() || !config.virtual_binary_files.is_empty(); + let canonical_root = if has_virtual_files { + root + } else { + root.canonicalize().map_err(|err| { + RuntimeError::Io(format!( + "failed to canonicalize bioscript root {}: {err}", + root.display() + )) + })? + }; let mut functions = BTreeMap::new(); functions.insert("read_text", host_read_text as HostFunction); @@ -97,14 +105,18 @@ impl BioscriptRuntime { trace_report_path: Option<&Path>, mut extra_inputs: Vec<(&str, MontyObject)>, ) -> Result { - let run_started = Instant::now(); + let run_started = RuntimeInstant::now(); let script_path = script_path.as_ref(); - let code = fs::read_to_string(script_path).map_err(|err| { - RuntimeError::Io(format!( - "failed to read script {}: {err}", - script_path.display() - )) - })?; + let code = if let Some(content) = self.read_virtual_text_file(script_path) { + content + } else { + fs::read_to_string(script_path).map_err(|err| { + RuntimeError::Io(format!( + "failed to read script {}: {err}", + script_path.display() + )) + })? + }; let instrumented = instrument_source(&code); self.state .trace_lines @@ -288,6 +300,9 @@ impl BioscriptRuntime { fn resolve_existing_user_path(&self, raw_path: &str) -> Result { let path = self.resolve_user_path(raw_path)?; + if self.virtual_file_exists(raw_path) { + return Ok(path); + } let canonical = path.canonicalize().map_err(|err| { RuntimeError::Io(format!("failed to resolve {}: {err}", path.display())) })?; @@ -297,6 +312,9 @@ impl BioscriptRuntime { fn resolve_user_write_path(&self, raw_path: &str) -> Result { let path = self.resolve_user_path(raw_path)?; + if self.uses_virtual_files() { + return Ok(path); + } if path.exists() { let canonical = path.canonicalize().map_err(|err| { RuntimeError::Io(format!("failed to resolve {}: {err}", path.display())) @@ -317,6 +335,72 @@ impl BioscriptRuntime { Ok(path) } + #[must_use] + pub fn virtual_written_text_files(&self) -> BTreeMap { + self.state + .virtual_written_text_files + .lock() + .expect("virtual file mutex poisoned") + .clone() + } + + pub(crate) fn read_virtual_text_file(&self, path: &Path) -> Option { + let key = self.virtual_key(path); + self.config + .virtual_text_files + .get(&key) + .cloned() + .or_else(|| { + self.state + .virtual_written_text_files + .lock() + .expect("virtual file mutex poisoned") + .get(&key) + .cloned() + }) + } + + pub(crate) fn write_virtual_text_file(&self, path: &Path, contents: String) -> bool { + if !self.uses_virtual_files() { + return false; + } + let key = self.virtual_key(path); + self.state + .virtual_written_text_files + .lock() + .expect("virtual file mutex poisoned") + .insert(key, contents); + true + } + + pub(crate) fn read_virtual_binary_file(&self, path: &Path) -> Option> { + let key = self.virtual_key(path); + self.config.virtual_binary_files.get(&key).cloned() + } + + fn uses_virtual_files(&self) -> bool { + !self.config.virtual_text_files.is_empty() || !self.config.virtual_binary_files.is_empty() + } + + fn virtual_file_exists(&self, raw_path: &str) -> bool { + self.config.virtual_text_files.contains_key(raw_path) + || self.config.virtual_binary_files.contains_key(raw_path) + || self + .state + .virtual_written_text_files + .lock() + .expect("virtual file mutex poisoned") + .contains_key(raw_path) + } + + fn virtual_key(&self, path: &Path) -> String { + path.strip_prefix(&self.root) + .unwrap_or(path) + .display() + .to_string() + .replace('\\', "/") + } + fn ensure_under_root(&self, path: &Path, raw_path: &str) -> Result<(), RuntimeError> { if path.starts_with(&self.root) { Ok(()) diff --git a/rust/bioscript-runtime/src/runtime/host_io.rs b/rust/bioscript-runtime/src/runtime/host_io.rs index 0934a7a..dc8e0d6 100644 --- a/rust/bioscript-runtime/src/runtime/host_io.rs +++ b/rust/bioscript-runtime/src/runtime/host_io.rs @@ -14,6 +14,9 @@ pub(crate) fn host_read_text( ) -> Result { reject_kwargs(kwargs, "read_text")?; let path = runtime.resolve_existing_user_path(&expect_string_arg(args, 0, "read_text")?)?; + if let Some(content) = runtime.read_virtual_text_file(&path) { + return Ok(MontyObject::String(content)); + } let content = read_text_limited(&path, MAX_HOST_TEXT_BYTES)?; Ok(MontyObject::String(content)) } @@ -31,6 +34,9 @@ pub(crate) fn host_write_text( "write_text content exceeds {MAX_HOST_TEXT_BYTES} bytes" ))); } + if runtime.write_virtual_text_file(&path, content.clone()) { + return Ok(MontyObject::None); + } if let Some(parent) = path.parent() { fs::create_dir_all(parent).map_err(|err| { RuntimeError::Io(format!( diff --git a/rust/bioscript-runtime/src/runtime/methods.rs b/rust/bioscript-runtime/src/runtime/methods.rs index 98a7d4c..89279ca 100644 --- a/rust/bioscript-runtime/src/runtime/methods.rs +++ b/rust/bioscript-runtime/src/runtime/methods.rs @@ -1,4 +1,4 @@ -use std::{fs, time::Instant}; +use std::fs; use bioscript_core::RuntimeError; use bioscript_formats::{GenotypeLoadOptions, GenotypeStore}; @@ -12,6 +12,7 @@ use super::{ genotype_file_object, variant_object, variant_observation_object, variant_plan_object, }, resolve_optional_loader_path, + timing::RuntimeInstant, variants::{ dataclass_handle_id, dataclass_to_variant_spec, variant_spec_from_kwargs, variant_specs_from_plan, @@ -24,7 +25,7 @@ impl BioscriptRuntime { args: &[MontyObject], kwargs: &[(MontyObject, MontyObject)], ) -> Result { - let started = Instant::now(); + let started = RuntimeInstant::now(); reject_kwargs(kwargs, "bioscript.load_genotypes")?; if args.len() != 2 { return Err(RuntimeError::InvalidArguments( @@ -37,7 +38,16 @@ impl BioscriptRuntime { "bioscript.load_genotypes", )?)?; let loader = self.resolved_loader_options()?; - let store = GenotypeStore::from_file_with_options(&path, &loader)?; + let store = if let Some(bytes) = self.read_virtual_binary_file(&path) { + GenotypeStore::from_bytes( + path.file_name() + .and_then(|value| value.to_str()) + .unwrap_or("input"), + &bytes, + )? + } else { + GenotypeStore::from_file_with_options(&path, &loader)? + }; let handle = self.state.next_handle(); self.state .genotype_files @@ -123,7 +133,7 @@ impl BioscriptRuntime { args: &[MontyObject], kwargs: &[(MontyObject, MontyObject)], ) -> Result { - let started = Instant::now(); + let started = RuntimeInstant::now(); reject_kwargs(kwargs, "GenotypeFile.lookup_variant")?; if args.len() != 2 { return Err(RuntimeError::InvalidArguments( @@ -159,7 +169,7 @@ impl BioscriptRuntime { args: &[MontyObject], kwargs: &[(MontyObject, MontyObject)], ) -> Result { - let started = Instant::now(); + let started = RuntimeInstant::now(); reject_kwargs(kwargs, "GenotypeFile.lookup_variant_details")?; if args.len() != 2 { return Err(RuntimeError::InvalidArguments( @@ -192,7 +202,7 @@ impl BioscriptRuntime { args: &[MontyObject], kwargs: &[(MontyObject, MontyObject)], ) -> Result { - let started = Instant::now(); + let started = RuntimeInstant::now(); reject_kwargs(kwargs, "GenotypeFile.lookup_variants")?; if args.len() != 2 { return Err(RuntimeError::InvalidArguments( @@ -233,7 +243,7 @@ impl BioscriptRuntime { args: &[MontyObject], kwargs: &[(MontyObject, MontyObject)], ) -> Result { - let started = Instant::now(); + let started = RuntimeInstant::now(); reject_kwargs(kwargs, "GenotypeFile.lookup_variants_details")?; if args.len() != 2 { return Err(RuntimeError::InvalidArguments( @@ -271,7 +281,7 @@ impl BioscriptRuntime { args: &[MontyObject], kwargs: &[(MontyObject, MontyObject)], ) -> Result { - let started = Instant::now(); + let started = RuntimeInstant::now(); reject_kwargs(kwargs, "bioscript.write_tsv")?; if args.len() != 3 { return Err(RuntimeError::InvalidArguments( @@ -281,14 +291,6 @@ impl BioscriptRuntime { let path = self.resolve_user_write_path(&expect_string_arg(args, 1, "bioscript.write_tsv")?)?; let rows = expect_rows(&args[2])?; - if let Some(parent) = path.parent() { - fs::create_dir_all(parent).map_err(|err| { - RuntimeError::Io(format!( - "failed to create parent dir {}: {err}", - parent.display() - )) - })?; - } let mut output = String::new(); if let Some(first) = rows.first() { let headers: Vec = first.keys().cloned().collect(); @@ -303,6 +305,22 @@ impl BioscriptRuntime { output.push('\n'); } } + if self.write_virtual_text_file(&path, output.clone()) { + self.record_timing( + "write_tsv", + started.elapsed(), + format!("path={} rows={}", path.display(), rows.len()), + ); + return Ok(MontyObject::None); + } + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).map_err(|err| { + RuntimeError::Io(format!( + "failed to create parent dir {}: {err}", + parent.display() + )) + })?; + } fs::write(&path, output).map_err(|err| { RuntimeError::Io(format!("failed to write {}: {err}", path.display())) })?; diff --git a/rust/bioscript-runtime/src/runtime/state.rs b/rust/bioscript-runtime/src/runtime/state.rs index 3b7d2cc..76f8854 100644 --- a/rust/bioscript-runtime/src/runtime/state.rs +++ b/rust/bioscript-runtime/src/runtime/state.rs @@ -1,5 +1,5 @@ use std::{ - collections::HashMap, + collections::{BTreeMap, HashMap}, sync::{ Mutex, atomic::{AtomicU64, Ordering}, @@ -16,6 +16,8 @@ use bioscript_core::RuntimeError; pub struct RuntimeConfig { pub limits: ResourceLimits, pub loader: GenotypeLoadOptions, + pub virtual_binary_files: BTreeMap>, + pub virtual_text_files: BTreeMap, } impl Default for RuntimeConfig { @@ -29,6 +31,8 @@ impl Default for RuntimeConfig { Self { limits, loader: GenotypeLoadOptions::default(), + virtual_binary_files: BTreeMap::new(), + virtual_text_files: BTreeMap::new(), } } } @@ -49,6 +53,7 @@ pub(crate) struct RuntimeState { pub(crate) genotype_files: Mutex>, pub(crate) trace_lines: Mutex>, pub(crate) timings: Mutex>, + pub(crate) virtual_written_text_files: Mutex>, } impl RuntimeState { @@ -58,6 +63,7 @@ impl RuntimeState { genotype_files: Mutex::new(HashMap::new()), trace_lines: Mutex::new(Vec::new()), timings: Mutex::new(Vec::new()), + virtual_written_text_files: Mutex::new(BTreeMap::new()), } } diff --git a/rust/bioscript-runtime/src/runtime/timing.rs b/rust/bioscript-runtime/src/runtime/timing.rs new file mode 100644 index 0000000..8e0b2e4 --- /dev/null +++ b/rust/bioscript-runtime/src/runtime/timing.rs @@ -0,0 +1,29 @@ +use std::time::Duration; + +#[cfg(not(target_arch = "wasm32"))] +pub(crate) struct RuntimeInstant(std::time::Instant); + +#[cfg(not(target_arch = "wasm32"))] +impl RuntimeInstant { + pub(crate) fn now() -> Self { + Self(std::time::Instant::now()) + } + + pub(crate) fn elapsed(&self) -> Duration { + self.0.elapsed() + } +} + +#[cfg(target_arch = "wasm32")] +pub(crate) struct RuntimeInstant; + +#[cfg(target_arch = "wasm32")] +impl RuntimeInstant { + pub(crate) fn now() -> Self { + Self + } + + pub(crate) fn elapsed(&self) -> Duration { + Duration::ZERO + } +} diff --git a/rust/bioscript-runtime/tests/resources.rs b/rust/bioscript-runtime/tests/resources.rs index c9f9b9b..cc8337b 100644 --- a/rust/bioscript-runtime/tests/resources.rs +++ b/rust/bioscript-runtime/tests/resources.rs @@ -4,7 +4,6 @@ use std::{ time::{Duration, SystemTime, UNIX_EPOCH}, }; -use bioscript_formats::GenotypeLoadOptions; use bioscript_runtime::{BioscriptRuntime, RuntimeConfig}; use monty::ResourceLimits; @@ -30,7 +29,7 @@ fn run_script(code: &str, limits: ResourceLimits) -> Result<(), String> { &dir, RuntimeConfig { limits, - loader: GenotypeLoadOptions::default(), + ..RuntimeConfig::default() }, ) .unwrap(); diff --git a/rust/bioscript-runtime/tests/resources_coverage.rs b/rust/bioscript-runtime/tests/resources_coverage.rs index 087ab4b..2d92996 100644 --- a/rust/bioscript-runtime/tests/resources_coverage.rs +++ b/rust/bioscript-runtime/tests/resources_coverage.rs @@ -5,7 +5,6 @@ use std::{ time::{Duration, SystemTime, UNIX_EPOCH}, }; -use bioscript_formats::GenotypeLoadOptions; use bioscript_runtime::{BioscriptRuntime, RuntimeConfig}; use monty::ResourceLimits; @@ -34,7 +33,7 @@ fn run_script(code: &str, limits: ResourceLimits) -> Result<(), String> { &dir, RuntimeConfig { limits, - loader: GenotypeLoadOptions::default(), + ..RuntimeConfig::default() }, ) .unwrap(); diff --git a/rust/bioscript-schema/src/lib.rs b/rust/bioscript-schema/src/lib.rs index 3da1f4c..b2d70c1 100644 --- a/rust/bioscript-schema/src/lib.rs +++ b/rust/bioscript-schema/src/lib.rs @@ -7,7 +7,8 @@ pub use remote_resource::{ pub use validator::{ AssayManifest, Download, FileReport, Issue, PanelInterpretation, PanelInterpretationLogic, PanelInterpretationLogicSource, PanelManifest, PanelMember, Permissions, Severity, - ValidationReport, VariantManifest, load_assay_manifest, load_panel_manifest, - load_variant_manifest, load_variant_manifest_text, load_variant_manifest_text_for_lookup, - validate_assays_path, validate_panels_path, validate_variants_path, + ValidationReport, VariantManifest, load_assay_manifest, load_assay_manifest_text, + load_panel_manifest, load_panel_manifest_text, load_variant_manifest, + load_variant_manifest_text, load_variant_manifest_text_for_lookup, validate_assays_path, + validate_panels_path, validate_variants_path, }; diff --git a/rust/bioscript-schema/src/validator_load.rs b/rust/bioscript-schema/src/validator_load.rs index d34013e..776314f 100644 --- a/rust/bioscript-schema/src/validator_load.rs +++ b/rust/bioscript-schema/src/validator_load.rs @@ -143,24 +143,40 @@ fn variant_manifest_from_root(path: &Path, value: &Value) -> Result Result { let value = load_yaml(path)?; + panel_manifest_from_root(path, &value) +} + +/// Load a panel manifest from YAML text. +/// +/// # Errors +/// +/// Returns an error when the text cannot be parsed or converted into a valid +/// panel manifest shape. +pub fn load_panel_manifest_text(name: &str, text: &str) -> Result { + let value: Value = + serde_yaml::from_str(text).map_err(|err| format!("failed to parse YAML {name}: {err}"))?; + panel_manifest_from_root(Path::new(name), &value) +} + +fn panel_manifest_from_root(path: &Path, value: &Value) -> Result { let mut issues = Vec::new(); - validate_panel_root(&value, &mut issues); + validate_panel_root(value, &mut issues); if issues.iter().any(|issue| issue.severity == Severity::Error) { return Err(render_single_manifest_errors(path, &issues)); } let permissions = Permissions { - domains: seq_of_strings(&value, &["permissions", "domains"]).unwrap_or_default(), + domains: seq_of_strings(value, &["permissions", "domains"]).unwrap_or_default(), }; - let downloads = parse_downloads(&value)?; - let members = parse_panel_members(&value)?; - let interpretations = parse_panel_interpretations(&value)?; + let downloads = parse_downloads(value)?; + let members = parse_panel_members(value)?; + let interpretations = parse_panel_interpretations(value)?; Ok(PanelManifest { path: path.to_path_buf(), - name: required_non_empty_string(&value, &["name"])?, - label: scalar_at(&value, &["label"]), - tags: seq_of_strings(&value, &["tags"]).unwrap_or_default(), + name: required_non_empty_string(value, &["name"])?, + label: scalar_at(value, &["label"]), + tags: seq_of_strings(value, &["tags"]).unwrap_or_default(), permissions, downloads, members, @@ -182,18 +198,34 @@ pub fn load_panel_manifest(path: &Path) -> Result { /// valid assay manifest shape. pub fn load_assay_manifest(path: &Path) -> Result { let value = load_yaml(path)?; + assay_manifest_from_root(path, &value) +} + +/// Load an assay manifest from YAML text. +/// +/// # Errors +/// +/// Returns an error when the text cannot be parsed or converted into a valid +/// assay manifest shape. +pub fn load_assay_manifest_text(name: &str, text: &str) -> Result { + let value: Value = + serde_yaml::from_str(text).map_err(|err| format!("failed to parse YAML {name}: {err}"))?; + assay_manifest_from_root(Path::new(name), &value) +} + +fn assay_manifest_from_root(path: &Path, value: &Value) -> Result { let mut issues = Vec::new(); - validate_assay_root(&value, &mut issues); + validate_assay_root(value, &mut issues); if issues.iter().any(|issue| issue.severity == Severity::Error) { return Err(render_single_manifest_errors(path, &issues)); } Ok(AssayManifest { path: path.to_path_buf(), - name: required_non_empty_string(&value, &["name"])?, - tags: seq_of_strings(&value, &["tags"]).unwrap_or_default(), - members: parse_panel_members(&value)?, - interpretations: parse_panel_interpretations(&value)?, + name: required_non_empty_string(value, &["name"])?, + tags: seq_of_strings(value, &["tags"]).unwrap_or_default(), + members: parse_panel_members(value)?, + interpretations: parse_panel_interpretations(value)?, }) } diff --git a/rust/bioscript-wasm/Cargo.toml b/rust/bioscript-wasm/Cargo.toml index 57625d2..3dcee73 100644 --- a/rust/bioscript-wasm/Cargo.toml +++ b/rust/bioscript-wasm/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "bioscript-wasm" version = "0.1.0" -edition = "2021" +edition = "2024" [lib] crate-type = ["cdylib", "rlib"] @@ -9,12 +9,18 @@ crate-type = ["cdylib", "rlib"] [dependencies] bioscript-core = { path = "../bioscript-core" } bioscript-formats = { path = "../bioscript-formats" } +bioscript-runtime = { path = "../bioscript-runtime" } bioscript-schema = { path = "../bioscript-schema" } +getrandom = { version = "0.3", features = ["wasm_js"] } +monty = { path = "../../monty/crates/monty" } noodles = { version = "0.109.0", features = ["bgzf", "cram", "csi", "fasta", "tabix"] } wasm-bindgen = "0.2" js-sys = "0.3" serde = { version = "1", features = ["derive"] } serde_json = "1" +serde_yaml = "0.9" +sha2 = "0.10" +zip = { version = "2.2.0", default-features = false, features = ["deflate"] } console_error_panic_hook = { version = "0.1", optional = true } [features] diff --git a/rust/bioscript-wasm/src/inspect_api.rs b/rust/bioscript-wasm/src/inspect_api.rs index cbe67d6..2508d2c 100644 --- a/rust/bioscript-wasm/src/inspect_api.rs +++ b/rust/bioscript-wasm/src/inspect_api.rs @@ -13,6 +13,8 @@ struct InspectOptionsJs { input_index: Option, reference_file: Option, reference_index: Option, + #[serde(default, rename = "detectSex")] + detect_sex: bool, } /// Classify bytes as a known genomic file. Mirrors `bioscript-formats::inspect::inspect_bytes`. @@ -32,6 +34,7 @@ pub fn inspect_bytes( input_index: options_js.input_index.map(PathBuf::from), reference_file: options_js.reference_file.map(PathBuf::from), reference_index: options_js.reference_index.map(PathBuf::from), + detect_sex: options_js.detect_sex, }; let inspection = inspect_bytes_rs(name, bytes, &options) diff --git a/rust/bioscript-wasm/src/lib.rs b/rust/bioscript-wasm/src/lib.rs index b438a72..bf00f1b 100644 --- a/rust/bioscript-wasm/src/lib.rs +++ b/rust/bioscript-wasm/src/lib.rs @@ -17,6 +17,8 @@ mod inspect_api; mod js_reader; mod lookup_api; +mod package_api; +mod report_api; mod variant_yaml; pub use inspect_api::{inspect_bytes, resolve_remote_resource_text}; @@ -24,6 +26,10 @@ pub use lookup_api::{ lookup_cram_variants, lookup_genotype_bytes_rsids, lookup_genotype_bytes_variants, lookup_vcf_variants, }; +pub use package_api::{ + resolve_package_release_text, resolve_package_zip_bytes, verify_package_artifact_sha256, +}; +pub use report_api::run_package_report_bytes; pub use variant_yaml::compile_variant_yaml_text; #[wasm_bindgen::prelude::wasm_bindgen(start)] diff --git a/rust/bioscript-wasm/src/package_api.rs b/rust/bioscript-wasm/src/package_api.rs new file mode 100644 index 0000000..6d81ada --- /dev/null +++ b/rust/bioscript-wasm/src/package_api.rs @@ -0,0 +1,437 @@ +use std::{ + collections::BTreeSet, + io::{Cursor, Read}, + path::{Path, PathBuf}, +}; + +use bioscript_schema::{ + resolve_remote_resource_text, RemoteResourceKind, RemoteResourceResolution, +}; +use serde::Serialize; +use sha2::{Digest, Sha256}; +use wasm_bindgen::prelude::*; + +const PACKAGE_DESCRIPTOR: &str = "manifest.yaml"; +const LEGACY_PACKAGE_DESCRIPTOR: &str = "bioscript-package.yaml"; +const MAX_PACKAGE_FILES: usize = 1000; +const MAX_PACKAGE_FILE_BYTES: u64 = 16 * 1024 * 1024; +const MAX_PACKAGE_TOTAL_BYTES: u64 = 64 * 1024 * 1024; + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +struct PackageFileJs { + path: String, + contents: String, + source_url: String, +} + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +struct PackageResourceJs { + path: String, + contents: String, + resolution: RemoteResourceResolution, +} + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +struct PackageResolutionJs { + entrypoint: String, + files: Vec, + name: Option, + resources: Vec, +} + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +struct PackageReleaseJs { + artifact_sha256: Option, + artifact_size_bytes: Option, + artifact_url: String, + entrypoint: Option, + name: Option, + title: String, + version: Option, +} + +struct PackageDescriptor { + entrypoint: PathBuf, + name: Option, +} + +struct ExtractedPackageFile { + path: PathBuf, + contents: String, +} + +/// Resolve a BioScript package zip from bytes. +/// +/// This mirrors the CLI package importer enough for browser/mobile callers: +/// path safety, package size limits, descriptor/entrypoint discovery, and +/// resource classification all stay in Rust. +#[wasm_bindgen(js_name = resolvePackageZipBytes)] +pub fn resolve_package_zip_bytes( + source_url: &str, + name: &str, + bytes: &[u8], +) -> Result { + let files = extract_package_zip(name, bytes) + .map_err(|err| JsError::new(&format!("resolve package zip failed: {err}")))?; + let descriptor = load_package_descriptor(&files) + .map_err(|err| JsError::new(&format!("resolve package descriptor failed: {err}")))?; + let entrypoint = descriptor.entrypoint.display().to_string(); + let entry_file = files + .iter() + .find(|file| file.path == descriptor.entrypoint) + .ok_or_else(|| JsError::new(&format!("package entrypoint not found: {entrypoint}")))?; + let entry_resolution = resolve_remote_resource_text( + &package_member_url(source_url, &descriptor.entrypoint), + &entrypoint, + &entry_file.contents, + ) + .map_err(|err| JsError::new(&format!("resolve package entrypoint failed: {err}")))?; + match entry_resolution.kind { + RemoteResourceKind::Assay + | RemoteResourceKind::Panel + | RemoteResourceKind::Python + | RemoteResourceKind::Variant => {} + _ => { + return Err(JsError::new(&format!( + "package entrypoint has unsupported resource kind: {:?}", + entry_resolution.kind + ))); + } + } + + let mut resources = Vec::new(); + for file in &files { + if !is_resource_file(&file.path) { + continue; + } + let path = file.path.display().to_string(); + let resolution = resolve_remote_resource_text( + &package_member_url(source_url, &file.path), + &path, + &file.contents, + ) + .map_err(|err| JsError::new(&format!("resolve package member {path} failed: {err}")))?; + if matches!( + resolution.kind, + RemoteResourceKind::Assay + | RemoteResourceKind::Panel + | RemoteResourceKind::Python + | RemoteResourceKind::Variant + ) { + resources.push(PackageResourceJs { + path, + contents: file.contents.clone(), + resolution, + }); + } + } + + let files_js = files + .iter() + .map(|file| PackageFileJs { + path: file.path.display().to_string(), + contents: file.contents.clone(), + source_url: package_member_url(source_url, &file.path), + }) + .collect(); + + serde_json::to_string(&PackageResolutionJs { + entrypoint, + files: files_js, + name: descriptor.name, + resources, + }) + .map_err(|err| JsError::new(&format!("failed to encode package response: {err}"))) +} + +/// Resolve a BioScript package release YAML into the package zip artifact URL. +#[wasm_bindgen(js_name = resolvePackageReleaseText)] +pub fn resolve_package_release_text( + source_url: &str, + name: &str, + text: &str, +) -> Result { + let value: serde_yaml::Value = serde_yaml::from_str(text) + .map_err(|err| JsError::new(&format!("failed to parse package release {name}: {err}")))?; + let schema = yaml_string(&value, "schema"); + if schema.as_deref() != Some("bioscript:package-release:1.0") { + return Err(JsError::new(&format!( + "{name} is not a bioscript:package-release:1.0 manifest" + ))); + } + let artifact = value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("artifact".to_owned()))) + .and_then(serde_yaml::Value::as_mapping) + .ok_or_else(|| JsError::new(&format!("package release {name} is missing artifact")))?; + let artifact_path = artifact + .get(serde_yaml::Value::String("path".to_owned())) + .and_then(serde_yaml::Value::as_str); + let artifact_url = artifact + .get(serde_yaml::Value::String("url".to_owned())) + .and_then(serde_yaml::Value::as_str); + let artifact_url = if let Some(url) = artifact_url { + url.to_owned() + } else if let Some(relative) = artifact_path { + join_url(source_url, relative) + } else { + return Err(JsError::new(&format!( + "package release {name} artifact needs path or url" + ))); + }; + let artifact_sha256 = artifact + .get(serde_yaml::Value::String("sha256".to_owned())) + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned); + let artifact_size_bytes = artifact + .get(serde_yaml::Value::String("size_bytes".to_owned())) + .and_then(serde_yaml::Value::as_u64); + let title = scalar_at(&value, "label") + .or_else(|| scalar_at(&value, "title")) + .or_else(|| scalar_at(&value, "name")) + .unwrap_or_else(|| name.to_owned()); + let release = PackageReleaseJs { + artifact_sha256, + artifact_size_bytes, + artifact_url, + entrypoint: scalar_at(&value, "entrypoint"), + name: scalar_at(&value, "name"), + title, + version: scalar_at(&value, "package_version").or_else(|| scalar_at(&value, "version")), + }; + serde_json::to_string(&release) + .map_err(|err| JsError::new(&format!("failed to encode package release: {err}"))) +} + +/// Verify package artifact bytes against a package-release sha256 value. +#[wasm_bindgen(js_name = verifyPackageArtifactSha256)] +pub fn verify_package_artifact_sha256( + name: &str, + bytes: &[u8], + expected: &str, +) -> Result<(), JsError> { + let actual = sha256_hex(bytes); + if actual != expected { + return Err(JsError::new(&format!( + "package artifact sha256 mismatch for {name}: expected {expected}, got {actual}" + ))); + } + Ok(()) +} + +fn extract_package_zip(name: &str, bytes: &[u8]) -> Result, String> { + let mut archive = zip::ZipArchive::new(Cursor::new(bytes)) + .map_err(|err| format!("failed to read package zip {name}: {err}"))?; + if archive.len() > MAX_PACKAGE_FILES { + return Err(format!( + "package has too many entries: {} > {MAX_PACKAGE_FILES}", + archive.len() + )); + } + + let mut seen = BTreeSet::new(); + let mut total_size = 0_u64; + let mut files = Vec::new(); + for idx in 0..archive.len() { + let mut entry = archive + .by_index(idx) + .map_err(|err| format!("failed to read package zip entry {idx}: {err}"))?; + let Some(enclosed) = entry.enclosed_name() else { + return Err(format!( + "package zip entry has unsafe path: {}", + entry.name() + )); + }; + let relative = checked_relative_package_path(&enclosed.to_string_lossy())?; + if entry.is_dir() { + continue; + } + if entry + .unix_mode() + .is_some_and(|mode| mode & 0o170_000 == 0o120_000) + { + return Err(format!("package zip entry is a symlink: {}", entry.name())); + } + if !is_allowed_package_file(&relative) { + return Err(format!( + "package zip entry has unsupported extension: {}", + relative.display() + )); + } + if !seen.insert(relative.clone()) { + return Err(format!( + "package zip contains duplicate path: {}", + relative.display() + )); + } + let size = entry.size(); + if size > MAX_PACKAGE_FILE_BYTES { + return Err(format!( + "package file {} exceeds {} bytes", + relative.display(), + MAX_PACKAGE_FILE_BYTES + )); + } + total_size = total_size.saturating_add(size); + if total_size > MAX_PACKAGE_TOTAL_BYTES { + return Err(format!( + "package contents exceed {MAX_PACKAGE_TOTAL_BYTES} bytes" + )); + } + let mut contents = String::new(); + entry + .read_to_string(&mut contents) + .map_err(|err| format!("failed to read package file {}: {err}", relative.display()))?; + files.push(ExtractedPackageFile { + path: relative, + contents, + }); + } + Ok(files) +} + +fn load_package_descriptor(files: &[ExtractedPackageFile]) -> Result { + for name in [PACKAGE_DESCRIPTOR, LEGACY_PACKAGE_DESCRIPTOR] { + if let Some(file) = files.iter().find(|file| file.path == Path::new(name)) { + let value: serde_yaml::Value = serde_yaml::from_str(&file.contents) + .map_err(|err| format!("failed to parse package descriptor {name}: {err}"))?; + let schema = value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("schema".to_owned()))) + .and_then(serde_yaml::Value::as_str) + .ok_or_else(|| format!("package descriptor {name} is missing schema"))?; + if matches!( + schema, + "bioscript:panel:1.0" + | "bioscript:assay:1.0" + | "bioscript:variant:1.0" + | "bioscript:variant" + ) { + let package_name = value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("name".to_owned()))) + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned); + return Ok(PackageDescriptor { + entrypoint: PathBuf::from(name), + name: package_name, + }); + } + if schema != "bioscript:package:1.0" { + return Err(format!( + "package descriptor {name} has unsupported schema '{schema}'" + )); + } + let entrypoint = value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("entrypoint".to_owned()))) + .and_then(serde_yaml::Value::as_str) + .ok_or_else(|| format!("package descriptor {name} is missing entrypoint"))?; + let name = value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("name".to_owned()))) + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned); + return Ok(PackageDescriptor { + entrypoint: checked_relative_package_path(entrypoint)?, + name, + }); + } + } + for candidate in ["panel.yaml", "assay.yaml", "variant.yaml"] { + if files.iter().any(|file| file.path == Path::new(candidate)) { + return Ok(PackageDescriptor { + entrypoint: PathBuf::from(candidate), + name: None, + }); + } + } + Err(format!( + "package does not contain {PACKAGE_DESCRIPTOR}, {LEGACY_PACKAGE_DESCRIPTOR}, panel.yaml, assay.yaml, or variant.yaml" + )) +} + +fn checked_relative_package_path(raw: &str) -> Result { + let path = Path::new(raw); + if path.is_absolute() { + return Err(format!("package path must be relative: {raw}")); + } + let mut out = PathBuf::new(); + for component in path.components() { + match component { + std::path::Component::Normal(part) => out.push(part), + std::path::Component::CurDir => {} + std::path::Component::ParentDir + | std::path::Component::RootDir + | std::path::Component::Prefix(_) => { + return Err(format!("package path escapes package root: {raw}")); + } + } + } + if out.as_os_str().is_empty() { + return Err("package path is empty".to_owned()); + } + Ok(out) +} + +fn is_allowed_package_file(path: &Path) -> bool { + path.file_name() + .and_then(|name| name.to_str()) + .is_some_and(|name| name == PACKAGE_DESCRIPTOR || name == LEGACY_PACKAGE_DESCRIPTOR) + || path + .extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| { + matches!( + ext.to_ascii_lowercase().as_str(), + "yaml" | "yml" | "py" | "md" | "txt" | "tsv" | "json" | "jsonl" + ) + }) +} + +fn is_resource_file(path: &Path) -> bool { + path.extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| matches!(ext.to_ascii_lowercase().as_str(), "yaml" | "yml" | "py")) +} + +fn package_member_url(source_url: &str, path: &Path) -> String { + format!( + "{}/{}", + source_url.trim_end_matches('/'), + path.display().to_string().replace('\\', "/") + ) +} + +fn yaml_string(value: &serde_yaml::Value, key: &str) -> Option { + value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String(key.to_owned()))) + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) +} + +fn scalar_at(value: &serde_yaml::Value, key: &str) -> Option { + yaml_string(value, key) +} + +fn join_url(base_url: &str, relative: &str) -> String { + if relative.starts_with("https://") || relative.starts_with("http://") { + return relative.to_owned(); + } + let base = base_url.split('?').next().unwrap_or(base_url); + match base.rsplit_once('/') { + Some((prefix, _)) => format!("{prefix}/{relative}"), + None => relative.to_owned(), + } +} + +fn sha256_hex(bytes: &[u8]) -> String { + let mut digest = Sha256::new(); + digest.update(bytes); + format!("{:x}", digest.finalize()) +} diff --git a/rust/bioscript-wasm/src/report_api.rs b/rust/bioscript-wasm/src/report_api.rs new file mode 100644 index 0000000..682ef41 --- /dev/null +++ b/rust/bioscript-wasm/src/report_api.rs @@ -0,0 +1,167 @@ +use std::{ + collections::BTreeMap, + fmt::Write as _, + path::{Path, PathBuf}, + time::Duration, +}; + +use bioscript_core::{Assembly, OBSERVATION_TSV_HEADERS, VariantObservation}; +use bioscript_formats::{ + GenotypeLoadOptions, GenotypeStore, InferredSex, InspectOptions, SexDetectionConfidence, + SexInference, inspect_bytes as inspect_bytes_rs, +}; +use bioscript_runtime::{BioscriptRuntime, RuntimeConfig}; +use bioscript_schema::{ + AssayManifest, PanelInterpretation, PanelManifest, VariantManifest, load_assay_manifest_text, + load_panel_manifest_text, load_variant_manifest_text, +}; +use monty::{MontyObject, ResourceLimits}; +use serde::{Deserialize, Serialize}; +use wasm_bindgen::prelude::*; + +#[path = "report_render.rs"] +mod report_render; +#[path = "report_helpers.rs"] +mod report_helpers; +#[path = "report_workspace.rs"] +mod report_workspace; + +use report_helpers::*; +use report_render::{app_report_json, match_app_findings, render_app_html_document, AppReportJsonInput}; +use report_workspace::PackageWorkspace; + +include!("../../bioscript-cli/src/report_matching.rs"); +include!("../../bioscript-cli/src/report_html_sections.rs"); +include!("../../bioscript-cli/src/report_html_analysis.rs"); +include!("../../bioscript-cli/src/report_html_provenance.rs"); +include!("../../bioscript-cli/src/report_html_observations.rs"); +include!("../../bioscript-cli/src/report_html_pgx.rs"); +include!("../../bioscript-cli/src/report_html_helpers.rs"); + +#[derive(Deserialize)] +#[serde(rename_all = "camelCase")] +struct PackageFileInput { + path: String, + contents: String, + #[serde(default)] + source_url: Option, +} + +#[derive(Default, Deserialize)] +#[serde(rename_all = "camelCase")] +struct ReportOptionsInput { + #[serde(default = "default_analysis_max_duration_ms")] + analysis_max_duration_ms: u64, + #[serde(default)] + detect_sex: bool, + #[serde(default)] + filters: Vec, +} + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +struct ReportArtifactOutput { + name: String, + path: String, + mime_type: String, + text: String, +} + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +struct ReportRunOutput { + artifacts: Vec, + duration_ms: u128, + text_output: String, +} + +#[wasm_bindgen(js_name = runPackageReportBytes)] +pub fn run_package_report_bytes( + manifest_path: &str, + package_files_json: &str, + input_name: &str, + input_bytes: &[u8], + options_json: Option, +) -> Result { + let started_ms = js_sys::Date::now(); + let package_files: Vec = serde_json::from_str(package_files_json) + .map_err(|err| JsError::new(&format!("invalid package files JSON: {err}")))?; + let options = match options_json { + Some(text) if !text.is_empty() => serde_json::from_str(&text) + .map_err(|err| JsError::new(&format!("invalid report options JSON: {err}")))?, + _ => ReportOptionsInput::default(), + }; + let workspace = PackageWorkspace::new(package_files)?; + let participant_id = participant_id_from_name(input_name); + let assay_id = app_assay_id(Path::new(manifest_path))?; + let manifest_metadata = workspace.report_manifest_metadata(manifest_path)?; + let findings = workspace.load_manifest_findings(manifest_path)?; + let provenance = workspace.load_manifest_provenance_links(manifest_path)?; + let inspect_options = InspectOptions { + input_index: None, + reference_file: None, + reference_index: None, + detect_sex: options.detect_sex, + }; + let input_inspection = inspect_bytes_rs(input_name, input_bytes, &inspect_options) + .map_err(|err| JsError::new(&format!("inspect input failed: {err:?}")))?; + let mut loader = GenotypeLoadOptions::default(); + loader.assembly = input_inspection.assembly; + loader.inferred_sex = input_inspection + .inferred_sex + .as_ref() + .map(|inference| inference.sex); + let store = GenotypeStore::from_bytes(input_name, input_bytes) + .map_err(|err| JsError::new(&format!("load genotypes failed: {err:?}")))?; + let rows = workspace.run_manifest_rows(manifest_path, &store, &participant_id, &options.filters)?; + let observations = rows + .iter() + .map(|row| { + workspace.app_observation_from_manifest_row( + row, + &assay_id, + input_inspection.inferred_sex.as_ref(), + input_inspection.assembly, + ) + }) + .collect::, _>>()?; + let analyses = workspace.run_manifest_analyses( + manifest_path, + input_name, + input_bytes, + &participant_id, + &loader, + &options, + )?; + let matched_findings = match_app_findings(&findings, &observations, &analyses); + let reports = vec![app_report_json(AppReportJsonInput { + assay_id: &assay_id, + participant_id: &participant_id, + input_file_name: input_name, + observations: &observations, + analyses: &analyses, + findings: &matched_findings, + provenance: &provenance, + input_inspection: Some(&input_inspection), + manifest_metadata: &manifest_metadata, + })]; + let observations_tsv = render_app_observations_tsv(&observations)?; + let analysis_jsonl = render_jsonl(&analyses)?; + let reports_jsonl = render_jsonl(&reports)?; + let html = render_app_html_document(&observations, &reports)?; + let text_output = format!( + "observations: observations.tsv\nanalysis: analysis.jsonl\nreports: reports.jsonl\nhtml: index.html\n" + ); + serde_json::to_string(&ReportRunOutput { + artifacts: vec![ + artifact("observations.tsv", "text/tab-separated-values", observations_tsv), + artifact("analysis.jsonl", "application/jsonl", analysis_jsonl), + artifact("reports.jsonl", "application/jsonl", reports_jsonl), + artifact("index.html", "text/html", html), + ], + duration_ms: (js_sys::Date::now() - started_ms).max(0.0) as u128, + text_output, + }) + .map_err(|err| JsError::new(&format!("failed to encode report output: {err}"))) +} + diff --git a/rust/bioscript-wasm/src/report_helpers.rs b/rust/bioscript-wasm/src/report_helpers.rs new file mode 100644 index 0000000..dc91706 --- /dev/null +++ b/rust/bioscript-wasm/src/report_helpers.rs @@ -0,0 +1,365 @@ +use super::*; + +pub(super) fn artifact(name: &str, mime_type: &str, text: String) -> ReportArtifactOutput { + ReportArtifactOutput { + name: name.to_owned(), + path: name.to_owned(), + mime_type: mime_type.to_owned(), + text, + } +} + +pub(super) fn variant_row( + path: &str, + name: &str, + tags: &[String], + observation: &VariantObservation, + participant_id: &str, +) -> BTreeMap { + let mut row = BTreeMap::new(); + row.insert("kind".to_owned(), "variant".to_owned()); + row.insert("name".to_owned(), name.to_owned()); + row.insert("path".to_owned(), path.to_owned()); + row.insert("tags".to_owned(), tags.join(",")); + row.insert("backend".to_owned(), observation.backend.clone()); + row.insert("participant_id".to_owned(), participant_id.to_owned()); + row.insert("matched_rsid".to_owned(), observation.matched_rsid.clone().unwrap_or_default()); + row.insert("assembly".to_owned(), observation.assembly.map(assembly_row_value).unwrap_or_default()); + row.insert("genotype".to_owned(), observation.genotype.clone().unwrap_or_default()); + row.insert("ref_count".to_owned(), observation.ref_count.map_or_else(String::new, |value| value.to_string())); + row.insert("alt_count".to_owned(), observation.alt_count.map_or_else(String::new, |value| value.to_string())); + row.insert("depth".to_owned(), observation.depth.map_or_else(String::new, |value| value.to_string())); + row.insert("raw_counts".to_owned(), serde_json::to_string(&observation.raw_counts).unwrap_or_default()); + row.insert("evidence".to_owned(), observation.evidence.join(" | ")); + row +} + +pub(super) fn render_app_observations_tsv(observations: &[serde_json::Value]) -> Result { + let mut out = OBSERVATION_TSV_HEADERS.join("\t"); + out.push('\n'); + for observation in observations { + let line = OBSERVATION_TSV_HEADERS + .iter() + .map(|header| json_field_as_tsv(observation.get(*header))) + .collect::>() + .join("\t"); + out.push_str(&line); + out.push('\n'); + } + Ok(out) +} + +pub(super) fn render_jsonl(rows: &[serde_json::Value]) -> Result { + let mut out = String::new(); + for row in rows { + out.push_str(&serde_json::to_string(row).map_err(|err| JsError::new(&err.to_string()))?); + out.push('\n'); + } + Ok(out) +} + +pub(super) fn json_field_as_tsv(value: Option<&serde_json::Value>) -> String { + match value { + Some(serde_json::Value::Null) | None => String::new(), + Some(serde_json::Value::String(value)) => value.replace(['\t', '\n'], " "), + Some(value) => value.to_string().replace(['\t', '\n'], " "), + } +} + +pub(super) fn normalize_package_path(path: &str) -> Result { + let mut out = PathBuf::new(); + for component in Path::new(path).components() { + match component { + std::path::Component::Normal(value) => out.push(value), + std::path::Component::CurDir => {} + _ => return Err(JsError::new(&format!("unsafe package path: {path}"))), + } + } + Ok(out.display().to_string().replace('\\', "/")) +} + +pub(super) fn default_analysis_max_duration_ms() -> u64 { + 30_000 +} + +pub(super) fn participant_id_from_name(path: &str) -> String { + Path::new(path) + .file_stem() + .and_then(|value| value.to_str()) + .unwrap_or(path) + .replace([' ', '\t', '\n'], "_") +} + +pub(super) fn app_assay_id(path: &Path) -> Result { + path.file_stem() + .and_then(|value| value.to_str()) + .map(ToOwned::to_owned) + .ok_or_else(|| JsError::new(&format!("failed to derive assay id from {}", path.display()))) +} + +pub(super) fn matches_filters(manifest: &VariantManifest, path: &str, filters: &[String]) -> bool { + filters.iter().all(|filter| match filter.split_once('=') { + Some(("kind", value)) => value == "variant", + Some(("name", value)) => manifest.name.contains(value), + Some(("path", value)) => path.contains(value), + Some(("tag", value)) => manifest.tags.iter().any(|tag| tag == value), + Some(_) | None => false, + }) +} + +pub(super) fn parse_analysis_output_text( + text: &str, + format: &str, +) -> Result<(Vec, Vec), JsError> { + match format { + "tsv" => Ok(parse_analysis_tsv(text)), + "json" => { + let value: serde_json::Value = serde_json::from_str(text) + .map_err(|err| JsError::new(&format!("failed to parse analysis JSON: {err}")))?; + let rows = match value { + serde_json::Value::Array(rows) => rows, + serde_json::Value::Object(mut object) => object + .remove("rows") + .and_then(|rows| rows.as_array().cloned()) + .unwrap_or_else(|| vec![serde_json::Value::Object(object)]), + other => vec![other], + }; + let row_headers = rows + .iter() + .find_map(|row| row.as_object()) + .map(|object| object.keys().cloned().collect()) + .unwrap_or_default(); + Ok((rows, row_headers)) + } + "jsonl" => { + let mut rows: Vec = Vec::new(); + for line in text.lines().filter(|line| !line.trim().is_empty()) { + rows.push(serde_json::from_str(line) + .map_err(|err| JsError::new(&format!("failed to parse analysis JSONL: {err}")))?); + } + let row_headers = rows + .iter() + .find_map(|row| row.as_object()) + .map(|object| object.keys().cloned().collect()) + .unwrap_or_default(); + Ok((rows, row_headers)) + } + other => Err(JsError::new(&format!("unsupported analysis output_format '{other}'"))), + } +} + +fn parse_analysis_tsv(text: &str) -> (Vec, Vec) { + let mut lines = text.lines(); + let headers = lines + .next() + .map(|line| line.split('\t').map(ToOwned::to_owned).collect::>()) + .unwrap_or_default(); + let rows = lines + .filter(|line| !line.trim().is_empty()) + .map(|line| { + let fields = line.split('\t').collect::>(); + let object = headers + .iter() + .enumerate() + .map(|(index, header)| { + ( + header.clone(), + serde_json::Value::String(fields.get(index).copied().unwrap_or_default().to_owned()), + ) + }) + .collect(); + serde_json::Value::Object(object) + }) + .collect(); + (rows, headers) +} + +pub(super) fn yaml_to_json(value: serde_yaml::Value) -> Result { + serde_json::to_value(value).map_err(|err| JsError::new(&format!("failed to convert YAML to JSON: {err}"))) +} + +pub(super) fn collect_manifest_provenance_entries( + value: &serde_yaml::Value, + links: &mut BTreeMap, +) -> Result<(), JsError> { + if let Some(sources) = value + .get("provenance") + .and_then(|provenance| provenance.get("sources")) + .and_then(serde_yaml::Value::as_sequence) + { + for source in sources { + let json = yaml_to_json(source.clone())?; + if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(json); + } + } + } + if let Some(source) = value.get("source") { + let json = yaml_to_json(source.clone())?; + if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(json); + } + } + Ok(()) +} + +pub(super) fn input_inspection_json(inspection: &bioscript_formats::FileInspection) -> serde_json::Value { + serde_json::json!({ + "container": match inspection.container { + bioscript_formats::FileContainer::Plain => "plain", + bioscript_formats::FileContainer::Zip => "zip", + }, + "format": match inspection.detected_kind { + bioscript_formats::DetectedKind::GenotypeText => "genotype_text", + bioscript_formats::DetectedKind::Vcf => "vcf", + bioscript_formats::DetectedKind::AlignmentCram => "alignment_cram", + bioscript_formats::DetectedKind::AlignmentBam => "alignment_bam", + bioscript_formats::DetectedKind::ReferenceFasta => "reference_fasta", + bioscript_formats::DetectedKind::Unknown => "unknown", + }, + "format_confidence": match inspection.confidence { + bioscript_formats::DetectionConfidence::Authoritative => "authoritative", + bioscript_formats::DetectionConfidence::StrongHeuristic => "strong_heuristic", + bioscript_formats::DetectionConfidence::WeakHeuristic => "weak_heuristic", + bioscript_formats::DetectionConfidence::Unknown => "unknown", + }, + "assembly": inspection.assembly.map(|assembly| match assembly { + Assembly::Grch37 => "grch37", + Assembly::Grch38 => "grch38", + }), + "selected_entry": inspection.selected_entry, + "source": inspection.source.as_ref().map(|source| serde_json::json!({ + "vendor": source.vendor, + "platform_version": source.platform_version, + "evidence": source.evidence, + })), + "inferred_sex": inspection.inferred_sex.as_ref().map(|sex| serde_json::json!({ + "sex": inferred_sex_name(sex.sex), + "confidence": sex_detection_confidence_name(sex.confidence), + "method": sex.method, + "evidence": sex.evidence, + })), + "evidence": inspection.evidence, + "warnings": inspection.warnings, + "duration_ms": inspection.duration_ms, + }) +} + +pub(super) fn yaml_string(value: &serde_yaml::Value, key: &str) -> Option { + value.get(key).and_then(serde_yaml::Value::as_str).map(ToOwned::to_owned) +} + +pub(super) fn yaml_string_sequence(value: &serde_yaml::Value, key: &str) -> Vec { + value + .get(key) + .and_then(serde_yaml::Value::as_sequence) + .map(|items| { + items + .iter() + .filter_map(serde_yaml::Value::as_str) + .map(serde_json::Value::from) + .collect() + }) + .unwrap_or_default() +} + +pub(super) fn yaml_mapping_string(mapping: &serde_yaml::Mapping, key: &str) -> Option { + mapping + .get(serde_yaml::Value::String(key.to_owned())) + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) +} + +pub(super) fn variant_primary_source_from_yaml(value: &serde_yaml::Value) -> serde_json::Value { + let mut links = BTreeMap::::new(); + let _ = collect_manifest_provenance_entries(value, &mut links); + if let Some(rsid) = value + .get("identifiers") + .and_then(|identifiers| identifiers.get("rsids")) + .and_then(serde_yaml::Value::as_sequence) + .and_then(|items| items.iter().find_map(serde_yaml::Value::as_str)) + { + return serde_json::json!({ + "kind": "database", + "label": "dbSNP / NCBI SNP", + "url": format!("https://www.ncbi.nlm.nih.gov/snp/{rsid}"), + "fields": ["identifiers.rsids"], + }); + } + links.into_values().next().unwrap_or(serde_json::Value::Null) +} + +pub(super) fn normalize_app_genotype( + display: &str, + ref_allele: &str, + alt_allele: &str, + chrom: &str, + inferred_sex: Option<&SexInference>, +) -> (String, String) { + if display.is_empty() { + return ("./.".to_owned(), "unknown".to_owned()); + } + let alleles: Vec = display.chars().filter(char::is_ascii_alphabetic).collect(); + if ref_allele.len() != 1 || alt_allele.len() != 1 { + return (display.to_owned(), "unknown".to_owned()); + } + let ref_ch = ref_allele.chars().next().unwrap_or_default(); + let alt_ch = alt_allele.chars().next().unwrap_or_default(); + if is_confident_male_sex_chromosome(chrom, inferred_sex) && alleles.len() == 2 && alleles[0] == alleles[1] { + let allele = alleles[0]; + if allele == ref_ch { + return ("0".to_owned(), "hem_ref".to_owned()); + } + if allele == alt_ch { + return ("1".to_owned(), "hem_alt".to_owned()); + } + } + if alleles.len() != 2 { + return (display.to_owned(), "unknown".to_owned()); + } + let alt_count = alleles.iter().filter(|allele| **allele == alt_ch).count(); + let ref_count = alleles.iter().filter(|allele| **allele == ref_ch).count(); + match (ref_count, alt_count) { + (2, 0) => ("0/0".to_owned(), "hom_ref".to_owned()), + (1, 1) => ("0/1".to_owned(), "het".to_owned()), + (0, 2) => ("1/1".to_owned(), "hom_alt".to_owned()), + _ => (display.to_owned(), "unknown".to_owned()), + } +} + +fn is_confident_male_sex_chromosome(chrom: &str, inferred_sex: Option<&SexInference>) -> bool { + matches!( + chrom.trim().trim_start_matches("chr").to_ascii_uppercase().as_str(), + "X" | "Y" | "23" | "24" + ) && inferred_sex.is_some_and(|sex| { + sex.sex == InferredSex::Male + && matches!(sex.confidence, SexDetectionConfidence::High | SexDetectionConfidence::Medium) + }) +} + +pub(super) fn assembly_row_value(assembly: Assembly) -> String { + match assembly { + Assembly::Grch37 => "grch37".to_owned(), + Assembly::Grch38 => "grch38".to_owned(), + } +} + +fn inferred_sex_name(value: InferredSex) -> &'static str { + match value { + InferredSex::Male => "male", + InferredSex::Female => "female", + InferredSex::Unknown => "unknown", + } +} + +fn sex_detection_confidence_name(value: SexDetectionConfidence) -> &'static str { + match value { + SexDetectionConfidence::High => "high", + SexDetectionConfidence::Medium => "medium", + SexDetectionConfidence::Low => "low", + } +} + +pub(super) fn parse_optional_u32(value: Option<&String>) -> Option { + value.and_then(|value| value.parse::().ok()) +} diff --git a/rust/bioscript-wasm/src/report_render.rs b/rust/bioscript-wasm/src/report_render.rs new file mode 100644 index 0000000..dc8f914 --- /dev/null +++ b/rust/bioscript-wasm/src/report_render.rs @@ -0,0 +1,146 @@ +use super::*; + +#[derive(Clone, Copy)] +pub(super) struct AppReportJsonInput<'a> { + pub(super) assay_id: &'a str, + pub(super) participant_id: &'a str, + pub(super) input_file_name: &'a str, + pub(super) observations: &'a [serde_json::Value], + pub(super) analyses: &'a [serde_json::Value], + pub(super) findings: &'a [serde_json::Value], + pub(super) provenance: &'a [serde_json::Value], + pub(super) input_inspection: Option<&'a bioscript_formats::FileInspection>, + pub(super) manifest_metadata: &'a serde_json::Value, +} + +pub(super) fn app_report_json(input: AppReportJsonInput<'_>) -> serde_json::Value { + let called = input + .observations + .iter() + .filter(|item| item.get("call_status").and_then(serde_json::Value::as_str) == Some("called")) + .count(); + serde_json::json!({ + "schema": "bioscript:report:1.0", + "version": "1.0", + "participant_id": input.participant_id, + "assay_id": input.assay_id, + "assay_version": "1.0", + "manifest": input.manifest_metadata, + "input": { + "file_name": input.input_file_name, + "file_path": input.input_file_name, + "debug": input.input_inspection.map(input_inspection_json), + }, + "report_status": if called == input.observations.len() { "complete" } else { "partial" }, + "derived_from": input.observations.iter().filter_map(|item| item.get("variant_key").cloned()).collect::>(), + "analyses": input.analyses, + "findings": input.findings, + "provenance": input.provenance, + "metrics": { + "n_sites_tested": input.observations.len(), + "n_sites_called": called, + "n_sites_missing": input.observations.len().saturating_sub(called), + "n_analyses": input.analyses.len(), + "n_findings_matched": input.findings.len(), + } + }) +} + +pub(super) fn match_app_findings( + findings: &[serde_json::Value], + observations: &[serde_json::Value], + analyses: &[serde_json::Value], +) -> Vec { + let mut matched = Vec::new(); + let mut seen = std::collections::BTreeSet::new(); + for finding in findings { + if let Some(effects) = finding.get("effects").and_then(serde_json::Value::as_array) { + for effect in effects { + if let Some(observation) = app_finding_match_observation(effect, observations) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.remove("effects"); + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_effect".to_owned(), effect.clone()); + object.insert( + "matched_observation".to_owned(), + app_finding_observation_context(observation), + ); + } + if seen.insert(app_finding_dedupe_key(&item)) { + matched.push(item); + } + } else if let Some(analysis) = app_finding_match_analysis(effect, analyses) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.remove("effects"); + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_effect".to_owned(), effect.clone()); + object.insert("matched_analysis".to_owned(), analysis); + } + if seen.insert(app_finding_dedupe_key(&item)) { + matched.push(item); + } + } + } + } else if let Some(observation) = app_finding_match_observation(finding, observations) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert( + "matched_observation".to_owned(), + app_finding_observation_context(observation), + ); + } + if seen.insert(app_finding_dedupe_key(&item)) { + matched.push(item); + } + } else if let Some(analysis) = app_finding_match_analysis(finding, analyses) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_analysis".to_owned(), analysis); + } + if seen.insert(app_finding_dedupe_key(&item)) { + matched.push(item); + } + } + } + matched +} + +pub(super) fn render_app_html_document( + observations: &[serde_json::Value], + reports: &[serde_json::Value], +) -> Result { + let mut out = String::from( + r##"BioScript report
"##, + ); + let label_findings = collect_report_findings(reports, "bioscript:pgx-label:1.0"); + let summary_findings = collect_report_findings(reports, "bioscript:pgx-summary:1.0"); + let analysis_outputs = collect_report_analyses(reports); + let participants = collect_report_participants(reports); + render_report_manifest_header(&mut out, reports); + let _ = write!(out, "
{} observation(s), {} analysis output(s), {} PGx label finding(s), {} PGx summary finding(s)
", observations.len(), analysis_outputs.len(), label_findings.len(), summary_findings.len()); + render_participant_filter(&mut out, &participants); + out.push_str(""); + out.push_str("

Input

"); + render_input_debug(&mut out, reports, participants.len() > 1); + out.push_str("

Observations

"); + render_observation_table(&mut out, observations, participants.len() > 1); + out.push_str("

Analysis

"); + render_analysis_tables(&mut out, &analysis_outputs, observations, participants.len() > 1); + out.push_str("

PGx

"); + render_pgx_table(&mut out, &label_findings, &summary_findings); + out.push_str("

Provenance

"); + render_provenance_links(&mut out, reports); + out.push_str("

Source

"); + render_report_source_section(&mut out, reports); + out.push_str("

Raw Reports JSON

Show raw report JSON"); + for report in reports { + let text = serde_json::to_string_pretty(report).map_err(|err| JsError::new(&err.to_string()))?; + let _ = write!(out, "
{}
", html_escape(&text)); + } + out.push_str("
"); + Ok(out) +} diff --git a/rust/bioscript-wasm/src/report_workspace.rs b/rust/bioscript-wasm/src/report_workspace.rs new file mode 100644 index 0000000..66d5c7e --- /dev/null +++ b/rust/bioscript-wasm/src/report_workspace.rs @@ -0,0 +1,500 @@ +use super::*; + +pub(super) struct PackageWorkspace { + files: BTreeMap, +} + +impl PackageWorkspace { + pub(super) fn new(files: Vec) -> Result { + let mut map = BTreeMap::new(); + for file in files { + let _ = file.source_url; + map.insert(normalize_package_path(&file.path)?, file.contents); + } + Ok(Self { files: map }) + } + + fn text(&self, path: &str) -> Result<&str, JsError> { + let normalized = normalize_package_path(path)?; + self.files + .get(&normalized) + .map(String::as_str) + .ok_or_else(|| JsError::new(&format!("package file not found: {normalized}"))) + } + + fn yaml(&self, path: &str) -> Result { + serde_yaml::from_str(self.text(path)?) + .map_err(|err| JsError::new(&format!("failed to parse YAML {path}: {err}"))) + } + + fn schema(&self, path: &str) -> Result { + self.yaml(path)? + .get("schema") + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) + .ok_or_else(|| JsError::new(&format!("{path} is missing schema"))) + } + + fn resolve(&self, base: &str, relative: &str) -> Result { + let base = Path::new(base).parent().unwrap_or_else(|| Path::new("")); + normalize_package_path(&base.join(relative).display().to_string()) + } + + fn load_variant(&self, path: &str) -> Result { + load_variant_manifest_text(path, self.text(path)?) + .map_err(|err| JsError::new(&format!("load variant {path}: {err}"))) + } + + fn load_panel(&self, path: &str) -> Result { + load_panel_manifest_text(path, self.text(path)?) + .map_err(|err| JsError::new(&format!("load panel {path}: {err}"))) + } + + fn load_assay(&self, path: &str) -> Result { + load_assay_manifest_text(path, self.text(path)?) + .map_err(|err| JsError::new(&format!("load assay {path}: {err}"))) + } + + pub(super) fn run_manifest_rows( + &self, + manifest_path: &str, + store: &GenotypeStore, + participant_id: &str, + filters: &[String], + ) -> Result>, JsError> { + match self.schema(manifest_path)?.as_str() { + "bioscript:variant:1.0" | "bioscript:variant" => { + let manifest = self.load_variant(manifest_path)?; + let observation = store + .lookup_variant(&manifest.spec) + .map_err(|err| JsError::new(&format!("lookup {}: {err:?}", manifest.name)))?; + Ok(vec![variant_row( + manifest_path, + &manifest.name, + &manifest.tags, + &observation, + participant_id, + )]) + } + "bioscript:panel:1.0" => self.run_panel_rows(manifest_path, store, participant_id, filters), + "bioscript:assay:1.0" => self.run_assay_rows(manifest_path, store, participant_id, filters), + other => Err(JsError::new(&format!("unsupported manifest schema '{other}'"))), + } + } + + fn run_panel_rows( + &self, + manifest_path: &str, + store: &GenotypeStore, + participant_id: &str, + filters: &[String], + ) -> Result>, JsError> { + let panel = self.load_panel(manifest_path)?; + let mut rows_by_member: Vec>> = vec![Vec::new(); panel.members.len()]; + let mut variants = Vec::<(usize, String, VariantManifest)>::new(); + for (index, member) in panel.members.iter().enumerate() { + let Some(path) = &member.path else { + return Err(JsError::new("remote panel members are not executable yet")); + }; + let resolved = self.resolve(manifest_path, path)?; + if member.kind == "variant" { + let variant = self.load_variant(&resolved)?; + if matches_filters(&variant, &resolved, filters) { + variants.push((index, resolved, variant)); + } + } else if member.kind == "assay" { + rows_by_member[index] = self.run_assay_rows(&resolved, store, participant_id, filters)?; + } + } + let specs = variants + .iter() + .map(|(_, _, manifest)| manifest.spec.clone()) + .collect::>(); + let observations = store + .lookup_variants(&specs) + .map_err(|err| JsError::new(&format!("panel lookup failed: {err:?}")))?; + for ((member_index, resolved, manifest), observation) in variants.into_iter().zip(observations) { + rows_by_member[member_index].push(variant_row( + &resolved, + &manifest.name, + &manifest.tags, + &observation, + participant_id, + )); + } + Ok(rows_by_member.into_iter().flatten().collect()) + } + + fn run_assay_rows( + &self, + manifest_path: &str, + store: &GenotypeStore, + participant_id: &str, + filters: &[String], + ) -> Result>, JsError> { + let assay = self.load_assay(manifest_path)?; + let mut variants = Vec::<(String, VariantManifest)>::new(); + for member in &assay.members { + if member.kind != "variant" { + continue; + } + let Some(path) = &member.path else { + continue; + }; + let resolved = self.resolve(manifest_path, path)?; + let variant = self.load_variant(&resolved)?; + if matches_filters(&variant, &resolved, filters) { + variants.push((resolved, variant)); + } + } + let specs = variants + .iter() + .map(|(_, manifest)| manifest.spec.clone()) + .collect::>(); + let observations = store + .lookup_variants(&specs) + .map_err(|err| JsError::new(&format!("assay lookup failed: {err:?}")))?; + Ok(variants + .into_iter() + .zip(observations) + .map(|((resolved, manifest), observation)| { + variant_row( + &resolved, + &manifest.name, + &manifest.tags, + &observation, + participant_id, + ) + }) + .collect()) + } + + pub(super) fn run_manifest_analyses( + &self, + manifest_path: &str, + input_name: &str, + input_bytes: &[u8], + participant_id: &str, + loader: &GenotypeLoadOptions, + options: &ReportOptionsInput, + ) -> Result, JsError> { + match self.schema(manifest_path)?.as_str() { + "bioscript:panel:1.0" => { + let panel = self.load_panel(manifest_path)?; + let mut analyses = self.run_interpretations( + manifest_path, + &panel.name, + &panel.interpretations, + input_name, + input_bytes, + participant_id, + loader, + options, + )?; + for member in &panel.members { + if member.kind != "assay" { + continue; + } + let Some(path) = &member.path else { + continue; + }; + let resolved = self.resolve(manifest_path, path)?; + analyses.extend(self.run_manifest_analyses( + &resolved, + input_name, + input_bytes, + participant_id, + loader, + options, + )?); + } + Ok(analyses) + } + "bioscript:assay:1.0" => { + let assay = self.load_assay(manifest_path)?; + self.run_interpretations( + manifest_path, + &assay.name, + &assay.interpretations, + input_name, + input_bytes, + participant_id, + loader, + options, + ) + } + _ => Ok(Vec::new()), + } + } + + #[allow(clippy::too_many_arguments)] + fn run_interpretations( + &self, + manifest_path: &str, + manifest_name: &str, + interpretations: &[PanelInterpretation], + input_name: &str, + input_bytes: &[u8], + participant_id: &str, + loader: &GenotypeLoadOptions, + options: &ReportOptionsInput, + ) -> Result, JsError> { + let mut outputs = Vec::new(); + for interpretation in interpretations { + if interpretation.kind != "bioscript" { + return Err(JsError::new(&format!( + "analysis '{}' uses unsupported kind '{}'", + interpretation.id, interpretation.kind + ))); + } + let script_path = self.resolve(manifest_path, &interpretation.path)?; + let output_file = format!( + "analysis/{participant_id}/{}.{}", + interpretation.id, + interpretation.output_format.as_deref().unwrap_or("json") + ); + let mut virtual_text_files = self.files.clone(); + let mut virtual_binary_files = BTreeMap::new(); + virtual_binary_files.insert(input_name.to_owned(), input_bytes.to_vec()); + let limits = ResourceLimits::new() + .max_duration(Duration::from_millis(options.analysis_max_duration_ms)) + .max_memory(16 * 1024 * 1024) + .max_allocations(400_000) + .gc_interval(1000) + .max_recursion_depth(Some(200)); + let runtime = BioscriptRuntime::with_config( + PathBuf::new(), + RuntimeConfig { + limits, + loader: loader.clone(), + virtual_binary_files, + virtual_text_files: std::mem::take(&mut virtual_text_files), + }, + ) + .map_err(|err| JsError::new(&format!("create analysis runtime failed: {err:?}")))?; + runtime + .run_file( + &script_path, + None, + vec![ + ("input_file", MontyObject::String(input_name.to_owned())), + ("output_file", MontyObject::String(output_file.clone())), + ("participant_id", MontyObject::String(participant_id.to_owned())), + ], + ) + .map_err(|err| JsError::new(&format!("analysis {} failed: {err:?}", interpretation.id)))?; + let written = runtime.virtual_written_text_files(); + let text = written + .get(&output_file) + .ok_or_else(|| JsError::new(&format!("analysis {} did not write {output_file}", interpretation.id)))?; + let format = interpretation + .output_format + .as_deref() + .unwrap_or("json") + .to_ascii_lowercase(); + let (rows, row_headers) = parse_analysis_output_text(text, &format)?; + outputs.push(serde_json::json!({ + "schema": "bioscript:analysis-output:1.0", + "version": "1.0", + "participant_id": participant_id, + "assay_id": manifest_name, + "analysis_id": interpretation.id, + "analysis_label": interpretation.label, + "kind": interpretation.kind, + "output_format": format, + "manifest_path": manifest_path, + "script_path": script_path, + "output_file": output_file, + "derived_from": interpretation.derived_from, + "emits": interpretation.emits.iter().map(|emit| serde_json::json!({ + "key": emit.key, + "label": emit.label, + "value_type": emit.value_type, + "format": emit.format, + })).collect::>(), + "logic": interpretation.logic.as_ref().map(|logic| serde_json::json!({ + "description": logic.description, + "source": logic.source.as_ref().map(|source| serde_json::json!({ + "name": source.name, + "url": source.url, + })), + })), + "row_headers": row_headers, + "rows": rows, + })); + } + Ok(outputs) + } + + pub(super) fn app_observation_from_manifest_row( + &self, + row: &BTreeMap, + assay_id: &str, + inferred_sex: Option<&SexInference>, + fallback_assembly: Option, + ) -> Result { + let row_path = row.get("path").cloned().unwrap_or_default(); + let manifest = self.load_variant(&row_path)?; + let value = self.yaml(&row_path)?; + let gene = yaml_string(&value, "gene").unwrap_or_default(); + let ref_allele = manifest.spec.reference.clone().unwrap_or_default(); + let alt_allele = manifest.spec.alternate.clone().unwrap_or_default(); + let genotype_display = row.get("genotype").cloned().unwrap_or_default(); + let assembly = row + .get("assembly") + .filter(|value| !value.is_empty()) + .cloned() + .or_else(|| fallback_assembly.map(assembly_row_value)) + .unwrap_or_default(); + let locus = if assembly.eq_ignore_ascii_case("grch37") { + manifest.spec.grch37.as_ref() + } else { + manifest.spec.grch38.as_ref().or(manifest.spec.grch37.as_ref()) + }; + let chrom = locus.map_or(String::new(), |locus| locus.chrom.clone()); + let (genotype, zygosity) = + normalize_app_genotype(&genotype_display, &ref_allele, &alt_allele, &chrom, inferred_sex); + let outcome = if genotype == "./." { + "no_call" + } else if zygosity == "hom_ref" || zygosity == "hem_ref" { + "reference" + } else if zygosity == "het" || zygosity == "hom_alt" || zygosity == "hem_alt" { + "variant" + } else { + "unknown" + }; + Ok(serde_json::json!({ + "participant_id": row.get("participant_id").cloned().unwrap_or_default(), + "assay_id": assay_id, + "assay_version": "1.0", + "variant_key": manifest.name, + "variant_path": row_path, + "rsid": row.get("matched_rsid").filter(|value| !value.is_empty()).cloned().or_else(|| manifest.spec.rsids.first().cloned()), + "gene": gene, + "assembly": if assembly.is_empty() { serde_json::Value::Null } else { serde_json::Value::String(assembly.to_uppercase()) }, + "chrom": chrom, + "pos_start": locus.map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.start)), + "pos_end": locus.map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.end)), + "ref": ref_allele, + "alt": alt_allele, + "kind": manifest.spec.kind.map_or("unknown".to_owned(), |kind| format!("{kind:?}").to_lowercase()), + "match_status": if row.get("matched_rsid").is_some_and(|value| !value.is_empty()) || !genotype_display.is_empty() { "found" } else { "not_found" }, + "coverage_status": "covered", + "call_status": if genotype == "./." { "no_call" } else { "called" }, + "genotype": genotype, + "genotype_display": genotype_display, + "zygosity": zygosity, + "ref_count": parse_optional_u32(row.get("ref_count")), + "alt_count": parse_optional_u32(row.get("alt_count")), + "depth": parse_optional_u32(row.get("depth")), + "genotype_quality": serde_json::Value::Null, + "allele_balance": serde_json::Value::Null, + "outcome": outcome, + "evidence_type": "genotype_file", + "evidence_raw": row.get("evidence").cloned().unwrap_or_default(), + "source": variant_primary_source_from_yaml(&value), + "facets": serde_json::Value::Null, + })) + } + + pub(super) fn report_manifest_metadata(&self, path: &str) -> Result { + let value = self.yaml(path)?; + let members = value + .get("members") + .and_then(serde_yaml::Value::as_sequence) + .map(|items| { + items + .iter() + .filter_map(serde_yaml::Value::as_mapping) + .map(|mapping| { + serde_json::json!({ + "kind": yaml_mapping_string(mapping, "kind"), + "path": yaml_mapping_string(mapping, "path"), + "version": yaml_mapping_string(mapping, "version"), + }) + }) + .collect::>() + }) + .unwrap_or_default(); + Ok(serde_json::json!({ + "schema": yaml_string(&value, "schema"), + "version": yaml_string(&value, "version"), + "name": yaml_string(&value, "name"), + "label": yaml_string(&value, "label").or_else(|| yaml_string(&value, "name")), + "tags": yaml_string_sequence(&value, "tags"), + "members": members, + })) + } + + pub(super) fn load_manifest_findings(&self, path: &str) -> Result, JsError> { + let value = self.yaml(path)?; + let schema = yaml_string(&value, "schema").unwrap_or_default(); + let mut findings = Vec::new(); + if matches!( + schema.as_str(), + "bioscript:variant:1.0" | "bioscript:variant" | "bioscript:assay:1.0" | "bioscript:panel:1.0" | "bioscript:pgx-findings:1.0" + ) { + if let Some(items) = value.get("findings").and_then(serde_yaml::Value::as_sequence) { + for item in items { + let json_item = yaml_to_json(item.clone())?; + if let Some(include) = json_item.get("include").and_then(serde_json::Value::as_str) { + let include_path = self.resolve(path, include)?; + let mut included = self.load_manifest_findings(&include_path)?; + let inherited_binding = json_item.get("binding").cloned(); + for included_item in &mut included { + if inherited_binding.is_some() + && included_item.get("binding").is_none() + && included_item.get("effects").is_none() + { + if let Some(object) = included_item.as_object_mut() { + object.insert( + "binding".to_owned(), + inherited_binding.clone().unwrap_or(serde_json::Value::Null), + ); + } + } + } + findings.extend(included); + } else { + findings.push(json_item); + } + } + } + } + if matches!(schema.as_str(), "bioscript:assay:1.0" | "bioscript:panel:1.0") { + if let Some(items) = value.get("members").and_then(serde_yaml::Value::as_sequence) { + for member in items { + let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { continue }; + if !matches!(kind, "variant" | "assay") { continue; } + let Some(member_path) = member.get("path").and_then(serde_yaml::Value::as_str) else { continue }; + let resolved = self.resolve(path, member_path)?; + findings.extend(self.load_manifest_findings(&resolved)?); + } + } + } + Ok(findings) + } + + pub(super) fn load_manifest_provenance_links(&self, path: &str) -> Result, JsError> { + let value = self.yaml(path)?; + let schema = yaml_string(&value, "schema").unwrap_or_default(); + let mut links = BTreeMap::::new(); + collect_manifest_provenance_entries(&value, &mut links)?; + if matches!(schema.as_str(), "bioscript:assay:1.0" | "bioscript:panel:1.0") { + if let Some(items) = value.get("members").and_then(serde_yaml::Value::as_sequence) { + for member in items { + let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { continue }; + if !matches!(kind, "variant" | "assay") { continue; } + let Some(member_path) = member.get("path").and_then(serde_yaml::Value::as_str) else { continue }; + let resolved = self.resolve(path, member_path)?; + for item in self.load_manifest_provenance_links(&resolved)? { + if let Some(url) = item.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(item); + } + } + } + } + } + Ok(links.into_values().collect()) + } +}