Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/variant-schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ coordinates:
grch37:
chrom: "12"
pos: 112241766
assembly_ref: "G"
grch38:
chrom: "12"
pos: 111803962
assembly_ref: "G"

alleles:
kind: "snv"
Expand All @@ -57,6 +59,12 @@ At least one of these must also exist:
- `identifiers`
- `coordinates`

`coordinates.<assembly>.assembly_ref` is optional for SNVs. Use it when an
older assembly's reference base differs from the canonical `alleles.ref` used
by the catalogue. VCF inputs are matched against the assembly reference base
and then translated back to the canonical `alleles.ref` / `alleles.alts`
representation before findings are evaluated.

## Top-Level `tags`

Optional free-form classification tags for filtering and compatibility hints.
Expand Down
2 changes: 2 additions & 0 deletions docs/variant-schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@ coordinates:
grch37:
chrom: "1"
pos: 12345
assembly_ref: "A"
grch38:
chrom: "1"
start: 12345
end: 12346
assembly_ref: "G"

alleles:
kind: "snv"
Expand Down
2 changes: 1 addition & 1 deletion rust/bioscript-cli/src/cli_bootstrap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ fn run_cli() -> Result<(), String> {
Ok(())
}

const USAGE: &str = "usage: bioscript <script.py|manifest.yaml|package.yaml|package.zip|https://.../package.yaml|https://.../package.zip> [--root <dir>] [--input-file <path>] [--output-file <path>] [--participant-id <id>] [--trace-report <path>] [--timing-report <path>] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index <path>] [--reference-file <path>] [--reference-index <path>] [--auto-index] [--cache-dir <path>] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report <manifest.yaml|package.yaml|package.zip|https://.../package.yaml|https://.../package.zip> --input-file <path> [--input-file <path>...] --output-dir <dir> [--html] [--open] [--root <dir>] [--input-format auto|text|zip|vcf|cram] [--detect-sex] [--sample-sex male|female|unknown] [--analysis-max-duration-ms N]\n bioscript review <manifest.yaml|package.yaml|package.zip> --cases <cases.yaml> --output-dir <dir> [--html] [--root <dir>] [--filter key=value]\n bioscript import-package <package.yaml|package.zip|https://.../package.yaml|https://.../package.zip> [--root <dir>] [--output-dir <dir>]\n bioscript validate-variants <path> [--report <file>]\n bioscript validate-panels <path> [--report <file>]\n bioscript validate-assays <path> [--report <file>]\n bioscript prepare [--root <dir>] [--input-file <path>] [--reference-file <path>] [--input-format auto|text|zip|vcf|cram] [--cache-dir <path>]\n bioscript inspect <path> [--input-index <path>] [--reference-file <path>] [--reference-index <path>] [--detect-sex]";
const USAGE: &str = "usage: bioscript <script.py|manifest.yaml|package.yaml|package.zip|https://.../package.yaml|https://.../package.zip> [--root <dir>] [--input-file <path>] [--output-file <path>] [--participant-id <id>] [--trace-report <path>] [--timing-report <path>] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index <path>] [--reference-file <path>] [--reference-index <path>] [--auto-index] [--cache-dir <path>] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report <manifest.yaml|package.yaml|package.zip|https://.../package.yaml|https://.../package.zip> --input-file <path> [--input-file <path>...] --output-dir <dir> [--html] [--open] [--root <dir>] [--input-format auto|text|zip|vcf|cram] [--input-index <path>] [--reference-file <path>] [--reference-index <path>] [--allow-md5-mismatch] [--detect-sex] [--sample-sex male|female|unknown] [--analysis-max-duration-ms N]\n bioscript review <manifest.yaml|package.yaml|package.zip> --cases <cases.yaml> --output-dir <dir> [--html] [--root <dir>] [--filter key=value]\n bioscript import-package <package.yaml|package.zip|https://.../package.yaml|https://.../package.zip> [--root <dir>] [--output-dir <dir>]\n bioscript validate-variants <path> [--report <file>]\n bioscript validate-panels <path> [--report <file>]\n bioscript validate-assays <path> [--report <file>]\n bioscript prepare [--root <dir>] [--input-file <path>] [--reference-file <path>] [--input-format auto|text|zip|vcf|cram] [--cache-dir <path>]\n bioscript inspect <path> [--input-index <path>] [--reference-file <path>] [--reference-index <path>] [--detect-sex]";

struct CliOptions {
script_path: Option<PathBuf>,
Expand Down
1 change: 1 addition & 0 deletions rust/bioscript-cli/src/report_options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ impl AppReportCliState {
self.loader.reference_index =
Some(PathBuf::from(next_arg(iter, "--reference-index")?));
}
"--allow-md5-mismatch" => self.loader.allow_reference_md5_mismatch = true,
value if value.starts_with('-') => return Err(format!("unexpected argument: {value}")),
value => self.consume_path(value),
}
Expand Down
11 changes: 11 additions & 0 deletions rust/bioscript-core/src/variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ pub struct VariantSpec {
pub rsids: Vec<String>,
pub grch37: Option<GenomicLocus>,
pub grch38: Option<GenomicLocus>,
pub grch37_assembly_ref: Option<String>,
pub grch38_assembly_ref: Option<String>,
pub reference: Option<String>,
pub alternate: Option<String>,
pub kind: Option<VariantKind>,
Expand Down Expand Up @@ -58,6 +60,15 @@ impl VariantSpec {
pub fn has_coordinates(&self) -> bool {
self.grch37.is_some() || self.grch38.is_some()
}

#[must_use]
pub fn assembly_reference(&self, assembly: Option<Assembly>) -> Option<&str> {
match assembly {
Some(Assembly::Grch37) => self.grch37_assembly_ref.as_deref(),
Some(Assembly::Grch38) => self.grch38_assembly_ref.as_deref(),
None => None,
}
}
}

#[cfg(test)]
Expand Down
20 changes: 20 additions & 0 deletions rust/bioscript-formats/src/alignment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,26 @@ where
cram_stream::for_each_cram_record_with_reader_inner(reader, label, locus, true, on_record)
}

pub(crate) fn for_each_cram_record_with_reader_allow_md5_mismatch<R, F>(
reader: &mut cram::io::indexed_reader::IndexedReader<R>,
label: &str,
locus: &GenomicLocus,
allow_reference_md5_mismatch: bool,
on_record: F,
) -> Result<(), RuntimeError>
where
R: Read + Seek,
F: FnMut(AlignmentRecord) -> Result<bool, RuntimeError>,
{
cram_stream::for_each_cram_record_with_reader_inner(
reader,
label,
locus,
allow_reference_md5_mismatch,
on_record,
)
}

/// Iterate raw CRAM records intersecting `locus`, streaming from an
/// already-built CRAM `IndexedReader`. The raw variant preserves the
/// `cram::Record` handle so callers can pull base+quality at a specific
Expand Down
35 changes: 17 additions & 18 deletions rust/bioscript-formats/src/alignment/cram_stream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ where
let reference_sequence_repository = reader.reference_sequence_repository().clone();

let mut stop = false;
let mut callback_err: Option<RuntimeError> = None;

for (index, slice_result) in container.slices().enumerate() {
let slice = slice_result.map_err(|err| {
Expand All @@ -256,38 +257,36 @@ where
))
})?;

let records = slice.records(
let decode_result = slice.records_while(
reference_sequence_repository.clone(),
header,
&compression_header,
&core_data_src,
&external_data_srcs,
!allow_reference_md5_mismatch,
|record| {
Ok(handle_decoded_cram_record(
label,
record,
interval,
locus_end,
&mut stop,
&mut callback_err,
on_record,
))
},
);

match records {
Ok(records) => {
let mut callback_err: Option<RuntimeError> = None;
for record in &records {
if !handle_decoded_cram_record(
label,
record,
interval,
locus_end,
&mut stop,
&mut callback_err,
on_record,
) {
break;
}
}
match decode_result {
Ok(()) => {
if let Some(err) = callback_err {
return Err(err);
}
}
Err(err) if allow_reference_md5_mismatch && is_reference_md5_mismatch(&err) => {
eprintln!(
"[bioscript] warning: CRAM reference MD5 mismatch for {label} slice landmark {landmark} — \
this noodles version cannot retry without checksum validation. \
decoding continued with checksum validation disabled for this slice. \
Details: {err}"
);
}
Expand Down
70 changes: 70 additions & 0 deletions rust/bioscript-formats/src/genotype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,8 @@ mod tests {
rsids: vec!["rs1".to_owned()],
grch37: Some(locus("1", 10, 10)),
grch38: Some(locus("2", 20, 20)),
grch37_assembly_ref: None,
grch38_assembly_ref: None,
reference: Some("A".to_owned()),
alternate: Some("G".to_owned()),
kind: Some(VariantKind::Snp),
Expand Down Expand Up @@ -831,6 +833,22 @@ mod tests {
..VariantSpec::default()
};
assert!(vcf_row_matches_variant(&row, &snp, Some(Assembly::Grch38)));
let assembly_ref_snp = VariantSpec {
grch37: Some(locus("1", 10, 10)),
grch37_assembly_ref: Some("G".to_owned()),
reference: Some("A".to_owned()),
alternate: Some("G".to_owned()),
kind: Some(VariantKind::Snp),
..VariantSpec::default()
};
let inverted_row = parse_vcf_record("1\t10\trs10\tG\tA\t.\tPASS\t.\tGT\t1/1")
.unwrap()
.unwrap();
assert!(vcf_row_matches_variant(
&inverted_row,
&assembly_ref_snp,
Some(Assembly::Grch37)
));
let deletion_row = parse_vcf_record("1\t9\trsdel\tATC\tA\t.\tPASS\t.\tGT\t0/1")
.unwrap()
.unwrap();
Expand Down Expand Up @@ -880,6 +898,58 @@ mod tests {
));
}

#[test]
fn vcf_scan_translates_assembly_ref_snv_rows_and_missing_reference() {
let dir = temp_dir("vcf-assembly-ref");
let vcf_path = dir.join("sample.grch37.vcf");
fs::write(
&vcf_path,
"##fileformat=VCFv4.3\n\
##reference=GRCh37\n\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\
1\t10\trs10\tT\tC,A\t.\tPASS\t.\tGT\t1/1\n",
)
.unwrap();

let backend = VcfBackend {
path: vcf_path,
options: GenotypeLoadOptions {
assembly: Some(Assembly::Grch37),
..GenotypeLoadOptions::default()
},
};
let present = VariantSpec {
grch37: Some(locus("1", 10, 10)),
grch37_assembly_ref: Some("T".to_owned()),
reference: Some("C".to_owned()),
alternate: Some("T".to_owned()),
kind: Some(VariantKind::Snp),
..VariantSpec::default()
};
let missing = VariantSpec {
grch37: Some(locus("1", 20, 20)),
grch37_assembly_ref: Some("T".to_owned()),
reference: Some("C".to_owned()),
alternate: Some("T".to_owned()),
kind: Some(VariantKind::Snp),
..VariantSpec::default()
};

let observations = scan_vcf_variants(&backend, &[present, missing]).unwrap();
assert_eq!(observations[0].genotype.as_deref(), Some("CC"));
assert!(
observations[0].evidence[0].contains("resolved by locus"),
"{:?}",
observations[0].evidence
);
assert_eq!(observations[1].genotype.as_deref(), Some("TT"));
assert!(
observations[1].evidence[0].contains("imputed reference genotype"),
"{:?}",
observations[1].evidence
);
}

#[test]
fn genotype_private_helpers_cover_indel_record_classification() {
let record = AlignmentRecord {
Expand Down
54 changes: 2 additions & 52 deletions rust/bioscript-formats/src/genotype/cram_backend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,7 @@ use std::{

use noodles::core::Position;
use noodles::cram;
use noodles::sam::alignment::{
Record as _,
record::{Cigar as _, QualityScores as _, Sequence as _, cigar::op::Kind as CigarOpKind},
};
use noodles::sam::alignment::Record as _;

use bioscript_core::{Assembly, GenomicLocus, RuntimeError, VariantSpec};

Expand Down Expand Up @@ -392,54 +389,7 @@ fn cram_base_quality_at_reference_position(
target_position: Position,
reference_base: u8,
) -> Result<Option<(u8, u8)>, RuntimeError> {
let Some(alignment_start) = record.alignment_start() else {
return Ok(None);
};
let alignment_start = alignment_start
.map_err(|err| RuntimeError::Io(format!("failed to read CRAM alignment start: {err}")))?;
let mut reference_position = usize::from(alignment_start);
let target = usize::from(target_position);
let mut read_position = 0usize;
let sequence = record.sequence();
let qualities = record.quality_scores();

for op in record.cigar().iter() {
let op = op.map_err(|err| RuntimeError::Io(format!("failed to read CRAM CIGAR: {err}")))?;
match op.kind() {
CigarOpKind::Match | CigarOpKind::SequenceMatch | CigarOpKind::SequenceMismatch => {
for offset in 0..op.len() {
if reference_position + offset == target {
let base = sequence
.get(read_position + offset)
.unwrap_or(reference_base);
let quality = qualities
.iter()
.nth(read_position + offset)
.transpose()
.map_err(|err| {
RuntimeError::Io(format!("failed to read CRAM base quality: {err}"))
})?
.unwrap_or(0);
return Ok(Some((base, quality)));
}
}
reference_position += op.len();
read_position += op.len();
}
CigarOpKind::Insertion | CigarOpKind::SoftClip => {
read_position += op.len();
}
CigarOpKind::Deletion | CigarOpKind::Skip => {
if target >= reference_position && target < reference_position + op.len() {
return Ok(None);
}
reference_position += op.len();
}
CigarOpKind::HardClip | CigarOpKind::Pad => {}
}
}

Ok(None)
Ok(record.base_quality_at_reference_position(target_position, reference_base))
}

pub(crate) fn normalize_pileup_base(base: u8) -> Option<char> {
Expand Down
43 changes: 34 additions & 9 deletions rust/bioscript-formats/src/genotype/vcf/matching.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ pub fn imputed_reference_observation(
) -> Option<VariantObservation> {
let genotype = match variant.kind {
None | Some(VariantKind::Snp) => {
let reference = first_single_base_allele(variant.reference.as_deref())?;
let reference = reference_for_assembly(variant, assembly)?;
first_single_base_allele(variant.alternate.as_deref())?;
reference_genotype_for_locus(reference, locus, inferred_sex)
}
Expand Down Expand Up @@ -77,6 +77,16 @@ fn reference_genotype_for_locus(
}
}

pub(crate) fn reference_for_assembly(
variant: &VariantSpec,
assembly: Option<Assembly>,
) -> Option<char> {
let value = variant
.assembly_reference(assembly)
.or(variant.reference.as_deref())?;
first_single_base_allele(Some(value))
}

fn is_sex_chromosome(chrom: &str) -> bool {
matches!(
chrom
Expand Down Expand Up @@ -108,16 +118,13 @@ pub(crate) fn vcf_row_matches_variant(

match variant.kind.unwrap_or(VariantKind::Other) {
VariantKind::Snp => {
let expected_reference = variant
.assembly_reference(assembly)
.or(variant.reference.as_deref());
row.position == locus.start
&& variant
.reference
.as_ref()
&& expected_reference
.is_none_or(|reference| reference.eq_ignore_ascii_case(&row.reference))
&& variant.alternate.as_ref().is_none_or(|alternate| {
row.alternates
.iter()
.any(|candidate| candidate.eq_ignore_ascii_case(alternate))
})
&& snp_row_has_catalog_allele(row, variant)
}
VariantKind::Deletion => {
let expected_len = variant.deletion_length.unwrap_or(0);
Expand All @@ -134,3 +141,21 @@ pub(crate) fn vcf_row_matches_variant(
VariantKind::Other => row.position == locus.start,
}
}

fn snp_row_has_catalog_allele(row: &ParsedVcfRow, variant: &VariantSpec) -> bool {
let Some(alternate) = variant.alternate.as_ref() else {
return true;
};
if row
.alternates
.iter()
.any(|candidate| candidate.eq_ignore_ascii_case(alternate))
{
return true;
}
variant.reference.as_ref().is_some_and(|reference| {
row.alternates
.iter()
.any(|candidate| candidate.eq_ignore_ascii_case(reference))
})
}
Loading
Loading