From e248911bd7b458c7dc4448ef00c2f7adc846db61 Mon Sep 17 00:00:00 2001 From: Brent Pedersen Date: Tue, 23 Apr 2024 19:39:52 +0200 Subject: [PATCH] working info expressions --- README.md | 29 +++++++++++++++++++++-------- src/header.rs | 29 +++++++++++++++++------------ src/vcfexpr.rs | 9 +++------ 3 files changed, 41 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 277dcfa..ea83399 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,22 @@ vcfexpr filter \ input.vcf ``` +--- + +add a new info field (`af_copy`) and set it. +``` +$ cat pre.lua +header:add_info({ID="af_copy", Number=1, Description="adding a single field", Type="Float"}) +``` +then run with: +``` +vcfexpr filter -p pre.lua -e 'return variant:format("AD")[1][2] > 0' \ + -s 'af_copy=return variant:info("AF", 0)' \ + input.vcf > output.vcf +``` + + + # Attributes / Functions ```lua @@ -85,6 +101,10 @@ allele.allele -> integer e.g. 0 for "0" allele header.samples (set/get) -> vec -- TODO: allow setting samples before iteration. +-- these header:add_* are available only in the prelude. currently only Number=1 is supported. +header:add_info({Type="Integer", Number=1, Description="asdf", ID="new field"}) +header:add_format({Type="Integer", Number=1, Description="xyz", ID="new format field"}) + sample = variant:sample("NA12878") sample.DP -- any fields in the row are available. special case for GT. use pprint to see structure: @@ -134,14 +154,7 @@ Options: # TODO -+ set fields with --info-set '$name|$lua_expression`. for example: -``` - -n 'popmax_AF|math.max(variant:info("AF_afr"), variant:info("AF_ami"), \ - variant:info("AF_amr"), variant:info("AF_ami"), variant:info("AF_eas"), variant:info("AF_nfe"), variant:info("AF_sas"))' -``` -which would create a new field `popmax_AF=$expression` at each row (note that this would need to handle missing values) -this would also use `header:add_info({Type="Float", Number=1, Description="max of subset of pops", ID="popmax_AF"})` - ++ Currently --info-expressions can only be used when output is VCF. Update to support template output as well. So we need the header to translate. + add a functional lib such as [Moses](https://github.com/Yonaba/Moses) or [Lume](https://github.com/rxi/lume) which have `map`/`filter` and other functions. (The user can add these on their own with `--lua`). + write a class to simplify accessing CSQ fields. diff --git a/src/header.rs b/src/header.rs index 8dc6dc1..a63bd9a 100644 --- a/src/header.rs +++ b/src/header.rs @@ -10,15 +10,13 @@ fn handle_hash_get<'a>( ) -> Result<&'a str, mlua::Error> { match tbl.get(key) { Some(x) => Ok(x), - None => { - Err(mlua::Error::ExternalError(Arc::new(std::io::Error::new( - std::io::ErrorKind::InvalidInput, - format!( - "must specify {} in the argument to add_{}. got {:?}", - key, func, tbl - ), - )))) - } + None => Err(mlua::Error::ExternalError(Arc::new(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!( + "must specify {} in the argument to add_{}. got {:?}", + key, func, tbl + ), + )))), } } @@ -80,7 +78,7 @@ pub(crate) fn register_header(lua: &Lua) -> mlua::Result<()> { |_lua, (ud, tbl): (AnyUserData, HashMap)| { let this = ud.borrow_mut::()?; let c_str = std::ffi::CString::new(format!( - r#"##INFO="#, + r#"##INFO="#, handle_hash_get(&tbl, "ID", "info")?, handle_hash_get(&tbl, "Number", "info")?, handle_hash_get(&tbl, "Type", "info")?, @@ -95,7 +93,14 @@ pub(crate) fn register_header(lua: &Lua) -> mlua::Result<()> { std::io::Error::last_os_error(), ))); } - _ = unsafe { rust_htslib::htslib::bcf_hdr_sync(this.inner) }; + let ret = unsafe { rust_htslib::htslib::bcf_hdr_sync(this.inner) }; + if ret != 0 { + log::warn!( + "Error syncing header after adding INFO field for {:?}: {}", + tbl, + ret + ); + } Ok(()) }, ); @@ -199,7 +204,7 @@ mod tests { scope.create_any_userdata_ref_mut(&mut header_view)?, )?; let result: String = exp.call(())?; - let expected = "##fileformat=VCFv4.2\n##FILTER=\n##contig=\n##INFO=\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSample1\tSample2\n"; + let expected = "##fileformat=VCFv4.2\n##FILTER=\n##contig=\n##INFO=\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSample1\tSample2\n"; assert_eq!( result, expected, diff --git a/src/vcfexpr.rs b/src/vcfexpr.rs index d6c7861..b0445b0 100644 --- a/src/vcfexpr.rs +++ b/src/vcfexpr.rs @@ -1,4 +1,3 @@ -use log::debug; use mlua::Lua; use rust_htslib::bcf::{ self, @@ -173,8 +172,6 @@ impl<'lua> VCFExpr<'lua> { let mut wtr = bcf::Writer::from_path(&output, &header, !output.ends_with(".gz"), format)?; _ = wtr.set_threads(2); - //let header_t = unsafe { rust_htslib::htslib::bcf_hdr_dup(reader.header().inner) }; - //hv = bcf::header::HeaderView::new(header_t); wtr } else { bcf::Writer::from_stdout(&header, true, bcf::Format::Vcf)? @@ -216,7 +213,7 @@ impl<'lua> VCFExpr<'lua> { .expect("invalid info expression should have name=$expression"); let t = hv .info_type(name_exp.0.as_bytes()) - .unwrap_or_else(|_| panic!("info field {} not found", name_exp.0)); + .unwrap_or_else(|_| panic!("ERROR: info field '{}' not found. Make sure it was added to the header in prelude if needed.", name_exp.0)); ( InfoFormat::Info(name_exp.0.to_string()), ( @@ -330,7 +327,7 @@ impl<'lua> VCFExpr<'lua> { let mut record = variant.take(); for (stag, value) in info_results { let tag = stag.as_bytes(); - debug!("Setting info field: {}: {:?}", stag, value); + //debug!("Setting info field: {}: {:?}", stag, value); let result = match value { InfoFormatValue::Bool(b) => { if b { @@ -339,7 +336,7 @@ impl<'lua> VCFExpr<'lua> { record.clear_info_flag(tag) } } - InfoFormatValue::Float(f) => record.push_info_float(tag, &[f]), + InfoFormatValue::Float(f) => record.push_info_float(b"af_copy", &[f]), InfoFormatValue::Integer(i) => record.push_info_integer(tag, &[i]), InfoFormatValue::String(s) => record.push_info_string(tag, &[s.as_bytes()]), };