From 2f41d5c071af2212a259ce7da712c04ca7b0e0b1 Mon Sep 17 00:00:00 2001 From: vasudeva8 Date: Tue, 26 Nov 2024 17:13:55 +0000 Subject: [PATCH] update to work with vcf 4.4 prefixed phasing info --- htslib/vcf.h | 108 ++++++++++++++++++++++++++++++++++++++++++ test/test.pl | 10 ++++ test/vcf44_1.expected | 27 +++++++++++ test/vcf44_1.vcf | 26 ++++++++++ vcf.c | 36 ++++++++++++-- 5 files changed, 203 insertions(+), 4 deletions(-) create mode 100644 test/vcf44_1.expected create mode 100644 test/vcf44_1.vcf diff --git a/htslib/vcf.h b/htslib/vcf.h index 9a36cab05..d0e5eb02c 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -1501,6 +1501,34 @@ static inline int bcf_float_is_vector_end(float f) return u.i==bcf_float_vector_end ? 1 : 0; } +typedef enum bcf_version {v41 = 1, v42, v43, v44} bcf_version; +/** + * bcf_get_version - get the version as bcf_version enumeration + * @param hdr - bcf header, to get version + * @param ipver - pointer to return version + * Returns 0 on success and -1 on failure + */ +static inline int bcf_get_version(const bcf_hdr_t *hdr, bcf_version *ver) +{ + const char *version = NULL; + + if (!hdr || !ver) { + return -1; + } + + version = bcf_hdr_get_version(hdr); + if (!strcmp("VCFv4.1", version)) { + *ver = v41; + } else if (!strcmp("VCFv4.2", version)) { + *ver = v42; + } else if (!strcmp("VCFv4.3", version)) { + *ver = v43; + } else { + *ver = v44; + } + return 0; +} + static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) { uint32_t e = 0; @@ -1528,6 +1556,86 @@ static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) return e == 0 ? 0 : -1; } +/** + * bcf_format_gt1 - formats GT information information on a string + * @param hdr - bcf header, to get version + * @param fmt - pointer to bcf format data + * @param isample - position of interested sample in data + * @param str - pointer to output string + * Returns 0 on success and -1 on failure + * This method is extended from bcf_format_gt to output phasing information + * in accordance with v4.4 format, which supports explicit / prefixed phasing + * for 1st allele. + * Explicit / prefixed phasing for 1st allele is used only when it is a must to + * correctly express phasing. + */ +static inline int bcf_format_gt1(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str) +{ + uint32_t e = 0; + bcf_version ver = v42; + int ploidy = 1, anyunphased = 0; + int32_t val0 = 0; + kstring_t tmp1 = KS_INITIALIZE, tmp2 = KS_INITIALIZE; + + if (bcf_get_version(hdr, &ver)) { + hts_log_error("Failed to get version information"); + return -1; + } + #define BRANCH(type_t, convert, missing, vector_end) { \ + uint8_t *ptr = fmt->p + isample*fmt->size; \ + int i; \ + for (i=0; in; i++, ptr += sizeof(type_t)) \ + { \ + type_t val = convert(ptr); \ + if ( val == vector_end ) break; \ + if (!i) { val0 = val; } \ + if (i) { \ + e |= kputc("/|"[val & 1], &tmp1) < 0; \ + anyunphased |= !(val & 1); \ + } \ + if (!(val >> 1)) e |= kputc('.', &tmp1) < 0; \ + else e |= kputw((val >> 1) - 1, &tmp1) < 0; \ + } \ + if (i == 0) e |= kputc('.', &tmp1) < 0; \ + ploidy = i; \ + } + switch (fmt->type) { + case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, bcf_int32_vector_end); break; + case BCF_BT_NULL: e |= kputc('.', &tmp1) < 0; break; + default: hts_log_error("Unexpected type %d", fmt->type); return -2; + } + #undef BRANCH + + if (ver >= v44) { //output which supports prefixed phasing + /* update 1st allele's phasing if required and append rest to it. + use prefixed phasing only when it is a must. i.e. without which the + inferred value will be incorrect */ + if (val0 & 1) { + /* 1st one is phased, if ploidy is > 1 and an unphased allele exists + need to specify explicitly */ + e |= (ploidy > 1 && anyunphased) ? + (kputc('|', &tmp2) < 0) : + 0; + } else { + /* 1st allele is unphased, if ploidy is = 1 or allele is '.' or + ploidy > 1 and no other unphased allele exist, need to specify + explicitly */ + e |= ((ploidy <= 1) || (ploidy > 1 && !anyunphased)) ? + (kputc('/', &tmp2) < 0) : + 0; + } + e |= kputsn(tmp1.s, tmp1.l, &tmp2) < 0; //append rest with updated one + ks_free(&tmp1); + tmp1 = tmp2; + } + //updated v44 string or 'test-vcf-sweep.out'); run_test('test_vcf_various',$opts); +run_test('test_vcf_44', $opts); run_test('test_bcf_sr_sort',$opts); run_test('test_bcf_sr_no_index',$opts); run_test('test_bcf_sr_range', $opts); @@ -1159,6 +1160,15 @@ sub test_vcf_various cmd => "$$opts{path}/test_view $$opts{path}/modhdr.vcf.gz chr22:1-2"); } +sub test_vcf_44 +{ + my ($opts, %args) = @_; + + # vcf4.4 with implicit and explicit phasing info combinations + test_cmd($opts, %args, out => "vcf44_1.expected", + cmd => "$$opts{bin}/htsfile -c $$opts{path}/vcf44_1.vcf"); +} + sub write_multiblock_bgzf { my ($name, $frags) = @_; diff --git a/test/vcf44_1.expected b/test/vcf44_1.expected new file mode 100644 index 000000000..c696f09b5 --- /dev/null +++ b/test/vcf44_1.expected @@ -0,0 +1,27 @@ +##fileformat=VCFv4.4 +##FILTER= +##contig= +##FORMAT= +##failue="test file on explicit and implicit phasing markers in 4.4" +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097 +1 61462 rs56992750 T A 100 PASS . GT 0|0|1 0/1 +1 61480 rs56992751 T A 100 PASS . GT 0 /0|1 +1 61481 rs56992752 T A 100 PASS . GT /0 |0/1 +1 61482 rs56992752 T A 100 PASS . GT /0 /1 +1 61483 rs56992752 T A 100 PASS . GT 0 1 +1 61484 rs56992752 T A 100 PASS . GT 0 /1 +1 61485 rs56992752 T A 100 PASS . GT 0 1 +1 61486 rs56992752 T A 100 PASS . GT 0 1 +1 61487 rs56992752 T A 100 PASS . GT 0 1 +1 61488 rs56992752 T A 100 PASS . GT 0 /1 +1 61489 rs56992752 T A 100 PASS . GT /0 1 +1 61490 rs56992752 T A 100 PASS . GT /0 1 +1 61491 rs56992752 T A 100 PASS . GT /0 /1 +1 61492 rs56992752 T A 100 PASS . GT /0|0 1/0 +1 61493 rs56992752 T A 100 PASS . GT 0|0 |1/0 +1 61494 rs56992752 T A 100 PASS . GT /0|0 1/0 +1 61495 rs56992752 T A 100 PASS . GT 0|0 |1/0 +1 61496 rs56992752 T A 100 PASS . GT . . +1 61497 rs56992752 T A 100 PASS . GT ./1 .|1 +1 61498 rs56992752 T A 100 PASS . GT 1/. 1|. +1 61499 rs56992752 T A 100 PASS . GT ./. .|. diff --git a/test/vcf44_1.vcf b/test/vcf44_1.vcf new file mode 100644 index 000000000..ed726e7f5 --- /dev/null +++ b/test/vcf44_1.vcf @@ -0,0 +1,26 @@ +##fileformat=VCFv4.4 +##contig= +##FORMAT= +##failue="test file on explicit and implicit phasing markers in 4.4" +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097 +1 61462 rs56992750 T A 100 PASS . GT 0|0|1 0/1 +1 61480 rs56992751 T A 100 PASS . GT 0 /0|1 +1 61481 rs56992752 T A 100 PASS . GT /0 |0/1 +1 61482 rs56992752 T A 100 PASS . GT /0 /1 +1 61483 rs56992752 T A 100 PASS . GT 0 1 +1 61484 rs56992752 T A 100 PASS . GT 0 /1 +1 61485 rs56992752 T A 100 PASS . GT 0 |1 +1 61486 rs56992752 T A 100 PASS . GT |0 1 +1 61487 rs56992752 T A 100 PASS . GT |0 |1 +1 61488 rs56992752 T A 100 PASS . GT |0 /1 +1 61489 rs56992752 T A 100 PASS . GT /0 1 +1 61490 rs56992752 T A 100 PASS . GT /0 |1 +1 61491 rs56992752 T A 100 PASS . GT /0 /1 +1 61492 rs56992752 T A 100 PASS . GT /0|0 /1/0 +1 61493 rs56992752 T A 100 PASS . GT |0|0 |1/0 +1 61494 rs56992752 T A 100 PASS . GT /0|0 1/0 +1 61495 rs56992752 T A 100 PASS . GT 0|0 |1/0 +1 61496 rs56992752 T A 100 PASS . GT . . +1 61497 rs56992752 T A 100 PASS . GT ./1 .|1 +1 61498 rs56992752 T A 100 PASS . GT 1/. 1|. +1 61499 rs56992752 T A 100 PASS . GT ./. .|. diff --git a/vcf.c b/vcf.c index f22f4450d..1c0e4b922 100644 --- a/vcf.c +++ b/vcf.c @@ -3061,8 +3061,14 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const char *t = q + 1; int m = 0; // m: sample id const int nsamples = bcf_hdr_nsamples(h); - + bcf_version ver = v42; const char *end = s->s + s->l; + + if (bcf_get_version(h, &ver)) { + hts_log_error("Failed to get version information"); + return -1; + } + while ( tis_gt) { // Genotypes. - // ([|/])+... where is [0-9]+ or ".". + //([/|])?)([|/])+... where is [0-9]+ or ".". int32_t is_phased = 0; uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m); uint32_t unreadable = 0; uint32_t max = 0; - int overflow = 0; + int overflow = 0, ploidy = 0, anyunphased = 0, \ + phasingprfx = 0; + + /* with prefixed phasing, it is explicitly given for 1st one + with non-prefixed, set based on ploidy and phasing of other + alleles. */ + if (ver >= v44 && (*t == '|' || *t == '/')) { + // cache prefix and phasing status + is_phased = *t++ == '|'; + phasingprfx = 1; + } + for (l = 0;; ++t) { + ploidy++; if (*t == '.') { ++t, x[l++] = is_phased; } else { @@ -3125,9 +3143,19 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, if (max < val) max = val; x[l++] = (val + 1) << 1 | is_phased; } + anyunphased |= (ploidy != 1) && !is_phased; is_phased = (*t == '|'); if (*t != '|' && *t != '/') break; } + if (ver >= v44 && !phasingprfx) { + /* no explicit phasing for 1st allele, set based on + other alleles and ploidy */ + if (ploidy == 1) { //implicitly phased + x[0]|= 1; + } else { //set by other unphased alleles + x[0] |= anyunphased ? 0 : 1; + } + } // Possibly check max against v->n_allele instead? if (overflow || max > (INT32_MAX >> 1) - 1) { hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); @@ -4187,7 +4215,7 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) if (!first) kputc_(':', s); first = 0; if (gt_i == i) { - bcf_format_gt(f,j,s); + bcf_format_gt1(h, f,j,s); break; } else if (f->n == 1)