From ac4b981f54446ae8119721629f6526ef909da405 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 7 Oct 2024 11:47:10 +0100 Subject: [PATCH] Fix CRAM embed_ref=2 with seqs overlapping ref end. If the sequences align off the end of the reference and we are creating consensus on the fly, then the consensus generated also steps beyond the reference length. Although this longer reference is embedded, it is trimmed back by the CRAM decoder which validates against the declared reference length in SQ LN, leading to Ns appearing in the decoder. Therefore we now validate in the encoder too, which also needed refs_from_header updates to work when not given any externals (such as .fai). --- cram/cram_encode.c | 4 +++- cram/cram_io.c | 9 +++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 5d22db54d..1977ac5e1 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -1761,7 +1761,6 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { c->ref_start = ref_start+1; c->ref_end = ref_end+1; c->ref_free = 1; - return 0; err: @@ -1997,6 +1996,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { fd->no_ref_counter -= (fd->no_ref_counter > 0); pthread_mutex_unlock(&fd->ref_lock); } + + if (c->ref_end > fd->refs->ref_id[c->ref_id]->length) + c->ref_end = fd->refs->ref_id[c->ref_id]->length; } // Iterate through records creating the cram blocks for some diff --git a/cram/cram_io.c b/cram/cram_io.c index 7f7ffca49..01cf7bdbf 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -2734,7 +2734,7 @@ int refs2id(refs_t *r, sam_hdr_t *hdr) { * Returns 0 on success * -1 on failure */ -static int refs_from_header(cram_fd *fd) { + int refs_from_header(cram_fd *fd) { if (!fd) return -1; @@ -2787,10 +2787,11 @@ static int refs_from_header(cram_fd *fd) { /* Initialise likely filename if known */ if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) { - if ((tag = sam_hrecs_find_key(ty, "M5", NULL))) { + if ((tag = sam_hrecs_find_key(ty, "M5", NULL))) r->ref_id[j]->fn = string_dup(r->pool, tag->str+3); - //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn); - } + + if ((tag = sam_hrecs_find_key(ty, "LN", NULL))) + r->ref_id[j]->length = strtol(tag->str+3, NULL, 0); } k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);