From 84f646a10234e64494237373df07d98b06c2d39f Mon Sep 17 00:00:00 2001 From: Thomas Krannich <krannich479@googlemail.com> Date: Wed, 16 Mar 2022 16:05:09 +0100 Subject: [PATCH] improved conflict solving for duplicated PG IDs when merging SAM headers --- Makefile | 2 +- src/popins2_merge_and_set_mate.h | 81 ++++++++++++++++++++++++-------- 2 files changed, 62 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index 9f4ba8e..98afc86 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ TOOLS=-DSAMTOOLS=\"$(SAMTOOLS)\" -DBWA=\"$(BWA)\" -DSICKLE=\"$(SICKLE)\" -DVELVE # Date and version number from git DATE := on $(shell git log --pretty=format:"%cd" --date=iso | cut -f 1,2 -d " " | head -n 1) -VERSION := 0.0.5-snake-$(shell git log --pretty=format:"%h" --date=iso | head -n 1) +VERSION := 0.0.7-snake-$(shell git log --pretty=format:"%h" --date=iso | head -n 1) CXXFLAGS += -DDATE=\""$(DATE)"\" -DVERSION=\""$(VERSION)"\" # Compiler flags diff --git a/src/popins2_merge_and_set_mate.h b/src/popins2_merge_and_set_mate.h index 7205ff2..9efd71b 100644 --- a/src/popins2_merge_and_set_mate.h +++ b/src/popins2_merge_and_set_mate.h @@ -100,6 +100,22 @@ inline void readRecordAndCorrectRIds(BamAlignmentRecord & record, BamFileIn & st // ========================================================================== +inline CharString iterSuffix(const CharString &PG_ID, const std::unordered_set<char*> &ids, const unsigned i){ + CharString ret; + append(ret, PG_ID); + CharString suf = ".mH"; + append(ret, suf); + CharString iter = i; + append(ret, iter); + + if (ids.find(toCString(ret)) == ids.end()) + return ret; + + return iterSuffix(PG_ID, ids, i+1); +} + +// ========================================================================== + inline void mergeHeaders(BamHeader & header, FormattedFileContext<BamFileOut, Owner<> >::Type & context, BamFileIn & stream1, BamFileIn & stream2){ StringSet<CharString> contigNames; @@ -107,36 +123,61 @@ inline void mergeHeaders(BamHeader & header, FormattedFileContext<BamFileOut, Ow String<int32_t> contigLengths; // Read and append the two headers. Remove duplicate entries. - readHeader(header, stream1); + // @Thomas: The current (v0.0.5-snake) approach does not catch duplications in the header of stream1. + // This is problematic if e.g. alignments were computed with BWA version <0.7.16a because of a known bug with the RG field back then. + BamHeader header1; + readHeader(header1, stream1); BamHeader header2; readHeader(header2, stream2); std::unordered_set<char*> program_ids; - for (unsigned i = 0; i < length(header2); ++i){ - - if (header2[i].type != BAM_HEADER_FIRST){ + // Append header1 to header (reference parameter) and remove duplicates in PG ID fields + for (unsigned i = 0; i < length(header1); ++i){ + if (header1[i].type == BAM_HEADER_PROGRAM){ + CharString PG_ID; + bool keyFound = getTagValue(PG_ID, "ID", header1[i]); + // if PG ID was seen already + if ( program_ids.find(toCString(PG_ID)) != program_ids.end() ){ + BamHeaderRecord hrecord = header1[i]; + // find new unique PG ID + PG_ID = iterSuffix(PG_ID, program_ids, 1); + setTagValue("ID", PG_ID, hrecord); + appendValue(header, hrecord); + program_ids.insert(toCString(PG_ID)); + continue; + } + else{ + program_ids.insert(toCString(PG_ID)); + } + } + appendValue(header, header1[i]); + } - if (header2[i].type == BAM_HEADER_PROGRAM){ - CharString PG_ID; - bool keyFound = getTagValue(PG_ID, "ID", header2[i]); + // Append header2 to header (reference parameter), remove duplicates in PG ID fields and skip VN line of header2 + for (unsigned i = 0; i < length(header2); ++i){ - if ( program_ids.find(toCString(PG_ID)) != program_ids.end() ){ // if PG ID was seen already - BamHeaderRecord hrecord = header2[i]; - CharString suf = ".mH"; // mH = "mergeHeader" to trace in SAM/BAM files - append(PG_ID, suf); - setTagValue("ID", PG_ID, hrecord); - appendValue(header, hrecord); - continue; - } - else{ - program_ids.insert(toCString(PG_ID)); - } + if (header2[i].type == BAM_HEADER_FIRST) + continue; + if (header2[i].type == BAM_HEADER_PROGRAM){ + CharString PG_ID; + bool keyFound = getTagValue(PG_ID, "ID", header2[i]); + // if PG ID was seen already + if ( program_ids.find(toCString(PG_ID)) != program_ids.end() ){ + BamHeaderRecord hrecord = header2[i]; + // find new unique PG ID + PG_ID = iterSuffix(PG_ID, program_ids, 1); + setTagValue("ID", PG_ID, hrecord); + appendValue(header, hrecord); + program_ids.insert(toCString(PG_ID)); + continue; + } + else{ + program_ids.insert(toCString(PG_ID)); } - - appendValue(header, header2[i]); } + appendValue(header, header2[i]); } std::stable_sort(begin(header, Standard()), end(header, Standard()), BamHeaderRecordTypeLess()); -- GitLab