diff --git a/Makefile b/Makefile index 93406250ec14234ffaf7b2f0b436b54f3a10d201..98afc864c400ec0ce47c399f243fc1ffbdde0432 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ TOOLS=-DSAMTOOLS=\"$(SAMTOOLS)\" -DBWA=\"$(BWA)\" -DSICKLE=\"$(SICKLE)\" -DVELVE # Date and version number from git DATE := on $(shell git log --pretty=format:"%cd" --date=iso | cut -f 1,2 -d " " | head -n 1) -VERSION := 0.0.6-snake-$(shell git log --pretty=format:"%h" --date=iso | head -n 1) +VERSION := 0.0.7-snake-$(shell git log --pretty=format:"%h" --date=iso | head -n 1) CXXFLAGS += -DDATE=\""$(DATE)"\" -DVERSION=\""$(VERSION)"\" # Compiler flags diff --git a/src/popins2_merge_and_set_mate.h b/src/popins2_merge_and_set_mate.h index 588e8c41751a37943e3fe9101729c0d8d90a1aff..2e9b7eb98f1313e359949de9dd4bfc000e567b3d 100644 --- a/src/popins2_merge_and_set_mate.h +++ b/src/popins2_merge_and_set_mate.h @@ -1,5 +1,9 @@ #include <sstream> #include <cerrno> +#include <stdlib.h> + +#include <algorithm> +#include <iterator> #include <seqan/file.h> #include <seqan/sequence.h> @@ -100,20 +104,110 @@ inline void readRecordAndCorrectRIds(BamAlignmentRecord & record, BamFileIn & st // ========================================================================== +inline std::string iterSuffix(const std::string &S_PG_ID, const std::unordered_set<std::string> &ids, const unsigned i){ + std::string ret; + ret.append(S_PG_ID); + ret.append(".mH"); + ret.append(std::to_string(i)); + + if (ids.find(ret) == ids.end()) + return ret; + + return iterSuffix(S_PG_ID, ids, i+1); +} + +// ========================================================================== +inline void printSet(std::unordered_set<std::string> const &s){ + std::copy(s.begin(),s.end(),std::ostream_iterator<std::string>(std::cout, " ")); +} +// ========================================================================== + inline void mergeHeaders(BamHeader & header, FormattedFileContext<BamFileOut, Owner<> >::Type & context, BamFileIn & stream1, BamFileIn & stream2){ + StringSet<CharString> contigNames; NameStoreCache<StringSet<CharString> > nameStoreCache; String<int32_t> contigLengths; // Read and append the two headers. Remove duplicate entries. - readHeader(header, stream1); + // @Thomas: The current (v0.0.5-snake) approach does not catch duplications in the header of stream1. + // This is problematic if e.g. alignments were computed with BWA version <0.7.16a because of a known bug with the RG field back then. + BamHeader header1; + readHeader(header1, stream1); BamHeader header2; readHeader(header2, stream2); - for (unsigned i = 0; i < length(header2); ++i) - { - if (header2[i].type != BAM_HEADER_FIRST) - appendValue(header, header2[i]); + + std::unordered_set<std::string> program_ids; + + // Append header1 to header (reference parameter) and remove duplicates in PG ID fields + for (unsigned i = 0; i < length(header1); ++i){ + if (header1[i].type == BAM_HEADER_PROGRAM){ + //std::cout << "[header1] =========================================" << std::endl; + //std::cout << "[list...] ["; printSet(program_ids); std::cout << "]" << std::endl; + CharString PG_ID; + bool keyFound = getTagValue(PG_ID, "ID", header1[i]); + const char* C_PG_ID = toCString(PG_ID); + std::string S_PG_ID(C_PG_ID); + //std::cout << "[header1] " << S_PG_ID << std::endl; + //std::cout << "[list...] ["; printSet(program_ids); std::cout << "]" << std::endl; + // if PG ID was seen already + if ( program_ids.find(S_PG_ID) != program_ids.end() ){ + //std::cout << "[header1] " << S_PG_ID << " was seen in list already." << std::endl; + BamHeaderRecord hrecord = header1[i]; + // find new unique PG ID + std::string S_PG_ID_NEW = iterSuffix(S_PG_ID, program_ids, 1); + //std::cout << "[header1] " << S_PG_ID_NEW << " is the new PG_ID." << std::endl; + setTagValue("ID", S_PG_ID_NEW, hrecord); + appendValue(header, hrecord); + program_ids.insert(toCString(S_PG_ID_NEW)); + //std::cout << "[header1] " << S_PG_ID_NEW << " added to list." << std::endl; + continue; + } + else{ + //std::cout << "[header1] " << S_PG_ID << " not seen in list yet." << std::endl; + program_ids.insert(S_PG_ID); + //std::cout << "[header1] " << S_PG_ID << " added to list." << std::endl; + } + } + appendValue(header, header1[i]); + } + + // Append header2 to header (reference parameter), remove duplicates in PG ID fields and skip VN line of header2 + for (unsigned i = 0; i < length(header2); ++i){ + + if (header2[i].type == BAM_HEADER_FIRST) + continue; + + if (header2[i].type == BAM_HEADER_PROGRAM){ + //std::cout << "[header2] =========================================" << std::endl; + //std::cout << "[list...] ["; printSet(program_ids); std::cout << "]" << std::endl; + CharString PG_ID; + bool keyFound = getTagValue(PG_ID, "ID", header2[i]); + const char* C_PG_ID = toCString(PG_ID); + std::string S_PG_ID(C_PG_ID); + //std::cout << "[header2] " << S_PG_ID << std::endl; + //std::cout << "[list...] ["; printSet(program_ids); std::cout << "]" << std::endl; + // if PG ID was seen already + if ( program_ids.find(S_PG_ID) != program_ids.end() ){ + //std::cout << "[header2] " << S_PG_ID << " was seen in list already." << std::endl; + BamHeaderRecord hrecord = header2[i]; + // find new unique PG ID + std::string S_PG_ID_NEW = iterSuffix(S_PG_ID, program_ids, 1); + //std::cout << "[header2] " << S_PG_ID_NEW << " is the new PG_ID." << std::endl; + setTagValue("ID", S_PG_ID_NEW, hrecord); + appendValue(header, hrecord); + program_ids.insert(toCString(S_PG_ID_NEW)); + //std::cout << "[header2] " << S_PG_ID_NEW << " added to list." << std::endl; + continue; + } + else{ + //std::cout << "[header2] " << S_PG_ID << " not seen in list yet." << std::endl; + program_ids.insert(S_PG_ID); + //std::cout << "[header2] " << S_PG_ID << " added to list." << std::endl; + } + } + appendValue(header, header2[i]); } + std::stable_sort(begin(header, Standard()), end(header, Standard()), BamHeaderRecordTypeLess()); // Fill sequence names into nameStoreCache.