From b55253e6226225fe6421430effc840f60607902f Mon Sep 17 00:00:00 2001 From: Birte Kehr <birte.kehr@bihealth.de> Date: Thu, 1 Aug 2024 11:55:15 +0200 Subject: [PATCH] Adding option for setting the minimum base qualities of reads to be cropped for unmapped assembly. --- src/argument_parsing.h | 11 ++++++++++- src/crop_unmapped.h | 16 +++++++++------- src/popins2_crop_unmapped.h | 6 +++--- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/src/argument_parsing.h b/src/argument_parsing.h index 0114cf4..d2ef294 100644 --- a/src/argument_parsing.h +++ b/src/argument_parsing.h @@ -37,6 +37,7 @@ struct CropUnmappedOptions { int humanSeqs; float alignment_score_factor; + unsigned min_qual; bool printSampleInfo; @@ -49,6 +50,7 @@ struct CropUnmappedOptions { sampleID(""), humanSeqs(maxValue<int>()), alignment_score_factor(0.67f), + min_qual(20), printSampleInfo(true) {} }; @@ -254,6 +256,8 @@ bool getOptionValues(CropUnmappedOptions & options, ArgumentParser const & parse getOptionValue(options.humanSeqs, parser, "filter"); if (isSet(parser, "alignment-score-factor")) getOptionValue(options.alignment_score_factor, parser, "alignment-score-factor"); + if (isSet(parser, "min-qual")) + getOptionValue(options.min_qual, parser, "min-qual"); if (isSet(parser, "noSampleInfo")) options.printSampleInfo = false; @@ -490,6 +494,7 @@ bool getOptionValues(GenotypingOptions &options, seqan::ArgumentParser &parser){ void setHiddenOptions(ArgumentParser & parser, bool hide, CropUnmappedOptions &){ hideOption(parser, "alignment-score-factor", hide); + hideOption(parser, "min-qual", hide); } void setHiddenOptions(seqan::ArgumentParser & /*parser*/, bool /*hide*/, MergeSetMateOptions &){ @@ -582,6 +587,7 @@ void setupParser(ArgumentParser & parser, CropUnmappedOptions & options){ "as high-quality aligned even if their alignment quality is low. Recommended for non-human reference " "sequences after remapping for contamination removal.", ArgParseArgument::INTEGER, "INT")); addOption(parser, ArgParseOption("c", "alignment-score-factor", "A read is considered low quality if its alignment score (AS tag) is below FLOAT*read length", seqan::ArgParseArgument::DOUBLE, "FLOAT")); + addOption(parser, ArgParseOption("q", "min-qual", "Minimum average quality value in windows for read quality trimming.", seqan::ArgParseArgument::INTEGER, "INT")); // Set valid and default values. setValidValues(parser, "adapters", "HiSeq HiSeqX"); @@ -595,10 +601,13 @@ void setupParser(ArgumentParser & parser, CropUnmappedOptions & options){ setDefaultValue(parser, "prefix", "\'.\'"); setDefaultValue(parser, "sample", "retrieval from BAM file header"); setDefaultValue(parser, "alignment-score-factor", options.alignment_score_factor); + setDefaultValue(parser, "min-qual", options.min_qual); setDefaultValue(parser, "noSampleInfo", "false"); setMinValue(parser, "alignment-score-factor", "0.0"); setMaxValue(parser, "alignment-score-factor", "1.0"); + setMinValue(parser, "min-qual", "0"); + setMaxValue(parser, "min-qual", "42"); // Hide some options from default help. setHiddenOptions(parser, true, options); @@ -1416,4 +1425,4 @@ seqan::ArgumentParser::ParseResult parseCommandLine(TOptions &options, int argc, } -#endif /*ARGUMENT_PARSING_H_*/ \ No newline at end of file +#endif /*ARGUMENT_PARSING_H_*/ diff --git a/src/crop_unmapped.h b/src/crop_unmapped.h index 9150e5f..e72e257 100644 --- a/src/crop_unmapped.h +++ b/src/crop_unmapped.h @@ -349,7 +349,7 @@ findOtherReads(BamFileOut & matesStream, // Function crop_unmapped() // ========================================================================== -template<typename TAdapterTag> +template<typename TAdapterTag, typename TSize_> int crop_unmapped(double & avgCov, unsigned & totalReads, @@ -358,7 +358,8 @@ crop_unmapped(double & avgCov, CharString const & mappingBam, int humanSeqs, TAdapterTag tag, - float as_factor) + float as_factor, + TSize_ qualThresh) { typedef __int32 TPos; typedef std::map<CharString, Pair<CharString> > TFastqMap; // Reads to go into fastq files. @@ -427,14 +428,14 @@ crop_unmapped(double & avgCov, // Check the read's unmapped flag. if (hasFlagUnmapped(record)) { - if (removeLowQuality(record, 20) != 1 && removeAdapter(record, indexUniversal, indexTruSeqs, 30, tag) != 2) + if (removeLowQuality(record, qualThresh) != 1 && removeAdapter(record, indexUniversal, indexTruSeqs, 30, tag) != 2) appendFastqRecord(fastqFirstStream, fastqSecondStream, firstReads, secondReads, record); } // Check for low mapping quality. else if (hasLowMappingQuality(record, humanSeqs, as_factor)) { - if (removeLowQuality(record, 20) != 1 && removeAdapter(record, indexUniversal, indexTruSeqs, 30, tag) != 2) + if (removeLowQuality(record, qualThresh) != 1 && removeAdapter(record, indexUniversal, indexTruSeqs, 30, tag) != 2) { if (appendFastqRecord(fastqFirstStream, fastqSecondStream, firstReads, secondReads, record) == 0) otherReads[Pair<TPos>(record.rNextId, record.pNext)] = Pair<CharString, bool>(record.qName, hasFlagFirst(record)); @@ -476,18 +477,19 @@ crop_unmapped(double & avgCov, } -template<typename TAdapterTag> +template<typename TAdapterTag, typename TSize_> int crop_unmapped(Triple<CharString> & fastqFiles, CharString & matesBam, CharString const & mappingBam, int humanSeqs, TAdapterTag tag, - float as_factor) + float as_factor, + TSize_ qualThresh) { double cov; unsigned totalReads; - return crop_unmapped(cov, totalReads, fastqFiles, matesBam, mappingBam, humanSeqs, tag, as_factor); + return crop_unmapped(cov, totalReads, fastqFiles, matesBam, mappingBam, humanSeqs, tag, as_factor, qualThresh); } #endif // #ifndef NOVINS_CROP_UNMAPPED_H_ diff --git a/src/popins2_crop_unmapped.h b/src/popins2_crop_unmapped.h index 3526cb2..cd8b024 100644 --- a/src/popins2_crop_unmapped.h +++ b/src/popins2_crop_unmapped.h @@ -86,17 +86,17 @@ int popins2_crop_unmapped(int argc, char const ** argv){ // Crop unmapped reads and reads with unreliable mappings from the input bam file. if (options.adapters == "HiSeqX") { - if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, HiSeqXAdapters(), as_factor) != 0) + if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, HiSeqXAdapters(), as_factor, options.min_qual) != 0) return 7; } else if (options.adapters == "HiSeq") { - if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, HiSeqAdapters(), as_factor) != 0) + if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, HiSeqAdapters(), as_factor, options.min_qual) != 0) return 7; } else { - if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, NoAdapters(), as_factor) != 0) + if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, NoAdapters(), as_factor, options.min_qual) != 0) return 7; } -- GitLab