From 9f64fc39dabe3d4bf36f0221eee97998db8ff91d Mon Sep 17 00:00:00 2001 From: Birte Kehr <birte.kehr@ukr.de> Date: Fri, 9 Aug 2024 16:39:09 +0200 Subject: [PATCH] Making minimum length requirement of reads in crop_unmapped after base quality trimming a parameter. --- src/argument_parsing.h | 10 +++++++++- src/crop_unmapped.h | 16 +++++++++------- src/popins2_crop_unmapped.h | 6 +++--- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/argument_parsing.h b/src/argument_parsing.h index d2ef294..10ea678 100644 --- a/src/argument_parsing.h +++ b/src/argument_parsing.h @@ -38,6 +38,7 @@ struct CropUnmappedOptions { float alignment_score_factor; unsigned min_qual; + unsigned min_read_len; bool printSampleInfo; @@ -51,6 +52,7 @@ struct CropUnmappedOptions { humanSeqs(maxValue<int>()), alignment_score_factor(0.67f), min_qual(20), + min_read_len(30), printSampleInfo(true) {} }; @@ -257,7 +259,9 @@ bool getOptionValues(CropUnmappedOptions & options, ArgumentParser const & parse if (isSet(parser, "alignment-score-factor")) getOptionValue(options.alignment_score_factor, parser, "alignment-score-factor"); if (isSet(parser, "min-qual")) - getOptionValue(options.min_qual, parser, "min-qual"); + getOptionValue(options.min_qual, parser, "min-qual"); + if (isSet(parser, "min-read-len")) + getOptionValue(options.min_read_len, parser, "min-read-len"); if (isSet(parser, "noSampleInfo")) options.printSampleInfo = false; @@ -495,6 +499,7 @@ bool getOptionValues(GenotypingOptions &options, seqan::ArgumentParser &parser){ void setHiddenOptions(ArgumentParser & parser, bool hide, CropUnmappedOptions &){ hideOption(parser, "alignment-score-factor", hide); hideOption(parser, "min-qual", hide); + hideOption(parser, "min-read-len", hide); } void setHiddenOptions(seqan::ArgumentParser & /*parser*/, bool /*hide*/, MergeSetMateOptions &){ @@ -588,6 +593,7 @@ void setupParser(ArgumentParser & parser, CropUnmappedOptions & options){ "sequences after remapping for contamination removal.", ArgParseArgument::INTEGER, "INT")); addOption(parser, ArgParseOption("c", "alignment-score-factor", "A read is considered low quality if its alignment score (AS tag) is below FLOAT*read length", seqan::ArgParseArgument::DOUBLE, "FLOAT")); addOption(parser, ArgParseOption("q", "min-qual", "Minimum average quality value in windows for read quality trimming.", seqan::ArgParseArgument::INTEGER, "INT")); + addOption(parser, ArgParseOption("l", "min-read-len", "Minimum read length to keep the read after quality trimming.", seqan::ArgParseArgument::INTEGER, "INT")); // Set valid and default values. setValidValues(parser, "adapters", "HiSeq HiSeqX"); @@ -602,12 +608,14 @@ void setupParser(ArgumentParser & parser, CropUnmappedOptions & options){ setDefaultValue(parser, "sample", "retrieval from BAM file header"); setDefaultValue(parser, "alignment-score-factor", options.alignment_score_factor); setDefaultValue(parser, "min-qual", options.min_qual); + setDefaultValue(parser, "min-read-len", options.min_read_len); setDefaultValue(parser, "noSampleInfo", "false"); setMinValue(parser, "alignment-score-factor", "0.0"); setMaxValue(parser, "alignment-score-factor", "1.0"); setMinValue(parser, "min-qual", "0"); setMaxValue(parser, "min-qual", "42"); + setMinValue(parser, "min-read-len", "1"); // Hide some options from default help. setHiddenOptions(parser, true, options); diff --git a/src/crop_unmapped.h b/src/crop_unmapped.h index e72e257..a038892 100644 --- a/src/crop_unmapped.h +++ b/src/crop_unmapped.h @@ -74,7 +74,7 @@ hasLowMappingQuality(BamAlignmentRecord & record, int humanSeqs, float as_factor template<typename TSize_> inline bool -removeLowQuality(BamAlignmentRecord & record, TSize_ qualThresh) +removeLowQuality(BamAlignmentRecord & record, TSize_ qualThresh, TSize_ minReadLen) { typedef Iterator<CharString, Rooted>::Type TIter; typedef Size<CharString>::Type TSize; @@ -129,7 +129,7 @@ removeLowQuality(BamAlignmentRecord & record, TSize_ qualThresh) windowQual += *windowBegin -33; } - if (length(record.seq) < 30) return 1; + if (length(record.seq) < minReadLen) return 1; return 0; } @@ -359,7 +359,8 @@ crop_unmapped(double & avgCov, int humanSeqs, TAdapterTag tag, float as_factor, - TSize_ qualThresh) + TSize_ qualThresh, + TSize_ minReadLen) { typedef __int32 TPos; typedef std::map<CharString, Pair<CharString> > TFastqMap; // Reads to go into fastq files. @@ -428,14 +429,14 @@ crop_unmapped(double & avgCov, // Check the read's unmapped flag. if (hasFlagUnmapped(record)) { - if (removeLowQuality(record, qualThresh) != 1 && removeAdapter(record, indexUniversal, indexTruSeqs, 30, tag) != 2) + if (removeLowQuality(record, qualThresh, minReadLen) != 1 && removeAdapter(record, indexUniversal, indexTruSeqs, 30, tag) != 2) appendFastqRecord(fastqFirstStream, fastqSecondStream, firstReads, secondReads, record); } // Check for low mapping quality. else if (hasLowMappingQuality(record, humanSeqs, as_factor)) { - if (removeLowQuality(record, qualThresh) != 1 && removeAdapter(record, indexUniversal, indexTruSeqs, 30, tag) != 2) + if (removeLowQuality(record, qualThresh, minReadLen) != 1 && removeAdapter(record, indexUniversal, indexTruSeqs, 30, tag) != 2) { if (appendFastqRecord(fastqFirstStream, fastqSecondStream, firstReads, secondReads, record) == 0) otherReads[Pair<TPos>(record.rNextId, record.pNext)] = Pair<CharString, bool>(record.qName, hasFlagFirst(record)); @@ -485,11 +486,12 @@ crop_unmapped(Triple<CharString> & fastqFiles, int humanSeqs, TAdapterTag tag, float as_factor, - TSize_ qualThresh) + TSize_ qualThresh, + TSize_ minReadLen) { double cov; unsigned totalReads; - return crop_unmapped(cov, totalReads, fastqFiles, matesBam, mappingBam, humanSeqs, tag, as_factor, qualThresh); + return crop_unmapped(cov, totalReads, fastqFiles, matesBam, mappingBam, humanSeqs, tag, as_factor, qualThresh, minReadLen); } #endif // #ifndef NOVINS_CROP_UNMAPPED_H_ diff --git a/src/popins2_crop_unmapped.h b/src/popins2_crop_unmapped.h index cd8b024..4409b46 100644 --- a/src/popins2_crop_unmapped.h +++ b/src/popins2_crop_unmapped.h @@ -86,17 +86,17 @@ int popins2_crop_unmapped(int argc, char const ** argv){ // Crop unmapped reads and reads with unreliable mappings from the input bam file. if (options.adapters == "HiSeqX") { - if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, HiSeqXAdapters(), as_factor, options.min_qual) != 0) + if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, HiSeqXAdapters(), as_factor, options.min_qual, options.min_read_len) != 0) return 7; } else if (options.adapters == "HiSeq") { - if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, HiSeqAdapters(), as_factor, options.min_qual) != 0) + if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, HiSeqAdapters(), as_factor, options.min_qual, options.min_read_len) != 0) return 7; } else { - if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, NoAdapters(), as_factor, options.min_qual) != 0) + if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, NoAdapters(), as_factor, options.min_qual, options.min_read_len) != 0) return 7; } -- GitLab