From b55253e6226225fe6421430effc840f60607902f Mon Sep 17 00:00:00 2001
From: Birte Kehr <birte.kehr@bihealth.de>
Date: Thu, 1 Aug 2024 11:55:15 +0200
Subject: [PATCH] Adding option for setting the minimum base qualities of reads
 to be cropped for unmapped assembly.

---
 src/argument_parsing.h      | 11 ++++++++++-
 src/crop_unmapped.h         | 16 +++++++++-------
 src/popins2_crop_unmapped.h |  6 +++---
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/argument_parsing.h b/src/argument_parsing.h
index 0114cf4..d2ef294 100644
--- a/src/argument_parsing.h
+++ b/src/argument_parsing.h
@@ -37,6 +37,7 @@ struct CropUnmappedOptions {
     int humanSeqs;
 
     float alignment_score_factor;
+    unsigned min_qual;
 
     bool printSampleInfo;
 
@@ -49,6 +50,7 @@ struct CropUnmappedOptions {
         sampleID(""),
         humanSeqs(maxValue<int>()),
         alignment_score_factor(0.67f),
+	min_qual(20),
         printSampleInfo(true)
     {}
 };
@@ -254,6 +256,8 @@ bool getOptionValues(CropUnmappedOptions & options, ArgumentParser const & parse
         getOptionValue(options.humanSeqs, parser, "filter");
     if (isSet(parser, "alignment-score-factor"))
         getOptionValue(options.alignment_score_factor, parser, "alignment-score-factor");
+    if (isSet(parser, "min-qual"))
+	getOptionValue(options.min_qual, parser, "min-qual");
     if (isSet(parser, "noSampleInfo"))
         options.printSampleInfo = false;
 
@@ -490,6 +494,7 @@ bool getOptionValues(GenotypingOptions &options, seqan::ArgumentParser &parser){
 
 void setHiddenOptions(ArgumentParser & parser, bool hide, CropUnmappedOptions &){
     hideOption(parser, "alignment-score-factor", hide);
+    hideOption(parser, "min-qual", hide);
 }
 
 void setHiddenOptions(seqan::ArgumentParser & /*parser*/, bool /*hide*/, MergeSetMateOptions &){
@@ -582,6 +587,7 @@ void setupParser(ArgumentParser & parser, CropUnmappedOptions & options){
             "as high-quality aligned even if their alignment quality is low. Recommended for non-human reference "
             "sequences after remapping for contamination removal.", ArgParseArgument::INTEGER, "INT"));
     addOption(parser, ArgParseOption("c", "alignment-score-factor", "A read is considered low quality if its alignment score (AS tag) is below FLOAT*read length", seqan::ArgParseArgument::DOUBLE, "FLOAT"));
+    addOption(parser, ArgParseOption("q", "min-qual", "Minimum average quality value in windows for read quality trimming.", seqan::ArgParseArgument::INTEGER, "INT"));
 
     // Set valid and default values.
     setValidValues(parser, "adapters", "HiSeq HiSeqX");
@@ -595,10 +601,13 @@ void setupParser(ArgumentParser & parser, CropUnmappedOptions & options){
     setDefaultValue(parser, "prefix", "\'.\'");
     setDefaultValue(parser, "sample", "retrieval from BAM file header");
     setDefaultValue(parser, "alignment-score-factor", options.alignment_score_factor);
+    setDefaultValue(parser, "min-qual", options.min_qual);
     setDefaultValue(parser, "noSampleInfo", "false");
 
     setMinValue(parser, "alignment-score-factor", "0.0");
     setMaxValue(parser, "alignment-score-factor", "1.0");
+    setMinValue(parser, "min-qual", "0");
+    setMaxValue(parser, "min-qual", "42");
 
     // Hide some options from default help.
     setHiddenOptions(parser, true, options);
@@ -1416,4 +1425,4 @@ seqan::ArgumentParser::ParseResult parseCommandLine(TOptions &options, int argc,
 }
 
 
-#endif /*ARGUMENT_PARSING_H_*/
\ No newline at end of file
+#endif /*ARGUMENT_PARSING_H_*/
diff --git a/src/crop_unmapped.h b/src/crop_unmapped.h
index 9150e5f..e72e257 100644
--- a/src/crop_unmapped.h
+++ b/src/crop_unmapped.h
@@ -349,7 +349,7 @@ findOtherReads(BamFileOut & matesStream,
 // Function crop_unmapped()
 // ==========================================================================
 
-template<typename TAdapterTag>
+template<typename TAdapterTag, typename TSize_>
 int
 crop_unmapped(double & avgCov,
 	unsigned & totalReads,
@@ -358,7 +358,8 @@ crop_unmapped(double & avgCov,
         CharString const & mappingBam,
         int humanSeqs,
         TAdapterTag tag,
-        float as_factor)
+        float as_factor,
+	TSize_ qualThresh)
 {
     typedef __int32 TPos;
     typedef std::map<CharString, Pair<CharString> > TFastqMap; // Reads to go into fastq files.
@@ -427,14 +428,14 @@ crop_unmapped(double & avgCov,
         // Check the read's unmapped flag.
         if (hasFlagUnmapped(record))
         {
-            if (removeLowQuality(record, 20) != 1 && removeAdapter(record, indexUniversal, indexTruSeqs, 30, tag) != 2)
+            if (removeLowQuality(record, qualThresh) != 1 && removeAdapter(record, indexUniversal, indexTruSeqs, 30, tag) != 2)
                 appendFastqRecord(fastqFirstStream, fastqSecondStream, firstReads, secondReads, record);
         }
 
         // Check for low mapping quality.
         else if (hasLowMappingQuality(record, humanSeqs, as_factor))
         {
-            if (removeLowQuality(record, 20) != 1 && removeAdapter(record, indexUniversal, indexTruSeqs, 30, tag) != 2)
+            if (removeLowQuality(record, qualThresh) != 1 && removeAdapter(record, indexUniversal, indexTruSeqs, 30, tag) != 2)
             {
                 if (appendFastqRecord(fastqFirstStream, fastqSecondStream, firstReads, secondReads, record) == 0)
                     otherReads[Pair<TPos>(record.rNextId, record.pNext)] = Pair<CharString, bool>(record.qName, hasFlagFirst(record));
@@ -476,18 +477,19 @@ crop_unmapped(double & avgCov,
 }
 
 
-template<typename TAdapterTag>
+template<typename TAdapterTag, typename TSize_>
 int
 crop_unmapped(Triple<CharString> & fastqFiles,
         CharString & matesBam,
         CharString const & mappingBam,
         int humanSeqs,
         TAdapterTag tag,
-        float as_factor)
+        float as_factor,
+	TSize_ qualThresh)
 {
     double cov;
     unsigned totalReads;
-    return crop_unmapped(cov, totalReads, fastqFiles, matesBam, mappingBam, humanSeqs, tag, as_factor);
+    return crop_unmapped(cov, totalReads, fastqFiles, matesBam, mappingBam, humanSeqs, tag, as_factor, qualThresh);
 }
 
 #endif // #ifndef NOVINS_CROP_UNMAPPED_H_
diff --git a/src/popins2_crop_unmapped.h b/src/popins2_crop_unmapped.h
index 3526cb2..cd8b024 100644
--- a/src/popins2_crop_unmapped.h
+++ b/src/popins2_crop_unmapped.h
@@ -86,17 +86,17 @@ int popins2_crop_unmapped(int argc, char const ** argv){
     // Crop unmapped reads and reads with unreliable mappings from the input bam file.
     if (options.adapters == "HiSeqX")
     {
-        if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, HiSeqXAdapters(), as_factor) != 0)
+        if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, HiSeqXAdapters(), as_factor, options.min_qual) != 0)
             return 7;
     }
     else if (options.adapters == "HiSeq")
     {
-        if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, HiSeqAdapters(), as_factor) != 0)
+        if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, HiSeqAdapters(), as_factor, options.min_qual) != 0)
             return 7;
     }
     else
     {
-        if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, NoAdapters(), as_factor) != 0)
+        if (crop_unmapped(info.avg_cov, info.total_reads, fastqFiles, matesBam, options.mappingFile, options.humanSeqs, NoAdapters(), as_factor, options.min_qual) != 0)
             return 7;
     }
 
-- 
GitLab