From 811a2af7c2f365fa27c411c250ead6036b09d172 Mon Sep 17 00:00:00 2001 From: Birte Kehr <birte.kehr@bihealth.de> Date: Thu, 18 Aug 2022 18:44:21 +0200 Subject: [PATCH] Cleaning popins4snake code, in particular the argument parser. Introducing a merge-locations command by splitting this task from place-refalign. --- .gitmodules | 3 + external/bifrost | 1 + src/argument_parsing.h | 435 +++++++++++------- src/crop_unmapped.h | 2 +- src/location.h | 2 - src/popins2_crop_unmapped.h | 18 +- src/popins2_find_locations.h | 80 ++++ src/{popins_genotype.h => popins2_genotype.h} | 8 +- src/popins2_merge.h | 2 - src/popins2_merge_and_set_mate.h | 12 +- src/popins2_merge_locations.h | 70 +++ src/{popins_place.h => popins2_place.h} | 49 +- src/popins4snake.cpp | 12 +- src/popins_find_locations.h | 312 ------------- src/variant_caller.h | 2 +- 15 files changed, 447 insertions(+), 561 deletions(-) create mode 100644 .gitmodules create mode 160000 external/bifrost create mode 100644 src/popins2_find_locations.h rename src/{popins_genotype.h => popins2_genotype.h} (98%) create mode 100644 src/popins2_merge_locations.h rename src/{popins_place.h => popins2_place.h} (87%) delete mode 100644 src/popins_find_locations.h diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..84c9b08 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "external/bifrost"] + path = external/bifrost + url = https://github.com/pmelsted/bifrost.git diff --git a/external/bifrost b/external/bifrost new file mode 160000 index 0000000..5594b5e --- /dev/null +++ b/external/bifrost @@ -0,0 +1 @@ +Subproject commit 5594b5e96b7ce8c89ee9361c6c49d43c852b81ae diff --git a/src/argument_parsing.h b/src/argument_parsing.h index bbd2c61..2579147 100644 --- a/src/argument_parsing.h +++ b/src/argument_parsing.h @@ -23,44 +23,30 @@ using namespace std; struct CropUnmappedOptions { - CharString outputFile; + CharString matesBam; CharString pe1; CharString pe2; CharString se; CharString mappingFile; - CharString matepairFile; - CharString referenceFile; CharString prefix; CharString sampleID; - unsigned kmerLength; CharString adapters; int humanSeqs; - unsigned threads; - CharString memory; - float alignment_score_factor; - bool printSampleInfo; - CropUnmappedOptions () : - outputFile("mates.bam"), + matesBam("mates.bam"), pe1("paired.1.fastq"), pe2("paired.2.fastq"), se("single.fastq"), - matepairFile(""), - referenceFile(""), prefix("."), sampleID(""), - kmerLength(47), humanSeqs(maxValue<int>()), - threads(1), - memory("768M"), - alignment_score_factor(0.67f), - printSampleInfo(false) + alignment_score_factor(0.67f) {} }; @@ -68,8 +54,6 @@ struct MergeSetMateOptions { CharString prefix; CharString sampleID; - unsigned nonContigSeqs; - CharString nonRefBam; CharString remappedBam; @@ -78,7 +62,6 @@ struct MergeSetMateOptions { MergeSetMateOptions(): prefix("."), sampleID(""), - nonContigSeqs(0), mergedBam("merged.bam") {} }; @@ -146,15 +129,24 @@ struct FindLocationsOptions { CharString non_ref_bam; int maxInsertSize; - bool deleteNonRefNew; FindLocationsOptions() : prefix("."), sampleID(""), referenceFile("genome.fa"), non_ref_bam("non_ref.bam"), - maxInsertSize(800), - deleteNonRefNew(false) + maxInsertSize(800) + {} +}; + + +struct MergeLocationsOptions { + CharString prefix; + CharString locationsFile; + unsigned maxInsertSize; + + MergeLocationsOptions() : + prefix("."), locationsFile("locations.txt"), maxInsertSize(800) {} }; @@ -240,8 +232,8 @@ bool getOptionValues(CropUnmappedOptions & options, ArgumentParser const & parse getArgumentValue(options.mappingFile, parser, 0); - if (isSet(parser, "output")) - getOptionValue(options.outputFile, parser, "output"); + if (isSet(parser, "mates")) + getOptionValue(options.matesBam, parser, "mates"); if (isSet(parser, "paired1")) getOptionValue(options.pe1, parser, "paired1"); if (isSet(parser, "paired2")) @@ -253,24 +245,12 @@ bool getOptionValues(CropUnmappedOptions & options, ArgumentParser const & parse getOptionValue(options.prefix, parser, "prefix"); if (isSet(parser, "sample")) getOptionValue(options.sampleID, parser, "sample"); - if (isSet(parser, "matePair")) - getOptionValue(options.matepairFile, parser, "matePair"); if (isSet(parser, "adapters")) getOptionValue(options.adapters, parser, "adapters"); - if (isSet(parser, "reference")) - getOptionValue(options.referenceFile, parser, "reference"); if (isSet(parser, "filter")) getOptionValue(options.humanSeqs, parser, "filter"); - if (isSet(parser, "kmerLength")) - getOptionValue(options.kmerLength, parser, "kmerLength"); - if (isSet(parser, "threads")) - getOptionValue(options.threads, parser, "threads"); - if (isSet(parser, "memory")) - getOptionValue(options.memory, parser, "memory"); if (isSet(parser, "alignment-score-factor")) getOptionValue(options.alignment_score_factor, parser, "alignment-score-factor"); - if (isSet(parser, "printSampleInfo")) - options.printSampleInfo = true; return true; } @@ -284,8 +264,6 @@ bool getOptionValues(MergeSetMateOptions & options, ArgumentParser const & parse getOptionValue(options.sampleID, parser, "sample"); if (isSet(parser, "prefix")) getOptionValue(options.prefix, parser, "prefix"); - if (isSet(parser, "non-contig-seqs")) - getOptionValue(options.nonContigSeqs, parser, "non-contig-seqs"); if (isSet(parser, "mergedBam")) getOptionValue(options.mergedBam, parser, "mergedBam"); @@ -377,12 +355,20 @@ bool getOptionValues(FindLocationsOptions &options, seqan::ArgumentParser &parse getOptionValue(options.non_ref_bam, parser, "non-ref"); if (isSet(parser, "maxInsertSize")) getOptionValue(options.maxInsertSize, parser, "maxInsertSize"); - if (isSet(parser, "delNonRefNew")) - options.deleteNonRefNew = true; return true; } +bool getOptionValues(MergeLocationsOptions &options, seqan::ArgumentParser &parser){ + if (isSet(parser, "prefix")) + getOptionValue(options.prefix, parser, "prefix"); + if (isSet(parser, "locations")) + getOptionValue(options.locationsFile, parser, "locations"); + if (isSet(parser, "maxInsertSize")) + getOptionValue(options.maxInsertSize, parser, "maxInsertSize"); + + return true; +} bool getOptionValues(PlacingOptions<RefAlign> &options, seqan::ArgumentParser &parser){ if (isSet(parser, "prefix")) @@ -498,8 +484,6 @@ bool getOptionValues(GenotypingOptions &options, seqan::ArgumentParser &parser){ // ========================= void setHiddenOptions(ArgumentParser & parser, bool hide, CropUnmappedOptions &){ - hideOption(parser, "matePair", hide); - hideOption(parser, "kmerLength", hide); hideOption(parser, "alignment-score-factor", hide); } @@ -521,6 +505,10 @@ void setHiddenOptions(seqan::ArgumentParser & /*parser*/, bool /*hide*/, FindLoc // TODO } +void setHiddenOptions(seqan::ArgumentParser & /*parser*/, bool /*hide*/, MergeLocationsOptions &){ + // Nothing to be done. +} + void setHiddenOptions(seqan::ArgumentParser & parser, bool hide, PlacingOptions<RefAlign> &){ hideOption(parser, "groupDist", hide); @@ -558,43 +546,40 @@ void setHiddenOptions(seqan::ArgumentParser &parser, bool hide, GenotypingOption void setupParser(ArgumentParser & parser, CropUnmappedOptions & options){ - setShortDescription(parser, "Crop unmapped reads."); + setShortDescription(parser, "Extract unmapped and poorly aligned reads from a BAM file."); setVersion(parser, VERSION); setDate(parser, DATE); // Define usage line and long description. addUsageLine(parser, "[\\fIOPTIONS\\fP] \\fIBAM_FILE\\fP"); - addDescription(parser, "Finds reads without high-quality alignment in the \\fIBAM FILE\\fP"); + addDescription(parser, "Extracts reads without high-quality alignment from the \\fIBAM_FILE\\fP. Writes read pairs " + "where both reads in the pair have no high-quality alignment to a pair of FASTQ files. Writes reads " + "without high-quality alignment but with an aligned mate to a single FASTQ file and the aligned mates to a " + "BAM file. All output files are written to a sample directory created in the directory specified with the " + "--prefix option."); // Require a bam file as argument. addArgument(parser, ArgParseArgument(ArgParseArgument::INPUT_FILE, "BAM_FILE")); // Setup the options. addSection(parser, "Input/output options"); - addOption(parser, ArgParseOption("o", "output", "File name for the output BAM file.", ArgParseArgument::OUTPUT_FILE, "BAM FILE")); + addOption(parser, ArgParseOption("p", "prefix", "Path to a working directory for creating sample directories.", ArgParseArgument::STRING, "PATH")); + addOption(parser, ArgParseOption("s", "sample", "An ID for the sample.", ArgParseArgument::STRING, "SAMPLE_ID")); addOption(parser, ArgParseOption("pe1", "paired1", "File name for the first reads file.", ArgParseArgument::INPUT_FILE, "FASTQ FILE")); addOption(parser, ArgParseOption("pe2", "paired2", "File name for the second reads file.", ArgParseArgument::INPUT_FILE, "FASTQ FILE")); addOption(parser, ArgParseOption("se", "single", "File name for the single reads file.", ArgParseArgument::INPUT_FILE, "FASTQ FILE")); - addOption(parser, ArgParseOption("p", "prefix", "Path to the sample directories.", ArgParseArgument::STRING, "PATH")); - addOption(parser, ArgParseOption("s", "sample", "An ID for the sample.", ArgParseArgument::STRING, "SAMPLE_ID")); - addOption(parser, ArgParseOption("mp", "matePair", "(Currently only available for Velvet.)", ArgParseArgument::INPUT_FILE, "BAM FILE")); - addOption(parser, ArgParseOption("i", "printSampleInfo", "Prints sample information sheet if set.")); + addOption(parser, ArgParseOption("m", "mates", "File name for the aligned mates BAM file.", ArgParseArgument::OUTPUT_FILE, "BAM FILE")); addSection(parser, "Algorithm options"); addOption(parser, ArgParseOption("a", "adapters", "Enable adapter removal for Illumina reads. Default: \\fIno adapter removal\\fP.", ArgParseArgument::STRING, "STR")); - addOption(parser, ArgParseOption("r", "reference", "Remap reads to this reference before assembly. Default: \\fIno remapping\\fP.", ArgParseArgument::INPUT_FILE, "FASTA_FILE")); - addOption(parser, ArgParseOption("f", "filter", "Treat reads aligned to all but the first INT reference sequences after remapping as high-quality aligned even if their alignment quality is low. " - "Recommended for non-human reference sequences.", ArgParseArgument::INTEGER, "INT")); - addOption(parser, ArgParseOption("k", "kmerLength", "The k-mer size if the velvet assembler is used.", ArgParseArgument::INTEGER, "INT")); - addOption(parser, ArgParseOption("c", "alignment-score-factor", "A record is considered low quality if the alignment score (AS) is below FLOAT*read length", seqan::ArgParseArgument::DOUBLE, "FLOAT")); - addSection(parser, "Compute resource options"); - addOption(parser, ArgParseOption("t", "threads", "Number of threads to use for BWA and samtools sort.", ArgParseArgument::INTEGER, "INT")); - addOption(parser, ArgParseOption("m", "memory", "Maximum memory per thread for samtools sort; suffix K/M/G recognized.", ArgParseArgument::STRING, "STR")); + addOption(parser, ArgParseOption("f", "filter", "Treat reads aligned to all but the first INT reference sequences " + "as high-quality aligned even if their alignment quality is low. Recommended for non-human reference " + "sequences after remapping for contamination removal.", ArgParseArgument::INTEGER, "INT")); + addOption(parser, ArgParseOption("c", "alignment-score-factor", "A read is considered low quality if its alignment score (AS tag) is below FLOAT*read length", seqan::ArgParseArgument::DOUBLE, "FLOAT")); // Set valid and default values. setValidValues(parser, "adapters", "HiSeq HiSeqX"); - setValidValues(parser, "reference", "fa fna fasta gz"); - setDefaultValue(parser, "output", options.outputFile); + setDefaultValue(parser, "mates", options.matesBam); setDefaultValue(parser, "paired1", options.pe1); setValidValues(parser, "paired1", "fq fastq gz"); setDefaultValue(parser, "paired2", options.pe2); @@ -603,13 +588,8 @@ void setupParser(ArgumentParser & parser, CropUnmappedOptions & options){ setValidValues(parser, "single", "fq fastq gz"); setDefaultValue(parser, "prefix", "\'.\'"); setDefaultValue(parser, "sample", "retrieval from BAM file header"); - setDefaultValue(parser, "kmerLength", options.kmerLength); - setDefaultValue(parser, "threads", options.threads); - setDefaultValue(parser, "memory", options.memory); setDefaultValue(parser, "alignment-score-factor", options.alignment_score_factor); - setDefaultValue(parser, "printSampleInfo", "false"); - setMinValue(parser, "threads", "1"); setMinValue(parser, "alignment-score-factor", "0.0"); setMaxValue(parser, "alignment-score-factor", "1.0"); @@ -617,58 +597,59 @@ void setupParser(ArgumentParser & parser, CropUnmappedOptions & options){ setHiddenOptions(parser, true, options); } - void setupParser(ArgumentParser & parser, MergeSetMateOptions & options){ - setShortDescription(parser, "Find the other reads in pairs for remapped reads."); // TODO: are the "other reads" the "mates"? + setShortDescription(parser, "Merges two name-sorted BAM files of the same sample."); setVersion(parser, VERSION); setDate(parser, DATE); // Define usage line and long description. - addUsageLine(parser, "[\\fIOPTIONS\\fP] \\fINONREF_BAM\\fP \\fIREMAPPED_BAM\\fP"); // TODO: check after parser is done - addDescription(parser, "tbd."); + addUsageLine(parser, "[\\fIOPTIONS\\fP] \\fIBAM_FILE_1\\fP \\fIBAM_FILE_2\\fP"); + addDescription(parser, "Merges two BAM files named PATH/SAMPLE_ID/\\fIBAM_FILE_1\\fP and " + "PATH/SAMPLE_ID/\\fIBAM_FILE_2\\fP originating from the same sample, where PATH and SAMPLE_ID can be " + "configered with the options --prefix and --sample. Adds information about the other read in pair for " + "those pairs that were separated in the two input files. For example, the two BAM files can be a file " + "containing the aligned mates of unaligned reads and another file containing the unaligned reads after " + "alignment to supercontigs. "); // Required arguments. - addArgument(parser, ArgParseArgument(ArgParseArgument::INPUT_FILE, "NONREF_BAM")); - addArgument(parser, ArgParseArgument(ArgParseArgument::INPUT_FILE, "REMAPPED_BAM")); + addArgument(parser, ArgParseArgument(ArgParseArgument::INPUT_FILE, "BAM_FILE_1")); + addArgument(parser, ArgParseArgument(ArgParseArgument::INPUT_FILE, "BAM_FILE_2")); // Setup the options. addSection(parser, "Input/output options"); addOption(parser, ArgParseOption("p", "prefix", "Path to the sample directories.", ArgParseArgument::STRING, "PATH")); addOption(parser, ArgParseOption("s", "sample", "An ID for the sample.", ArgParseArgument::STRING, "SAMPLE_ID")); - addOption(parser, ArgParseOption("o", "mergedBam", "Output BAMFILE name.", ArgParseArgument::STRING, "BAM FILE")); - - addSection(parser, "Algorithm options"); - addOption(parser, ArgParseOption("n", "non-contig-seqs", "Magic INT .", ArgParseArgument::INTEGER, "INT")); - + addOption(parser, ArgParseOption("o", "mergedBam", "Output BAM file name.", ArgParseArgument::STRING, "BAM FILE")); // Set valid and default values. setDefaultValue(parser, "prefix", "\'.\'"); - setDefaultValue(parser, "sample", "retrieval from NONREF_BAM file header"); + setDefaultValue(parser, "sample", "retrieval from BAM_FILE_1 file header"); setDefaultValue(parser, "mergedBam", options.mergedBam); - setDefaultValue(parser, "non-contig-seqs", options.nonContigSeqs); // Setup hidden options setHiddenOptions(parser, true, options); } - - /** * Function to handle the input parsing for the popins2 merge module * @param parser is a seqan argument parser instance * @param options is a struct to store the input arguments for the merge module */ void setupParser(seqan::ArgumentParser &parser, MergeOptions &options){ - // Setup meta-information - seqan::setShortDescription(parser, "Build or read a colored and compacted de Bruijn Graph (CCDBG) and generate supercontigs."); + seqan::setShortDescription(parser, "Merges sets of contigs into supercontigs using a colored compacted de Bruijn Graph."); seqan::setVersion(parser, VERSION); seqan::setDate(parser, DATE); - seqan::addUsageLine(parser, "\\--input-{seq|ref}-files DIR or --input-graph-file GFA --input-colors-file BFG_COLORS [OPTIONS] \\fP "); + + // Define usage line and long description. + seqan::addUsageLine(parser, "\\--input-{seq|ref}-files DIR or --input-graph-file GFA --input-colors-file BFG_COLORS [OPTIONS]\\fP "); + addDescription(parser, "Reads assembled contigs from all sample directories, builds and simplifies a colored " + "compacted de Bruijn graph (using Bifrost), and writes a FASTA file of supercontigs inferred from graph " + "traversal. Options largely match those of Bifrost."); // Setup options seqan::addSection(parser, "I/O options"); - seqan::addOption(parser, seqan::ArgParseOption("s", "input-seq-files", "Path to the sample directories.", seqan::ArgParseArgument::STRING, "DIR")); - seqan::addOption(parser, seqan::ArgParseOption("r", "input-ref-files", "Path to the sample directories. (no abundance filter)", seqan::ArgParseArgument::STRING, "DIR")); + seqan::addOption(parser, seqan::ArgParseOption("s", "input-seq-files", "Path to the sample directories (k-mers with exactly 1 occurence are discarded).", seqan::ArgParseArgument::STRING, "DIR")); + seqan::addOption(parser, seqan::ArgParseOption("r", "input-ref-files", "Path to the sample directories (no k-mer abundance filter).", seqan::ArgParseArgument::STRING, "DIR")); seqan::addOption(parser, seqan::ArgParseOption("y", "input-graph-file", "Source file with dBG", seqan::ArgParseArgument::STRING, "GFA")); seqan::addOption(parser, seqan::ArgParseOption("z", "input-colors-file", "Source file with dBG colors", seqan::ArgParseArgument::STRING, "BFG_COLORS")); seqan::addOption(parser, seqan::ArgParseOption("p", "outputfile-prefix", "Specify a prefix for the output files.", seqan::ArgParseArgument::STRING, "STRING")); @@ -711,15 +692,19 @@ void setupParser(seqan::ArgumentParser &parser, MergeOptions &options){ setHiddenOptions(parser, true, options); } - void setupParser(seqan::ArgumentParser &parser, FindLocationsOptions &options){ - setShortDescription(parser, "Find insertion locations per sample."); + setShortDescription(parser, "Finds insertion locations of (super-)contigs per sample."); setVersion(parser, VERSION); setDate(parser, DATE); // Define usage line and long description. addUsageLine(parser, "[\\fIOPTIONS\\fP] \\fISAMPLE_ID\\fP"); - addDescription(parser, "todo."); + addDescription(parser, "Anchoring read pairs found in the file \\fInon_ref_new.bam\\fP in the directory of the " + "sample \\fISAMPLE_ID\\fP are iterated and clustered (if they are within a distance specified using the " + "--maxInsertSize option) into candidate insertion locations for the (super-)contigs. The candidate " + "locations are sorted by (super-)contig, scored with an anchoring score and written to " + "\\fIlocations.txt\\fP in the sample directory. Expects the two files \\fInon_ref_new.bam\\fP and " + "\\fInon_ref.bam\\fP to be present in the sample directory."); addArgument(parser, ArgParseArgument(ArgParseArgument::STRING, "SAMPLE_ID")); @@ -727,11 +712,10 @@ void setupParser(seqan::ArgumentParser &parser, FindLocationsOptions &options){ addSection(parser, "Input/output options"); addOption(parser, ArgParseOption("p", "prefix", "Path to the sample directories.", ArgParseArgument::STRING, "PATH")); addOption(parser, ArgParseOption("r", "reference", "Name of reference genome file.", ArgParseArgument::INPUT_FILE, "FASTA_FILE")); - addOption(parser, ArgParseOption("n", "non-ref", "Non-reference reads alignment file name.", ArgParseArgument::INPUT_FILE, "BAM FILE")); + addOption(parser, ArgParseOption("n", "non-ref", "Name of file containing aligned mates of unaligned reads.", ArgParseArgument::INPUT_FILE, "BAM FILE")); addSection(parser, "Algorithm options"); addOption(parser, ArgParseOption("e", "maxInsertSize", "The maximum expected insert size of the read pairs.", ArgParseArgument::INTEGER, "INT")); - addOption(parser, ArgParseOption("d", "delNonRefNew", "Delete the non_ref_new.bam file after writing locations.")); // Set valid values. setValidValues(parser, "reference", "fa fna fasta"); @@ -741,36 +725,68 @@ void setupParser(seqan::ArgumentParser &parser, FindLocationsOptions &options){ setDefaultValue(parser, "reference", options.referenceFile); setDefaultValue(parser, "non-ref", options.non_ref_bam); setDefaultValue(parser, "maxInsertSize", options.maxInsertSize); - setDefaultValue(parser, "delNonRefNew", "false"); // Hide some options from default help. setHiddenOptions(parser, true, options); } -void setupParser(seqan::ArgumentParser &parser, PlacingOptions<RefAlign> &options){ - setShortDescription(parser, "Contig placing by alignment of contig ends to reference genome."); +void setupParser(seqan::ArgumentParser &parser, MergeLocationsOptions &options){ + setShortDescription(parser, "Merges insertion locations from all samples into one file."); setVersion(parser, VERSION); setDate(parser, DATE); addUsageLine(parser, "[\\fIOPTIONS\\fP]"); - addDescription(parser, "This is step 1/3 of contig placing. The contig locations in the sample directories are " - "merged into one file of locations. Next, prefixes/suffixes of contigs are aligned to the merged locations " - "on the reference genome and VCF records are written if the alignment is successful. Locations of contigs " - "that do not align to the reference genome are written to additional output files \\fIlocations_unplaced.txt\\fP " - "in the sample directories. Further, groups of contigs that can be placed at the same position and whose " - "prefixes/suffixes align to each other are written to another output file; only a single VCF record is " - "written per group."); + addDescription(parser, "The \\fIlocations.txt\\fP in all sample directories are merged into one file of locations. " + "The anchoring score is recomputed from all samples together. If the number of samples exceeds 500, this " + "step is performed in two passes, first in batches of 500 each writing a temporary file and then combining " + "the temporary files of all batches into the final output file. The maximum number of samples currently " + "supported is 250,000."); // Setup the options. addSection(parser, "Input/output options"); addOption(parser, ArgParseOption("p", "prefix", "Path to the sample directories.", ArgParseArgument::STRING, "PATH")); addOption(parser, ArgParseOption("l", "locations", "Name of merged locations file.", ArgParseArgument::OUTPUT_FILE, "FILE")); + + addSection(parser, "Algorithm options"); + addOption(parser, ArgParseOption("", "maxInsertSize", "The maximum expected insert size of the read pairs.", ArgParseArgument::INTEGER, "INT")); + + // Set default values. + setDefaultValue(parser, "prefix", "\'.\'"); + setDefaultValue(parser, "locations", options.locationsFile); + setDefaultValue(parser, "maxInsertSize", options.maxInsertSize); + + // Hide some options from default help. + setHiddenOptions(parser, true, options); +} + +void setupParser(seqan::ArgumentParser &parser, PlacingOptions<RefAlign> &options){ + setShortDescription(parser, "Finds positions of (super-)contigs by aligning contig ends to the reference genome."); + setVersion(parser, VERSION); + setDate(parser, DATE); + + addUsageLine(parser, "[\\fIOPTIONS\\fP]"); + + addDescription(parser, "This is step 1/3 of contig placing. Aligns prefixes/suffixes of (super-)contigs to the " + "locations on the reference genome given in the merged locations file specified using the --locations " + "option. It skips locations that lie on OTHER chromosomes, exceed a maximum length, are supported by too " + "few anchoring read pairs or by a too low anchoring score. If the alignment is successful, VCF records are " + "written to the insertions output file specified using the --insertions option. Locations of contigs that " + "do not align to the reference genome are written to additional output files " + "\\fIlocations_unplaced.txt\\fP in the sample directories. Further, groups of contigs that can be placed " + "at the same position and whose prefixes/suffixes align to each other are written to an output specified " + "using the --groups option; only a single VCF record is written per group."); + + // Setup the options. + addSection(parser, "Input/output options"); + addOption(parser, ArgParseOption("p", "prefix", "Path to the sample directories.", ArgParseArgument::STRING, "PATH")); addOption(parser, ArgParseOption("i", "insertions", "Name of VCF output file.", ArgParseArgument::OUTPUT_FILE, "VCF_FILE")); + addOption(parser, ArgParseOption("l", "locations", "Name of merged locations file.", ArgParseArgument::INPUT_FILE, "FILE")); addOption(parser, ArgParseOption("c", "contigs", "Name of supercontigs file.", ArgParseArgument::INPUT_FILE, "FASTA_FILE")); addOption(parser, ArgParseOption("r", "reference", "Name of reference genome file.", ArgParseArgument::INPUT_FILE, "FASTA_FILE")); - addOption(parser, ArgParseOption("g", "groups", "Name of file containing groups of contigs that can be placed at the same position and whose prefixes/suffixes align.", ArgParseArgument::OUTPUT_FILE, "FILE")); + addOption(parser, ArgParseOption("g", "groups", "Name of file containing groups of contigs that can be placed at " + "the same position and whose prefixes/suffixes align.", ArgParseArgument::OUTPUT_FILE, "FILE")); addSection(parser, "Algorithm options"); addOption(parser, ArgParseOption("", "minScore", "Minimal anchoring score for a location.", ArgParseArgument::DOUBLE, "FLOAT")); @@ -806,15 +822,15 @@ void setupParser(seqan::ArgumentParser &parser, PlacingOptions<RefAlign> &option void setupParser(seqan::ArgumentParser &parser, PlacingOptions<SplitAlign> &options){ - setShortDescription(parser, "Contig placing by split-read alignment."); + setShortDescription(parser, "Find positions of (super-)contigs by split-read alignment (per sample)."); setVersion(parser, VERSION); setDate(parser, DATE); addUsageLine(parser, "[\\fIOPTIONS\\fP] \\fISAMPLE_ID\\fP"); - addDescription(parser, "This is step 2/3 of contig placing. All locations in a sample's " - "\\fIlocations_unplaced.txt\\fP are split-read aligned and the results are written to a file " - "\\fIlocations_placed.txt\\fP in the sample directory."); + addDescription(parser, "This is step 2/3 of contig placing. All locations in the file " + "\\fIlocations_unplaced.txt\\fP in the directory of the sample \\fISAMPLE_ID\\fP are split-read aligned " + "and the results are written to a file \\fIlocations_placed.txt\\fP in the sample directory."); addArgument(parser, ArgParseArgument(ArgParseArgument::STRING, "SAMPLE_ID")); @@ -846,14 +862,15 @@ void setupParser(seqan::ArgumentParser &parser, PlacingOptions<SplitAlign> &opti void setupParser(seqan::ArgumentParser &parser, PlacingOptions<SplitCombine> &options){ - setShortDescription(parser, "Combining breakpoint positions from split-read alignment."); + setShortDescription(parser, "Combines (super-)contig positions found by split-read alignment from all samples."); setVersion(parser, VERSION); setDate(parser, DATE); addUsageLine(parser, "[\\fIOPTIONS\\fP]"); addDescription(parser, "This is step 3/3 of contig placing. The results from split-read alignment (the " - "\\fIlocations_placed.txt\\fP files) of all samples are combined and appended to the VCF output file."); + "\\fIlocations_placed.txt\\fP files) of all samples are combined and appended to the VCF output file that " + "can be specified using the --insertions option."); // Setup the options. addSection(parser, "Input/output options"); @@ -874,18 +891,20 @@ void setupParser(seqan::ArgumentParser &parser, PlacingOptions<SplitCombine> &op setHiddenOptions(parser, true, options); } - void setupParser(seqan::ArgumentParser &parser, GenotypingOptions &options){ - setShortDescription(parser, "Genotyping a sample for insertions."); + setShortDescription(parser, "Determines genotypes of all insertions in a sample."); setVersion(parser, VERSION); setDate(parser, DATE); addUsageLine(parser, "[\\fIOPTIONS\\fP] \\fISAMPLE_ID\\fP"); - addDescription(parser, "Computes genotype likelihoods for a sample for all insertions given in the input VCF file " - "by aligning all reads, which are mapped to the reference genome around the insertion breakpoint or to the " - "contig, to the reference and to the alternative insertion sequence. VCF records with the genotype " - "likelihoods in GT:PL format for the individual are written to a file \\fIinsertions.vcf\\fP in the sample " - "directory."); + addDescription(parser, "Computes genotype likelihoods for the sample \\fISAMPLE_ID\\fP for all insertions given in " + "the input VCF file specified using the --insertions option. Aligns all reads, which are mapped to the " + "reference genome around the insertion breakpoint or to the contig, to the reference and to the " + "alternative insertion sequence. VCF records with the genotype likelihoods in GT:PL format for the " + "individual are written to a file \\fIinsertions.vcf\\fP in the sample directory. Expects the two files " + "\\fIPOPINS_SAMPLE_INFO\\fP (written by the crop-unmapped command) and \\fInon_ref_new.bam\\fP to be " + "present in the sample directory and the original BAM file specified in \\fIPOPINS_SAMPLE_INFO\\fP to " + "exist."); addArgument(parser, ArgParseArgument(ArgParseArgument::STRING, "SAMPLE_ID")); @@ -917,13 +936,13 @@ void setupParser(seqan::ArgumentParser &parser, GenotypingOptions &options){ // Misc hidden options. addOption(parser, ArgParseOption("v", "verbose", "Enable verbose output.")); - hideOption(parser, "verbose", true); + hideOption(parser, "verbose", true); addOption(parser, ArgParseOption("", "callBoth", "Call both models.")); - hideOption(parser, "callBoth", true); + hideOption(parser, "callBoth", true); addOption(parser, ArgParseOption("", "readCounts", "Use read counts.")); - hideOption(parser, "readCounts", true); + hideOption(parser, "readCounts", true); addOption(parser, ArgParseOption("", "fullOverlap", "Full overlap of read.")); - hideOption(parser, "fullOverlap", true); + hideOption(parser, "fullOverlap", true); // Set valid values. setValidValues(parser, "contigs", "fa fna fasta"); @@ -983,12 +1002,6 @@ ArgumentParser::ParseResult checkInput(CropUnmappedOptions & options){ res = ArgumentParser::PARSE_ERROR; } - if (options.matepairFile != "" && !exists(options.matepairFile)) - { - std::cerr << "ERROR: Input BAM file \'" << options.matepairFile << "\' does not exist." << std::endl; - res = ArgumentParser::PARSE_ERROR; - } - return res; } @@ -1077,15 +1090,24 @@ ArgumentParser::ParseResult checkInput(FindLocationsOptions &options){ res = ArgumentParser::PARSE_ERROR; } - //if (!exists(options.non_ref_bam)){ - // std::cerr << "ERROR: Non-reference reads input file \'" << options.non_ref_bam << "\' does not exist." << std::endl; - // res = ArgumentParser::PARSE_ERROR; - //} - return res; } +ArgumentParser::ParseResult checkInput(MergeLocationsOptions &options){ + + ArgumentParser::ParseResult res = ArgumentParser::PARSE_OK; + + if (options.prefix != "." && !exists(options.prefix)) + { + std::cerr << "ERROR: Path to sample directories \'" << options.prefix << "\' does not exist." << std::endl; + res = ArgumentParser::PARSE_ERROR; + } + + return res; +} + + ArgumentParser::ParseResult checkInput(PlacingOptions<RefAlign> &options){ ArgumentParser::ParseResult res = ArgumentParser::PARSE_OK; @@ -1163,7 +1185,7 @@ ArgumentParser::ParseResult checkInput(GenotypingOptions &options){ ArgumentParser::ParseResult res = ArgumentParser::PARSE_OK; if (options.prefix != "." && !exists(options.prefix)) - { + { std::cerr << "ERROR: Path to sample directories \'" << options.prefix << "\' does not exist." << std::endl; res = ArgumentParser::PARSE_ERROR; } @@ -1195,20 +1217,23 @@ ArgumentParser::ParseResult checkInput(GenotypingOptions &options){ // ========================= void printHelp(char const * name){ - std::cerr << "Population-scale detection of non-reference sequence insertions using colored de Bruijn Graphs" << std::endl; - std::cerr << "================================================================" << std::endl; + std::cerr << "=====================================================================" << std::endl; + std::cerr << "A modularized version of the program PopIns2" << std::endl; + std::cerr << " for population-scale detection of non-reference sequence variants" << std::endl; + std::cerr << "=====================================================================" << std::endl; std::cerr << std::endl; std::cerr << "\033[1mSYNOPSIS\033[0m" << std::endl; std::cerr << " \033[1m" << name << " COMMAND\033[0m [\033[4mOPTIONS\033[0m]" << std::endl; std::cerr << std::endl; std::cerr << "\033[1mCOMMAND\033[0m" << std::endl; - std::cerr << " \033[1mcrop-unmapped\033[0m Clip unmapped and poorly aligned reads from a sample." << std::endl; - std::cerr << " \033[1mmerge-set-mate\033[0m Merge and mate poorly aligned reads into contigs." << std::endl; - std::cerr << " \033[1mmerge\033[0m Generate supercontigs from a colored compacted de Bruijn Graph." << std::endl; - std::cerr << " \033[1mfind-locations\033[0m Find insertion locations per individual." << std::endl; - std::cerr << " \033[1mplace-refalign\033[0m Find position of (super-)contigs by aligning contig ends to the reference genome." << std::endl; - std::cerr << " \033[1mplace-splitalign\033[0m Find position of (super-)contigs by split-read alignment (per sample)." << std::endl; - std::cerr << " \033[1mplace-finish\033[0m Combine position found by split-read alignment from all samples." << std::endl; + std::cerr << " \033[1mcrop-unmapped\033[0m Extract unmapped and poorly aligned reads from a BAM file." << std::endl; + std::cerr << " \033[1mmerge-bams\033[0m Merge two name-sorted BAM files of the same sample and set mate information of now paired reads." << std::endl; + std::cerr << " \033[1mmerge-contigs\033[0m Merge sets of contigs into supercontigs using a colored compacted de Bruijn Graph." << std::endl; + std::cerr << " \033[1mfind-locations\033[0m Find insertion locations of (super-)contigs per sample." << std::endl; + std::cerr << " \033[1mmerge-locations\033[0m Merge insertion locations from all samples into one file." << std::endl; + std::cerr << " \033[1mplace-refalign\033[0m Find positions of (super-)contigs by aligning contig ends to the reference genome." << std::endl; + std::cerr << " \033[1mplace-splitalign\033[0m Find positions of (super-)contigs by split-read alignment (per sample)." << std::endl; + std::cerr << " \033[1mplace-finish\033[0m Combine (super-)contig positions found by split-read alignment from all samples." << std::endl; std::cerr << " \033[1mgenotype\033[0m Determine genotypes of all insertions in a sample." << std::endl; std::cerr << std::endl; std::cerr << "\033[1mVERSION\033[0m" << std::endl; @@ -1219,42 +1244,116 @@ void printHelp(char const * name){ } -void printMergeOptions(const MergeOptions &options){ - cout << "=========================================================" << endl; - cout << "popins2 version : " << VERSION << endl; - cout << "PARAMETER ======== : VALUE ==============================" << endl; - cout << "verbose : " << options.verbose << endl; - cout << "threads : " << options.nb_threads << endl; - cout << "#filename_seq_in : " << options.filename_seq_in.size() << endl; - cout << "#filename_ref_in : " << options.filename_ref_in.size() << endl; - cout << "filename_graph_in : " << options.filename_graph_in << endl; - cout << "filename_colors_in : " << options.filename_colors_in << endl; - cout << "k : " << options.k << endl; - cout << "g : " << options.g << endl; - cout << "clip-tips : " << options.clipTips << endl; - cout << "delete-isolated : " << options.deleteIsolated << endl; - cout << "mercy-kmers : " << options.useMercyKmers << endl; - cout << "outputfile-prefix : " << options.prefixFilenameOut << endl; - cout << "setcover-min-kmers : " << options.setcover_min_kmers << endl; - cout << "min-entropy : " << options.min_entropy << endl; - cout << "write-setcover : " << options.write_setcover << endl; - cout << "write-lecc : " << options.write_lecc << endl; - cout << "=========================================================" << endl; +void printOptions(const CropUnmappedOptions &/*options*/){ + // TODO + //cout << "==========================================================" << endl; + //cout << "popins4snake version: " << VERSION << endl; + //cout << "PARAMETER ========= : VALUE ==============================" << endl; + //cout << "prefix : " << options.prefix << endl; + /* + CharString matesBam; + CharString pe1; + CharString pe2; + CharString se; + CharString mappingFile; + CharString prefix; + CharString sampleID; + CharString adapters; + int humanSeqs; + float alignment_score_factor; + */ + //cout << "==========================================================" << endl; +} + +void printOptions(const MergeSetMateOptions &/*options*/){ + // TODO + //cout << "==========================================================" << endl; + //cout << "popins4snake version: " << VERSION << endl; + //cout << "PARAMETER ========= : VALUE ==============================" << endl; + //cout << "prefix : " << options.prefix << endl; + //cout << "==========================================================" << endl; } -void printFindLocationsOptions(const FindLocationsOptions &options){ +void printOptions(const MergeOptions &options){ cout << "==========================================================" << endl; - cout << "popins version : " << VERSION << endl; + cout << "popins4snake version: " << VERSION << endl; + cout << "PARAMETER ========= : VALUE ==============================" << endl; + cout << "verbose : " << options.verbose << endl; + cout << "threads : " << options.nb_threads << endl; + cout << "#filename_seq_in : " << options.filename_seq_in.size() << endl; + cout << "#filename_ref_in : " << options.filename_ref_in.size() << endl; + cout << "filename_graph_in : " << options.filename_graph_in << endl; + cout << "filename_colors_in : " << options.filename_colors_in << endl; + cout << "k : " << options.k << endl; + cout << "g : " << options.g << endl; + cout << "clip-tips : " << options.clipTips << endl; + cout << "delete-isolated : " << options.deleteIsolated << endl; + cout << "mercy-kmers : " << options.useMercyKmers << endl; + cout << "outputfile-prefix : " << options.prefixFilenameOut << endl; + cout << "setcover-min-kmers : " << options.setcover_min_kmers << endl; + cout << "min-entropy : " << options.min_entropy << endl; + cout << "write-setcover : " << options.write_setcover << endl; + cout << "write-lecc : " << options.write_lecc << endl; + cout << "==========================================================" << endl; +} + +void printOptions(const FindLocationsOptions &options){ + cout << "==========================================================" << endl; + cout << "popins4snake version: " << VERSION << endl; cout << "PARAMETER ========= : VALUE ==============================" << endl; cout << "prefix : " << options.prefix << endl; cout << "sample ID : " << options.sampleID << endl; cout << "reference file : " << options.referenceFile << endl; cout << "non-reference reads : " << options.non_ref_bam << endl; cout << "max insert size : " << options.maxInsertSize << endl; - cout << "delete non_ref_new : " << options.deleteNonRefNew << endl; cout << "==========================================================" << endl; } +void printOptions(const MergeLocationsOptions &/*options*/){ + // TODO + //cout << "==========================================================" << endl; + //cout << "popins4snake version: " << VERSION << endl; + //cout << "PARAMETER ========= : VALUE ==============================" << endl; + //cout << "prefix : " << options.prefix << endl; + //cout << "==========================================================" << endl; +} + +void printOptions(const PlacingOptions<RefAlign> &/*options*/){ + // TODO + //cout << "==========================================================" << endl; + //cout << "popins4snake version: " << VERSION << endl; + //cout << "PARAMETER ========= : VALUE ==============================" << endl; + //cout << "prefix : " << options.prefix << endl; + //cout << "==========================================================" << endl; +} + +void printOptions(const PlacingOptions<SplitAlign> &/*options*/){ + // TODO + //cout << "==========================================================" << endl; + //cout << "popins4snake version: " << VERSION << endl; + //cout << "PARAMETER ========= : VALUE ==============================" << endl; + //cout << "prefix : " << options.prefix << endl; + //cout << "==========================================================" << endl; +} + +void printOptions(const PlacingOptions<SplitCombine> &/*options*/){ + // TODO + //cout << "==========================================================" << endl; + //cout << "popins4snake version: " << VERSION << endl; + //cout << "PARAMETER ========= : VALUE ==============================" << endl; + //cout << "prefix : " << options.prefix << endl; + //cout << "==========================================================" << endl; +} + +void printOptions(const GenotypingOptions &/*options*/){ + // TODO + //cout << "==========================================================" << endl; + //cout << "popins4snake version: " << VERSION << endl; + //cout << "PARAMETER ========= : VALUE ==============================" << endl; + //cout << "prefix : " << options.prefix << endl; + //cout << "==========================================================" << endl; +} + // ========================================================================== // Function parseCommandLine() @@ -1303,13 +1402,11 @@ seqan::ArgumentParser::ParseResult parseCommandLine(TOptions &options, int argc, // Check if input files exist. res = checkInput(options); + // Print all option values. + printOptions(options); + return res; } - - - - -#endif /*ARGUMENT_PARSING_H_*/ - +#endif /*ARGUMENT_PARSING_H_*/ \ No newline at end of file diff --git a/src/crop_unmapped.h b/src/crop_unmapped.h index 2a66f30..eea8f3e 100644 --- a/src/crop_unmapped.h +++ b/src/crop_unmapped.h @@ -464,7 +464,7 @@ crop_unmapped(double & avgCov, if (found == -1) return 1; msg.str(""); - msg << "Mapped mates of unmapped reads written to " << matesBam << " , " << found << " found in second pass."; + msg << "Mapped mates of unmapped reads written to " << matesBam << ", " << found << " found in second pass."; printStatus(msg); return 0; diff --git a/src/location.h b/src/location.h index f7904a3..a62bba6 100644 --- a/src/location.h +++ b/src/location.h @@ -565,8 +565,6 @@ findLocations(String<Location> & locations, CharString & nonRefFile, std::set<Ch BamFileIn inStream(toCString(nonRefFile)); -std::cout << "nonContigSeqs=" << nonContigSeqs << std::endl; - // Read the header and clear it since we don't need it. BamHeader header; readHeader(header, inStream); diff --git a/src/popins2_crop_unmapped.h b/src/popins2_crop_unmapped.h index c972551..4d64020 100644 --- a/src/popins2_crop_unmapped.h +++ b/src/popins2_crop_unmapped.h @@ -73,11 +73,7 @@ int popins2_crop_unmapped(int argc, char const ** argv){ float as_factor = options.alignment_score_factor; - //CharString matesBam = getFileName(workingDirectory, "mates.bam"); - CharString matesBam = getFileName(workingDirectory, options.outputFile); - //CharString fastqFirst = getFileName(workingDirectory, "paired.1.fastq"); - //CharString fastqSecond = getFileName(workingDirectory, "paired.2.fastq"); - //CharString fastqSingle = getFileName(workingDirectory, "single.fastq"); + CharString matesBam = getFileName(workingDirectory, options.matesBam); CharString fastqFirst = getFileName(workingDirectory, options.pe1); CharString fastqSecond = getFileName(workingDirectory, options.pe2); CharString fastqSingle = getFileName(workingDirectory, options.se); @@ -110,14 +106,12 @@ int popins2_crop_unmapped(int argc, char const ** argv){ return 7; } - if(options.printSampleInfo){ - CharString sampleInfoFile = getFileName(workingDirectory, "POPINS_SAMPLE_INFO"); - writeSampleInfo(info, sampleInfoFile); + CharString sampleInfoFile = getFileName(workingDirectory, "POPINS_SAMPLE_INFO"); + writeSampleInfo(info, sampleInfoFile); - msg.str(""); - msg << "Sample info written to \'" << sampleInfoFile << "\'."; - printStatus(msg); - } + msg.str(""); + msg << "Sample info written to \'" << sampleInfoFile << "\'."; + printStatus(msg); } diff --git a/src/popins2_find_locations.h b/src/popins2_find_locations.h new file mode 100644 index 0000000..ffd630e --- /dev/null +++ b/src/popins2_find_locations.h @@ -0,0 +1,80 @@ +#ifndef POPINS_FIND_LOCATIONS_H_ +#define POPINS_FIND_LOCATIONS_H_ + +#include <sstream> + +#include <seqan/file.h> +#include <seqan/sequence.h> +#include <seqan/bam_io.h> + +#include "util.h" +#include "argument_parsing.h" +#include "location.h" + +using namespace seqan; + +// ========================================================================== +// Function popins_find_locations() +// ========================================================================== + +int popins_find_locations(int argc, char const ** argv){ + + // Parse the command line to get option values. + FindLocationsOptions options; + ArgumentParser::ParseResult res = parseCommandLine(options, argc, argv); + if (res != ArgumentParser::PARSE_OK) + return res; + + CharString workingDirectory = getFileName(options.prefix, options.sampleID); + + // Check for input files to exist. + CharString nonRefBam = getFileName(workingDirectory, options.non_ref_bam); + CharString nonRefNew = getFileName(workingDirectory, "non_ref_new.bam"); + CharString locationsFile = getFileName(workingDirectory, "locations.txt"); + + if ( !exists(nonRefNew) || !exists(nonRefBam) ){ + std::cerr << "ERROR: Could not find all input files:"; + std::cerr << nonRefBam << " and " << nonRefNew << std::endl; + return 7; + } + + unsigned nonContigSeqs = 0; + + std::ostringstream msg; + + msg.str(""); + msg << "Counting sequence names listed in header of " << nonRefBam; + printStatus(msg); + + // Count the number of reference sequences in header of non_ref.bam + // possibly including the sequences from the ALTREF. + BamFileIn nonRefStream(toCString(nonRefBam)); + BamHeader header; + readHeader(header, nonRefStream); + nonContigSeqs = length(contigNames(context(nonRefStream))); + + msg.str(""); + msg << "Reading chromosome names from " << options.referenceFile; + printStatus(msg); + + // Read chromosome names in the given reference genome (not ALTREF!). + std::set<CharString> chromosomes; + if (readChromosomes(chromosomes, options.referenceFile) != 0) + return 7; + + msg.str(""); + msg << "Computing contig locations from anchoring reads in " << nonRefNew; + printStatus(msg); + + // Find anchoring locations of contigs for this individual. + String<Location> locations; + findLocations(locations, nonRefNew, chromosomes, nonContigSeqs, options.maxInsertSize); + scoreLocations(locations); + if (writeLocations(locationsFile, locations) != 0) return 7; + + return 0; +} + +#endif // POPINS_FIND_LOCATIONS_H_ + + diff --git a/src/popins_genotype.h b/src/popins2_genotype.h similarity index 98% rename from src/popins_genotype.h rename to src/popins2_genotype.h index 69a4b39..7f7b724 100644 --- a/src/popins_genotype.h +++ b/src/popins2_genotype.h @@ -116,7 +116,7 @@ popins_genotype(int argc, char const ** argv) if (!build(faIndex, toCString(options.referenceFile))) { std::cerr << "ERROR: Could not find nor build the index of " << options.referenceFile << std::endl; - return 7; + return 1; } } @@ -124,7 +124,7 @@ popins_genotype(int argc, char const ** argv) BamIndex<Bai> bamIndex; BamFileIn bamStream; if (initializeBam(sampleInfo.bam_file, bamIndex, bamStream) != 0) - return 7; + return 1; // Build an index of the insertion sequences' fasta file. FaiIndex faIndexAlt; @@ -133,7 +133,7 @@ popins_genotype(int argc, char const ** argv) if (!build(faIndexAlt, toCString(options.supercontigFile))) { std::cerr << "ERROR: Could not build the index of " << options.supercontigFile << std::endl; - return 7; + return 1; } } @@ -142,7 +142,7 @@ popins_genotype(int argc, char const ** argv) BamFileIn bamStreamAlt; CharString altBamFile = getFileName(samplePath, "non_ref_new.bam"); if (initializeBam(altBamFile, bamIndexAlt, bamStreamAlt)) - return 7; + return 1; std::ostringstream msg; msg << "Genotyping VCF records in \'" << options.vcfFile << "\' for " << options.sampleID << "."; diff --git a/src/popins2_merge.h b/src/popins2_merge.h index 397e563..4651d51 100644 --- a/src/popins2_merge.h +++ b/src/popins2_merge.h @@ -42,8 +42,6 @@ int popins2_merge(int argc, char const *argv[]){ CCDBG_Build_opt ccdbg_build_opt; setupBifrostOptions(mo, ccdbg_build_opt); - printMergeOptions(mo); - std::ostringstream msg; // ============================== diff --git a/src/popins2_merge_and_set_mate.h b/src/popins2_merge_and_set_mate.h index e6b4345..c3ad19d 100644 --- a/src/popins2_merge_and_set_mate.h +++ b/src/popins2_merge_and_set_mate.h @@ -266,7 +266,7 @@ compare_qName(CharString & nameA, CharString & nameB) // ========================================================================== bool -merge_and_set_mate(CharString &mergedBam, unsigned &nonContigSeqs, CharString &nonRefBam, CharString &remappedBam) +merge_and_set_mate(CharString &mergedBam, CharString &nonRefBam, CharString &remappedBam) { std::ostringstream msg; msg << "Merging bam files " << nonRefBam << " and " << remappedBam; @@ -290,8 +290,6 @@ merge_and_set_mate(CharString &mergedBam, unsigned &nonContigSeqs, CharString &n BamFileOut outStream(bamContextDep, toCString(mergedBam)); writeHeader(outStream, outHeader); - nonContigSeqs = length(contigNames(context(nonRefStream))); - printStatus(" - merging read records..."); // Read the first record from each input file. Correct ids in records from remappedStreams for new header. @@ -353,16 +351,18 @@ popins2_merge_and_set_mate(int argc, char const ** argv) // Retrieve the sample ID from the first read group listed in BAM file header. if (options.sampleID == "" && retrieveSampleID_(options.sampleID, options.nonRefBam) == 1) - return 7; + { + std::cerr << "ERROR: Cannot retrieve sample ID from BAM file header. Please use option -s or --sample." << std::endl; + return 1; + } CharString workingDirectory = getFileName(options.prefix, options.sampleID); CharString nonRefBam = getFileName(workingDirectory, options.nonRefBam); CharString remappedBam = getFileName(workingDirectory, options.remappedBam); CharString mergedBam = getFileName(workingDirectory, options.mergedBam); - unsigned nonContigSeqs = options.nonContigSeqs; - bool ret = merge_and_set_mate(mergedBam, nonContigSeqs, nonRefBam, remappedBam); + bool ret = merge_and_set_mate(mergedBam, nonRefBam, remappedBam); return ret; } diff --git a/src/popins2_merge_locations.h b/src/popins2_merge_locations.h new file mode 100644 index 0000000..03edbf5 --- /dev/null +++ b/src/popins2_merge_locations.h @@ -0,0 +1,70 @@ +#ifndef POPINS_MERGE_LOCATIONS_H_ +#define POPINS_MERGE_LOCATIONS_H_ + +#include <ctime> + +#include <seqan/sequence.h> +#include <seqan/stream.h> +#include <seqan/seq_io.h> + +#include "util.h" +#include "argument_parsing.h" +#include "location.h" +#include "location_info.h" + +using namespace seqan; + + +// ======================================================================================= +// Function mergeLocations() +// ======================================================================================= + +int +mergeLocations(String<Location> & locations, MergeLocationsOptions & options) +{ + // Open output file. + std::fstream stream(toCString(options.locationsFile), std::ios::out); + if (!stream.good()) + { + std::cerr << "ERROR: Could not open locations file " << options.locationsFile << " for writing." << std::endl; + return 1; + } + + printStatus("Listing locations files."); + + CharString filename = "locations.txt"; + String<Pair<CharString> > locationsFiles = listFiles(options.prefix, filename); + + std::ostringstream msg; + msg << "Merging " << length(locationsFiles) << " locations files."; + printStatus(msg); + + // Merge approximate locations and write them to a file. + if (mergeLocations(stream, locations, locationsFiles, options.locationsFile, options.maxInsertSize) != 0) + return 1; + + close(stream); + + return 0; +} + +// ========================================================================== +// Function popins2_merge_locations() +// ========================================================================== + +int popins_merge_locations(int argc, char const ** argv) +{ + // Parse the command line to get option values. + MergeLocationsOptions options; + ArgumentParser::ParseResult res = parseCommandLine(options, argc, argv); + if (res != ArgumentParser::PARSE_OK) + return res; + + // Merge the locations from all individuals. + String<Location> locs; + int ret = mergeLocations(locs, options); + + return ret; +} + +#endif // POPINS_MERGE_LOCATIONS_H_ \ No newline at end of file diff --git a/src/popins_place.h b/src/popins2_place.h similarity index 87% rename from src/popins_place.h rename to src/popins2_place.h index 2b7de76..9f56b3a 100644 --- a/src/popins_place.h +++ b/src/popins2_place.h @@ -74,6 +74,8 @@ initVcf(TStream & vcfStream, PlacingOptions<TTag> & options, FaiIndex & fai) vcfStream << "##INFO=<ID=AR,Number=1,Type=Integer,Description=\"Number of anchoring read pairs.\">" << std::endl; vcfStream << "##INFO=<ID=AS,Number=1,Type=Float,Description=\"Anchoring score.\">" << std::endl; vcfStream << "##INFO=<ID=GS,Number=1,Type=Integer,Description=\"Number of contigs in the contig group.\">" << std::endl; + vcfStream << "##INFO=<ID=SR,Number=1,Type=Integer,Description=\"Number of supporting reads in most supportive sample.\">" << std::endl; + vcfStream << "##INFO=<ID=NOANCHOR,Number=0,Type=Flag,Description=\"Absence of anchoring read pairs for contig end.\">" << std::endl; vcfStream << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << std::endl; vcfStream << "##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"PHRED-scaled genotype likelihoods\">" << std::endl; @@ -85,40 +87,6 @@ initVcf(TStream & vcfStream, PlacingOptions<TTag> & options, FaiIndex & fai) return 0; } -// ======================================================================================= -// Function mergeLocations() -// ======================================================================================= - -template<typename TTag> -int -mergeLocations(String<Location> & locations, PlacingOptions<TTag> & options) -{ - // Open output file. - std::fstream stream(toCString(options.locationsFile), std::ios::out); - if (!stream.good()) - { - std::cerr << "ERROR: Could not open locations file " << options.locationsFile << " for writing." << std::endl; - return 1; - } - - printStatus("Listing locations files."); - - CharString filename = "locations.txt"; - String<Pair<CharString> > locationsFiles = listFiles(options.prefix, filename); - - std::ostringstream msg; - msg << "Merging " << length(locationsFiles) << " locations files."; - printStatus(msg); - - // Merge approximate locations and write them to a file. - if (mergeLocations(stream, locations, locationsFiles, options.locationsFile, options.maxInsertSize) != 0) - return 1; - - close(stream); - - return 0; -} - // ======================================================================================= // Function loadLocations() // ======================================================================================= @@ -244,19 +212,6 @@ int popins_place_refalign(int argc, char const ** argv) if (res != ArgumentParser::PARSE_OK) return res; - // Placing step 1: MERGE THE LOCATIONS FROM ALL INDIVIDUALS - if (!exists(options.locationsFile)) - { - String<Location> locs; - mergeLocations(locs, options); - } - else - { - std::ostringstream msg; - msg << "WARNING: Locations file \'" << options.locationsFile << "\' exists. Skipping merging step."; - printStatus(msg); - } - // Placing step 2: REFERENCE ALIGNMENT FOR ALL LOCATIONS // Open the FAI file of the reference genome. diff --git a/src/popins4snake.cpp b/src/popins4snake.cpp index 79970ac..ebe9c3b 100644 --- a/src/popins4snake.cpp +++ b/src/popins4snake.cpp @@ -5,9 +5,10 @@ #include "popins2_crop_unmapped.h" #include "popins2_merge_and_set_mate.h" #include "popins2_merge.h" -#include "popins_find_locations.h" -#include "popins_place.h" -#include "popins_genotype.h" +#include "popins2_find_locations.h" +#include "popins2_merge_locations.h" +#include "popins2_place.h" +#include "popins2_genotype.h" using namespace std; @@ -31,9 +32,10 @@ int main(int argc, char const *argv[]){ const char * command = argv[1]; if (strcmp(command,"crop-unmapped") == 0) ret = popins2_crop_unmapped(argc, argv); - else if (strcmp(command,"merge-set-mate") == 0) ret = popins2_merge_and_set_mate(argc, argv); - else if (strcmp(command,"merge") == 0) ret = popins2_merge(argc, argv); + else if (strcmp(command,"merge-bams") == 0) ret = popins2_merge_and_set_mate(argc, argv); + else if (strcmp(command,"merge-contigs") == 0) ret = popins2_merge(argc, argv); else if (strcmp(command,"find-locations") == 0) ret = popins_find_locations(argc, argv); + else if (strcmp(command,"merge-locations") == 0) ret = popins_merge_locations(argc, argv); else if (strcmp(command,"place-refalign") == 0) ret = popins_place_refalign(argc, argv); else if (strcmp(command,"place-splitalign") == 0) ret = popins_place_splitalign(argc, argv); else if (strcmp(command,"place-finish") == 0) ret = popins_place_finish(argc, argv); diff --git a/src/popins_find_locations.h b/src/popins_find_locations.h deleted file mode 100644 index dc98eae..0000000 --- a/src/popins_find_locations.h +++ /dev/null @@ -1,312 +0,0 @@ -#ifndef POPINS_FIND_LOCATIONS_H_ -#define POPINS_FIND_LOCATIONS_H_ - -#include <sstream> - -#include <seqan/file.h> -#include <seqan/sequence.h> -#include <seqan/bam_io.h> - -#include "util.h" -#include "argument_parsing.h" -//#include "popins2_merge_and_set_mate.h" -#include "location.h" - -using namespace seqan; - -// ========================================================================== -// Function write_fastq() -// ========================================================================== -/* -bool write_fastq(CharString & fastqFirst, CharString & fastqSecond, CharString & fastqSingle, CharString & unmappedBam){ - - typedef std::map<CharString, Pair<CharString> > TFastqMap; - - // Create maps for fastq records (first read in pair and second read in pair). - TFastqMap firstReads, secondReads; - - // Open bam file. - BamFileIn inStream(toCString(unmappedBam)); - - // Read bam header and clear it since we don't need it. - BamHeader header; - readHeader(header, inStream); - clear(header); - - // Open the output fastq files. - SeqFileOut fastqFirstStream(toCString(fastqFirst)); - SeqFileOut fastqSecondStream(toCString(fastqSecond)); - SeqFileOut fastqSingleStream(toCString(fastqSingle)); - - // Iterate over bam file and append fastq records. - BamAlignmentRecord record; - while(!atEnd(inStream)) - { - readRecord(record, inStream); - if (hasFlagUnmapped(record)) - appendFastqRecord(fastqFirstStream, fastqSecondStream, firstReads, secondReads, record); - } - - // Write the fastq files. - if (writeFastq(fastqFirstStream, fastqSecondStream, fastqSingleStream, firstReads, secondReads) != 0) return 1; - - return 0; -} -*/ - - -// ========================================================================== -// Function fill_sequences() -// ========================================================================== -/* -bool fill_sequences(CharString & outFile, CharString & inFile){ - typedef Position<Dna5String>::Type TPos; - - BamFileIn inStream(toCString(inFile)); - BamFileOut outStream(context(inStream), toCString(outFile)); - - BamHeader header; - readHeader(header, inStream); - writeHeader(outStream, header); - - BamAlignmentRecord firstRecord, nextRecord; - while (!atEnd(inStream)) - { - readRecord(nextRecord, inStream); - - if (firstRecord.qName != nextRecord.qName || hasFlagFirst(firstRecord) != hasFlagFirst(nextRecord)) - { - // update first record - firstRecord = nextRecord; - if (length(firstRecord.seq) == 0 || length(firstRecord.qual) == 0) - { - std::cerr << "ERROR: First record of read " << firstRecord.qName << " has no sequence." << std::endl; - return 1; - } - } - else - { - // fill sequence field and quality string - if (length(nextRecord.seq) == 0 || length(nextRecord.qual) == 0) - { - TPos last = length(nextRecord.cigar)-1; - if (nextRecord.cigar[0].operation == 'H' || nextRecord.cigar[last].operation == 'H') - { - TPos begin = 0; - if (nextRecord.cigar[0].operation == 'H') - begin = nextRecord.cigar[0].count; - - TPos end = length(firstRecord.seq); - if (nextRecord.cigar[last].operation == 'H') - end -= nextRecord.cigar[last].count; - - nextRecord.seq = infix(firstRecord.seq, begin, end); - nextRecord.qual = infix(firstRecord.qual, begin, end); - } - else - { - nextRecord.seq = firstRecord.seq; - nextRecord.qual = firstRecord.qual; - } - } - } - - writeRecord(outStream, nextRecord); - } - - close(outStream); - - return 0; -} -*/ - - -// ========================================================================== -// Function popins_contigmap() -// ========================================================================== - -int popins_find_locations(int argc, char const ** argv){ - - // Parse the command line to get option values. - FindLocationsOptions options; - ArgumentParser::ParseResult res = parseCommandLine(options, argc, argv); - if (res != ArgumentParser::PARSE_OK) - return res; - - printFindLocationsOptions(options); - - CharString workingDirectory = getFileName(options.prefix, options.sampleID); - - // Check for input files to exist. - //CharString fastqFirst = getFileName(workingDirectory, "paired.1.fastq"); - //CharString fastqSecond = getFileName(workingDirectory, "paired.2.fastq"); - //CharString fastqSingle = getFileName(workingDirectory, "single.fastq"); - CharString nonRefBam = getFileName(workingDirectory, options.non_ref_bam); - CharString nonRefNew = getFileName(workingDirectory, "non_ref_new.bam"); - CharString locationsFile = getFileName(workingDirectory, "locations.txt"); - - if ( !exists(nonRefNew) || !exists(nonRefBam) ){ - std::cerr << "ERROR: Could not find all input files:"; - std::cerr << nonRefBam << " and " << nonRefNew << std::endl; - return 7; - } - - unsigned nonContigSeqs = 0; - - std::ostringstream msg; - - /* - if (!exists(nonRefNew)){ - - // Create names of temporary files. - CharString mappedSam = getFileName(workingDirectory, "contig_mapped_unsorted.sam"); - CharString mappedBamUnsorted = getFileName(workingDirectory, "contig_mapped_unsorted.bam"); - CharString mappedBam = getFileName(workingDirectory, "contig_mapped.bam"); - CharString mergedBam = getFileName(workingDirectory, "merged.bam"); - - std::stringstream cmd; - - CharString indexFile = options.contigFile; - indexFile += ".bwt"; - if (!exists(indexFile)) - { - std::ostringstream msg; - msg << "Indexing contigs in \'" << options.contigFile << "\' using " << BWA; - printStatus(msg); - - cmd.str(""); - cmd << BWA << " index " << options.contigFile; - if (system(cmd.str().c_str()) != 0) - { - std::cerr << "ERROR while indexing \'" << options.contigFile << "\' using " << BWA << std::endl; - return 7; - } - } - - msg << "Mapping reads to contigs using " << BWA; - printStatus(msg); - - // Remapping to contigs with bwa. - cmd.str(""); - if (!options.bestAlignment) cmd << BWA << " mem -a "; - else cmd << BWA << " mem "; - cmd << "-t " << options.threads << " " << options.contigFile << " " << fastqFirst << " " << fastqSecond << " > " << mappedSam; - if (system(cmd.str().c_str()) != 0) - { - std::cerr << "ERROR while running bwa on " << fastqFirst << " and " << fastqSecond << std::endl; - return 7; - } - //remove(toCString(fastqFirst)); - //remove(toCString(fastqSecond)); - - cmd.str(""); - if (!options.bestAlignment) cmd << BWA << " mem -a "; - else cmd << BWA << " mem "; - cmd << "-t " << options.threads << " " << options.contigFile << " " << fastqSingle << " | awk '$1 !~ /@/' >> " << mappedSam; - if (system(cmd.str().c_str()) != 0) - { - std::cerr << "ERROR while running bwa on " << fastqSingle << std::endl; - return 7; - } - //remove(toCString(fastqSingle)); - - printStatus("Filling in sequences of secondary records in bwa output"); - - // Fill in sequences in bwa output. - if (fill_sequences(mappedBamUnsorted, mappedSam) != 0) - { - return 7; - } - remove(toCString(mappedSam)); - - msg.str(""); - msg << "Sorting " << mappedBamUnsorted << " by read name using " << SAMTOOLS; - printStatus(msg); - - // Sort <WD>/contig_mapped.bam by read name - cmd.str(""); - cmd << SAMTOOLS << " sort -n -@ " << options.threads << " -m " << options.memory << " -o " << mappedBam << " " << mappedBamUnsorted; - if (system(cmd.str().c_str()) != 0) - { - std::cerr << "ERROR while sorting " << mappedBamUnsorted << " by read name using " << SAMTOOLS << std::endl; - return 7; - } - remove(toCString(mappedBamUnsorted)); - - // Merge non_ref.bam with contig_mapped and set the mates. - if (merge_and_set_mate(mergedBam, nonContigSeqs, nonRefBam, mappedBam) != 0) - return 7; - - remove(toCString(mappedBam)); - //remove(toCString(nonRefBam)); - - msg.str(""); - msg << "Sorting " << mergedBam << " using " << SAMTOOLS; - printStatus(msg); - - // Sort <WD>/merged.bam by beginPos, output is <WD>/non_ref.bam. - cmd.str(""); - cmd << SAMTOOLS << " sort -@ " << options.threads << " -m " << options.memory << " -o " << nonRefNew << " " << mergedBam; - if (system(cmd.str().c_str()) != 0) - { - std::cerr << "ERROR while sorting " << mergedBam << " by beginPos using " << SAMTOOLS << std::endl; - return 7; - } - remove(toCString(mergedBam)); - - msg.str(""); - msg << "Indexing " << nonRefNew << " by beginPos using " << SAMTOOLS; - printStatus(msg); - - // Index <WD>/non_ref_new.bam. - cmd.str(""); - cmd << SAMTOOLS << " index " << nonRefNew; - if (system(cmd.str().c_str()) != 0) - { - std::cerr << "ERROR while indexing " << nonRefNew << " using " << SAMTOOLS << std::endl; - return 7; - } - } - */ - - msg.str(""); - msg << "Reading contig names from " << nonRefBam; - printStatus(msg); - - BamFileIn nonRefStream(toCString(nonRefBam)); - BamHeader header; - readHeader(header, nonRefStream); - nonContigSeqs = length(contigNames(context(nonRefStream))); - - - - msg.str(""); - msg << "Reading chromosomes from " << options.referenceFile; - printStatus(msg); - - std::set<CharString> chromosomes; - if (readChromosomes(chromosomes, options.referenceFile) != 0) - return 7; - - - - msg.str(""); - msg << "Computing contig locations from anchoring reads in " << nonRefNew; - printStatus(msg); - - // Find anchoring locations of contigs for this individual. - String<Location> locations; - findLocations(locations, nonRefNew, chromosomes, nonContigSeqs, options.maxInsertSize); - scoreLocations(locations); - if (writeLocations(locationsFile, locations) != 0) return 7; - - // Remove the non_ref_new.bam file. - if (options.deleteNonRefNew) - remove(toCString(nonRefNew)); - - return 0; -} - -#endif // POPINS_FIND_LOCATIONS_H_ - - diff --git a/src/variant_caller.h b/src/variant_caller.h index 31e84fa..91b64fe 100644 --- a/src/variant_caller.h +++ b/src/variant_caller.h @@ -663,7 +663,7 @@ int initializeBam(CharString fileName, BamIndex<Bai> & bamIndex, BamFileIn & bam } CharString baifileName = fileName; - fileName += ".bai"; + baifileName += ".bai"; if (!open(bamIndex, toCString(baifileName))) { std::cerr << "ERROR: Could not read BAI index file " << fileName << "\n"; -- GitLab