diff --git a/Makefile b/Makefile index 7a606b656fd0605336965dab7829b0773ff2be74..45eb510b820e52b7201226b72524a7c2f813823b 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ TOOLS=-DSAMTOOLS=\"$(SAMTOOLS)\" -DBWA=\"$(BWA)\" -DSICKLE=\"$(SICKLE)\" -DVELVE # Date and version number from git DATE := on $(shell git log --pretty=format:"%cd" --date=iso | cut -f 1,2 -d " " | head -n 1) -VERSION := 0.13.0-$(shell git log --pretty=format:"%h" --date=iso | head -n 1) +VERSION := 0.0.2-snake-$(shell git log --pretty=format:"%h" --date=iso | head -n 1) CXXFLAGS += -DDATE=\""$(DATE)"\" -DVERSION=\""$(VERSION)"\" # Compiler flags diff --git a/build/ColoredDeBruijnGraph.o b/build/ColoredDeBruijnGraph.o deleted file mode 100644 index fd2a3929f099cc6ee3a2120f2879f52ceda062ad..0000000000000000000000000000000000000000 Binary files a/build/ColoredDeBruijnGraph.o and /dev/null differ diff --git a/build/LECC_Finder.o b/build/LECC_Finder.o deleted file mode 100644 index 579249b968d795226a1934e7070fea24ea8dc09d..0000000000000000000000000000000000000000 Binary files a/build/LECC_Finder.o and /dev/null differ diff --git a/build/Setcover.o b/build/Setcover.o deleted file mode 100644 index 2e6e86703cda14c6e956cb65d908f06c2c9abc93..0000000000000000000000000000000000000000 Binary files a/build/Setcover.o and /dev/null differ diff --git a/build/Traceback.o b/build/Traceback.o deleted file mode 100644 index ca879ab784957160518fa26d936ca0110d44a0ca..0000000000000000000000000000000000000000 Binary files a/build/Traceback.o and /dev/null differ diff --git a/build/UnitigExtension.o b/build/UnitigExtension.o deleted file mode 100644 index 553494b4e88b799e0ab5cfe311d94c5f41f15da9..0000000000000000000000000000000000000000 Binary files a/build/UnitigExtension.o and /dev/null differ diff --git a/build/popins2.o b/build/popins2.o deleted file mode 100644 index 1363e9adcd4aaeb6dc12583d9c38d95d8dfd702c..0000000000000000000000000000000000000000 Binary files a/build/popins2.o and /dev/null differ diff --git a/popins2 b/popins2 deleted file mode 100755 index 1f24a555bc2acc0427f0dd1cd133ed2b9242d721..0000000000000000000000000000000000000000 Binary files a/popins2 and /dev/null differ diff --git a/src/argument_parsing.h b/src/argument_parsing.h index cb4522a51a7c2bbc7113d22776fc1075762076ee..27fb6178e8670988c58e78d85576cf602bbe920c 100644 --- a/src/argument_parsing.h +++ b/src/argument_parsing.h @@ -226,6 +226,24 @@ struct ContigMapOptions { }; +struct FindLocationsOptions { + CharString prefix; + CharString sampleID; + CharString referenceFile; + + int maxInsertSize; + bool deleteNonRefNew; + + FindLocationsOptions() : + prefix("."), + sampleID(""), + referenceFile("genome.fa"), + maxInsertSize(800), + deleteNonRefNew(false) + {} +}; + + struct RefAlign_; typedef Tag<RefAlign_> RefAlign; struct SplitAlign_; @@ -399,7 +417,6 @@ bool getOptionValues(MergeSetMateOptions & options, ArgumentParser const & parse return true; } - bool getOptionValues(MergeOptions &options, seqan::ArgumentParser &parser){ if (isSet(parser, "verbose")) @@ -452,7 +469,6 @@ bool getOptionValues(MergeOptions &options, seqan::ArgumentParser &parser){ return true; } - /** * This function transfers all relevant options of the merge module to * a CCDBG_Build_opt instance. @@ -521,6 +537,22 @@ bool getOptionValues(ContigMapOptions &options, seqan::ArgumentParser &parser){ } +bool getOptionValues(FindLocationsOptions &options, seqan::ArgumentParser &parser){ + getArgumentValue(options.sampleID, parser, 0); + + if (isSet(parser, "prefix")) + getOptionValue(options.prefix, parser, "prefix"); + if (isSet(parser, "reference")) + getOptionValue(options.referenceFile, parser, "reference"); + if (isSet(parser, "maxInsertSize")) + getOptionValue(options.maxInsertSize, parser, "maxInsertSize"); + if (isSet(parser, "delNonRefNew")) + options.deleteNonRefNew = true; + + return true; +} + + bool getOptionValues(PlacingOptions<RefAlign> &options, seqan::ArgumentParser &parser){ if (isSet(parser, "prefix")) getOptionValue(options.prefix, parser, "prefix"); @@ -677,6 +709,11 @@ void setHiddenOptions(seqan::ArgumentParser &parser, bool hide, ContigMapOptions } +void setHiddenOptions(seqan::ArgumentParser & /*parser*/, bool /*hide*/, FindLocationsOptions &){ + // TODO +} + + void setHiddenOptions(seqan::ArgumentParser & parser, bool hide, PlacingOptions<RefAlign> &){ hideOption(parser, "groupDist", hide); } @@ -1034,6 +1071,40 @@ void setupParser(seqan::ArgumentParser &parser, ContigMapOptions &options){ } +void setupParser(seqan::ArgumentParser &parser, FindLocationsOptions &options){ + setShortDescription(parser, "Find insertion locations per sample."); + setVersion(parser, VERSION); + setDate(parser, DATE); + + // Define usage line and long description. + addUsageLine(parser, "[\\fIOPTIONS\\fP] \\fISAMPLE_ID\\fP"); + addDescription(parser, "todo."); + + addArgument(parser, ArgParseArgument(ArgParseArgument::STRING, "SAMPLE_ID")); + + // Setup the options. + addSection(parser, "Input/output options"); + addOption(parser, ArgParseOption("p", "prefix", "Path to the sample directories.", ArgParseArgument::STRING, "PATH")); + addOption(parser, ArgParseOption("r", "reference", "Name of reference genome file.", ArgParseArgument::INPUT_FILE, "FASTA_FILE")); + + addSection(parser, "Algorithm options"); + addOption(parser, ArgParseOption("e", "maxInsertSize", "The maximum expected insert size of the read pairs.", ArgParseArgument::INTEGER, "INT")); + addOption(parser, ArgParseOption("d", "delNonRefNew", "Delete the non_ref_new.bam file after writing locations.")); + + // Set valid values. + setValidValues(parser, "reference", "fa fna fasta"); + + // Set default values. + setDefaultValue(parser, "prefix", "\'.\'"); + setDefaultValue(parser, "reference", options.referenceFile); + setDefaultValue(parser, "maxInsertSize", options.maxInsertSize); + setDefaultValue(parser, "delNonRefNew", "false"); + + // Hide some options from default help. + setHiddenOptions(parser, true, options); +} + + void setupParser(seqan::ArgumentParser &parser, PlacingOptions<RefAlign> &options){ setShortDescription(parser, "Contig placing by alignment of contig ends to reference genome."); setVersion(parser, VERSION); @@ -1336,6 +1407,7 @@ ArgumentParser::ParseResult checkInput(MergeSetMateOptions & options){ std::cerr << "ERROR: Path to sample directories \'" << options.prefix << "\' does not exist." << std::endl; res = ArgumentParser::PARSE_ERROR; } +<<<<<<< HEAD CharString filename = options.prefix; filename += "/"; filename += options.sampleID; @@ -1354,6 +1426,30 @@ ArgumentParser::ParseResult checkInput(MergeSetMateOptions & options){ if (!exists(filename)) { std::cerr << "ERROR: Input BAM file \'" << filename << "\' does not exist." << std::endl; +======= + + CharString filenameNonRefBam = options.prefix; + filenameNonRefBam += "/"; + filenameNonRefBam += options.sampleID; + filenameNonRefBam += "/"; + filenameNonRefBam += options.nonRefBam; + + if (!exists(filenameNonRefBam)) + { + std::cerr << "ERROR: Input BAM file \'" << filenameNonRefBam << "\' does not exist." << std::endl; + res = ArgumentParser::PARSE_ERROR; + } + + CharString filenameRemappedBam = options.prefix; + filenameRemappedBam += "/"; + filenameRemappedBam += options.sampleID; + filenameRemappedBam += "/"; + filenameRemappedBam += options.remappedBam; + + if (!exists(filenameRemappedBam)) + { + std::cerr << "ERROR: Input BAM file \'" << filenameRemappedBam << "\' does not exist." << std::endl; +>>>>>>> 3c91fa0fb1120cec92e7feb76afce165d0e62af0 res = ArgumentParser::PARSE_ERROR; } @@ -1452,6 +1548,24 @@ ArgumentParser::ParseResult checkInput(ContigMapOptions &options){ } +ArgumentParser::ParseResult checkInput(FindLocationsOptions &options){ + + ArgumentParser::ParseResult res = ArgumentParser::PARSE_OK; + + if (options.prefix != "." && !exists(options.prefix)){ + std::cerr << "ERROR: Path to sample directories \'" << options.prefix << "\' does not exist." << std::endl; + res = ArgumentParser::PARSE_ERROR; + } + + if (!exists(options.referenceFile)){ + std::cerr << "ERROR: Reference genome file \'" << options.referenceFile << "\' does not exist." << std::endl; + res = ArgumentParser::PARSE_ERROR; + } + + return res; +} + + ArgumentParser::ParseResult checkInput(PlacingOptions<RefAlign> &options){ ArgumentParser::ParseResult res = ArgumentParser::PARSE_OK; @@ -1574,6 +1688,7 @@ void printHelp(char const * name){ std::cerr << " \033[1mmerge\033[0m Generate supercontigs from a colored compacted de Bruijn Graph." << std::endl; std::cerr << " \033[1mmultik\033[0m Multi-k framework for a colored compacted de Bruijn Graph." << std::endl; std::cerr << " \033[1mcontigmap\033[0m Map unmapped reads to (super-)contigs." << std::endl; + std::cerr << " \033[1mfind-locations\033[0m Find insertion locations per individual." << std::endl; std::cerr << " \033[1mplace-refalign\033[0m Find position of (super-)contigs by aligning contig ends to the reference genome." << std::endl; std::cerr << " \033[1mplace-splitalign\033[0m Find position of (super-)contigs by split-read alignment (per sample)." << std::endl; std::cerr << " \033[1mplace-finish\033[0m Combine position found by split-read alignment from all samples." << std::endl; @@ -1626,6 +1741,19 @@ void printMultikOptions(const MultikOptions &options){ } +void printMultikOptions(const FindLocationsOptions &options){ + cout << "=========================================================" << endl; + cout << "popins version : " << VERSION << endl; + cout << "PARAMETER ======== : VALUE ==============================" << endl; + cout << "prefix : " << options.prefix << endl; + cout << "sample ID : " << options.sampleID << endl; + cout << "reference file : " << options.referenceFile << endl; + cout << "max insert size : " << options.maxInsertSize << endl; + cout << "delete non_ref_new : " << options.deleteNonRefNew << endl; + cout << "=========================================================" << endl; +} + + // ========================================================================== // Function parseCommandLine() // ========================================================================== diff --git a/src/popins2.cpp b/src/popins2.cpp index 34abd8bc324f72fd283fcde3beeb1e3de4d74c5f..7fa5eec85cc46d2c28b6ae9408a66c700d8858a2 100644 --- a/src/popins2.cpp +++ b/src/popins2.cpp @@ -8,6 +8,7 @@ #include "popins2_merge.h" #include "popins2_multik.h" #include "popins_contigmap.h" +#include "popins_find_locations.h" #include "popins_place.h" #include "popins_genotype.h" @@ -38,6 +39,7 @@ int main(int argc, char const *argv[]){ else if (strcmp(command,"merge") == 0) ret = popins2_merge(argc, argv); else if (strcmp(command,"multik") == 0) ret = popins2_multik(argc, argv); else if (strcmp(command,"contigmap") == 0) ret = popins_contigmap(argc, argv); + else if (strcmp(command,"find-locations") == 0) ret = popins_find_locations(argc, argv); else if (strcmp(command,"place-refalign") == 0) ret = popins_place_refalign(argc, argv); else if (strcmp(command,"place-splitalign") == 0) ret = popins_place_splitalign(argc, argv); else if (strcmp(command,"place-finish") == 0) ret = popins_place_finish(argc, argv); @@ -57,7 +59,7 @@ int main(int argc, char const *argv[]){ if (ret == 0){ std::ostringstream msg; - msg << "[popins2 " << command << "] finished in " << (std::time(0) - start_time) << " seconds."; + msg << "[popins4snake " << command << "] finished in " << (std::time(0) - start_time) << " seconds."; printTimeStatus(msg); } diff --git a/src/popins_find_locations.h b/src/popins_find_locations.h new file mode 100644 index 0000000000000000000000000000000000000000..82e177865b43457caf186f3f51d6861b491d5a1c --- /dev/null +++ b/src/popins_find_locations.h @@ -0,0 +1,310 @@ +#ifndef POPINS_FIND_LOCATIONS_H_ +#define POPINS_FIND_LOCATIONS_H_ + +#include <sstream> + +#include <seqan/file.h> +#include <seqan/sequence.h> +#include <seqan/bam_io.h> + +#include "util.h" +#include "argument_parsing.h" +//#include "popins2_merge_and_set_mate.h" +#include "location.h" + +using namespace seqan; + +// ========================================================================== +// Function write_fastq() +// ========================================================================== +/* +bool write_fastq(CharString & fastqFirst, CharString & fastqSecond, CharString & fastqSingle, CharString & unmappedBam){ + + typedef std::map<CharString, Pair<CharString> > TFastqMap; + + // Create maps for fastq records (first read in pair and second read in pair). + TFastqMap firstReads, secondReads; + + // Open bam file. + BamFileIn inStream(toCString(unmappedBam)); + + // Read bam header and clear it since we don't need it. + BamHeader header; + readHeader(header, inStream); + clear(header); + + // Open the output fastq files. + SeqFileOut fastqFirstStream(toCString(fastqFirst)); + SeqFileOut fastqSecondStream(toCString(fastqSecond)); + SeqFileOut fastqSingleStream(toCString(fastqSingle)); + + // Iterate over bam file and append fastq records. + BamAlignmentRecord record; + while(!atEnd(inStream)) + { + readRecord(record, inStream); + if (hasFlagUnmapped(record)) + appendFastqRecord(fastqFirstStream, fastqSecondStream, firstReads, secondReads, record); + } + + // Write the fastq files. + if (writeFastq(fastqFirstStream, fastqSecondStream, fastqSingleStream, firstReads, secondReads) != 0) return 1; + + return 0; +} +*/ + + +// ========================================================================== +// Function fill_sequences() +// ========================================================================== +/* +bool fill_sequences(CharString & outFile, CharString & inFile){ + typedef Position<Dna5String>::Type TPos; + + BamFileIn inStream(toCString(inFile)); + BamFileOut outStream(context(inStream), toCString(outFile)); + + BamHeader header; + readHeader(header, inStream); + writeHeader(outStream, header); + + BamAlignmentRecord firstRecord, nextRecord; + while (!atEnd(inStream)) + { + readRecord(nextRecord, inStream); + + if (firstRecord.qName != nextRecord.qName || hasFlagFirst(firstRecord) != hasFlagFirst(nextRecord)) + { + // update first record + firstRecord = nextRecord; + if (length(firstRecord.seq) == 0 || length(firstRecord.qual) == 0) + { + std::cerr << "ERROR: First record of read " << firstRecord.qName << " has no sequence." << std::endl; + return 1; + } + } + else + { + // fill sequence field and quality string + if (length(nextRecord.seq) == 0 || length(nextRecord.qual) == 0) + { + TPos last = length(nextRecord.cigar)-1; + if (nextRecord.cigar[0].operation == 'H' || nextRecord.cigar[last].operation == 'H') + { + TPos begin = 0; + if (nextRecord.cigar[0].operation == 'H') + begin = nextRecord.cigar[0].count; + + TPos end = length(firstRecord.seq); + if (nextRecord.cigar[last].operation == 'H') + end -= nextRecord.cigar[last].count; + + nextRecord.seq = infix(firstRecord.seq, begin, end); + nextRecord.qual = infix(firstRecord.qual, begin, end); + } + else + { + nextRecord.seq = firstRecord.seq; + nextRecord.qual = firstRecord.qual; + } + } + } + + writeRecord(outStream, nextRecord); + } + + close(outStream); + + return 0; +} +*/ + + +// ========================================================================== +// Function popins_contigmap() +// ========================================================================== + +int popins_find_locations(int argc, char const ** argv){ + + // Parse the command line to get option values. + FindLocationsOptions options; + ArgumentParser::ParseResult res = parseCommandLine(options, argc, argv); + if (res != ArgumentParser::PARSE_OK) + return res; + + CharString workingDirectory = getFileName(options.prefix, options.sampleID); + + // Check for input files to exist. + //CharString fastqFirst = getFileName(workingDirectory, "paired.1.fastq"); + //CharString fastqSecond = getFileName(workingDirectory, "paired.2.fastq"); + //CharString fastqSingle = getFileName(workingDirectory, "single.fastq"); + CharString nonRefBam = getFileName(workingDirectory, "non_ref.bam"); + CharString nonRefNew = getFileName(workingDirectory, "non_ref_new.bam"); + CharString locationsFile = getFileName(workingDirectory, "locations.txt"); + + if ( !exists(nonRefNew) || !exists(nonRefBam) ){ + std::cerr << "ERROR: Could not find all input files:"; + std::cerr << nonRefBam << " and " << nonRefNew << std::endl; + return 7; + } + + unsigned nonContigSeqs = 0; + + std::ostringstream msg; + + /* + if (!exists(nonRefNew)){ + + // Create names of temporary files. + CharString mappedSam = getFileName(workingDirectory, "contig_mapped_unsorted.sam"); + CharString mappedBamUnsorted = getFileName(workingDirectory, "contig_mapped_unsorted.bam"); + CharString mappedBam = getFileName(workingDirectory, "contig_mapped.bam"); + CharString mergedBam = getFileName(workingDirectory, "merged.bam"); + + std::stringstream cmd; + + CharString indexFile = options.contigFile; + indexFile += ".bwt"; + if (!exists(indexFile)) + { + std::ostringstream msg; + msg << "Indexing contigs in \'" << options.contigFile << "\' using " << BWA; + printStatus(msg); + + cmd.str(""); + cmd << BWA << " index " << options.contigFile; + if (system(cmd.str().c_str()) != 0) + { + std::cerr << "ERROR while indexing \'" << options.contigFile << "\' using " << BWA << std::endl; + return 7; + } + } + + msg << "Mapping reads to contigs using " << BWA; + printStatus(msg); + + // Remapping to contigs with bwa. + cmd.str(""); + if (!options.bestAlignment) cmd << BWA << " mem -a "; + else cmd << BWA << " mem "; + cmd << "-t " << options.threads << " " << options.contigFile << " " << fastqFirst << " " << fastqSecond << " > " << mappedSam; + if (system(cmd.str().c_str()) != 0) + { + std::cerr << "ERROR while running bwa on " << fastqFirst << " and " << fastqSecond << std::endl; + return 7; + } + //remove(toCString(fastqFirst)); + //remove(toCString(fastqSecond)); + + cmd.str(""); + if (!options.bestAlignment) cmd << BWA << " mem -a "; + else cmd << BWA << " mem "; + cmd << "-t " << options.threads << " " << options.contigFile << " " << fastqSingle << " | awk '$1 !~ /@/' >> " << mappedSam; + if (system(cmd.str().c_str()) != 0) + { + std::cerr << "ERROR while running bwa on " << fastqSingle << std::endl; + return 7; + } + //remove(toCString(fastqSingle)); + + printStatus("Filling in sequences of secondary records in bwa output"); + + // Fill in sequences in bwa output. + if (fill_sequences(mappedBamUnsorted, mappedSam) != 0) + { + return 7; + } + remove(toCString(mappedSam)); + + msg.str(""); + msg << "Sorting " << mappedBamUnsorted << " by read name using " << SAMTOOLS; + printStatus(msg); + + // Sort <WD>/contig_mapped.bam by read name + cmd.str(""); + cmd << SAMTOOLS << " sort -n -@ " << options.threads << " -m " << options.memory << " -o " << mappedBam << " " << mappedBamUnsorted; + if (system(cmd.str().c_str()) != 0) + { + std::cerr << "ERROR while sorting " << mappedBamUnsorted << " by read name using " << SAMTOOLS << std::endl; + return 7; + } + remove(toCString(mappedBamUnsorted)); + + // Merge non_ref.bam with contig_mapped and set the mates. + if (merge_and_set_mate(mergedBam, nonContigSeqs, nonRefBam, mappedBam) != 0) + return 7; + + remove(toCString(mappedBam)); + //remove(toCString(nonRefBam)); + + msg.str(""); + msg << "Sorting " << mergedBam << " using " << SAMTOOLS; + printStatus(msg); + + // Sort <WD>/merged.bam by beginPos, output is <WD>/non_ref.bam. + cmd.str(""); + cmd << SAMTOOLS << " sort -@ " << options.threads << " -m " << options.memory << " -o " << nonRefNew << " " << mergedBam; + if (system(cmd.str().c_str()) != 0) + { + std::cerr << "ERROR while sorting " << mergedBam << " by beginPos using " << SAMTOOLS << std::endl; + return 7; + } + remove(toCString(mergedBam)); + + msg.str(""); + msg << "Indexing " << nonRefNew << " by beginPos using " << SAMTOOLS; + printStatus(msg); + + // Index <WD>/non_ref_new.bam. + cmd.str(""); + cmd << SAMTOOLS << " index " << nonRefNew; + if (system(cmd.str().c_str()) != 0) + { + std::cerr << "ERROR while indexing " << nonRefNew << " using " << SAMTOOLS << std::endl; + return 7; + } + } + */ + + msg.str(""); + msg << "Reading contig names from " << nonRefBam; + printStatus(msg); + + BamFileIn nonRefStream(toCString(nonRefBam)); + BamHeader header; + readHeader(header, nonRefStream); + nonContigSeqs = length(contigNames(context(nonRefStream))); + + + + msg.str(""); + msg << "Reading chromosomes from " << options.referenceFile; + printStatus(msg); + + std::set<CharString> chromosomes; + if (readChromosomes(chromosomes, options.referenceFile) != 0) + return 7; + + + + msg.str(""); + msg << "Computing contig locations from anchoring reads in " << nonRefNew; + printStatus(msg); + + // Find anchoring locations of contigs for this individual. + String<Location> locations; + findLocations(locations, nonRefNew, chromosomes, nonContigSeqs, options.maxInsertSize); + scoreLocations(locations); + if (writeLocations(locationsFile, locations) != 0) return 7; + + // Remove the non_ref_new.bam file. + if (options.deleteNonRefNew) + remove(toCString(nonRefNew)); + + return 0; +} + +#endif // POPINS_FIND_LOCATIONS_H_ + +