Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
popins2_multik.h 10.61 KiB
/**
 *  @file   src/popins2_multik.h
 *  @brief  Creating a multi-k-iterative colored compacted de Bruijn Graph.
 *          In the past, the multi-k paradigm for de Bruijn Graphs (dBG) has been shown to be a successful
 *          method to transform the topology of a dBG for de novo sequence assembly. The multi-k paradigm
 *          aims to increase the average unitig length without the fragmentation of large k values.
 *          The multik module extends this idea for colored de Bruijn Graphs by maintaining the color
 *          awareness in each iteration.
 */

#ifndef POPINS2_MULTIK_H_
#define POPINS2_MULTIK_H_

#include <bifrost/ColoredCDBG.hpp>          // ColoredCDBG
#include <seqan/seq_io.h>                   // getAbsolutePath, toCString, readRecords
#include "util.h"                           // getFastx, printTimeStatus, getAbsoluteFileName
#include "argument_parsing.h"               // MultikOptions, parseCommandLine, printMultikOptions

using namespace seqan;



/**
 *          Function to convert FASTQ to FASTA.
 * @details This is the seqAn2 way to do this. Converting this with the new C++20 concepts used in
 *          seqAn3 is so much more elegant.
 * @param   fastq_file is the name of the fastq file to convert to fasta, including its full path
 * @param   outpath is a (optional) string to define an output directory for the FASTA.
 *          If outpath is "" the FASTA will be written to the current working directory
 * @param   fasta_names is a vector reference to store the absolute FASTA filenames in
 * @return  bool; 1 if error, 0 else
 */
inline bool fastq2fasta(const std::string &fastq_file, const std::string &outpath, std::vector<std::string> &fasta_names){
    // read FASTQ
    CharString seqFileName = fastq_file;
    SeqFileIn seqFileIn;

    if (!open(seqFileIn, toCString(seqFileName))){
        std::cerr << "ERROR: Could not open FASTQ file to read from.\n";
        return 1;
    }

    StringSet<CharString> ids;
    StringSet<Dna5String> seqs;
    StringSet<CharString> quals;

    try{
        readRecords(ids, seqs, quals, seqFileIn);
    }
    catch (Exception const & e){
        std::cout << "ERROR: " << e.what() << std::endl;
        return 1;
    }

    close(seqFileIn);
    clear(quals);   // we don't need them

    // get FASTQ filename without path and without file ending
    size_t lastSlashPos = fastq_file.find_last_of("/");
    std::string fname   = fastq_file.substr(lastSlashPos+1);
    size_t lastDotPos   = fname.find_last_of(".");
    std::string fname_  = fname.substr(0,lastDotPos);

    // create FASTA filename
    std::string ofile_name;
    if (strcmp(outpath.c_str(), "") == 0){
        ofile_name = fname_+".fasta";
    }
    else{
        ofile_name = getAbsoluteFileName(outpath, fname_+".fasta");
    }
    fasta_names.push_back(ofile_name);

    // write FASTA
    SeqFileOut seqFileOut(ofile_name.c_str());
    if (!open(seqFileOut, ofile_name.c_str())){
        std::cerr << "ERROR: Could not open FASTA file to write into.\n";
        return 1;
    }

    try{
        writeRecords(seqFileOut, ids, seqs);
    }
    catch (Exception const & e){
        std::cout << "ERROR: " << e.what() << std::endl;
        return 1;
    }


    close(seqFileOut);

    return 0;
}


/**
 *          This function defines the boolean logic to decide if a color for unitig is significant.
 *  @param  b1, b2, b3 are boolean values to compare
 */
inline bool logicDecision(const bool b1, const bool b2, const bool b3){
    // true if at least 2 out of 3 are true
    return b1 ? (b2 || b3) : (b2 && b3);
}


/**
 *          This function extract the color bits of a kmer position of UnitigColorMap
 *  @param  ucm is a unitig of the graph
 *  @param  colorBits is a vector to store the color bits of the kmer
 *  @param  pos is the position of the kmer in the unitig (ucm)
 */
inline void getColorBitsOfPosition(const UnitigColorMap<void> &ucm, std::vector<bool> &colorBits, const size_t pos){
    UnitigColorMap<void> kmer;
    UnitigColors* colors;

    kmer   = ucm.getKmerMapping(pos);
    colors = kmer.getData()->getUnitigColors(kmer);

    UnitigColors::const_iterator cit = colors->begin(kmer);
    for (; cit != colors->end(); ++cit)
        colorBits[cit.getColorID()] = true;
}


/**
 *          This function defines how to subsample kmers from a unitig.
 *  @param  ucm is a unitig of the graph
 *  @param  color_indices is a list of samples the unitig should be added to
 */
inline void colorProbing(const UnitigColorMap<void> &ucm, std::vector<size_t> &color_indices, const size_t nb_colors){
    // the current approach is to ckeck the unitig colors at three positions: start, middle and end
    // NOTE: finding the end position usually requires strand awareness, but this is not important here
    size_t end = ucm.len - 1;
    size_t mid = floor(end / 2);

    std::vector<bool> startColorBits(nb_colors, false);
    std::vector<bool> middleColorBits(nb_colors, false);
    std::vector<bool> endColorBits(nb_colors, false);

    getColorBitsOfPosition(ucm, startColorBits, 0);
    getColorBitsOfPosition(ucm, middleColorBits, mid);
    getColorBitsOfPosition(ucm, endColorBits, end);

    for (size_t i = 0; i < nb_colors; ++i)
        if (logicDecision(startColorBits[i], middleColorBits[i], endColorBits[i]))
            color_indices.push_back(i);
}


/**
 *          Multik main routine.
 *  @return 0=success, 1=error
 */
int popins2_multik(int argc, char const *argv[]){
    // =====================
    // Argument parsing
    // =====================
    MultikOptions mko;
    seqan::ArgumentParser::ParseResult res = parseCommandLine(mko, argc, argv);
    if (res != seqan::ArgumentParser::PARSE_OK){
        if (res == seqan::ArgumentParser::PARSE_HELP)
            return 0;
        cerr << "[popins2 multik] seqan::ArgumentParser::PARSE_ERROR" << endl;
        return 1;
    }

    printMultikOptions(mko);

    int delta_k = mko.delta_k;
    int k_max   = mko.k_max;

    std::ostringstream msg;
    msg << "[popins2 multik] Starting ..."; printTimeStatus(msg);

    // manage a temporary directory
    if (strcmp(mko.tempPath.c_str(), "") != 0)      // if mko.tempPath != ""
        (mkdir(mko.tempPath.c_str(), 0750) == -1) ? cerr << "Error :  " << strerror(errno) << endl : cout << "[popins2 multik] Temp directory created.\n";

    // read original FASTQ samples
    //std::vector<std::string> samples;
    //getFastx(samples, mko.samplePath);

    // create a FASTA at tempDir for every input FASTQ
    std::vector<std::string> temp_fastas;
    bool fastq2fasta_failed = false;

    for (auto &sample : mko.inputFiles){
        bool ret = fastq2fasta(sample, mko.tempPath, temp_fastas);
        fastq2fasta_failed = fastq2fasta_failed || ret;
    }
    if (fastq2fasta_failed){
        std::cerr << "[Error] Initial FASTQ to FASTA conversion failed]" << '\n';
        return 1;
    }

    (mko.inputFiles).clear();

    // =====================
    // Graph options
    // =====================
    CCDBG_Build_opt opt;
    opt.filename_seq_in   = temp_fastas;
    opt.deleteIsolated    = true;
    opt.clipTips          = true;
    opt.useMercyKmers     = false;
    opt.prefixFilenameOut = mko.prefixFilenameOut;
    opt.nb_threads        = 16;
    opt.outputGFA         = true;
    opt.verbose           = false;
    opt.k                 = mko.k_init;

    // =====================
    // Multi-k framework
    // =====================
    unsigned k_iter_counter = 0;
    while (opt.k <= k_max) {
        ++k_iter_counter;
        msg << "[popins2 multik] Multi-k iteration "+std::to_string(k_iter_counter)+" using k="+std::to_string(opt.k); printTimeStatus(msg);

        ColoredCDBG<> g(opt.k);

        msg << "[popins2 multik] Building dBG..."; printTimeStatus(msg);
        g.buildGraph(opt);

        msg << "[popins2 multik] Simplifying dBG..."; printTimeStatus(msg);
        g.simplify(opt.deleteIsolated, opt.clipTips, opt.verbose);

        msg << "[popins2 multik] Annotating colors..."; printTimeStatus(msg);
        g.buildColors(opt);

        opt.k += delta_k;

        // exit point
        if (opt.k > k_max){
            msg << "[popins2 multik] Writing "+opt.prefixFilenameOut+".gfa of de Bruijn Graph built with k="+std::to_string(opt.k - delta_k)+"..."; printTimeStatus(msg);
            g.write(opt.prefixFilenameOut, opt.nb_threads, opt.verbose);
            break;
        }

        // get sample-unitig assignment
        msg << "[popins2 multik] Get sample-unitig assignment..."; printTimeStatus(msg);
        const size_t nb_colors = g.getNbColors();
        std::unordered_map <std::string, std::vector<unsigned> > unitig_propagation_table;
        std::vector<size_t> color_indices;
        unsigned ucm_index = 0;

        for (auto &ucm : g){
            // don't process unitigs of size less than current k, they wouldn't survive the include of the next k iteration anyway
            if (ucm.size < (unsigned)opt.k){
                ++ucm_index;
                continue;
            }

            colorProbing(ucm, color_indices, nb_colors);

            for (size_t i = 0; i < color_indices.size(); ++i)
                unitig_propagation_table[g.getColorName(i)].push_back(ucm_index);

            ++ucm_index;
            color_indices.clear();
        }

        // write unitig file per sample
        msg << "[popins2 multik] Adding (k + delta_k)-unitigs to temp FASTAs..."; printTimeStatus(msg);
        for (auto sample = unitig_propagation_table.cbegin(); sample != unitig_propagation_table.cend(); ++sample){

            std::ofstream stream(sample->first, std::ofstream::out | std::ofstream::app);
            if (!stream.good())
            {
                std::cerr << "ERROR: Could not open sample file \'" << sample->first << "\' for writing." << std::endl;
                return 1;
            }

            // iterators pointing to the unitig ID list of a sample
            std::vector<unsigned>::const_iterator idx = sample->second.cbegin();
            std::vector<unsigned>::const_iterator idx_end = sample->second.cend();

            // loop through graph
            unsigned current_ucm_idx = 0;
            for (auto &ucm : g){

                if (*idx == current_ucm_idx){

                    // write unitig to file
                    stream << ">unitig_" << std::to_string(current_ucm_idx) << "\n";
                    stream << ucm.referenceUnitigToString() << "\n";

                    ++idx;

                    if (idx == idx_end)
                        break;
                }

                current_ucm_idx += 1;
            }

            stream.close();
        }

    }

    // =====================
    // EOF
    // =====================
    msg << "[popins2 multik] Done."; printTimeStatus(msg);

    return 0;
}



#endif /*POPINS2_MULTIK_H_*/