From e03b0e22e3ffc403301f6207dff71604c451a93c Mon Sep 17 00:00:00 2001 From: kdc715 <kedic@LIT-PF3VB193.localdomain> Date: Fri, 21 Feb 2025 15:02:07 +0100 Subject: [PATCH] workflow can either automatically detect sample s from input_dir or analyze only provided samples from the config file;conda default channel deprecated;add logging for python script --- Snakefile | 24 ++- config/snake_config.yaml | 165 +++++++++--------- snakemodules/contigmap.smk | 8 +- snakemodules/envs/biopython.yml | 1 - snakemodules/envs/samtools21.yml | 1 - snakemodules/merge.smk | 8 - .../scripts/remap_classified_human.py | 76 ++++---- 7 files changed, 147 insertions(+), 136 deletions(-) diff --git a/Snakefile b/Snakefile index 908ac1d..701b385 100644 --- a/Snakefile +++ b/Snakefile @@ -6,6 +6,7 @@ # This allows snakemake load yaml for configuration import glob import os +import re configfile: os.path.join(workflow.basedir,"config","snake_config.yaml") configfile: os.path.join(workflow.basedir,"config","cluster_config.yaml") @@ -31,20 +32,29 @@ relative_paths = { VELVET := os.path.join(WORKFLOW_PATH, config["VELVET_BIN"]) } +# Natural sort helper +def natural_key(s): + # Splits on any digits (\d+) and tries to convert them to integers, + # ensuring numeric chunks are sorted as numbers, not strings. + return [int(chunk) if chunk.isdigit() else chunk for chunk in re.split(r'(\d+)', s)] ################################################################################ # Automatically grabbing sample names from INPUT_DIR - +selected_samples = config["SAMPLES"] # Recursively find all BAM files in INPUT_DIR (including subdirectories) bam_files = glob.glob(os.path.join(INPUT_DIR, "**", "*.bam"), recursive=True) +# Extract sample names (assuming the sample name is the filename without the .bam extension) +auto_samples = [os.path.splitext(os.path.basename(bam))[0] for bam in bam_files] -# Extract sample names from the BAM file basenames -# Assuming the sample name is the filename without the .bam extension. -SAMPLES = sorted(set(os.path.splitext(os.path.basename(b))[0] for b in bam_files)) -sample_num = len(SAMPLES) +# If the user specified a list of samples, filter to those; otherwise, take all +if selected_samples: + auto_samples = set(auto_samples).intersection(selected_samples) +# Assuming the sample name is the filename without the .bam extension. +SAMPLES = sorted(auto_samples, key=natural_key) +sample_num = len(SAMPLES) ################################################################################ input_files = [ @@ -56,7 +66,9 @@ input_files = [ if config["remove_contamination"]=="no": input_files.append(expand("{p}/{s}/{f}", p=WORK_DIR, s=SAMPLES, f=["POPINS_SAMPLE_INFO"])) - +elif config["remove_contamination"]=="yes": + input_files.append(expand("{p}/{s}/contaminate_info/paired_report.txt", p=WORK_DIR, s=SAMPLES)) + input_files.append(expand("{p}/{s}/contaminate_info/single_report.txt", p=WORK_DIR, s=SAMPLES)) if config["ANALYSIS"]=="yes": input_files.append(expand("{p}/{f}", p=RESULTS_DIR, f=["analysis_table.png","gc_content_dist.png","contig_length_dist.png","heatmap_coverage.png"])) diff --git a/config/snake_config.yaml b/config/snake_config.yaml index 47c06b6..087f754 100644 --- a/config/snake_config.yaml +++ b/config/snake_config.yaml @@ -1,79 +1,86 @@ -### Define references here -REFERENCE: - example_data/ref/chr21_ins.fa - -### Provide input sample folder here -INPUT_DIR: - example_data/ - -### Provide full path of where your workflow is located -# this path is a prefix for submodules and log files within the workflow -# e.g.: /home/user/popinSnake -WORKFLOW_PATH: - /home/user/popinSnake/ - -### Provide full path of where you wish the analyzed output to be stored -# this path is a prefix for WORK_DIR and RESULTS_DIR -# e.g.: /home/user/../output -OUTPUT_PATH: - /home/user/popinSnake/ - -### Define workflow behaviour here -SICKLE: - "no" -ASSEMBLER: - "minia" -ANALYSIS: - "yes" - -### Define contamination removal module -remove_contamination: - "yes" -# Provid an alternative reference genome after contamination removal to ensure that cleaned reads are accurately realigned -ALTREF: - example_data/altref/chr21_remap.fa -# Define where the kraken database is installed -kraken_db_path: - example_data/kraken_db -# deprecated parameters for snakefile download_krakendb -# check readme file for more information on how to download krakendb -# Pipeline to automatically download the chosen kraken database -#install_kraken_db: -# "no" -#kraken_lib: -# "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_08gb_20240605.tar.gz" - -### Define other workflow parameter values here -receive_email_notifications: "no" # choose between "yes" or "no" to receive email notifications or not -email: "user@example.com" # specify your email address here to receive email notifications before each notebook execution. -kmerlength: 29 # for velvet assembly -k_merge: 63 # for rule popins2_merge_contigs -readlen: 150 # for rules popins2_place_refalign and popins2_place_splitalign -min-qual: 30 # for rule popins2_crop_unmapped, Minimum average quality value in windows for read quality trimming. -min-read-len: 60 # for rule popins2_crop_unmapped, Minimum read length to keep the read after quality trimming. - - -# Sub-directories for OUTPUT_PATH -# These folders will be automatically created after snakemake execution -# WORK_DIR includes intermediate output, processed data -# RESUTLS_DIR includes analyzed output, final results -WORK_DIR: - workdir -RESULTS_DIR: - results - -# Define paths to binaries: -# (Change only if you are not using the default installation locations.) -POPINS2_BIN: - submodules/popins4snake/popins4snake -MINIA_BIN: - submodules/gatb-minia-pipeline/gatb -SICKLE_BIN: - submodules/sickle/sickle -VELVET_BIN: - submodules/velvet -# Part of the contamination removal module -python_script: - snakemodules/scripts/remap_classified_human.py - - +### Define *full path* of references here +REFERENCE: + /home/user/popinSnake/example_data/ref/chr21_ins.fa + +### Provide *full path* of input sample folder here +INPUT_DIR: + /home/user/popinSnake/example_data/ + +### Provide selected sample names from input sample folder (optional) +### If not provide a list of samples, all samples found under input_dir will be analyzed +SAMPLES: + # - S0001 + # - S0002 + # - S0003 + +### Provide *full path* of where your workflow is located +# this path will be a prefix for submodules and log files within the workflow +# e.g.: /home/user/popinSnake +WORKFLOW_PATH: + /home/user/popinSnake/ + +### Provide *full path* of where you wish the analyzed output to be stored +# this path is a prefix for WORK_DIR and RESULTS_DIR +# e.g.: /home/user/../output +OUTPUT_PATH: + /home/user/output_path/ + +### Choose workflow behaviour here +SICKLE: + "yes" +ASSEMBLER: + "minia" +ANALYSIS: + "yes" + +### Define contamination removal module +remove_contamination: + "yes" +# Provid an alternative reference genome after contamination removal to ensure that cleaned reads are accurately realigned +ALTREF: + /home/user/popinSnake/example_data/altref/chr21_remap.fa +# Define the *full path* of where the kraken database is installed +kraken_db_path: + /home/user/popinSnake/example_data/kraken_db/ +# deprecated parameters for snakefile download_krakendb +# check readme file for more information on how to download krakendb +# Pipeline to automatically download the chosen kraken database +#install_kraken_db: +# "no" +#kraken_lib: +# "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_08gb_20240605.tar.gz" + +### Define other workflow parameter values here +receive_email_notifications: "no" # choose between "yes" or "no" to receive email notifications or not +email: "user@example.com" # specify your email address here to receive email notifications before each notebook execution. +kmerlength: 29 # for velvet assembly +k_merge: 63 # for rule popins2_merge_contigs +readlen: 150 # for rules popins2_place_refalign and popins2_place_splitalign +min-qual: 30 # for rule popins2_crop_unmapped, Minimum average quality value in windows for read quality trimming. +min-read-len: 60 # for rule popins2_crop_unmapped, Minimum read length to keep the read after quality trimming. + + +# Sub-directories for OUTPUT_PATH +# These folders will be automatically created after snakemake execution +# WORK_DIR includes intermediate output, processed data +# RESUTLS_DIR includes analyzed output, final results +WORK_DIR: + workdir +RESULTS_DIR: + results + +# Define paths to binaries: +# (Change only if you are not using the default installation locations.) +POPINS2_BIN: + submodules/popins4snake/popins4snake +MINIA_BIN: + submodules/gatb-minia-pipeline/gatb +SICKLE_BIN: + submodules/sickle/sickle +VELVET_BIN: + submodules/velvet +# Part of the contamination removal module +python_script: + snakemodules/scripts/remap_classified_human.py + + diff --git a/snakemodules/contigmap.smk b/snakemodules/contigmap.smk index 165d41a..ca26a47 100644 --- a/snakemodules/contigmap.smk +++ b/snakemodules/contigmap.smk @@ -41,8 +41,8 @@ if config["remove_contamination"] == 'yes': os.path.join(WORKFLOW_PATH,"snakemodules/envs/bwa.yml") resources: # Dynamically iterating mem - # mem_mb = lambda wildcards, attempt: config["dynamic_schedule"]["mem_init"]* (2** (attempt - 1)) - mem_mb = resources["bwa"]["mem"], + mem_mb = lambda wildcards, attempt: config["dynamic_schedule"]["mem_init"]* (2** (attempt - 1)), + # mem_mb = resources["bwa"]["mem"], runtime = resources["bwa"]["time"] threads: threads["multi"]["bwa"] @@ -71,8 +71,8 @@ elif config["remove_contamination"] == 'no': os.path.join(WORKFLOW_PATH,"snakemodules/envs/bwa.yml") resources: # Dynamically iterating mem - # mem_mb = lambda wildcards, attempt: config["dynamic_schedule"]["mem_init"]* (2** (attempt - 1)) - mem_mb = resources["bwa"]["mem"], + mem_mb = lambda wildcards, attempt: config["dynamic_schedule"]["mem_init"]* (2** (attempt - 1)), + # mem_mb = resources["bwa"]["mem"], runtime = resources["bwa"]["time"] threads: threads["multi"]["bwa"] diff --git a/snakemodules/envs/biopython.yml b/snakemodules/envs/biopython.yml index 8a3b302..97a0f1f 100644 --- a/snakemodules/envs/biopython.yml +++ b/snakemodules/envs/biopython.yml @@ -1,7 +1,6 @@ name: biopython channels: - conda-forge - - defaults dependencies: - python=3.9 - biopython \ No newline at end of file diff --git a/snakemodules/envs/samtools21.yml b/snakemodules/envs/samtools21.yml index 3e55604..1cbcdfa 100644 --- a/snakemodules/envs/samtools21.yml +++ b/snakemodules/envs/samtools21.yml @@ -2,7 +2,6 @@ name: samtools21 channels: - conda-forge - bioconda - - defaults dependencies: - _libgcc_mutex=0.1 - _openmp_mutex=4.5 diff --git a/snakemodules/merge.smk b/snakemodules/merge.smk index 0398011..876808a 100644 --- a/snakemodules/merge.smk +++ b/snakemodules/merge.smk @@ -1,9 +1,3 @@ -# WORK_DIR = config["WORK_DIR"] -# RESULTS_DIR = config["RESULTS_DIR"] -# REFERENCE = config["REFERENCE"] -# SAMPLES = config["SAMPLES"] -# ASSEMBLER = config["ASSEMBLER"] - rule popins2_merge_contigs: input: @@ -11,8 +5,6 @@ rule popins2_merge_contigs: os.path.join(RESULTS_DIR, "analysis_table.png") if config["ANALYSIS"]=="yes" else [], os.path.join(RESULTS_DIR, "contig_length_dist.png") if config["ANALYSIS"]=="yes" else [], os.path.join(RESULTS_DIR, "gc_content_dist.png") if config["ANALYSIS"]=="yes" else [] - - # expand("{p}/{f}",p=RESULTS_DIR, f=["analysis_table.png","contig_length_dist.png","gc_content_dist.png"]) output: supercontigs = RESULTS_DIR + "/supercontigs.fa", gfa = RESULTS_DIR + "/supercontigs.gfa", diff --git a/snakemodules/scripts/remap_classified_human.py b/snakemodules/scripts/remap_classified_human.py index f49926e..334897f 100644 --- a/snakemodules/scripts/remap_classified_human.py +++ b/snakemodules/scripts/remap_classified_human.py @@ -1,75 +1,77 @@ from Bio import SeqIO import argparse +import os +import logging - -# Parse classification files and extract relevant read IDs -# the classification files are Tab seperated files from kraken output def extract_human_read_ids(classification_file): human_reads = set() + lines_processed = 0 with open(classification_file, 'r') as f: for line in f: + lines_processed += 1 parts = line.strip().split() - #first column contains classification information of classified/unclassified(C/U) + if len(parts) < 3: + logging.warning(f"Line {lines_processed} in {classification_file} skipped (insufficient columns): {line.strip()}") + continue + # First column: classification (C/U) classification = parts[0] - #second column contains id of each reads from the fastq files + # Second column: read ID read_id = parts[1] - #third column contains taxonomy id of either human-'9606' or other species + # Third column: taxonomy ID (human is '9606') taxonomy = parts[2] if classification == 'C' and taxonomy == '9606': human_reads.add(read_id) + logging.info(f"Extracted {len(human_reads)} human read IDs from {classification_file} (processed {lines_processed} lines)") return human_reads -# Extract human reads from fastq file based on read IDs def extract_reads_from_fastq(fastq_file, human_read_ids, output_fastq): + total_records = 0 + human_records = 0 with open(output_fastq, 'w') as out_fq: for record in SeqIO.parse(fastq_file, "fastq"): + total_records += 1 if record.id in human_read_ids: SeqIO.write(record, out_fq, "fastq") + human_records += 1 + logging.info(f"Processed {total_records} records from {fastq_file}; extracted {human_records} human reads into {output_fastq}") def main(single_output, paired_output, single_fq, paired_fq1, paired_fq2, output_prefix): - # Extract human read IDs from the output files + # Extract human read IDs from the classification files + logging.info("Starting extraction of human read IDs from classified single reads...") single_human_reads = extract_human_read_ids(single_output) + logging.info("Starting extraction of human read IDs from classified paired reads...") paired_human_reads = extract_human_read_ids(paired_output) - # Combine the read IDs from single and paired reads + # Combine the read IDs from single and paired outputs all_human_reads = single_human_reads.union(paired_human_reads) - # Extract human reads from the fastq files - extract_reads_from_fastq(single_fq, all_human_reads, f'{output_prefix}/human_classified_single.fastq') - extract_reads_from_fastq(paired_fq1, all_human_reads, f'{output_prefix}/human_classified_paired_1.fastq') - extract_reads_from_fastq(paired_fq2, all_human_reads, f'{output_prefix}/human_classified_paired_2.fastq') + logging.info(f"Total unique human read IDs combined: {len(all_human_reads)}") + + # Construct output file paths using os.path.join + single_out = os.path.join(output_prefix, 'human_classified_single.fastq') + paired_out1 = os.path.join(output_prefix, 'human_classified_paired_1.fastq') + paired_out2 = os.path.join(output_prefix, 'human_classified_paired_2.fastq') - print(f"Human reads extracted into '{output_prefix}/human_classified_single.fastq', '{output_prefix}/human_classified_paired_1.fastq', and '{output_prefix}/human_classified_paired_2.fastq'") + # Extract human reads from the FASTQ files + logging.info("Extracting human reads from FASTQ files...") + extract_reads_from_fastq(single_fq, all_human_reads, single_out) + extract_reads_from_fastq(paired_fq1, all_human_reads, paired_out1) + extract_reads_from_fastq(paired_fq2, all_human_reads, paired_out2) + + logging.info("Extraction complete.") + logging.info(f"Outputs saved to:\n {single_out}\n {paired_out1}\n {paired_out2}") if __name__ == "__main__": + # Set up logging to include the time, log level, and message + logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + parser = argparse.ArgumentParser(description="Extract human reads based on classification and taxonomy.") parser.add_argument("--classified_single", required=True, help="Path to the single_output.txt file") parser.add_argument("--classified_paired", required=True, help="Path to the paired_output.txt file") parser.add_argument("--single_fq", required=True, help="Path to the single.fq file") parser.add_argument("--paired_fq1", required=True, help="Path to the paired.1.fq file") parser.add_argument("--paired_fq2", required=True, help="Path to the paired.2.fq file") - parser.add_argument("--output_prefix", required=True, help="Prefix for the output fastq files") - # parser.add_argument("--single_out", default="human_single.fastq", help="Output name for the single fastq file (default: human_single.fastq)") - # parser.add_argument("--paired1_out", default="human_paired_1.fastq", help="Output name for the first paired fastq file (default: human_paired_1.fastq)") - # parser.add_argument("--paired2_out", default="human_paired_2.fastq", help="Output name for the second paired fastq file (default: human_paired_2.fastq)") - - + parser.add_argument("--output_prefix", required=True, help="Directory for the output FASTQ files") + args = parser.parse_args() - main(args.classified_single, args.classified_paired, args.single_fq, args.paired_fq1, args.paired_fq2, args.output_prefix) -# # Combine all the reads into one fastq file -# with open('all_human_reads.fastq', 'w') as combined_output: -# with open('human_single.fastq', 'r') as single_fq: -# combined_output.write(single_fq.read()) -# with open('human_paired_1.fastq', 'r') as paired1_fq: -# combined_output.write(paired1_fq.read()) -# with open('human_paired_2.fastq', 'r') as paired2_fq: -# combined_output.write(paired2_fq.read()) - -# # Clean up temporary files -# import os -# os.remove('human_single.fastq') -# os.remove('human_paired_1.fastq') -# os.remove('human_paired_2.fastq') - -# print("All human reads have been extracted and saved to 'all_human_reads.fastq'") \ No newline at end of file -- GitLab