From e03b0e22e3ffc403301f6207dff71604c451a93c Mon Sep 17 00:00:00 2001
From: kdc715 <kedic@LIT-PF3VB193.localdomain>
Date: Fri, 21 Feb 2025 15:02:07 +0100
Subject: [PATCH] workflow can either automatically detect sample s from
 input_dir or analyze only provided samples from the config file;conda default
 channel deprecated;add logging for python script

---
 Snakefile                                     |  24 ++-
 config/snake_config.yaml                      | 165 +++++++++---------
 snakemodules/contigmap.smk                    |   8 +-
 snakemodules/envs/biopython.yml               |   1 -
 snakemodules/envs/samtools21.yml              |   1 -
 snakemodules/merge.smk                        |   8 -
 .../scripts/remap_classified_human.py         |  76 ++++----
 7 files changed, 147 insertions(+), 136 deletions(-)

diff --git a/Snakefile b/Snakefile
index 908ac1d..701b385 100644
--- a/Snakefile
+++ b/Snakefile
@@ -6,6 +6,7 @@
 # This allows snakemake load yaml for configuration
 import glob
 import os
+import re
 
 configfile: os.path.join(workflow.basedir,"config","snake_config.yaml")
 configfile: os.path.join(workflow.basedir,"config","cluster_config.yaml")
@@ -31,20 +32,29 @@ relative_paths = {
     VELVET := os.path.join(WORKFLOW_PATH, config["VELVET_BIN"])
     }
 
+# Natural sort helper
+def natural_key(s):
+    # Splits on any digits (\d+) and tries to convert them to integers,
+    # ensuring numeric chunks are sorted as numbers, not strings.
+    return [int(chunk) if chunk.isdigit() else chunk for chunk in re.split(r'(\d+)', s)]
 
 ################################################################################
 # Automatically grabbing sample names from INPUT_DIR
-
+selected_samples = config["SAMPLES"]
 # Recursively find all BAM files in INPUT_DIR (including subdirectories)
 bam_files = glob.glob(os.path.join(INPUT_DIR, "**", "*.bam"), recursive=True)
+# Extract sample names (assuming the sample name is the filename without the .bam extension)
+auto_samples = [os.path.splitext(os.path.basename(bam))[0] for bam in bam_files]
 
-# Extract sample names from the BAM file basenames
-# Assuming the sample name is the filename without the .bam extension.
-SAMPLES = sorted(set(os.path.splitext(os.path.basename(b))[0] for b in bam_files))
 
-sample_num = len(SAMPLES)
+# If the user specified a list of samples, filter to those; otherwise, take all
+if selected_samples:
+    auto_samples = set(auto_samples).intersection(selected_samples)
 
+# Assuming the sample name is the filename without the .bam extension.
+SAMPLES = sorted(auto_samples, key=natural_key)
 
+sample_num = len(SAMPLES)
 ################################################################################
 
 input_files = [
@@ -56,7 +66,9 @@ input_files = [
 
 if config["remove_contamination"]=="no":
     input_files.append(expand("{p}/{s}/{f}", p=WORK_DIR, s=SAMPLES, f=["POPINS_SAMPLE_INFO"]))
-
+elif config["remove_contamination"]=="yes":
+    input_files.append(expand("{p}/{s}/contaminate_info/paired_report.txt", p=WORK_DIR, s=SAMPLES))
+    input_files.append(expand("{p}/{s}/contaminate_info/single_report.txt", p=WORK_DIR, s=SAMPLES))
 
 if config["ANALYSIS"]=="yes":
     input_files.append(expand("{p}/{f}", p=RESULTS_DIR, f=["analysis_table.png","gc_content_dist.png","contig_length_dist.png","heatmap_coverage.png"]))
diff --git a/config/snake_config.yaml b/config/snake_config.yaml
index 47c06b6..087f754 100644
--- a/config/snake_config.yaml
+++ b/config/snake_config.yaml
@@ -1,79 +1,86 @@
-### Define references here
-REFERENCE:
-   example_data/ref/chr21_ins.fa
-
-### Provide input sample folder here
-INPUT_DIR:
-   example_data/
-
-### Provide full path of where your workflow is located
-# this path is a prefix for submodules and log files within the workflow
-# e.g.: /home/user/popinSnake
-WORKFLOW_PATH:
-   /home/user/popinSnake/
-
-### Provide full path of where you wish the analyzed output to be stored
-# this path is a prefix for WORK_DIR and RESULTS_DIR
-# e.g.: /home/user/../output
-OUTPUT_PATH:
-   /home/user/popinSnake/
-
-### Define workflow behaviour here
-SICKLE:
-   "no"
-ASSEMBLER:
-   "minia"
-ANALYSIS:
-   "yes"
-
-### Define contamination removal module
-remove_contamination:
-   "yes"
-# Provid an alternative reference genome after contamination removal to ensure that cleaned reads are accurately realigned
-ALTREF:
-   example_data/altref/chr21_remap.fa
-# Define where the kraken database is installed
-kraken_db_path:
-   example_data/kraken_db
-# deprecated parameters for snakefile download_krakendb
-# check readme file for more information on how to download krakendb
-# Pipeline to automatically download the chosen kraken database
-#install_kraken_db:
-#   "no"
-#kraken_lib:
-#   "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_08gb_20240605.tar.gz"
-
-### Define other workflow parameter values here
-receive_email_notifications: "no" # choose between "yes" or "no" to receive email notifications or not
-email: "user@example.com" # specify your email address here to receive email notifications before each notebook execution.
-kmerlength: 29  # for velvet assembly
-k_merge: 63     # for rule popins2_merge_contigs
-readlen: 150    # for rules popins2_place_refalign and popins2_place_splitalign
-min-qual: 30    # for rule popins2_crop_unmapped, Minimum average quality value in windows for read quality trimming.
-min-read-len: 60 # for rule popins2_crop_unmapped, Minimum read length to keep the read after quality trimming.
-
-
-# Sub-directories for OUTPUT_PATH
-# These folders will be automatically created after snakemake execution
-# WORK_DIR includes intermediate output, processed data
-# RESUTLS_DIR includes analyzed output, final results
-WORK_DIR:
-   workdir
-RESULTS_DIR:
-   results
-
-# Define paths to binaries:
-# (Change only if you are not using the default installation locations.)
-POPINS2_BIN:
-   submodules/popins4snake/popins4snake
-MINIA_BIN:
-   submodules/gatb-minia-pipeline/gatb
-SICKLE_BIN:
-   submodules/sickle/sickle
-VELVET_BIN:
-   submodules/velvet
-# Part of the contamination removal module
-python_script:
-   snakemodules/scripts/remap_classified_human.py
-
-
+### Define *full path* of references here
+REFERENCE:
+   /home/user/popinSnake/example_data/ref/chr21_ins.fa
+
+### Provide *full path* of input sample folder here
+INPUT_DIR:
+   /home/user/popinSnake/example_data/
+
+### Provide selected sample names from input sample folder (optional)
+### If not provide a list of samples, all samples found under input_dir will be analyzed 
+SAMPLES:
+   # - S0001
+   # - S0002
+   # - S0003
+
+### Provide *full path* of where your workflow is located
+# this path will be a prefix for submodules and log files within the workflow
+# e.g.: /home/user/popinSnake
+WORKFLOW_PATH:
+   /home/user/popinSnake/
+
+### Provide *full path* of where you wish the analyzed output to be stored
+# this path is a prefix for WORK_DIR and RESULTS_DIR
+# e.g.: /home/user/../output
+OUTPUT_PATH:
+   /home/user/output_path/
+
+### Choose workflow behaviour here
+SICKLE:
+   "yes"
+ASSEMBLER:
+   "minia"
+ANALYSIS:
+   "yes"
+
+### Define contamination removal module
+remove_contamination:
+   "yes"
+# Provid an alternative reference genome after contamination removal to ensure that cleaned reads are accurately realigned
+ALTREF:
+   /home/user/popinSnake/example_data/altref/chr21_remap.fa
+# Define the *full path* of where the kraken database is installed
+kraken_db_path:
+   /home/user/popinSnake/example_data/kraken_db/
+# deprecated parameters for snakefile download_krakendb
+# check readme file for more information on how to download krakendb
+# Pipeline to automatically download the chosen kraken database
+#install_kraken_db:
+#   "no"
+#kraken_lib:
+#   "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_08gb_20240605.tar.gz"
+
+### Define other workflow parameter values here
+receive_email_notifications: "no" # choose between "yes" or "no" to receive email notifications or not
+email: "user@example.com" # specify your email address here to receive email notifications before each notebook execution.
+kmerlength: 29  # for velvet assembly
+k_merge: 63     # for rule popins2_merge_contigs
+readlen: 150    # for rules popins2_place_refalign and popins2_place_splitalign
+min-qual: 30    # for rule popins2_crop_unmapped, Minimum average quality value in windows for read quality trimming.
+min-read-len: 60 # for rule popins2_crop_unmapped, Minimum read length to keep the read after quality trimming.
+
+
+# Sub-directories for OUTPUT_PATH
+# These folders will be automatically created after snakemake execution
+# WORK_DIR includes intermediate output, processed data
+# RESUTLS_DIR includes analyzed output, final results
+WORK_DIR:
+   workdir
+RESULTS_DIR:
+   results
+
+# Define paths to binaries:
+# (Change only if you are not using the default installation locations.)
+POPINS2_BIN:
+   submodules/popins4snake/popins4snake
+MINIA_BIN:
+   submodules/gatb-minia-pipeline/gatb
+SICKLE_BIN:
+   submodules/sickle/sickle
+VELVET_BIN:
+   submodules/velvet
+# Part of the contamination removal module
+python_script:
+   snakemodules/scripts/remap_classified_human.py
+
+
diff --git a/snakemodules/contigmap.smk b/snakemodules/contigmap.smk
index 165d41a..ca26a47 100644
--- a/snakemodules/contigmap.smk
+++ b/snakemodules/contigmap.smk
@@ -41,8 +41,8 @@ if config["remove_contamination"] == 'yes':
             os.path.join(WORKFLOW_PATH,"snakemodules/envs/bwa.yml")
         resources:
             # Dynamically iterating mem
-            # mem_mb = lambda wildcards, attempt: config["dynamic_schedule"]["mem_init"]* (2** (attempt - 1))
-            mem_mb = resources["bwa"]["mem"],
+            mem_mb = lambda wildcards, attempt: config["dynamic_schedule"]["mem_init"]* (2** (attempt - 1)),
+            # mem_mb = resources["bwa"]["mem"],
             runtime = resources["bwa"]["time"]
         threads: 
             threads["multi"]["bwa"]
@@ -71,8 +71,8 @@ elif config["remove_contamination"] == 'no':
             os.path.join(WORKFLOW_PATH,"snakemodules/envs/bwa.yml")
         resources:
             # Dynamically iterating mem
-            # mem_mb = lambda wildcards, attempt: config["dynamic_schedule"]["mem_init"]* (2** (attempt - 1))
-            mem_mb = resources["bwa"]["mem"],
+            mem_mb = lambda wildcards, attempt: config["dynamic_schedule"]["mem_init"]* (2** (attempt - 1)),
+            # mem_mb = resources["bwa"]["mem"],
             runtime = resources["bwa"]["time"]
         threads: 
             threads["multi"]["bwa"]
diff --git a/snakemodules/envs/biopython.yml b/snakemodules/envs/biopython.yml
index 8a3b302..97a0f1f 100644
--- a/snakemodules/envs/biopython.yml
+++ b/snakemodules/envs/biopython.yml
@@ -1,7 +1,6 @@
 name: biopython
 channels:
   - conda-forge
-  - defaults
 dependencies:
   - python=3.9  
   - biopython    
\ No newline at end of file
diff --git a/snakemodules/envs/samtools21.yml b/snakemodules/envs/samtools21.yml
index 3e55604..1cbcdfa 100644
--- a/snakemodules/envs/samtools21.yml
+++ b/snakemodules/envs/samtools21.yml
@@ -2,7 +2,6 @@ name: samtools21
 channels:
   - conda-forge
   - bioconda
-  - defaults
 dependencies:
   - _libgcc_mutex=0.1
   - _openmp_mutex=4.5
diff --git a/snakemodules/merge.smk b/snakemodules/merge.smk
index 0398011..876808a 100644
--- a/snakemodules/merge.smk
+++ b/snakemodules/merge.smk
@@ -1,9 +1,3 @@
-# WORK_DIR = config["WORK_DIR"]
-# RESULTS_DIR = config["RESULTS_DIR"]
-# REFERENCE = config["REFERENCE"]
-# SAMPLES = config["SAMPLES"]
-# ASSEMBLER = config["ASSEMBLER"]
-
 
 rule popins2_merge_contigs:
     input:
@@ -11,8 +5,6 @@ rule popins2_merge_contigs:
         os.path.join(RESULTS_DIR, "analysis_table.png") if config["ANALYSIS"]=="yes" else [],
         os.path.join(RESULTS_DIR, "contig_length_dist.png") if config["ANALYSIS"]=="yes" else [],
         os.path.join(RESULTS_DIR, "gc_content_dist.png") if config["ANALYSIS"]=="yes" else []
-
-        # expand("{p}/{f}",p=RESULTS_DIR, f=["analysis_table.png","contig_length_dist.png","gc_content_dist.png"])
     output:
         supercontigs = RESULTS_DIR + "/supercontigs.fa",
         gfa          = RESULTS_DIR + "/supercontigs.gfa",
diff --git a/snakemodules/scripts/remap_classified_human.py b/snakemodules/scripts/remap_classified_human.py
index f49926e..334897f 100644
--- a/snakemodules/scripts/remap_classified_human.py
+++ b/snakemodules/scripts/remap_classified_human.py
@@ -1,75 +1,77 @@
 from Bio import SeqIO
 import argparse
+import os
+import logging
 
-
-# Parse classification files and extract relevant read IDs
-# the classification files are Tab seperated files from kraken output
 def extract_human_read_ids(classification_file):
     human_reads = set()
+    lines_processed = 0
     with open(classification_file, 'r') as f:
         for line in f:
+            lines_processed += 1
             parts = line.strip().split()
-            #first column contains classification information of classified/unclassified(C/U)
+            if len(parts) < 3:
+                logging.warning(f"Line {lines_processed} in {classification_file} skipped (insufficient columns): {line.strip()}")
+                continue
+            # First column: classification (C/U)
             classification = parts[0]
-            #second column contains id of each reads from the fastq files
+            # Second column: read ID
             read_id = parts[1]
-            #third column contains taxonomy id of either human-'9606' or other species
+            # Third column: taxonomy ID (human is '9606')
             taxonomy = parts[2]
             if classification == 'C' and taxonomy == '9606':
                 human_reads.add(read_id)
+    logging.info(f"Extracted {len(human_reads)} human read IDs from {classification_file} (processed {lines_processed} lines)")
     return human_reads
 
-# Extract human reads from fastq file based on read IDs
 def extract_reads_from_fastq(fastq_file, human_read_ids, output_fastq):
+    total_records = 0
+    human_records = 0
     with open(output_fastq, 'w') as out_fq:
         for record in SeqIO.parse(fastq_file, "fastq"):
+            total_records += 1
             if record.id in human_read_ids:
                 SeqIO.write(record, out_fq, "fastq")
+                human_records += 1
+    logging.info(f"Processed {total_records} records from {fastq_file}; extracted {human_records} human reads into {output_fastq}")
 
 def main(single_output, paired_output, single_fq, paired_fq1, paired_fq2, output_prefix):
-    # Extract human read IDs from the output files
+    # Extract human read IDs from the classification files
+    logging.info("Starting extraction of human read IDs from classified single reads...")
     single_human_reads = extract_human_read_ids(single_output)
+    logging.info("Starting extraction of human read IDs from classified paired reads...")
     paired_human_reads = extract_human_read_ids(paired_output)
 
-    # Combine the read IDs from single and paired reads
+    # Combine the read IDs from single and paired outputs
     all_human_reads = single_human_reads.union(paired_human_reads)
-    # Extract human reads from the fastq files
-    extract_reads_from_fastq(single_fq, all_human_reads, f'{output_prefix}/human_classified_single.fastq')
-    extract_reads_from_fastq(paired_fq1, all_human_reads, f'{output_prefix}/human_classified_paired_1.fastq')
-    extract_reads_from_fastq(paired_fq2, all_human_reads, f'{output_prefix}/human_classified_paired_2.fastq')
+    logging.info(f"Total unique human read IDs combined: {len(all_human_reads)}")
+
+    # Construct output file paths using os.path.join
+    single_out = os.path.join(output_prefix, 'human_classified_single.fastq')
+    paired_out1 = os.path.join(output_prefix, 'human_classified_paired_1.fastq')
+    paired_out2 = os.path.join(output_prefix, 'human_classified_paired_2.fastq')
 
-    print(f"Human reads extracted into '{output_prefix}/human_classified_single.fastq', '{output_prefix}/human_classified_paired_1.fastq', and '{output_prefix}/human_classified_paired_2.fastq'")
+    # Extract human reads from the FASTQ files
+    logging.info("Extracting human reads from FASTQ files...")
+    extract_reads_from_fastq(single_fq, all_human_reads, single_out)
+    extract_reads_from_fastq(paired_fq1, all_human_reads, paired_out1)
+    extract_reads_from_fastq(paired_fq2, all_human_reads, paired_out2)
+
+    logging.info("Extraction complete.")
+    logging.info(f"Outputs saved to:\n  {single_out}\n  {paired_out1}\n  {paired_out2}")
 
 if __name__ == "__main__":
+    # Set up logging to include the time, log level, and message
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+    
     parser = argparse.ArgumentParser(description="Extract human reads based on classification and taxonomy.")
     parser.add_argument("--classified_single", required=True, help="Path to the single_output.txt file")
     parser.add_argument("--classified_paired", required=True, help="Path to the paired_output.txt file")
     parser.add_argument("--single_fq", required=True, help="Path to the single.fq file")
     parser.add_argument("--paired_fq1", required=True, help="Path to the paired.1.fq file")
     parser.add_argument("--paired_fq2", required=True, help="Path to the paired.2.fq file")
-    parser.add_argument("--output_prefix", required=True, help="Prefix for the output fastq files")
-    # parser.add_argument("--single_out", default="human_single.fastq", help="Output name for the single fastq file (default: human_single.fastq)")
-    # parser.add_argument("--paired1_out", default="human_paired_1.fastq", help="Output name for the first paired fastq file (default: human_paired_1.fastq)")
-    # parser.add_argument("--paired2_out", default="human_paired_2.fastq", help="Output name for the second paired fastq file (default: human_paired_2.fastq)")
-
-
+    parser.add_argument("--output_prefix", required=True, help="Directory for the output FASTQ files")
+    
     args = parser.parse_args()
-
     main(args.classified_single, args.classified_paired, args.single_fq, args.paired_fq1, args.paired_fq2, args.output_prefix)
 
-# # Combine all the reads into one fastq file
-# with open('all_human_reads.fastq', 'w') as combined_output:
-#     with open('human_single.fastq', 'r') as single_fq:
-#         combined_output.write(single_fq.read())
-#     with open('human_paired_1.fastq', 'r') as paired1_fq:
-#         combined_output.write(paired1_fq.read())
-#     with open('human_paired_2.fastq', 'r') as paired2_fq:
-#         combined_output.write(paired2_fq.read())
-
-# # Clean up temporary files
-# import os
-# os.remove('human_single.fastq')
-# os.remove('human_paired_1.fastq')
-# os.remove('human_paired_2.fastq')
-
-# print("All human reads have been extracted and saved to 'all_human_reads.fastq'")
\ No newline at end of file
-- 
GitLab