From b0bf1af4066b6ebe4de04c8c3898a9372597b107 Mon Sep 17 00:00:00 2001
From: kdc715 <kedic@LIT-PF3VB193.localdomain>
Date: Tue, 8 Oct 2024 15:52:18 +0200
Subject: [PATCH] modify coverage calculation method, add analysis for
 comparing popins filter params and sickle

---
 Snakefile                                     |  19 ++-
 modules/analysis.smk                          | 151 ++++++++++++++----
 modules/contigmap.smk                         |  18 +--
 modules/crop_remapped.smk                     |   4 +-
 modules/crop_unmapped.smk                     |   4 +-
 modules/genotype.smk                          |   9 +-
 .../contig_coverage_heatmap.py.ipynb          |  49 ++----
 modules/notebooks/src/stat_func/calculator.py |  33 ++++
 modules/position.smk                          |   8 -
 9 files changed, 183 insertions(+), 112 deletions(-)

diff --git a/Snakefile b/Snakefile
index 6925a07..c98e58e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1,6 +1,6 @@
 #---------------------------
 # @author: Kedi Cao
-# @email: ckedi.cao@ukr.de
+# @email: kedi.cao@ukr.de
 # @date: 28.08.2024
 #---------------------------
 
@@ -36,12 +36,7 @@ kraken_lib = config["kraken_lib"]
 
 # REMAP = config["REMAP"]
 
-import os
 
-def get_results_filename(filenames):
-    paths = [os.path.join(RESULTS_DIR, filename) for filename in filenames]
-    existing_paths = [path if os.path.exists(path) else [] for path in paths]
-    return existing_paths 
 ################################################################################
 
 input_files = [
@@ -60,10 +55,18 @@ if config["remove_contamination"]=="no":
 elif config["remove_contamination"]=="yes":
     input_files.append(expand("{p}/{s}/{f}",p=WORK_DIR, s=SAMPLES, f=["single_clean.fastq","paired_1_clean.fastq","paired_2_clean.fastq"]))
     input_files.append(expand("{p}/{s}/remapped_non_ref.bam", p=WORK_DIR, s=SAMPLES))
-    # input_files.append(RESULTS_DIR + "/read_numbers.png")
 
 if config["ANALYSIS"]=="yes":
-    input_files.append(expand("{p}/{f}", p=RESULTS_DIR, f=["read_numbers.png","analysis_table.png","gc_content_dist.png","contig_length_dist.png","heatmap_coverage.png"]))
+    input_files.append(expand("{p}/{f}", p=RESULTS_DIR, f=["analysis_table.png","gc_content_dist.png","contig_length_dist.png","heatmap_coverage.png"]))
+    if config["SICKLE"]=="no":
+        input_files.append(expand("{p}/{f}", p=RESULTS_DIR, f=["read_numbers.png"]))
+    elif config["SICKLE"]=="yes":
+        if config["remove_contamination"]=="no":
+            input_files.append(expand("{p}/{f}", p=RESULTS_DIR, f=["read_numbers_before_filter.png","read_numbers_after_filter.png"]))
+        elif config["remove_contamination"]=="yes":
+            input_files.append(expand("{p}/{f}", p=RESULTS_DIR, f=["read_numbers_before_clean_filter.png","read_numbers_after_clean.png","read_numbers_after_clean_filter.png"]))
+
+
 
 
 ################################################################################
diff --git a/modules/analysis.smk b/modules/analysis.smk
index 4048428..5204b0b 100644
--- a/modules/analysis.smk
+++ b/modules/analysis.smk
@@ -1,21 +1,107 @@
 
-rule unmapped_analysis:
-    input:
-        single = expand("{p}/{s}/single.fastq", p=WORK_DIR, s=SAMPLES),
-        paired1 = expand("{p}/{s}/paired.1.fastq", p=WORK_DIR, s=SAMPLES),
-        paired2 = expand("{p}/{s}/paired.2.fastq", p=WORK_DIR, s=SAMPLES)
-    output:
-        RESULTS_DIR + "/read_numbers.png"
-    conda:
-        "envs/notebooks.yml"
-    log:
-        "logs/analysis/unmapped_analysis.log"
-    benchmark:
-        "benchmarks/analysis/unmapped_analysis.txt"
-    notebook:
-        MYPATH + "/modules/notebooks/read_numbers_in_bar_graph.py.ipynb"
+if config["SICKLE"]=="no":
+    rule unmapped_analysis:
+        input:
+            single = expand("{p}/{s}/single.fastq", p=WORK_DIR, s=SAMPLES),
+            paired1 = expand("{p}/{s}/paired.1.fastq", p=WORK_DIR, s=SAMPLES),
+            paired2 = expand("{p}/{s}/paired.2.fastq", p=WORK_DIR, s=SAMPLES)
+        output:
+            RESULTS_DIR + "/read_numbers.png"
+        conda:
+            "envs/notebooks.yml"
+        log:
+            "logs/analysis/unmapped_analysis.log"
+        benchmark:
+            "benchmarks/analysis/unmapped_analysis.txt"
+        notebook:
+            MYPATH + "/modules/notebooks/read_numbers_in_bar_graph.py.ipynb"
+
+elif config["SICKLE"]=="yes":
+    if config["remove_contamination"]=="no":
+        rule unmapped_analysis_before_filter:
+            input:
+                single = expand("{p}/{s}/single.fastq", p=WORK_DIR, s=SAMPLES),
+                paired1 = expand("{p}/{s}/paired.1.fastq", p=WORK_DIR, s=SAMPLES),
+                paired2 = expand("{p}/{s}/paired.2.fastq", p=WORK_DIR, s=SAMPLES)
+            output:
+                RESULTS_DIR + "/read_numbers_before_filter.png"
+            conda:
+                "envs/notebooks.yml"
+            log:
+                "logs/analysis/unmapped_analysis_before_filter.log"
+            benchmark:
+                "benchmarks/analysis/unmapped_analysis_before_filter.txt"
+            notebook:
+                MYPATH + "/modules/notebooks/read_numbers_in_bar_graph.py.ipynb"
+
+        rule unmapped_analysis_after_filter:
+            input:
+                single = expand("{p}/{s}/sickle.single.fastq", p=WORK_DIR, s=SAMPLES),
+                paired1 = expand("{p}/{s}/sickle.paired.1.fastq", p=WORK_DIR, s=SAMPLES),
+                paired2 = expand("{p}/{s}/sickle.paired.2.fastq", p=WORK_DIR, s=SAMPLES)
+            output:
+                RESULTS_DIR + "/read_numbers_after_filter.png"
+            conda:
+                "envs/notebooks.yml"
+            log:
+                "logs/analysis/unmapped_analysis_after_filter.log"
+            benchmark:
+                "benchmarks/analysis/unmapped_analysis_after_filter.txt"
+            notebook:
+                MYPATH + "/modules/notebooks/read_numbers_in_bar_graph.py.ipynb"
+
+    elif config["remove_contamination"]=="yes":
+        rule unmapped_analysis_before_clean_filter:
+            input:
+                single = expand("{p}/{s}/single.fastq", p=WORK_DIR, s=SAMPLES),
+                paired1 = expand("{p}/{s}/paired.1.fastq", p=WORK_DIR, s=SAMPLES),
+                paired2 = expand("{p}/{s}/paired.2.fastq", p=WORK_DIR, s=SAMPLES)
+            output:
+                RESULTS_DIR + "/read_numbers_before_clean_filter.png"
+            conda:
+                "envs/notebooks.yml"
+            log:
+                "logs/analysis/unmapped_analysis_before_clean_filter.log"
+            benchmark:
+                "benchmarks/analysis/unmapped_analysis_before_clean_filte.txt"
+            notebook:
+                MYPATH + "/modules/notebooks/read_numbers_in_bar_graph.py.ipynb"
+
+        rule unmapped_analysis_after_clean:
+            input:
+                single = expand("{p}/{s}/single_clean.fastq", p=WORK_DIR, s=SAMPLES),
+                paired1 = expand("{p}/{s}/paired_1_clean.fastq", p=WORK_DIR, s=SAMPLES),
+                paired2 = expand("{p}/{s}/paired_2_clean.fastq", p=WORK_DIR, s=SAMPLES)
+            output:
+                RESULTS_DIR + "/read_numbers_after_clean.png"
+            conda:
+                "envs/notebooks.yml"
+            log:
+                "logs/analysis/unmapped_analysis_after_clean.log"
+            benchmark:
+                "benchmarks/analysis/unmapped_analysis_after_cleaning.txt"
+            notebook:
+                MYPATH + "/modules/notebooks/read_numbers_in_bar_graph.py.ipynb"    
+
+        rule unmapped_analysis_after_clean_filter:
+            input:
+                single = expand("{p}/{s}/sickle.single_clean.fastq", p=WORK_DIR, s=SAMPLES),
+                paired1 = expand("{p}/{s}/sickle.paired_1_clean.fastq", p=WORK_DIR, s=SAMPLES),
+                paired2 = expand("{p}/{s}/sickle.paired_2_clean.fastq", p=WORK_DIR, s=SAMPLES)
+            output:
+                RESULTS_DIR + "/read_numbers_after_clean_filter.png"
+            conda:
+                "envs/notebooks.yml"
+            log:
+                "logs/analysis/unmapped_analysis_after_clean_filter.log"
+            benchmark:
+                "benchmarks/analysis/unmapped_analysis_after_clean_filter.txt"
+            notebook:
+                MYPATH + "/modules/notebooks/read_numbers_in_bar_graph.py.ipynb"
+
+
+     
 
-            
 rule assembly_analysis_table:
     input:
         contigs = expand("{p}/{s}/{assemblr}.contigs.fa", p=WORK_DIR, s=SAMPLES, assemblr=ASSEMBLER)
@@ -46,6 +132,22 @@ rule assembly_analysis:
     notebook:
         MYPATH + "/modules/notebooks/content_distribution.py.ipynb"
 
+    
+rule calculate_coverage:
+    input:
+        bams = WORK_DIR + "/{sample}/non_ref_new.bam",
+        bais = WORK_DIR + "/{sample}/non_ref_new.bam.bai"
+    output:
+        cov = WORK_DIR + "/{sample}/coverage.txt"
+    conda:
+        "envs/samtools.yml"
+    log:
+        "logs/analysis/{sample}_calculate_coverage.log"
+    benchmark:
+        "benchmarks/analysis/{sample}_calculate_coverage.txt"
+    shell:
+        "samtools coverage {input.bams} > {output.cov}"
+
 
 rule coverage_analysis:
     input:
@@ -61,20 +163,3 @@ rule coverage_analysis:
         "benchmarks/analysis/coverage_analysis.txt"
     notebook:
         MYPATH + "/modules/notebooks/contig_coverage_heatmap.py.ipynb"
-
-# rule coverage_analysis:
-#     input:
-#         cov = RESULTS_DIR + "/{sample}_coverage.txt"
-#         bams = expand(WORK_DIR + "/{s}/non_ref_new.bam", s=SAMPLES),
-#         bais = expand(WORK_DIR + "/{s}/non_ref_new.bam.bai", s=SAMPLES),
-#         fasta = RESULTS_DIR + "/supercontigs.fa"
-#     output:
-#         RESULTS_DIR + "/heatmap_coverage.png"
-#     conda:
-#         "envs/notebooks.yml"
-#     log:
-#         "logs/analysis/coverage_analysis.log"
-#     benchmark:
-#         "benchmarks/analysis/coverage_analysis.txt"
-#     notebook:
-#         MYPATH + "/modules/notebooks/contig_coverage_heatmap.py.ipynb"
\ No newline at end of file
diff --git a/modules/contigmap.smk b/modules/contigmap.smk
index 4e6a46e..9b89be3 100644
--- a/modules/contigmap.smk
+++ b/modules/contigmap.smk
@@ -165,20 +165,4 @@ rule index_sorted:
     benchmark:
         "benchmarks/contigmap/{sample}_index_sorted.txt"
     shell:
-        "samtools index {input}"
-
-    
-rule calculate_coverage:
-    input:
-        bams = WORK_DIR + "/{sample}/non_ref_new.bam",
-        bais = WORK_DIR + "/{sample}/non_ref_new.bam.bai"
-    output:
-        cov = WORK_DIR + "/{sample}/coverage.txt"
-    conda:
-        "envs/samtools.yml"
-    log:
-        "logs/contigmap/{sample}_calculate_coverage.log"
-    benchmark:
-        "benchmarks/contigmap/{sample}_calculate_coverage.txt"
-    shell:
-        "samtools coverage {input.bams} > {output.cov}"
\ No newline at end of file
+        "samtools index {input}"
\ No newline at end of file
diff --git a/modules/crop_remapped.smk b/modules/crop_remapped.smk
index 8690fe6..bea5f1d 100644
--- a/modules/crop_remapped.smk
+++ b/modules/crop_remapped.smk
@@ -70,7 +70,7 @@ elif config["SICKLE"]=="yes":
 
     rule popins2_sort:
         input:
-            os.path.join(RESULTS_DIR, "read_numbers.png") if config["ANALYSIS"]=="yes" else [],
+            # os.path.join(RESULTS_DIR, "read_numbers.png") if config["ANALYSIS"]=="yes" else [],
             mates = WORK_DIR + "/{sample}/mates.bam"
         output:
             WORK_DIR + "/{sample}/non_ref.bam"
@@ -89,7 +89,7 @@ elif config["SICKLE"]=="yes":
 
     rule popins2_sickle:
         input:
-            os.path.join(RESULTS_DIR, "read_numbers.png") if config["ANALYSIS"]=="yes" else [],
+            # os.path.join(RESULTS_DIR, "read_numbers.png") if config["ANALYSIS"]=="yes" else [],
             single = WORK_DIR + "/{sample}/single_clean.fastq", 
             pair_1 = WORK_DIR + "/{sample}/paired_1_clean.fastq", 
             pair_2 = WORK_DIR + "/{sample}/paired_2_clean.fastq"
diff --git a/modules/crop_unmapped.smk b/modules/crop_unmapped.smk
index 6e95d53..036c5c3 100644
--- a/modules/crop_unmapped.smk
+++ b/modules/crop_unmapped.smk
@@ -68,7 +68,7 @@ elif config["SICKLE"]=="yes":
             
     rule popins2_sort:
         input:
-            os.path.join(RESULTS_DIR, "read_numbers.png") if config["ANALYSIS"]=="yes" else [],
+            # os.path.join(RESULTS_DIR, "read_numbers.png") if config["ANALYSIS"]=="yes" else [],
             mates = WORK_DIR + "/{sample}/mates.bam"
         output:
             WORK_DIR + "/{sample}/non_ref.bam"
@@ -87,7 +87,7 @@ elif config["SICKLE"]=="yes":
      
     rule popins2_sickle:
         input:
-            os.path.join(RESULTS_DIR, "read_numbers.png") if config["ANALYSIS"]=="yes" else [],
+            # os.path.join(RESULTS_DIR, "read_numbers.png") if config["ANALYSIS"]=="yes" else [],
             single = WORK_DIR + "/{sample}/single.fastq", 
             pair_1 = WORK_DIR + "/{sample}/paired.1.fastq", 
             pair_2 = WORK_DIR + "/{sample}/paired.2.fastq"
diff --git a/modules/genotype.smk b/modules/genotype.smk
index f21caa6..df1c581 100644
--- a/modules/genotype.smk
+++ b/modules/genotype.smk
@@ -1,11 +1,7 @@
-# RESULTS_DIR = config["RESULTS_DIR"]
-# WORK_DIR = config["WORK_DIR"]
-# SAMPLES = config["SAMPLES"]
-
 rule sort_vcf:
     input:
         RESULTS_DIR + "/place-finish.done",
-        ins=rules.popins2_place_refalign.output.vcf
+        ins=RESULTS_DIR + "/insertions_unsorted.vcf"
     output:
         temp(RESULTS_DIR + "/insertions.vcf")
     conda:
@@ -15,7 +11,6 @@ rule sort_vcf:
     benchmark:
         "benchmarks/genotype/sort_vcf.txt"
     shell:
-        "bcftools view {input.ins} | "
         "bcftools sort {input.ins} "
         "   --output-file {output}"
         "   --output-type v"
@@ -33,7 +28,7 @@ rule popins2_genotype:
     shell:
         "{POPINS2_BIN} genotype "
         "   --prefix {WORK_DIR} "
-        "   --insertions {rules.sort_vcf.output} "
+        "   --insertions {input} "
         "   --contigs {rules.popins2_merge_contigs.output.supercontigs} "
         "   --reference {REFERENCE} "
         "   {wildcards.sample} "
diff --git a/modules/notebooks/contig_coverage_heatmap.py.ipynb b/modules/notebooks/contig_coverage_heatmap.py.ipynb
index 20ae7d9..46ac0ca 100644
--- a/modules/notebooks/contig_coverage_heatmap.py.ipynb
+++ b/modules/notebooks/contig_coverage_heatmap.py.ipynb
@@ -126,20 +126,8 @@
     "    'contig_id':contig_id,\n",
     "    'contig_length':contig_length\n",
     "})\n",
-    "df_sort_contig\n",
     "df_sort_contig = df_sort_contig.sort_values(by=['contig_length'], ascending=True)\n",
-    "order_by_length = df_sort_contig['contig_id'].tolist()\n",
-    "order_by_length"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 89,
-   "id": "39150e4b-d2f4-43d4-a6da-6427453899fa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# df = pd.read_csv(coverage_files[0], sep='\\t')"
+    "order_by_length = df_sort_contig['contig_id'].tolist()"
    ]
   },
   {
@@ -167,12 +155,7 @@
     "        coverage_df = df\n",
     "    else:\n",
     "        # Merge with the main DataFrame on the contig name\n",
-    "        coverage_df = pd.merge(coverage_df, df, on='#rname', how='outer')\n",
-    " # Set the contig names as the index\n",
-    "# coverage_df.set_index('#rname', inplace=True)\n",
-    "# Replace NaN values with zeros (if any contigs are missing in some samples)\n",
-    "# coverage_df.fillna(0, inplace=True)\n",
-    "# coverage_df.fillna(0)"
+    "        coverage_df = pd.merge(coverage_df, df, on='#rname', how='outer')\n"
    ]
   },
   {
@@ -187,6 +170,16 @@
     "coverage_df = coverage_df.reindex(order_by_length)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab7e6633",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "coverage_df_cleaned = calculator.remove_outliers_replace_with_bounds(coverage_df)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 85,
@@ -195,9 +188,8 @@
    "outputs": [],
    "source": [
     "# Plot the heatmap\n",
-    "plt.figure(figsize=(10, 10))\n",
-    "sns.set(font_scale = 1.3)\n",
-    "sns.heatmap(coverage_df, annot=False, cmap='YlOrBr')\n",
+    "sns.set(font_scale = 0.8)\n",
+    "sns.heatmap(coverage_df_cleaned, annot=False, cmap='YlOrBr')\n",
     "plt.title('Coverage of Contigs')\n",
     "plt.xlabel('Samples')\n",
     "plt.ylabel('Contigs')\n",
@@ -206,19 +198,6 @@
     "plt.savefig(snakemake.output[0], dpi=400, bbox_inches = 'tight')"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 79,
-   "id": "f0e2f837-0307-4269-928d-417d89303aeb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "\n",
-    "\n",
-    "# plt.savefig(wkdir+'/contig_coverage_no_outlier', dpi=400, bbox_inches = 'tight')"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/modules/notebooks/src/stat_func/calculator.py b/modules/notebooks/src/stat_func/calculator.py
index 8c080fc..f85adca 100644
--- a/modules/notebooks/src/stat_func/calculator.py
+++ b/modules/notebooks/src/stat_func/calculator.py
@@ -250,6 +250,39 @@ def replace_column_outliers_with_max(df, column):
     
     return df
 
+def remove_outliers_replace_with_bounds(df):
+    """
+    Removes outliers from the 1st and 4th quartiles and replaces them with the lower and upper bounds.
+    
+    Parameters:
+    df: DataFrame where the first column is the index and the next columns are the contig coverages for samples.
+    
+    Returns:
+    df: DataFrame with outliers replaced by the respective bounds.
+    """
+    
+    # Apply the process to each column starting from the second (ignoring index column)
+    for col in df.columns:
+        # Calculate Q1 (25th percentile) and Q3 (75th percentile)
+        Q1 = df[col].quantile(0.25)
+        Q3 = df[col].quantile(0.75)
+        
+        # Calculate IQR
+        IQR = Q3 - Q1
+        
+        # Define bounds for outliers
+        lower_bound = Q1 - 1.5 * IQR
+        upper_bound = Q3 + 1.5 * IQR
+        
+        # Replace values below the lower bound with the lower bound
+        df[col] = df[col].apply(lambda x: lower_bound if x < lower_bound else x)
+        
+        # Replace values above the upper bound with the upper bound
+        df[col] = df[col].apply(lambda x: upper_bound if x > upper_bound else x)
+    
+    return df
+
+
 #quick function for getting an average of a list of numbers as a string    
 def quickstats(x):
     x_array = np.asarray(x,dtype=np.float64)
diff --git a/modules/position.smk b/modules/position.smk
index 6fcfed9..ae9cf8b 100644
--- a/modules/position.smk
+++ b/modules/position.smk
@@ -1,10 +1,3 @@
-# WORK_DIR = config["WORK_DIR"]
-# RESULTS_DIR = config["RESULTS_DIR"]
-# REFERENCE = config["REFERENCE"]
-# SAMPLES = config["SAMPLES"]
-# ASSEMBLER = config["ASSEMBLER"]
-
-
 
 
 if config["remove_contamination"] == 'yes':
@@ -54,7 +47,6 @@ rule popins2_merge_locations:
     input:
         os.path.join(RESULTS_DIR, "heatmap_coverage.png") if config["ANALYSIS"]=="yes" else [],
         expand(WORK_DIR + "/{s}/locations.txt", s=SAMPLES)
-        # RESULTS_DIR + "/heatmap_coverage.png"
     output:
         loc=RESULTS_DIR + "/locations.txt"
     log:
-- 
GitLab