From b0bf1af4066b6ebe4de04c8c3898a9372597b107 Mon Sep 17 00:00:00 2001 From: kdc715 <kedic@LIT-PF3VB193.localdomain> Date: Tue, 8 Oct 2024 15:52:18 +0200 Subject: [PATCH] modify coverage calculation method, add analysis for comparing popins filter params and sickle --- Snakefile | 19 ++- modules/analysis.smk | 151 ++++++++++++++---- modules/contigmap.smk | 18 +-- modules/crop_remapped.smk | 4 +- modules/crop_unmapped.smk | 4 +- modules/genotype.smk | 9 +- .../contig_coverage_heatmap.py.ipynb | 49 ++---- modules/notebooks/src/stat_func/calculator.py | 33 ++++ modules/position.smk | 8 - 9 files changed, 183 insertions(+), 112 deletions(-) diff --git a/Snakefile b/Snakefile index 6925a07..c98e58e 100644 --- a/Snakefile +++ b/Snakefile @@ -1,6 +1,6 @@ #--------------------------- # @author: Kedi Cao -# @email: ckedi.cao@ukr.de +# @email: kedi.cao@ukr.de # @date: 28.08.2024 #--------------------------- @@ -36,12 +36,7 @@ kraken_lib = config["kraken_lib"] # REMAP = config["REMAP"] -import os -def get_results_filename(filenames): - paths = [os.path.join(RESULTS_DIR, filename) for filename in filenames] - existing_paths = [path if os.path.exists(path) else [] for path in paths] - return existing_paths ################################################################################ input_files = [ @@ -60,10 +55,18 @@ if config["remove_contamination"]=="no": elif config["remove_contamination"]=="yes": input_files.append(expand("{p}/{s}/{f}",p=WORK_DIR, s=SAMPLES, f=["single_clean.fastq","paired_1_clean.fastq","paired_2_clean.fastq"])) input_files.append(expand("{p}/{s}/remapped_non_ref.bam", p=WORK_DIR, s=SAMPLES)) - # input_files.append(RESULTS_DIR + "/read_numbers.png") if config["ANALYSIS"]=="yes": - input_files.append(expand("{p}/{f}", p=RESULTS_DIR, f=["read_numbers.png","analysis_table.png","gc_content_dist.png","contig_length_dist.png","heatmap_coverage.png"])) + input_files.append(expand("{p}/{f}", p=RESULTS_DIR, f=["analysis_table.png","gc_content_dist.png","contig_length_dist.png","heatmap_coverage.png"])) + if config["SICKLE"]=="no": + input_files.append(expand("{p}/{f}", p=RESULTS_DIR, f=["read_numbers.png"])) + elif config["SICKLE"]=="yes": + if config["remove_contamination"]=="no": + input_files.append(expand("{p}/{f}", p=RESULTS_DIR, f=["read_numbers_before_filter.png","read_numbers_after_filter.png"])) + elif config["remove_contamination"]=="yes": + input_files.append(expand("{p}/{f}", p=RESULTS_DIR, f=["read_numbers_before_clean_filter.png","read_numbers_after_clean.png","read_numbers_after_clean_filter.png"])) + + ################################################################################ diff --git a/modules/analysis.smk b/modules/analysis.smk index 4048428..5204b0b 100644 --- a/modules/analysis.smk +++ b/modules/analysis.smk @@ -1,21 +1,107 @@ -rule unmapped_analysis: - input: - single = expand("{p}/{s}/single.fastq", p=WORK_DIR, s=SAMPLES), - paired1 = expand("{p}/{s}/paired.1.fastq", p=WORK_DIR, s=SAMPLES), - paired2 = expand("{p}/{s}/paired.2.fastq", p=WORK_DIR, s=SAMPLES) - output: - RESULTS_DIR + "/read_numbers.png" - conda: - "envs/notebooks.yml" - log: - "logs/analysis/unmapped_analysis.log" - benchmark: - "benchmarks/analysis/unmapped_analysis.txt" - notebook: - MYPATH + "/modules/notebooks/read_numbers_in_bar_graph.py.ipynb" +if config["SICKLE"]=="no": + rule unmapped_analysis: + input: + single = expand("{p}/{s}/single.fastq", p=WORK_DIR, s=SAMPLES), + paired1 = expand("{p}/{s}/paired.1.fastq", p=WORK_DIR, s=SAMPLES), + paired2 = expand("{p}/{s}/paired.2.fastq", p=WORK_DIR, s=SAMPLES) + output: + RESULTS_DIR + "/read_numbers.png" + conda: + "envs/notebooks.yml" + log: + "logs/analysis/unmapped_analysis.log" + benchmark: + "benchmarks/analysis/unmapped_analysis.txt" + notebook: + MYPATH + "/modules/notebooks/read_numbers_in_bar_graph.py.ipynb" + +elif config["SICKLE"]=="yes": + if config["remove_contamination"]=="no": + rule unmapped_analysis_before_filter: + input: + single = expand("{p}/{s}/single.fastq", p=WORK_DIR, s=SAMPLES), + paired1 = expand("{p}/{s}/paired.1.fastq", p=WORK_DIR, s=SAMPLES), + paired2 = expand("{p}/{s}/paired.2.fastq", p=WORK_DIR, s=SAMPLES) + output: + RESULTS_DIR + "/read_numbers_before_filter.png" + conda: + "envs/notebooks.yml" + log: + "logs/analysis/unmapped_analysis_before_filter.log" + benchmark: + "benchmarks/analysis/unmapped_analysis_before_filter.txt" + notebook: + MYPATH + "/modules/notebooks/read_numbers_in_bar_graph.py.ipynb" + + rule unmapped_analysis_after_filter: + input: + single = expand("{p}/{s}/sickle.single.fastq", p=WORK_DIR, s=SAMPLES), + paired1 = expand("{p}/{s}/sickle.paired.1.fastq", p=WORK_DIR, s=SAMPLES), + paired2 = expand("{p}/{s}/sickle.paired.2.fastq", p=WORK_DIR, s=SAMPLES) + output: + RESULTS_DIR + "/read_numbers_after_filter.png" + conda: + "envs/notebooks.yml" + log: + "logs/analysis/unmapped_analysis_after_filter.log" + benchmark: + "benchmarks/analysis/unmapped_analysis_after_filter.txt" + notebook: + MYPATH + "/modules/notebooks/read_numbers_in_bar_graph.py.ipynb" + + elif config["remove_contamination"]=="yes": + rule unmapped_analysis_before_clean_filter: + input: + single = expand("{p}/{s}/single.fastq", p=WORK_DIR, s=SAMPLES), + paired1 = expand("{p}/{s}/paired.1.fastq", p=WORK_DIR, s=SAMPLES), + paired2 = expand("{p}/{s}/paired.2.fastq", p=WORK_DIR, s=SAMPLES) + output: + RESULTS_DIR + "/read_numbers_before_clean_filter.png" + conda: + "envs/notebooks.yml" + log: + "logs/analysis/unmapped_analysis_before_clean_filter.log" + benchmark: + "benchmarks/analysis/unmapped_analysis_before_clean_filte.txt" + notebook: + MYPATH + "/modules/notebooks/read_numbers_in_bar_graph.py.ipynb" + + rule unmapped_analysis_after_clean: + input: + single = expand("{p}/{s}/single_clean.fastq", p=WORK_DIR, s=SAMPLES), + paired1 = expand("{p}/{s}/paired_1_clean.fastq", p=WORK_DIR, s=SAMPLES), + paired2 = expand("{p}/{s}/paired_2_clean.fastq", p=WORK_DIR, s=SAMPLES) + output: + RESULTS_DIR + "/read_numbers_after_clean.png" + conda: + "envs/notebooks.yml" + log: + "logs/analysis/unmapped_analysis_after_clean.log" + benchmark: + "benchmarks/analysis/unmapped_analysis_after_cleaning.txt" + notebook: + MYPATH + "/modules/notebooks/read_numbers_in_bar_graph.py.ipynb" + + rule unmapped_analysis_after_clean_filter: + input: + single = expand("{p}/{s}/sickle.single_clean.fastq", p=WORK_DIR, s=SAMPLES), + paired1 = expand("{p}/{s}/sickle.paired_1_clean.fastq", p=WORK_DIR, s=SAMPLES), + paired2 = expand("{p}/{s}/sickle.paired_2_clean.fastq", p=WORK_DIR, s=SAMPLES) + output: + RESULTS_DIR + "/read_numbers_after_clean_filter.png" + conda: + "envs/notebooks.yml" + log: + "logs/analysis/unmapped_analysis_after_clean_filter.log" + benchmark: + "benchmarks/analysis/unmapped_analysis_after_clean_filter.txt" + notebook: + MYPATH + "/modules/notebooks/read_numbers_in_bar_graph.py.ipynb" + + + - rule assembly_analysis_table: input: contigs = expand("{p}/{s}/{assemblr}.contigs.fa", p=WORK_DIR, s=SAMPLES, assemblr=ASSEMBLER) @@ -46,6 +132,22 @@ rule assembly_analysis: notebook: MYPATH + "/modules/notebooks/content_distribution.py.ipynb" + +rule calculate_coverage: + input: + bams = WORK_DIR + "/{sample}/non_ref_new.bam", + bais = WORK_DIR + "/{sample}/non_ref_new.bam.bai" + output: + cov = WORK_DIR + "/{sample}/coverage.txt" + conda: + "envs/samtools.yml" + log: + "logs/analysis/{sample}_calculate_coverage.log" + benchmark: + "benchmarks/analysis/{sample}_calculate_coverage.txt" + shell: + "samtools coverage {input.bams} > {output.cov}" + rule coverage_analysis: input: @@ -61,20 +163,3 @@ rule coverage_analysis: "benchmarks/analysis/coverage_analysis.txt" notebook: MYPATH + "/modules/notebooks/contig_coverage_heatmap.py.ipynb" - -# rule coverage_analysis: -# input: -# cov = RESULTS_DIR + "/{sample}_coverage.txt" -# bams = expand(WORK_DIR + "/{s}/non_ref_new.bam", s=SAMPLES), -# bais = expand(WORK_DIR + "/{s}/non_ref_new.bam.bai", s=SAMPLES), -# fasta = RESULTS_DIR + "/supercontigs.fa" -# output: -# RESULTS_DIR + "/heatmap_coverage.png" -# conda: -# "envs/notebooks.yml" -# log: -# "logs/analysis/coverage_analysis.log" -# benchmark: -# "benchmarks/analysis/coverage_analysis.txt" -# notebook: -# MYPATH + "/modules/notebooks/contig_coverage_heatmap.py.ipynb" \ No newline at end of file diff --git a/modules/contigmap.smk b/modules/contigmap.smk index 4e6a46e..9b89be3 100644 --- a/modules/contigmap.smk +++ b/modules/contigmap.smk @@ -165,20 +165,4 @@ rule index_sorted: benchmark: "benchmarks/contigmap/{sample}_index_sorted.txt" shell: - "samtools index {input}" - - -rule calculate_coverage: - input: - bams = WORK_DIR + "/{sample}/non_ref_new.bam", - bais = WORK_DIR + "/{sample}/non_ref_new.bam.bai" - output: - cov = WORK_DIR + "/{sample}/coverage.txt" - conda: - "envs/samtools.yml" - log: - "logs/contigmap/{sample}_calculate_coverage.log" - benchmark: - "benchmarks/contigmap/{sample}_calculate_coverage.txt" - shell: - "samtools coverage {input.bams} > {output.cov}" \ No newline at end of file + "samtools index {input}" \ No newline at end of file diff --git a/modules/crop_remapped.smk b/modules/crop_remapped.smk index 8690fe6..bea5f1d 100644 --- a/modules/crop_remapped.smk +++ b/modules/crop_remapped.smk @@ -70,7 +70,7 @@ elif config["SICKLE"]=="yes": rule popins2_sort: input: - os.path.join(RESULTS_DIR, "read_numbers.png") if config["ANALYSIS"]=="yes" else [], + # os.path.join(RESULTS_DIR, "read_numbers.png") if config["ANALYSIS"]=="yes" else [], mates = WORK_DIR + "/{sample}/mates.bam" output: WORK_DIR + "/{sample}/non_ref.bam" @@ -89,7 +89,7 @@ elif config["SICKLE"]=="yes": rule popins2_sickle: input: - os.path.join(RESULTS_DIR, "read_numbers.png") if config["ANALYSIS"]=="yes" else [], + # os.path.join(RESULTS_DIR, "read_numbers.png") if config["ANALYSIS"]=="yes" else [], single = WORK_DIR + "/{sample}/single_clean.fastq", pair_1 = WORK_DIR + "/{sample}/paired_1_clean.fastq", pair_2 = WORK_DIR + "/{sample}/paired_2_clean.fastq" diff --git a/modules/crop_unmapped.smk b/modules/crop_unmapped.smk index 6e95d53..036c5c3 100644 --- a/modules/crop_unmapped.smk +++ b/modules/crop_unmapped.smk @@ -68,7 +68,7 @@ elif config["SICKLE"]=="yes": rule popins2_sort: input: - os.path.join(RESULTS_DIR, "read_numbers.png") if config["ANALYSIS"]=="yes" else [], + # os.path.join(RESULTS_DIR, "read_numbers.png") if config["ANALYSIS"]=="yes" else [], mates = WORK_DIR + "/{sample}/mates.bam" output: WORK_DIR + "/{sample}/non_ref.bam" @@ -87,7 +87,7 @@ elif config["SICKLE"]=="yes": rule popins2_sickle: input: - os.path.join(RESULTS_DIR, "read_numbers.png") if config["ANALYSIS"]=="yes" else [], + # os.path.join(RESULTS_DIR, "read_numbers.png") if config["ANALYSIS"]=="yes" else [], single = WORK_DIR + "/{sample}/single.fastq", pair_1 = WORK_DIR + "/{sample}/paired.1.fastq", pair_2 = WORK_DIR + "/{sample}/paired.2.fastq" diff --git a/modules/genotype.smk b/modules/genotype.smk index f21caa6..df1c581 100644 --- a/modules/genotype.smk +++ b/modules/genotype.smk @@ -1,11 +1,7 @@ -# RESULTS_DIR = config["RESULTS_DIR"] -# WORK_DIR = config["WORK_DIR"] -# SAMPLES = config["SAMPLES"] - rule sort_vcf: input: RESULTS_DIR + "/place-finish.done", - ins=rules.popins2_place_refalign.output.vcf + ins=RESULTS_DIR + "/insertions_unsorted.vcf" output: temp(RESULTS_DIR + "/insertions.vcf") conda: @@ -15,7 +11,6 @@ rule sort_vcf: benchmark: "benchmarks/genotype/sort_vcf.txt" shell: - "bcftools view {input.ins} | " "bcftools sort {input.ins} " " --output-file {output}" " --output-type v" @@ -33,7 +28,7 @@ rule popins2_genotype: shell: "{POPINS2_BIN} genotype " " --prefix {WORK_DIR} " - " --insertions {rules.sort_vcf.output} " + " --insertions {input} " " --contigs {rules.popins2_merge_contigs.output.supercontigs} " " --reference {REFERENCE} " " {wildcards.sample} " diff --git a/modules/notebooks/contig_coverage_heatmap.py.ipynb b/modules/notebooks/contig_coverage_heatmap.py.ipynb index 20ae7d9..46ac0ca 100644 --- a/modules/notebooks/contig_coverage_heatmap.py.ipynb +++ b/modules/notebooks/contig_coverage_heatmap.py.ipynb @@ -126,20 +126,8 @@ " 'contig_id':contig_id,\n", " 'contig_length':contig_length\n", "})\n", - "df_sort_contig\n", "df_sort_contig = df_sort_contig.sort_values(by=['contig_length'], ascending=True)\n", - "order_by_length = df_sort_contig['contig_id'].tolist()\n", - "order_by_length" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "id": "39150e4b-d2f4-43d4-a6da-6427453899fa", - "metadata": {}, - "outputs": [], - "source": [ - "# df = pd.read_csv(coverage_files[0], sep='\\t')" + "order_by_length = df_sort_contig['contig_id'].tolist()" ] }, { @@ -167,12 +155,7 @@ " coverage_df = df\n", " else:\n", " # Merge with the main DataFrame on the contig name\n", - " coverage_df = pd.merge(coverage_df, df, on='#rname', how='outer')\n", - " # Set the contig names as the index\n", - "# coverage_df.set_index('#rname', inplace=True)\n", - "# Replace NaN values with zeros (if any contigs are missing in some samples)\n", - "# coverage_df.fillna(0, inplace=True)\n", - "# coverage_df.fillna(0)" + " coverage_df = pd.merge(coverage_df, df, on='#rname', how='outer')\n" ] }, { @@ -187,6 +170,16 @@ "coverage_df = coverage_df.reindex(order_by_length)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab7e6633", + "metadata": {}, + "outputs": [], + "source": [ + "coverage_df_cleaned = calculator.remove_outliers_replace_with_bounds(coverage_df)" + ] + }, { "cell_type": "code", "execution_count": 85, @@ -195,9 +188,8 @@ "outputs": [], "source": [ "# Plot the heatmap\n", - "plt.figure(figsize=(10, 10))\n", - "sns.set(font_scale = 1.3)\n", - "sns.heatmap(coverage_df, annot=False, cmap='YlOrBr')\n", + "sns.set(font_scale = 0.8)\n", + "sns.heatmap(coverage_df_cleaned, annot=False, cmap='YlOrBr')\n", "plt.title('Coverage of Contigs')\n", "plt.xlabel('Samples')\n", "plt.ylabel('Contigs')\n", @@ -206,19 +198,6 @@ "plt.savefig(snakemake.output[0], dpi=400, bbox_inches = 'tight')" ] }, - { - "cell_type": "code", - "execution_count": 79, - "id": "f0e2f837-0307-4269-928d-417d89303aeb", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "\n", - "\n", - "# plt.savefig(wkdir+'/contig_coverage_no_outlier', dpi=400, bbox_inches = 'tight')" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/modules/notebooks/src/stat_func/calculator.py b/modules/notebooks/src/stat_func/calculator.py index 8c080fc..f85adca 100644 --- a/modules/notebooks/src/stat_func/calculator.py +++ b/modules/notebooks/src/stat_func/calculator.py @@ -250,6 +250,39 @@ def replace_column_outliers_with_max(df, column): return df +def remove_outliers_replace_with_bounds(df): + """ + Removes outliers from the 1st and 4th quartiles and replaces them with the lower and upper bounds. + + Parameters: + df: DataFrame where the first column is the index and the next columns are the contig coverages for samples. + + Returns: + df: DataFrame with outliers replaced by the respective bounds. + """ + + # Apply the process to each column starting from the second (ignoring index column) + for col in df.columns: + # Calculate Q1 (25th percentile) and Q3 (75th percentile) + Q1 = df[col].quantile(0.25) + Q3 = df[col].quantile(0.75) + + # Calculate IQR + IQR = Q3 - Q1 + + # Define bounds for outliers + lower_bound = Q1 - 1.5 * IQR + upper_bound = Q3 + 1.5 * IQR + + # Replace values below the lower bound with the lower bound + df[col] = df[col].apply(lambda x: lower_bound if x < lower_bound else x) + + # Replace values above the upper bound with the upper bound + df[col] = df[col].apply(lambda x: upper_bound if x > upper_bound else x) + + return df + + #quick function for getting an average of a list of numbers as a string def quickstats(x): x_array = np.asarray(x,dtype=np.float64) diff --git a/modules/position.smk b/modules/position.smk index 6fcfed9..ae9cf8b 100644 --- a/modules/position.smk +++ b/modules/position.smk @@ -1,10 +1,3 @@ -# WORK_DIR = config["WORK_DIR"] -# RESULTS_DIR = config["RESULTS_DIR"] -# REFERENCE = config["REFERENCE"] -# SAMPLES = config["SAMPLES"] -# ASSEMBLER = config["ASSEMBLER"] - - if config["remove_contamination"] == 'yes': @@ -54,7 +47,6 @@ rule popins2_merge_locations: input: os.path.join(RESULTS_DIR, "heatmap_coverage.png") if config["ANALYSIS"]=="yes" else [], expand(WORK_DIR + "/{s}/locations.txt", s=SAMPLES) - # RESULTS_DIR + "/heatmap_coverage.png" output: loc=RESULTS_DIR + "/locations.txt" log: -- GitLab