remove chromosome info from coverage analysis, only keep contigs

4dcf79b4 · kdc715 · f30daff1 · 4dcf79b4
Commit 4dcf79b4 authored 2 months ago by kdc715
--- a/snakemodules/notebooks/contig_coverage_heatmap.py.ipynb
+++ b/snakemodules/notebooks/contig_coverage_heatmap.py.ipynb
@@ -160,13 +160,23 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 91,
+   "execution_count": null,
   "id": "36f31bc9-8eaa-434a-9a81-6a97282fe469",
   "metadata": {},
   "outputs": [],
   "source": [
    "coverage_df.rename(columns={'#rname': 'contigs'}, inplace=True)\n",
-    "coverage_df.set_index('contigs', inplace=True)\n",
+    "coverage_df.set_index('contigs', inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "345e2ab4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "coverage_df = coverage_df[coverage_df.index.str.startswith(\"contig_\")]\n",
    "coverage_df = coverage_df.reindex(order_by_length)"
   ]
  },
@@ -217,7 +227,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
@@ -231,7 +241,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.9.12"
  }
 },
 "nbformat": 4,

 %% Cell type:code id:99f09a73-ce8c-46b0-87c1-0f2c801a63ca tags:
 ``` python
 %load_ext autoreload
 %autoreload 2
 ```
 %% Cell type:code id:0bcadc62-0e3e-41e7-ba9a-1623c666520f tags:
 ``` python
 from Bio import SeqIO
 import glob
 import os
 import sys
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 ```
 %% Cell type:code id:96c89b03-b6e0-4d20-a5ee-9df6607f79fb tags:
 ``` python
 # If executing with snakemake, use the path below
 sys.path.append(snakemake.config["WORKFLOW_PATH"]+'/snakemodules/notebooks/src/')
 # If executing directly with jupyter notebook from the commandline, use the path below
 # sys.path.append(os.getcwd()+'/src/')
 from stat_func import calculator
 from stat_func import visualization
 %matplotlib inline
 ```
 %% Cell type:code id:b61116c6-48c1-40e5-8e7a-42f31b05bfe2 tags:
 ``` python
 # # Get the current working directory
 # print("Current Path:", current_path)
 # # Navigate up two directories to reach '/home/usr/program'
 # print("Program Path:", program_path)
 # Join 'results' to get the desired path
 # current_path = os.getcwd()
 # program_path = os.path.dirname(os.path.dirname(current_path))
 # results_path = os.path.join(program_path, 'results')
 # If executing with snakemake, use the path below
 results_path = os.path.join(os.getcwd(), 'results')
 print("Results Path:", results_path)
 ```
 %% Cell type:code id:496682dd-a457-4510-b611-34ebd28fbd4f tags:
 ``` python
 # Use paths below for snakemake execution
 supercontigs = snakemake.input.fasta
 coverage_files = snakemake.input.cov
 # Use paths below for jupyter notebook execution
 # For jupyter notebook execution, supercontigs is a list after glob.glob
 # supercontig_fa = glob.glob('/home/kedic/popinSnake/results/supercontigs.fa')
 # supercontigs = supercontig_fa[0]
 # coverage_files = glob.glob('/home/kedic/popinSnake/workdir/**/coverage.txt')
 ```
 %% Cell type:code id:e2c85a70-b05a-4ee3-b964-638639f9343a tags:
 ``` python
 # For jupyter notebook execution, change supercontigs to supercontigs[0]
 seq_objects=SeqIO.parse(supercontigs,'fasta')
 sequences=[]
 for seq in seq_objects:
    sequences.append(seq)
 num_of_contigs = len(sequences)
 ```
 %% Cell type:code id:e88ab748-87cb-493d-b2c2-15134073d87d tags:
 ``` python
 contig_id = []
 contig_length = []
 for record in sequences:
    record_id = record.id
    sequence = record.seq
    length=len(sequence)
    contig_id.append(record_id)
    contig_length.append(length)
 ```
 %% Cell type:code id:95441eb2-f5dd-42b0-bae1-c43f5b4b0cb6 tags:
 ``` python
 df_sort_contig = pd.DataFrame({
    'contig_id':contig_id,
    'contig_length':contig_length
 })
 df_sort_contig = df_sort_contig.sort_values(by=['contig_length'], ascending=True)
 order_by_length = df_sort_contig['contig_id'].tolist()
 ```
 %% Cell type:code id:495f2814-9fcb-4843-a2b6-fc288301101a tags:
 ``` python
 # Initialize an empty DataFrame to hold all coverage data
 coverage_df = pd.DataFrame()
 for file in coverage_files:
    # Extract sample name from the filename
    folder_path = os.path.dirname(file)
    sample_name = os.path.basename(folder_path)
    # Read the coverage data, skipping comment lines starting with '#'
    df = pd.read_csv(file, sep='\t')
    # Select the contig name and mean depth columns
    df = df[['#rname', 'meandepth']]
    # Rename the mean depth column to the sample name
    df.rename(columns={'meandepth': sample_name}, inplace=True)
    if coverage_df.empty:
        # Initialize the main DataFrame with the first sample
        coverage_df = df
    else:
        # Merge with the main DataFrame on the contig name
        coverage_df = pd.merge(coverage_df, df, on='#rname', how='outer')
 ```
 %% Cell type:code id:36f31bc9-8eaa-434a-9a81-6a97282fe469 tags:
 ``` python
 coverage_df.rename(columns={'#rname': 'contigs'}, inplace=True)
 coverage_df.set_index('contigs', inplace=True)
+```
+%% Cell type:code id:345e2ab4 tags:
+``` python
+coverage_df = coverage_df[coverage_df.index.str.startswith("contig_")]
 coverage_df = coverage_df.reindex(order_by_length)
 ```
 %% Cell type:code id:ab7e6633 tags:
 ``` python
 coverage_df_cleaned = calculator.remove_outliers_replace_with_bounds(coverage_df)
 ```
 %% Cell type:code id:3f61507d-b554-4dfd-830c-fa3555cad215 tags:
 ``` python
 # Plot the heatmap
 sns.set(font_scale = 0.8)
 sns.heatmap(coverage_df_cleaned, annot=False, cmap='YlOrBr')
 plt.title('Coverage of Contigs')
 plt.xlabel('Samples')
 plt.ylabel('Contigs')
 plt.savefig(snakemake.output[0], dpi=400, bbox_inches = 'tight')
 plt.show()
 ```
 %% Cell type:code id:b61cdb41-3f9e-48fa-86ca-3249642fd7b4 tags:
 ``` python
 ```
 %% Cell type:code id:de6a634e-ab13-4297-b7c9-349cc69bad03 tags:
 ``` python
 ```