Skip to content
Snippets Groups Projects
Commit 4dcf79b4 authored by kdc715's avatar kdc715
Browse files

remove chromosome info from coverage analysis, only keep contigs

parent f30daff1
Branches
No related merge requests found
%% Cell type:code id:99f09a73-ce8c-46b0-87c1-0f2c801a63ca tags: %% Cell type:code id:99f09a73-ce8c-46b0-87c1-0f2c801a63ca tags:
``` python ``` python
%load_ext autoreload %load_ext autoreload
%autoreload 2 %autoreload 2
``` ```
%% Cell type:code id:0bcadc62-0e3e-41e7-ba9a-1623c666520f tags: %% Cell type:code id:0bcadc62-0e3e-41e7-ba9a-1623c666520f tags:
``` python ``` python
from Bio import SeqIO from Bio import SeqIO
import glob import glob
import os import os
import sys import sys
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
``` ```
%% Cell type:code id:96c89b03-b6e0-4d20-a5ee-9df6607f79fb tags: %% Cell type:code id:96c89b03-b6e0-4d20-a5ee-9df6607f79fb tags:
``` python ``` python
# If executing with snakemake, use the path below # If executing with snakemake, use the path below
sys.path.append(snakemake.config["WORKFLOW_PATH"]+'/snakemodules/notebooks/src/') sys.path.append(snakemake.config["WORKFLOW_PATH"]+'/snakemodules/notebooks/src/')
# If executing directly with jupyter notebook from the commandline, use the path below # If executing directly with jupyter notebook from the commandline, use the path below
# sys.path.append(os.getcwd()+'/src/') # sys.path.append(os.getcwd()+'/src/')
from stat_func import calculator from stat_func import calculator
from stat_func import visualization from stat_func import visualization
%matplotlib inline %matplotlib inline
``` ```
%% Cell type:code id:b61116c6-48c1-40e5-8e7a-42f31b05bfe2 tags: %% Cell type:code id:b61116c6-48c1-40e5-8e7a-42f31b05bfe2 tags:
``` python ``` python
# # Get the current working directory # # Get the current working directory
# print("Current Path:", current_path) # print("Current Path:", current_path)
# # Navigate up two directories to reach '/home/usr/program' # # Navigate up two directories to reach '/home/usr/program'
# print("Program Path:", program_path) # print("Program Path:", program_path)
# Join 'results' to get the desired path # Join 'results' to get the desired path
# current_path = os.getcwd() # current_path = os.getcwd()
# program_path = os.path.dirname(os.path.dirname(current_path)) # program_path = os.path.dirname(os.path.dirname(current_path))
# results_path = os.path.join(program_path, 'results') # results_path = os.path.join(program_path, 'results')
# If executing with snakemake, use the path below # If executing with snakemake, use the path below
results_path = os.path.join(os.getcwd(), 'results') results_path = os.path.join(os.getcwd(), 'results')
print("Results Path:", results_path) print("Results Path:", results_path)
``` ```
%% Cell type:code id:496682dd-a457-4510-b611-34ebd28fbd4f tags: %% Cell type:code id:496682dd-a457-4510-b611-34ebd28fbd4f tags:
``` python ``` python
# Use paths below for snakemake execution # Use paths below for snakemake execution
supercontigs = snakemake.input.fasta supercontigs = snakemake.input.fasta
coverage_files = snakemake.input.cov coverage_files = snakemake.input.cov
# Use paths below for jupyter notebook execution # Use paths below for jupyter notebook execution
# For jupyter notebook execution, supercontigs is a list after glob.glob # For jupyter notebook execution, supercontigs is a list after glob.glob
# supercontig_fa = glob.glob('/home/kedic/popinSnake/results/supercontigs.fa') # supercontig_fa = glob.glob('/home/kedic/popinSnake/results/supercontigs.fa')
# supercontigs = supercontig_fa[0] # supercontigs = supercontig_fa[0]
# coverage_files = glob.glob('/home/kedic/popinSnake/workdir/**/coverage.txt') # coverage_files = glob.glob('/home/kedic/popinSnake/workdir/**/coverage.txt')
``` ```
%% Cell type:code id:e2c85a70-b05a-4ee3-b964-638639f9343a tags: %% Cell type:code id:e2c85a70-b05a-4ee3-b964-638639f9343a tags:
``` python ``` python
# For jupyter notebook execution, change supercontigs to supercontigs[0] # For jupyter notebook execution, change supercontigs to supercontigs[0]
seq_objects=SeqIO.parse(supercontigs,'fasta') seq_objects=SeqIO.parse(supercontigs,'fasta')
sequences=[] sequences=[]
for seq in seq_objects: for seq in seq_objects:
sequences.append(seq) sequences.append(seq)
num_of_contigs = len(sequences) num_of_contigs = len(sequences)
``` ```
%% Cell type:code id:e88ab748-87cb-493d-b2c2-15134073d87d tags: %% Cell type:code id:e88ab748-87cb-493d-b2c2-15134073d87d tags:
``` python ``` python
contig_id = [] contig_id = []
contig_length = [] contig_length = []
for record in sequences: for record in sequences:
record_id = record.id record_id = record.id
sequence = record.seq sequence = record.seq
length=len(sequence) length=len(sequence)
contig_id.append(record_id) contig_id.append(record_id)
contig_length.append(length) contig_length.append(length)
``` ```
%% Cell type:code id:95441eb2-f5dd-42b0-bae1-c43f5b4b0cb6 tags: %% Cell type:code id:95441eb2-f5dd-42b0-bae1-c43f5b4b0cb6 tags:
``` python ``` python
df_sort_contig = pd.DataFrame({ df_sort_contig = pd.DataFrame({
'contig_id':contig_id, 'contig_id':contig_id,
'contig_length':contig_length 'contig_length':contig_length
}) })
df_sort_contig = df_sort_contig.sort_values(by=['contig_length'], ascending=True) df_sort_contig = df_sort_contig.sort_values(by=['contig_length'], ascending=True)
order_by_length = df_sort_contig['contig_id'].tolist() order_by_length = df_sort_contig['contig_id'].tolist()
``` ```
%% Cell type:code id:495f2814-9fcb-4843-a2b6-fc288301101a tags: %% Cell type:code id:495f2814-9fcb-4843-a2b6-fc288301101a tags:
``` python ``` python
# Initialize an empty DataFrame to hold all coverage data # Initialize an empty DataFrame to hold all coverage data
coverage_df = pd.DataFrame() coverage_df = pd.DataFrame()
for file in coverage_files: for file in coverage_files:
# Extract sample name from the filename # Extract sample name from the filename
folder_path = os.path.dirname(file) folder_path = os.path.dirname(file)
sample_name = os.path.basename(folder_path) sample_name = os.path.basename(folder_path)
# Read the coverage data, skipping comment lines starting with '#' # Read the coverage data, skipping comment lines starting with '#'
df = pd.read_csv(file, sep='\t') df = pd.read_csv(file, sep='\t')
# Select the contig name and mean depth columns # Select the contig name and mean depth columns
df = df[['#rname', 'meandepth']] df = df[['#rname', 'meandepth']]
# Rename the mean depth column to the sample name # Rename the mean depth column to the sample name
df.rename(columns={'meandepth': sample_name}, inplace=True) df.rename(columns={'meandepth': sample_name}, inplace=True)
if coverage_df.empty: if coverage_df.empty:
# Initialize the main DataFrame with the first sample # Initialize the main DataFrame with the first sample
coverage_df = df coverage_df = df
else: else:
# Merge with the main DataFrame on the contig name # Merge with the main DataFrame on the contig name
coverage_df = pd.merge(coverage_df, df, on='#rname', how='outer') coverage_df = pd.merge(coverage_df, df, on='#rname', how='outer')
``` ```
%% Cell type:code id:36f31bc9-8eaa-434a-9a81-6a97282fe469 tags: %% Cell type:code id:36f31bc9-8eaa-434a-9a81-6a97282fe469 tags:
``` python ``` python
coverage_df.rename(columns={'#rname': 'contigs'}, inplace=True) coverage_df.rename(columns={'#rname': 'contigs'}, inplace=True)
coverage_df.set_index('contigs', inplace=True) coverage_df.set_index('contigs', inplace=True)
```
%% Cell type:code id:345e2ab4 tags:
``` python
coverage_df = coverage_df[coverage_df.index.str.startswith("contig_")]
coverage_df = coverage_df.reindex(order_by_length) coverage_df = coverage_df.reindex(order_by_length)
``` ```
%% Cell type:code id:ab7e6633 tags: %% Cell type:code id:ab7e6633 tags:
``` python ``` python
coverage_df_cleaned = calculator.remove_outliers_replace_with_bounds(coverage_df) coverage_df_cleaned = calculator.remove_outliers_replace_with_bounds(coverage_df)
``` ```
%% Cell type:code id:3f61507d-b554-4dfd-830c-fa3555cad215 tags: %% Cell type:code id:3f61507d-b554-4dfd-830c-fa3555cad215 tags:
``` python ``` python
# Plot the heatmap # Plot the heatmap
sns.set(font_scale = 0.8) sns.set(font_scale = 0.8)
sns.heatmap(coverage_df_cleaned, annot=False, cmap='YlOrBr') sns.heatmap(coverage_df_cleaned, annot=False, cmap='YlOrBr')
plt.title('Coverage of Contigs') plt.title('Coverage of Contigs')
plt.xlabel('Samples') plt.xlabel('Samples')
plt.ylabel('Contigs') plt.ylabel('Contigs')
plt.savefig(snakemake.output[0], dpi=400, bbox_inches = 'tight') plt.savefig(snakemake.output[0], dpi=400, bbox_inches = 'tight')
plt.show() plt.show()
``` ```
%% Cell type:code id:b61cdb41-3f9e-48fa-86ca-3249642fd7b4 tags: %% Cell type:code id:b61cdb41-3f9e-48fa-86ca-3249642fd7b4 tags:
``` python ``` python
``` ```
%% Cell type:code id:de6a634e-ab13-4297-b7c9-349cc69bad03 tags: %% Cell type:code id:de6a634e-ab13-4297-b7c9-349cc69bad03 tags:
``` python ``` python
``` ```
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment