Source code for msions.msplot

"""
This module contains functions that are useful for plotting MS data in Python.
"""
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns  # for despine of plots
from msions.mzml import tic_df
from msions.encyclopedia import dia_df
from msions.hardklor import hk2df
from msions.encyclopedia import match_hk
from msions.hardklor import summarize_df
import numpy as np
from typing import List, Union


[docs]def plot_data(mzml_input: Union[pd.DataFrame, str], feat_input: Union[pd.DataFrame, str] = None, id_input: Union[pd.DataFrame, str] = None, method: str = None, data_type = "TIC", stats = None, return_dfs = False, color: Union[str, List[str]] = ["black", "#1f77b4"], no_labels: bool = False, alpha: float = 1.0, fig_params: List[float] = None): """ Plots TIC against retention time. Parameters ---------- mzml_input : pd.DataFrame or str The pandas DataFrame containing retention time and TIC or the mzML file. feat_input : pd.DataFrame or str The pandas DataFrame containing retention time and TIC or the Hardklor/Kronik file. id_input : pd.DataFrame or str The pandas DataFrame containing retention time and TIC or the EncyclopeDIA/Percolator file. method : str Type of acquisition method (e.g., "DIA") data_type : str Data chosen for plot ("TIC", "ions", "both") stats : bool Print stats for data return_dfs : bool Return calculated DataFrames if True color: List[str] The lists of colors for the line plots. no_labels: bool Removes ticks and labels. alpha: float Changes the alpha value for the line plot. fig_params: List[float] Sets the figure size and optionally the dpi. Examples ------- >>> from msions.msplot import plot_tic >>> from msions.mzml import tic_df >>> import matplotlib.pyplot as plot >>> ms1_df = tic_df("test.mzML") >>> plot_data(ms1_df) >>> plt.show() """ # create blank variables for returning df = "" feat_df = "" id_df = "" sumid_feat_df = "" title_txt = [] # if it's an mzML file if isinstance(mzml_input, str): # create mzML data frame df = tic_df(mzml_input) # if it's a data frame already else: df = mzml_input if isinstance(color, str): color = [color] # change figure size if fig_params is not None: if len(fig_params) == 2: plt.figure(figsize=(fig_params[0], fig_params[1])) elif len(fig_params) == 3: plt.figure(figsize=(fig_params[0], fig_params[1]), dpi=fig_params[2]) if data_type.lower() == "ions": if stats.lower() == "print": # find total ions across all print("Total # of ions: %.2e" % sum(df.ions)) elif stats.lower() == "title": # find total ions across all title_txt.append("Total # of ions: %.2e\n" % sum(df.ions)) if fig_params is None: # define figure size plt.figure(figsize=(10,8)) # plot ions plt.plot(df['rt'], df['ions'], color=color[0], alpha=alpha) elif data_type.lower() == "both": if stats == "print": # find totals across all scans print("Total Ion Current (TIC): %.2e \t Total # of ions: %.2e" % (sum(df.TIC), sum(df.ions))) elif stats.lower() == "title": # find total ions across all title_txt.append("Total Ion Current (TIC): %.2e\n" % sum(df.TIC)) title_txt.append("Total # of ions: %.2e\n" % sum(df.ions)) if fig_params is None: plt.figure(figsize=(16, 6)) # plot TIC plt.subplot(1, 2, 1) plt.plot(df['rt'], df['TIC'], color=color[0], alpha=alpha) # plot ions plt.subplot(1, 2, 2) plt.plot(df['rt'], df['ions'], color=color[0], alpha=alpha) else: if stats == "print": # find total ion current across all print("Total Ion Current (TIC): %.2e" % sum(df.TIC)) elif stats.lower() == "title": # find total ions across all title_txt.append("Total Ion Current (TIC): %.2e\n" % sum(df.TIC)) if fig_params is None: # define figure size plt.figure(figsize=(10,8)) # plot TIC plt.plot(df['rt'], df['TIC'], color=color[0], alpha=alpha) # if feature file and ID file is given if feat_input is not None and id_input is not None: if isinstance(feat_input, str): # check if Hardklor file if feat_input[-2:] == "hk": feat_df = hk2df(feat_input) else: feat_df = feat_input if isinstance(id_input, str): # check if EncyclopeDIA file if id_input[-4:] == "elib": id_df = dia_df(id_input) else: id_df = id_input if method == "DIA": # if scans have been summed already if len(np.unique(feat_df.scan_num)) == len(feat_df.scan_num): sumid_feat_df = feat_df # if features have been matched already elif "in_encyclo" in feat_df.columns: # create DataFrame of only identified features id_feat_df = feat_df[feat_df["in_encyclo"] > 0].reset_index(drop=True) # summarize identified features DataFrame sumid_feat_df = summarize_df(id_feat_df, full_ms1_df=df) else: # find Hardklor/encyclopeDIA match feat_df["in_encyclo"] = feat_df.apply(match_hk, axis=1, other_df=id_df) # create DataFrame of only identified features id_feat_df = feat_df[feat_df["in_encyclo"] > 0].reset_index(drop=True) # summarize identified features DataFrame sumid_feat_df = summarize_df(id_feat_df, full_ms1_df=df) if data_type.lower() == "ions": if stats.lower() == "print": # find identified ions print("Ions mapped to peptides: %.2e" % sum(sumid_feat_df.ions)) # calculate ratio of identified ions to total ions print("%.1f%% of the signal" % float(sum(sumid_feat_df.ions)/sum(df.ions)*100)) # print number of peptide IDs print ("Number of peptide IDs: %.0f" % len(id_df)) elif stats.lower() == "title": title_txt[0] += "Ions mapped to peptides: %.2e\n" % sum(sumid_feat_df.ions) title_txt[0] += "%.1f%% of the signal\n" % float(sum(sumid_feat_df.ions)/sum(df.ions)*100) title_txt[0] += "Number of peptide IDs: %.0f\n" % len(id_df) # plot ID'd ions plt.plot(sumid_feat_df['rt'], sumid_feat_df['ions'], color=color[1], alpha=alpha) elif data_type.lower() == "both": if stats.lower() == "print": # find identified signals print("ID'd TIC: %.2e \t\t\t Ions mapped to peptides: %.2e" % (sum(sumid_feat_df.TIC), sum(sumid_feat_df.ions))) # calculate ratio of identified signals to total signal print("%.1f%% of the signal \t\t\t %.1f%% of the signal" % (float(sum(sumid_feat_df.TIC)/sum(df.TIC)*100), float(sum(sumid_feat_df.ions)/sum(df.ions)*100))) # print number of peptide IDs print ("Number of peptide IDs: %.0f" % len(id_df)) elif stats == "title": title_txt[0] += "ID'd TIC: %.2e\n" % sum(sumid_feat_df.TIC) title_txt[0] += "%.1f%% of the signal\n" % float(sum(sumid_feat_df.TIC)/sum(df.TIC)*100) title_txt[0] += "Number of peptide IDs: %.0f\n" % len(id_df) title_txt[1] += "Ions mapped to peptides: %.2e\n" % sum(sumid_feat_df.ions) title_txt[1] += "%.1f%% of the signal\n\n" % float(sum(sumid_feat_df.ions)/sum(df.ions)*100) # title_txt[1] += "Number of peptide IDs: %.0f" % len(id_df) # plot TIC plt.subplot(1, 2, 1) plt.plot(sumid_feat_df['rt'], sumid_feat_df['TIC'], color=color[1], alpha=alpha) # plot ions plt.subplot(1, 2, 2) plt.plot(sumid_feat_df['rt'], sumid_feat_df['ions'], color=color[1], alpha=alpha) else: if stats == "print": # find identified total ion current print("ID'd TIC: %.2e" % sum(sumid_feat_df.TIC)) # calculate ratio of identified ion current to total ion current print("%.1f%% of the signal" % float(sum(sumid_feat_df.TIC)/sum(df.TIC)*100)) # print number of peptide IDs print ("Number of peptide IDs: %.0f" % len(id_df)) elif stats == "title": title_txt[0] += "ID'd TIC: %.2e\n" % sum(sumid_feat_df.TIC) title_txt[0] += "%.1f%% of the signal\n" % float(sum(sumid_feat_df.TIC)/sum(df.TIC)*100) title_txt[0] += "Number of peptide IDs: %.0f\n" % len(id_df) # plot ID'd TIC plt.plot(sumid_feat_df['rt'], sumid_feat_df['TIC'], color=color[1], alpha=alpha) if data_type.lower() == "ions": plt.xticks(fontsize=14) plt.xlabel("Time (min)", fontsize=18) plt.yticks(fontsize=14) plt.ylabel("Ions", fontsize=18) if stats == "title": plt.title(title_txt[0], loc="left", fontsize=18) elif data_type.lower() == "both": plt.subplot(1, 2, 1) plt.xticks(fontsize=14) plt.xlabel("Time (min)", fontsize=18) plt.yticks(fontsize=14) plt.ylabel("Total Ion Current", fontsize=18) if stats == "title": plt.title(title_txt[0], loc="left", fontsize=18) # gives scientific notation plt.ticklabel_format(axis="y", style="sci", scilimits=(0, 0)) # removes right side & top of plot sns.despine() # x- and y-axis start at 0 plt.xlim(left=0) plt.ylim(bottom=0) # removes labels if no_labels: # get rid of x- and y-axis titles plt.xlabel(None) plt.ylabel(None) # gives numbers instead of scientific notation, needed if trying to get rid of all labels plt.ticklabel_format(style="plain") # draw ticks and labels plt.tick_params( axis='both', which='both', # both major and minor ticks are affected bottom=False, top=False, left=False, right=False, labelbottom=False, labelleft=False, labeltop=False, labelright=False) plt.subplot(1, 2, 2) plt.xticks(fontsize=14) plt.xlabel("Time (min)", fontsize=18) plt.yticks(fontsize=14) plt.ylabel("Ions", fontsize=18) if stats == "title": plt.title(title_txt[1], loc="left", fontsize=18) else: plt.xticks(fontsize=14) plt.xlabel("Time (min)", fontsize=18) plt.yticks(fontsize=14) plt.ylabel("Total Ion Current", fontsize=18) if stats == "title": plt.title(title_txt[0], loc="left", fontsize=18) # gives scientific notation plt.ticklabel_format(axis="y", style="sci", scilimits=(0, 0)) # removes right side & top of plot sns.despine() # x- and y-axis start at 0 plt.xlim(left=0) plt.ylim(bottom=0) # removes labels if no_labels: # get rid of x- and y-axis titles plt.xlabel(None) plt.ylabel(None) # gives numbers instead of scientific notation, needed if trying to get rid of all labels plt.ticklabel_format(style="plain") # draw ticks and labels plt.tick_params( axis='both', which='both', # both major and minor ticks are affected bottom=False, top=False, left=False, right=False, labelbottom=False, labelleft=False, labeltop=False, labelright=False) if return_dfs: return df, sumid_feat_df, id_df, feat_df