Source code for msions.kronik

"""
This module contains functions that are useful for interacting with
Kronik output files in Python.
"""
import pandas as pd
import numpy as np
from typing import Union


[docs]def simple_df(kro_input: Union[pd.DataFrame, str], cv: Union[int, str] = None, topN: int = None, bestInt_thresh: float = None,
			  sumInt_thresh: float = None, remove1: bool = False, by_int: bool = False) -> pd.DataFrame:
	"""
	Create a simplified Kronik pandas DataFrame.
	
	The DataFrame can be filtered by topN intensity values and/or by removing +1 charges.
	
	Parameters
	----------
	kro_input : pd.Dataframe or str
		The Kronik pandas DataFrame or Kronik tab-delimited file.
	cv : int or str
		CV value associated with the dataset or "given" for already present
	topN : int
		Only include features with topN summed intensity.
	bestInt_thresh: float
		Only include features with apex intensity above intensity threshold.
	sumInt_thresh: float
		Only include features with summed intensity above intensity threshold.
	remove1 : bool
		Remove +1 charges from DataFrame.
	by_int: bool
		Sort data by summed intensity.
		
	Returns
	-------
	pd.DataFrame
		A pandas DataFrame of the input file.

	Examples
	-------
	>>> import msions.kronik as kro
	>>> kro.simple_df("test.kro")	
	"""
	# if it's a kronik file
	if isinstance(kro_input, str):
		# create kronik data frame
		kro_df = pd.read_csv(kro_input, header=0, sep='\t')

		# if dataset has a FAIMS CV
		if cv is not None:
			# if cv is an integer
			if isinstance(cv, int):
				# define CV associated with scans
				kro_df['CV'] = cv

			# if cv is not equal to given
			else: 
				assert cv == "given", "CV is not an integer or 'given.' Please check input."

			# select columns of interest
			df_short = kro_df.loc[:, ["First Scan","Last Scan", "Num of Scans", "Monoisotopic Mass", 
									  "Charge", "Best Intensity", "Summed Intensity", "Best RTime", "CV"]]

		else:
			# select columns of interest
			df_short = kro_df.loc[:, ["First Scan","Last Scan", "Num of Scans", "Monoisotopic Mass", 
									  "Charge", "Best Intensity", "Summed Intensity", "Best RTime"]]
		
	# if it's a data frame already
	else:
		kro_df = kro_input

		# if dataset has a FAIMS CV
		if cv is not None:
			# select columns of interest
			df_short = kro_df.loc[:, ["First Scan","Last Scan", "Num of Scans", "Monoisotopic Mass", 
									  "Charge", "Best Intensity", "Summed Intensity", "Best RTime", "CV"]]

		else:
			# select columns of interest
			df_short = kro_df.loc[:, ["First Scan","Last Scan", "Num of Scans", "Monoisotopic Mass", 
									  "Charge", "Best Intensity", "Summed Intensity", "Best RTime"]]

	# rename to remove spaces
	df_short.rename(columns={'First Scan':'first_scan','Last Scan':'last_scan',
                             'Num of Scans':'num_scans',
                             'Monoisotopic Mass':'mass', 'Charge':'charge',
                             'Best Intensity':'best_int',
                             'Summed Intensity':'sum_int', 'Best RTime':'best_rt'}, inplace=True)

	# round retention time
	df_short['best_rt'] = df_short['best_rt'].round(4)

	# remove features with +1 charge
	if remove1:
		df_short = df_short[df_short.charge != 1]

		df_short.reset_index(drop=True, inplace=True)

	# sort DataFrame by summed intensity of features
	if by_int:
		df_short.sort_values(by="sum_int", ascending=False, inplace=True)
		
		df_short.reset_index(drop=True, inplace=True)

	# filter DataFrame to only include topN features
	if topN is not None:
		if by_int:
			df_short = df_short.iloc[0:topN, ]
		else:
			df_short.sort_values(by="sum_int", ascending=False, inplace=True)
		
			df_short.reset_index(drop=True, inplace=True)
		
			df_short = df_short.iloc[0:topN, ]

			df_short.sort_values(by="best_rt", inplace=True)

			df_short.reset_index(drop=True, inplace=True)

	# filter DataFrame to only include features with apex intensity above threshold
	if bestInt_thresh is not None:
		df_short = df_short[df_short["best_int"] >= bestInt_thresh]

		df_short.reset_index(drop=True, inplace=True)
			
	# filter DataFrame to only include features with summed intensity above threshold
	if sumInt_thresh is not None:
		df_short = df_short[df_short["sum_int"] >= sumInt_thresh]

		df_short.reset_index(drop=True, inplace=True)

	# calculate m/z for each feature
	df_short['mz'] = (df_short['mass']+df_short['charge']*1.00728)/df_short['charge']

	# calculate retention time in seconds
	df_short['best_rt_s'] = df_short['best_rt']*60

	return df_short


[docs]def filter_df(df, start=0, stop=None) -> pd.DataFrame:
	"""
	Filter a pandas DataFrame containing Kronik data with a start and stop time.
	
	Parameters
	----------
	df : pd.DataFrame
		pandas DataFrame containing Kronik data.
	start : float
		Starting time to use to filter the DataFrame.
	stop : float
		Ending time to use to filter the DataFrame.

	Returns
	-------
	pd.DataFrame
		A filtered pandas DataFrame.

	Examples
	-------
	>>> import msions.kronik as kro
	>>> kro_df = kro.simple_df("test.kro")
	>>> kro.filter_df(kro_df, start=15.0)	
	"""
	# if there is not a stop time
	if stop == None:
		# make the stop time the last retention time
		stop = np.sort(df.best_rt)[-1]
    
	return df.loc[df.best_rt.between(start, stop)] #may need to add .copy() to prevent SettingwithCopyWarning


[docs]def match_rt_mass(ref_row: pd.Series, other_df: pd.DataFrame, rt_diff: float = None) -> int:
	""" 
	Match Kronik output with itself.

	Parameters 
	---------- 
	ref_row : pd.Series
		The row of data to match.
	other_df : pd.DataFrame
		The other DataFrame to match.
	rt_diff : float
		Retention time difference window to use to search for a match.

	Returns 
	------- 
	int
		Number of matches in DataFrame.

	Examples 
	------- 
	>>> from msions.kronik import simple_df
	>>> from msions.kronik import match_rt_mass 
	>>> kro_df = simple_df("test.kro") 
	>>> redund_df = kro_df.copy()
	>>> redund_df["redund"] = redund_df.apply(match_rt_mass, axis=1, other_df=kro_df, rt_diff=1) 
	""" 
	# define info to match
	mass2match = ref_row.mass
	charge2match = ref_row.charge
	rt2match = ref_row.best_rt

	# if a retention time difference is given
	if rt_diff is not None:
		# filter DataFrame
		small_df = filter_df(other_df, rt2match-rt_diff, rt2match+rt_diff)
	else:
		small_df = other_df

    # only search scans that match	
	small_df = small_df.loc[small_df.charge == charge2match]

    # look for mass that matches
	small_df = small_df[np.isclose(small_df.mass, mass2match, rtol=5e-6)]

	# return number of rows (subtracting 1 for self-match)
	return small_df.shape[0] - 1