"""
This module contains functions that are useful for interacting with
Hardklor output files in Python.
"""
import pandas as pd
import numpy as np
from typing import Union
[docs]def hk2df(hk_file: str, by_int: bool = False) -> pd.DataFrame:
"""
Read a Hardklor tab-delimited file to a pandas DataFrame.
After import, all columns that can be converted to a numeric data
type will be.
Parameters
----------
hk_file : str
The Hardklor tab-delimited file to read.
by_int: bool
Sort data by intensity.
Returns
-------
pd.DataFrame
A pandas DataFrame of the input file.
Examples
-------
>>> import msions.hardklor as hk
>>> hk.hk2df("test.hk")
"""
# open file
with open(hk_file, "r") as open_file:
scan_num = 0
rt = 0.0
pep_arrays = []
# read file, keep scan number, retention time, and all peptide info
for line in open_file:
if line[0] == 'S':
scan_info = line.strip().split()
scan_num = int(scan_info[1])
rt = float(scan_info[2])
else:
pep_info = line.strip().split()[1:]
pep_info.extend([scan_num, rt])
pep_arrays.append(pep_info)
# create data frame from info
pep_df = pd.DataFrame(pep_arrays,
columns=['mass', 'charge',
'intensity', 'base_peak',
'window', 'unk',
'mod', 'corr', 'scan_num', 'rt'])
# change data types
pep_df = pep_df.astype({'mass': 'float', 'charge': 'int64',
'intensity': 'int64', 'base_peak': 'float',
'scan_num': 'int64', 'rt': 'float'})
# sort by intensity if true
if by_int:
pep_df.sort_values(by="intensity", ascending=False, inplace=True)
pep_df.reset_index(drop=True, inplace=True)
# calculate m/z
pep_df['mz'] = (pep_df['mass']+pep_df['charge']*1.00728)/pep_df['charge']
# round m/z to 4 decimal places
pep_df['mz'] = pep_df["mz"].round(4)
# calculate retention time in seconds
pep_df['rt_s'] = pep_df['rt']*60
# return data frame of info
return pep_df
[docs]def summarize_df(hk_input: Union[pd.DataFrame, str], full_ms1_df: pd.DataFrame = None) -> pd.DataFrame:
"""
Summarize the TIC in each scan from a Hardklor pandas DataFrame or Hardklor tab-delimited file.
If an additional pandas DataFrame is provided with the MS1 scan information,
the ion injection time will be mapped to each scan.
Parameters
----------
hk_input : pd.Dataframe or str
The Hardklor pandas DataFrame or Hardklor tab-delimited file.
full_ms1_df: pd.DataFrame
The pandas DataFrame containing the MS1 scan information.
Returns
-------
pd.DataFrame
A summarized pandas DataFrame.
Examples
-------
>>> import msions.hardklor as hk
>>> hk_df = hk.hk2df("test.hk")
>>> hk.summarize_df(hk_df)
"""
# if it's a hardklor file
if isinstance(hk_input, str):
# create hardklor data frame
hk_df = hk2df(hk_input)
# if it's a data frame already
else:
hk_df = hk_input
# group data frame by scan number & rt
byscan_rt = hk_df.groupby(["scan_num", "rt"])
# sum by grouping
sum_group = pd.DataFrame(byscan_rt["intensity"].aggregate(sum))
# rename column
sum_group.rename(columns={'intensity': 'TIC'}, inplace=True)
# reset indices
sum_group.reset_index(level=0, inplace=True)
sum_group.reset_index(level=0, inplace=True)
# if complete data frame is given
if full_ms1_df is not None:
# merge ion injection time into data frame
sum_group = pd.merge(full_ms1_df[['scan_num', 'rt', 'IT']],
sum_group.drop("rt", axis=1),
on='scan_num', how="left")
# fill NaN with 0
sum_group["TIC"] = sum_group["TIC"].replace(np.nan,0)
# calculate ions per scan
# ions per scan = ion current (for scan) * inject time /1000
sum_group["ions"] = sum_group['TIC']*sum_group['IT']/1000
# re-order columns
sum_group = sum_group[["scan_num", "rt", "TIC", "IT",
"ions"]]
# return data frame
return sum_group