"""
This module contains functions that are useful for interacting with
XMLs, such as Percolator output.
"""
import xml.etree.ElementTree as ET
import pandas as pd
from typing import Union, List
import numpy as np
[docs]def parse_psms(xmlfile: str) -> List[dict]:
"""
Parse the PSMs in an XML file.
Parameters
----------
xmlfile : str
The XML file.
Returns
-------
List[dict]
A list of dictionaries containing PSM information.
Examples
-------
>>> from msions.percolator import parse_psms
>>> parse_psms("test.xml")
"""
# create element tree object
tree = ET.parse(xmlfile)
# get root element
root = tree.getroot()
# create empty list for PSMs
psms = []
# define root string
root_string = './{http://per-colator.com/percolator_out/15}psms/{http://per-colator.com/percolator_out/15}psm'
# iterate PSMs
for psm in root.findall(root_string):
# empty PSM dictionary
psm_dict = {}
psm_dict['{http://per-colator.com/percolator_out/15}psm_id'] = psm.attrib['{http://per-colator.com/percolator_out/15}psm_id']
# iterate child elements of PSM
for child in psm:
# record PSM information in dictionary
if child.tag == '{http://per-colator.com/percolator_out/15}protein_id':
if '{http://per-colator.com/percolator_out/15}protein_id' in psm_dict.keys():
psm_dict['{http://per-colator.com/percolator_out/15}protein_id'].append(child.text)
# might need .encode(utf8)?
else:
psm_dict['{http://per-colator.com/percolator_out/15}protein_id'] = [child.text]
# might need encode?
elif child.tag == '{http://per-colator.com/percolator_out/15}peptide_seq':
psm_dict['{http://per-colator.com/percolator_out/15}peptide_seq'] = (child.attrib['seq'])
else:
psm_dict[child.tag] = child.text
# might need encode?
# append PSM dictionary to PSM list
psms.append(psm_dict)
# return PSM list
return psms
[docs]def parse_peps(xmlfile: str) -> List[dict]:
"""
Parse the peptides in an XML file.
Parameters
----------
xmlfile : str
The XML file.
Returns
-------
List[dict]
A list of dictionaries containing peptide information.
Examples
-------
>>> from msions.msxml import parse_peps
>>> parse_peps("test.xml")
"""
# create element tree object
tree = ET.parse(xmlfile)
# get root element
root = tree.getroot()
# create empty list for peptides
peptides = []
root_string = './{http://per-colator.com/percolator_out/15}peptides/{http://per-colator.com/percolator_out/15}peptide'
# iterate peptides
for peptide in root.findall(root_string):
# empty peptide dictionary
pep = {}
pep['{http://per-colator.com/percolator_out/15}peptide_id'] = peptide.attrib['{http://per-colator.com/percolator_out/15}peptide_id']
# iterate child elements of peptide
for child in peptide:
# record peptide information in dictionary
if child.tag == '{http://per-colator.com/percolator_out/15}psm_ids':
for grand_child in child:
if '{http://per-colator.com/percolator_out/15}psm_ids' in pep.keys():
pep['{http://per-colator.com/percolator_out/15}psm_ids'].append(grand_child.text)
# might need .encode(utf8)?
else:
pep['{http://per-colator.com/percolator_out/15}psm_ids'] = [grand_child.text]
# might need encode?
else:
pep[child.tag] = child.text
# might need encode?
# append peptide dictionary to peptides list
peptides.append(pep)
# return peptides list
return peptides
[docs]def psms2df(xml_input: Union[List[dict], str]) -> pd.DataFrame:
"""
Create a pandas DataFrame of PSM XML information.
Parameters
----------
xml_input : list[dict] or str
The PSM list of dictionaries or the XML file.
Returns
-------
pd.DataFrame
The pandas DataFrame of PSM information.
Examples
-------
>>> from msions.percolator import psms2df
>>> psms2df("test.xml")
"""
# if it's an XML file
if isinstance(xml_input, str):
# create list of dictionaries
psm_xml = parse_psms(xml_input)
# if it's a list of dictionaries already
else:
psm_xml = xml_input
# define prefix
prefix = '{http://per-colator.com/percolator_out/15}'
# initiate array
xml_lst = []
# iterate through peptide xml
for psm in psm_xml:
seq = psm[prefix+'peptide_seq']
prots = ','.join(psm[prefix+'protein_id'])
q_val = psm[prefix+'q_value']
exp_mass = psm[prefix+'exp_mass']
calc_mass = psm[prefix+'calc_mass']
scan = psm[prefix+'psm_id'].strip().split('_')[2]
xml_lst.append([seq, prots, q_val, exp_mass, calc_mass, scan])
# create pandas DataFrame
xml_df = pd.DataFrame(xml_lst,
columns=['peptide', 'protein_s',
'q_value',
'exp_mass', 'calc_mass',
'scan_num'])
# change data types
xml_df = xml_df.astype({'q_value': 'float',
'exp_mass': 'float',
'calc_mass': 'float',
'scan_num': 'int64'})
# return data frame
return xml_df
[docs]def peps2df(xml_input: Union[List[dict], str]) -> pd.DataFrame:
"""
Create a pandas DataFrame of peptide XML information.
Parameters
----------
xml_input : list[dict] or str
The peptide list of dictionaries or the XML file.
Returns
-------
pd.DataFrame
The pandas DataFrame of peptide information.
Examples
-------
>>> from msions.percolator import peps2df
>>> peps2df("test.xml")
"""
# if it's an XML file
if isinstance(xml_input, str):
# create list of dictionaries
pep_xml = parse_peps(xml_input)
# if it's a list of dictionaries already
else:
pep_xml = xml_input
# define prefix
prefix = '{http://per-colator.com/percolator_out/15}'
# initiate array
xml_lst = []
# iterate through peptide xml
for peptide in pep_xml:
seq = peptide[prefix+'peptide_id']
q_val = peptide[prefix+'q_value']
exp_mass = peptide[prefix+'exp_mass']
calc_mass = peptide[prefix+'calc_mass']
prot = peptide[prefix+'protein_id']
for psm in peptide[prefix+'psm_ids']:
scan = psm.strip().split('_')[2]
xml_lst.append([seq, q_val, exp_mass, calc_mass, prot, scan])
# create pandas DataFrame
xml_df = pd.DataFrame(xml_lst,
columns=['peptide', 'q_value',
'exp_mass', 'calc_mass',
'protein', 'scan_num'])
# change data types
xml_df = xml_df.astype({'q_value': 'float',
'exp_mass': 'float',
'calc_mass': 'float',
'scan_num': 'int64'})
# return data frame
return xml_df
[docs]def id_scans(perc_target, ms2_tic_df):
"""
Create a column saying whether an MS2 was identified
Parameters
----------
perc_target : str
The TXT file of percolator output.
ms2_tic_df: pd.DataFrame
The pandas DataFrame of MS2 scan information.
Examples
-------
>>> from msions.percolator import id_scans
>>> ms2_tic_df = mzml.tic_df("test.mzML", level="2")
>>> id_scans("test.percolator.target.peptides.txt", ms2_tic_df)
"""
# create DataFrame of percolator results for run
perc_df = pd.read_csv(perc_target, header=0, sep="\t")
# remove q-values greater than 0.01 and sort by scan
perc_sig = perc_df[perc_df['percolator q-value'] < 0.01].sort_values(by="scan").reset_index(drop=True)
# create list for whether MS2 was ID'd
ms2_tic_df["IDd"] = np.isin(ms2_tic_df["scan_num"], perc_sig['scan'])
[docs]def match_kro(kro_df: pd.DataFrame, xml_input: pd.DataFrame, ms_input: pd.DataFrame, faims: bool = False):
"""
Determine if Kronik features were identified or not
Parameters
----------
kro_df : pd.DataFrame
The pandas DataFrame of Kronik features.
xml_input : pd.DataFrame
The pandas DataFrame of Percolator XML output.
ms_input: pd.DataFrame
The pandas DataFrame of MS2 scan and precursor information.
faims : bool
Whether data is from FAIMS runs
Examples
-------
>>> from msions.percolator import match_kro
>>> match_kro(kro_df, perc_xml_df, ms_df)
"""
# define DataFrames
xml_df = xml_input
ms_df = ms_input
# initiate ID list
kro_id_lst = len(kro_df)*[0]
xml_id_lst = []
xml_int_lst = []
xml_tic_lst = []
xml_it_lst = []
for row in xml_df.itertuples():
# define info to match
ms2_ref_scan = row.scan_num
ms1_ref_mass = row.exp_mass
subset_ms2_df = ms_df[ms_df.scan_num == ms2_ref_scan]
ms1_ref_mz = subset_ms2_df.ms1_mz
ms1_ref_scan = subset_ms2_df.ms1_scan.reset_index(drop=True)[0]
# find TIC and IT for MS1
subset_ms_df = ms_df[ms_df.scan_num == ms1_ref_scan]
xml_tic_lst.append(subset_ms_df.TIC.reset_index(drop=True)[0])
xml_it_lst.append(subset_ms_df.IT.reset_index(drop=True)[0])
# filter Kronik DataFrame
kro_filt = kro_df[(kro_df.first_scan <= ms1_ref_scan) & (kro_df.last_scan >= ms1_ref_scan)]
kro_filt = kro_filt[np.isclose(kro_filt.mz, ms1_ref_mz, atol=1.01)]
kro_filt = kro_filt[np.isclose(kro_filt.mass, ms1_ref_mass, atol=1.01)]
# if FAIMS experiment
if faims:
# define CV to match
ref_cv = int(ms_df[ms_df.scan_num == ms2_ref_scan].CV)
# further filter DataFrame
kro_filt = kro_filt[kro_filt.CV == ref_cv]
# determine indices that are ID'd
idx_lst = kro_filt.index
num_idxs = len(idx_lst)
# add ID'd indices to ID lists and feature intensities to XML dataframe
if num_idxs == 1:
kro_id_lst[idx_lst[0]] = 1
xml_int_lst.append(kro_filt.best_int.reset_index(drop=True)[0])
elif num_idxs > 1:
for idx_single in kro_filt.index:
kro_id_lst[idx_single] = num_idxs
xml_int_lst.append(max(kro_filt.best_int))
else:
xml_int_lst.append(0)
xml_id_lst.append(num_idxs)
# add IDs to Kronik DataFrame
kro_df["ID_d"] = kro_id_lst
xml_df["in_kro"] = xml_id_lst
xml_df["best_int"] = xml_int_lst
xml_df["TIC"] = xml_tic_lst
xml_df["IT"] = xml_it_lst
xml_df["ions"] = xml_df['best_int']*xml_df['IT']/1000
# xml_df["norm_int"] = xml_df["best_int"]/xml_df["TIC"]