Source code for msions.percolator

"""
This module contains functions that are useful for interacting with
XMLs, such as Percolator output.
"""
import xml.etree.ElementTree as ET
import pandas as pd
from typing import Union, List
import numpy as np


[docs]def parse_psms(xmlfile: str) -> List[dict]: """ Parse the PSMs in an XML file. Parameters ---------- xmlfile : str The XML file. Returns ------- List[dict] A list of dictionaries containing PSM information. Examples ------- >>> from msions.percolator import parse_psms >>> parse_psms("test.xml") """ # create element tree object tree = ET.parse(xmlfile) # get root element root = tree.getroot() # create empty list for PSMs psms = [] # define root string root_string = './{http://per-colator.com/percolator_out/15}psms/{http://per-colator.com/percolator_out/15}psm' # iterate PSMs for psm in root.findall(root_string): # empty PSM dictionary psm_dict = {} psm_dict['{http://per-colator.com/percolator_out/15}psm_id'] = psm.attrib['{http://per-colator.com/percolator_out/15}psm_id'] # iterate child elements of PSM for child in psm: # record PSM information in dictionary if child.tag == '{http://per-colator.com/percolator_out/15}protein_id': if '{http://per-colator.com/percolator_out/15}protein_id' in psm_dict.keys(): psm_dict['{http://per-colator.com/percolator_out/15}protein_id'].append(child.text) # might need .encode(utf8)? else: psm_dict['{http://per-colator.com/percolator_out/15}protein_id'] = [child.text] # might need encode? elif child.tag == '{http://per-colator.com/percolator_out/15}peptide_seq': psm_dict['{http://per-colator.com/percolator_out/15}peptide_seq'] = (child.attrib['seq']) else: psm_dict[child.tag] = child.text # might need encode? # append PSM dictionary to PSM list psms.append(psm_dict) # return PSM list return psms
[docs]def parse_peps(xmlfile: str) -> List[dict]: """ Parse the peptides in an XML file. Parameters ---------- xmlfile : str The XML file. Returns ------- List[dict] A list of dictionaries containing peptide information. Examples ------- >>> from msions.msxml import parse_peps >>> parse_peps("test.xml") """ # create element tree object tree = ET.parse(xmlfile) # get root element root = tree.getroot() # create empty list for peptides peptides = [] root_string = './{http://per-colator.com/percolator_out/15}peptides/{http://per-colator.com/percolator_out/15}peptide' # iterate peptides for peptide in root.findall(root_string): # empty peptide dictionary pep = {} pep['{http://per-colator.com/percolator_out/15}peptide_id'] = peptide.attrib['{http://per-colator.com/percolator_out/15}peptide_id'] # iterate child elements of peptide for child in peptide: # record peptide information in dictionary if child.tag == '{http://per-colator.com/percolator_out/15}psm_ids': for grand_child in child: if '{http://per-colator.com/percolator_out/15}psm_ids' in pep.keys(): pep['{http://per-colator.com/percolator_out/15}psm_ids'].append(grand_child.text) # might need .encode(utf8)? else: pep['{http://per-colator.com/percolator_out/15}psm_ids'] = [grand_child.text] # might need encode? else: pep[child.tag] = child.text # might need encode? # append peptide dictionary to peptides list peptides.append(pep) # return peptides list return peptides
[docs]def psms2df(xml_input: Union[List[dict], str]) -> pd.DataFrame: """ Create a pandas DataFrame of PSM XML information. Parameters ---------- xml_input : list[dict] or str The PSM list of dictionaries or the XML file. Returns ------- pd.DataFrame The pandas DataFrame of PSM information. Examples ------- >>> from msions.percolator import psms2df >>> psms2df("test.xml") """ # if it's an XML file if isinstance(xml_input, str): # create list of dictionaries psm_xml = parse_psms(xml_input) # if it's a list of dictionaries already else: psm_xml = xml_input # define prefix prefix = '{http://per-colator.com/percolator_out/15}' # initiate array xml_lst = [] # iterate through peptide xml for psm in psm_xml: seq = psm[prefix+'peptide_seq'] prots = ','.join(psm[prefix+'protein_id']) q_val = psm[prefix+'q_value'] exp_mass = psm[prefix+'exp_mass'] calc_mass = psm[prefix+'calc_mass'] scan = psm[prefix+'psm_id'].strip().split('_')[2] xml_lst.append([seq, prots, q_val, exp_mass, calc_mass, scan]) # create pandas DataFrame xml_df = pd.DataFrame(xml_lst, columns=['peptide', 'protein_s', 'q_value', 'exp_mass', 'calc_mass', 'scan_num']) # change data types xml_df = xml_df.astype({'q_value': 'float', 'exp_mass': 'float', 'calc_mass': 'float', 'scan_num': 'int64'}) # return data frame return xml_df
[docs]def peps2df(xml_input: Union[List[dict], str]) -> pd.DataFrame: """ Create a pandas DataFrame of peptide XML information. Parameters ---------- xml_input : list[dict] or str The peptide list of dictionaries or the XML file. Returns ------- pd.DataFrame The pandas DataFrame of peptide information. Examples ------- >>> from msions.percolator import peps2df >>> peps2df("test.xml") """ # if it's an XML file if isinstance(xml_input, str): # create list of dictionaries pep_xml = parse_peps(xml_input) # if it's a list of dictionaries already else: pep_xml = xml_input # define prefix prefix = '{http://per-colator.com/percolator_out/15}' # initiate array xml_lst = [] # iterate through peptide xml for peptide in pep_xml: seq = peptide[prefix+'peptide_id'] q_val = peptide[prefix+'q_value'] exp_mass = peptide[prefix+'exp_mass'] calc_mass = peptide[prefix+'calc_mass'] prot = peptide[prefix+'protein_id'] for psm in peptide[prefix+'psm_ids']: scan = psm.strip().split('_')[2] xml_lst.append([seq, q_val, exp_mass, calc_mass, prot, scan]) # create pandas DataFrame xml_df = pd.DataFrame(xml_lst, columns=['peptide', 'q_value', 'exp_mass', 'calc_mass', 'protein', 'scan_num']) # change data types xml_df = xml_df.astype({'q_value': 'float', 'exp_mass': 'float', 'calc_mass': 'float', 'scan_num': 'int64'}) # return data frame return xml_df
[docs]def id_scans(perc_target, ms2_tic_df): """ Create a column saying whether an MS2 was identified Parameters ---------- perc_target : str The TXT file of percolator output. ms2_tic_df: pd.DataFrame The pandas DataFrame of MS2 scan information. Examples ------- >>> from msions.percolator import id_scans >>> ms2_tic_df = mzml.tic_df("test.mzML", level="2") >>> id_scans("test.percolator.target.peptides.txt", ms2_tic_df) """ # create DataFrame of percolator results for run perc_df = pd.read_csv(perc_target, header=0, sep="\t") # remove q-values greater than 0.01 and sort by scan perc_sig = perc_df[perc_df['percolator q-value'] < 0.01].sort_values(by="scan").reset_index(drop=True) # create list for whether MS2 was ID'd ms2_tic_df["IDd"] = np.isin(ms2_tic_df["scan_num"], perc_sig['scan'])
[docs]def match_kro(kro_df: pd.DataFrame, xml_input: pd.DataFrame, ms_input: pd.DataFrame, faims: bool = False): """ Determine if Kronik features were identified or not Parameters ---------- kro_df : pd.DataFrame The pandas DataFrame of Kronik features. xml_input : pd.DataFrame The pandas DataFrame of Percolator XML output. ms_input: pd.DataFrame The pandas DataFrame of MS2 scan and precursor information. faims : bool Whether data is from FAIMS runs Examples ------- >>> from msions.percolator import match_kro >>> match_kro(kro_df, perc_xml_df, ms_df) """ # define DataFrames xml_df = xml_input ms_df = ms_input # initiate ID list kro_id_lst = len(kro_df)*[0] xml_id_lst = [] xml_int_lst = [] xml_tic_lst = [] xml_it_lst = [] for row in xml_df.itertuples(): # define info to match ms2_ref_scan = row.scan_num ms1_ref_mass = row.exp_mass subset_ms2_df = ms_df[ms_df.scan_num == ms2_ref_scan] ms1_ref_mz = subset_ms2_df.ms1_mz ms1_ref_scan = subset_ms2_df.ms1_scan.reset_index(drop=True)[0] # find TIC and IT for MS1 subset_ms_df = ms_df[ms_df.scan_num == ms1_ref_scan] xml_tic_lst.append(subset_ms_df.TIC.reset_index(drop=True)[0]) xml_it_lst.append(subset_ms_df.IT.reset_index(drop=True)[0]) # filter Kronik DataFrame kro_filt = kro_df[(kro_df.first_scan <= ms1_ref_scan) & (kro_df.last_scan >= ms1_ref_scan)] kro_filt = kro_filt[np.isclose(kro_filt.mz, ms1_ref_mz, atol=1.01)] kro_filt = kro_filt[np.isclose(kro_filt.mass, ms1_ref_mass, atol=1.01)] # if FAIMS experiment if faims: # define CV to match ref_cv = int(ms_df[ms_df.scan_num == ms2_ref_scan].CV) # further filter DataFrame kro_filt = kro_filt[kro_filt.CV == ref_cv] # determine indices that are ID'd idx_lst = kro_filt.index num_idxs = len(idx_lst) # add ID'd indices to ID lists and feature intensities to XML dataframe if num_idxs == 1: kro_id_lst[idx_lst[0]] = 1 xml_int_lst.append(kro_filt.best_int.reset_index(drop=True)[0]) elif num_idxs > 1: for idx_single in kro_filt.index: kro_id_lst[idx_single] = num_idxs xml_int_lst.append(max(kro_filt.best_int)) else: xml_int_lst.append(0) xml_id_lst.append(num_idxs) # add IDs to Kronik DataFrame kro_df["ID_d"] = kro_id_lst xml_df["in_kro"] = xml_id_lst xml_df["best_int"] = xml_int_lst xml_df["TIC"] = xml_tic_lst xml_df["IT"] = xml_it_lst xml_df["ions"] = xml_df['best_int']*xml_df['IT']/1000
# xml_df["norm_int"] = xml_df["best_int"]/xml_df["TIC"]