diff --git a/.bumpversion.cfg b/.bumpversion.cfg index fb3d1325..48dad86e 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.0.2 +current_version = 1.0.3 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/CHANGELOG.md b/CHANGELOG.md index 911b1c9b..ce8b2168 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,21 @@ Follow the changelog format from https://keepachangelog.com/en/1.0.0/. +## 1.1.0 - 2023.xx.xx + +### Added + +- Separate `library_reader_base` in `psm_reader.yaml` config for `LibraryReaderBase`. + +### Changed + +- `mod@Any N-term` and `mod@Any_N-term` are both supported, `Any_N-term` is prefered as there are no spaces and hence better for command line tools. The same for `mod@Protein N-term`, `mod@Any C-term`, and `mod@Protein C-term`. +- Enable customizing dtypes of peak mz and intensty values. +- `SWATHLibraryReader` to `LibraryBaseReader` in `alphabase.spectral_library.reader`. +- New `LibraryReaderBase._get_fragment_intensity` implementation which is called at the end of the parsing process in `PSMReaderBase._post_process`. This allows it to operate only on the translated column names. By default, all non-fragment columns will be grouped and part of the final output. +- `SpecLibBase.copy()` for copying spectral libraries including all attributes. +- `SpecLibBase.append()` for appending spectral libraries while maintaining the fragment index mapping. + ## 1.0.2 - 2023.02.10 ### Changed diff --git a/alphabase/__init__.py b/alphabase/__init__.py index 24b0af48..9e24bcee 100644 --- a/alphabase/__init__.py +++ b/alphabase/__init__.py @@ -2,7 +2,7 @@ __project__ = "alphabase" -__version__ = "1.0.2" +__version__ = "1.0.3" __license__ = "Apache" __description__ = "An infrastructure Python package of the AlphaX ecosystem" __author__ = "Mann Labs" @@ -39,5 +39,5 @@ "PyPi": "https://pypi.org/project/alphabase/", } __extra_requirements__ = { - "development": "requirements_development.txt", + "development": "extra_requirements/development.txt", } diff --git a/alphabase/constants/_const.py b/alphabase/constants/_const.py index a2f1cd1c..819b23a9 100644 --- a/alphabase/constants/_const.py +++ b/alphabase/constants/_const.py @@ -1,6 +1,23 @@ import os +import numpy as np + +from alphabase.yaml_utils import load_yaml CONST_FILE_FOLDER = os.path.join( os.path.dirname(__file__), - 'const_files' -) \ No newline at end of file + "const_files" +) + +common_const_dict:dict = load_yaml( + os.path.join(CONST_FILE_FOLDER, "common_constants.yaml") +) + +# Only applied in peak and fragment dataframes to save RAM. +# Using float32 still keeps 0.1 ppm precision in any value range. +# Default float dtype is "float64" for value calculation and other senarios. +PEAK_MZ_DTYPE:np.dtype = np.dtype( + common_const_dict["PEAK_MZ_DTYPE"] +).type +PEAK_INTENSITY_DTYPE:np.dtype = np.dtype( + common_const_dict["PEAK_INTENSITY_DTYPE"] +).type \ No newline at end of file diff --git a/alphabase/constants/atom.py b/alphabase/constants/atom.py index e6b930c0..ec3ac4be 100644 --- a/alphabase/constants/atom.py +++ b/alphabase/constants/atom.py @@ -4,10 +4,9 @@ from alphabase.yaml_utils import load_yaml -from alphabase.constants._const import CONST_FILE_FOLDER - -common_const_dict:dict = load_yaml( - os.path.join(CONST_FILE_FOLDER, 'common_constants.yaml') +from alphabase.constants._const import ( + CONST_FILE_FOLDER, + common_const_dict ) MASS_PROTON:float = common_const_dict['MASS_PROTON'] diff --git a/alphabase/constants/const_files/common_constants.yaml b/alphabase/constants/const_files/common_constants.yaml index b5165029..98c659b4 100644 --- a/alphabase/constants/const_files/common_constants.yaml +++ b/alphabase/constants/const_files/common_constants.yaml @@ -6,4 +6,8 @@ MOBILITY: # Mason Schamp equation of Burker. CCS_IM_COEF: 1059.62245 # 28 is the mass of N(2), the default gas in IM bruker - IM_GAS_MASS: 28.0 \ No newline at end of file + IM_GAS_MASS: 28.0 + +# Only applied in peak/fragment dataframes to save RAM +PEAK_MZ_DTYPE: float32 +PEAK_INTENSITY_DTYPE: float32 \ No newline at end of file diff --git a/alphabase/constants/const_files/psm_reader.yaml b/alphabase/constants/const_files/psm_reader.yaml index 449b756d..1981f8a2 100644 --- a/alphabase/constants/const_files/psm_reader.yaml +++ b/alphabase/constants/const_files/psm_reader.yaml @@ -20,6 +20,7 @@ alphapept: 'Phospho@T': 'pT' 'Phospho@Y': 'pY' 'Acetyl@Protein N-term': 'a' + maxquant: reader_type: maxquant rt_unit: minute @@ -45,6 +46,7 @@ maxquant: 'genes': ['Gene Names','Gene names'] 'decoy': 'Reverse' 'intensity': 'Intensity' + modification_mapping: 'Acetyl@Protein N-term': - '_(Acetyl (Protein N-term))' @@ -74,6 +76,7 @@ maxquant: 'Deamidated@N': ['N(Deamidation (NQ))','N(de)'] 'Deamidated@Q': ['Q(Deamidation (NQ))','Q(de)'] 'GlyGly@K': ['K(GlyGly (K))', 'K(gl)'] + pfind: reader_type: pfind rt_unit: minute @@ -117,7 +120,9 @@ msfragger_pepxml: - 'Glu->pyro-Glu@E^Any N-term' - 'Gln->pyro-Glu@Q^Any N-term' - 'Dimethyl@K' # Any N-term is not needed here as it will be infered in-the-fly + - 'Methyl@E' #an example of a PTM that can be C-term mod_mass_tol: 0.1 # Da + diann: reader_type: diann rt_unit: minute @@ -148,6 +153,7 @@ spectronaut_report: 'uniprot_ids': 'PG.UniProtIds' 'charge': 'charge' modification_mapping: 'maxquant' + spectronaut: reader_type: spectronaut rt_unit: irt @@ -171,4 +177,35 @@ spectronaut: 'uniprot_ids': ['UniProtIds','UniProtID','UniprotId'] 'genes': ['Genes','Gene','GeneName','GeneNames'] modification_mapping: 'maxquant' - \ No newline at end of file + +library_reader_base: + reader_type: library_reader_base + rt_unit: irt + fixed_C57: False + csv_sep: "\t" + mod_seq_columns: + - 'ModifiedPeptideSequence' + - 'ModifiedPeptide' + - 'ModifiedSequence' + - 'FullUniModPeptideName' + - 'LabeledSequence' + - 'FullUniModPeptideName' + column_mapping: + 'raw_name': 'ReferenceRun' + 'sequence': ['PeptideSequence', 'StrippedPeptide'] + 'modified_sequence': ['ModifiedPeptideSequence','ModifiedPeptide'] + 'charge': 'PrecursorCharge' + 'rt': ['RT','iRT','Tr_recalibrated','RetentionTime','NormalizedRetentionTime'] + 'ccs': 'CCS' + 'precursor_mz': 'PrecursorMz' + 'mobility': ['Mobility','IonMobility','PrecursorIonMobility'] + 'proteins': ['ProteinId','ProteinID','ProteinName','Protein Name',] + 'uniprot_ids': ['UniProtIds','UniProtID','UniprotId'] + 'genes': ['GeneName','Genes','Gene',] + 'fragment_intensity': ['LibraryIntensity','RelativeIntensity', 'RelativeFragmentIntensity', 'RelativeFragmentIonIntensity'] + 'fragment_mz': ['ProductMz'] + 'fragment_type': ['FragmentType', 'FragmentIonType', 'ProductType', 'ProductIonType'] + 'fragment_charge' : ['FragmentCharge', 'FragmentIonCharge', 'ProductCharge', 'ProductIonCharge'] + 'fragment_series': ['FragmentSeriesNumber','FragmentNumber'] + 'fragment_loss_type': ['FragmentLossType', 'FragmentIonLossType', 'ProductLossType', 'ProductIonLossType'] + modification_mapping: 'maxquant' diff --git a/alphabase/constants/modification.py b/alphabase/constants/modification.py index 63a1c583..08a843f3 100644 --- a/alphabase/constants/modification.py +++ b/alphabase/constants/modification.py @@ -84,6 +84,11 @@ def load_mod_df( ): global MOD_DF MOD_DF = pd.read_table(tsv) + _df = MOD_DF[MOD_DF.mod_name.str.contains(' ', regex=False)].copy() + _df["mod_name"] = MOD_DF.mod_name.str.replace(' ', '_', regex=False) + MOD_DF = pd.concat( + [MOD_DF, _df], ignore_index=True + ).drop_duplicates("mod_name") MOD_DF.fillna('',inplace=True) MOD_DF['unimod_id'] = MOD_DF.unimod_id.astype(np.int32) MOD_DF.set_index('mod_name', drop=False, inplace=True) diff --git a/alphabase/peptide/fragment.py b/alphabase/peptide/fragment.py index ef866fc6..c4fa3e28 100644 --- a/alphabase/peptide/fragment.py +++ b/alphabase/peptide/fragment.py @@ -5,6 +5,9 @@ import numba as nb import logging +from alphabase.constants._const import ( + PEAK_MZ_DTYPE, PEAK_INTENSITY_DTYPE +) from alphabase.peptide.mass_calc import * from alphabase.constants.modification import ( calc_modloss_mass @@ -129,7 +132,7 @@ def parse_charged_frag_type( def init_zero_fragment_dataframe( peplen_array:np.ndarray, charged_frag_types:List[str], - dtype=np.float64 + dtype=PEAK_MZ_DTYPE )->Tuple[pd.DataFrame, np.ndarray, np.ndarray]: '''Initialize a zero dataframe based on peptide length (nAA) array (peplen_array) and charge_frag_types (column number). @@ -163,7 +166,7 @@ def init_zero_fragment_dataframe( def init_fragment_dataframe_from_other( reference_fragment_df: pd.DataFrame, - dtype=np.float64 + dtype=PEAK_MZ_DTYPE ): ''' Init zero fragment dataframe from the `reference_fragment_df` (same rows and same columns) @@ -178,7 +181,7 @@ def init_fragment_by_precursor_dataframe( charged_frag_types: List[str], *, reference_fragment_df: pd.DataFrame = None, - dtype:np.dtype=np.float64, + dtype:np.dtype=PEAK_MZ_DTYPE, inplace_in_reference:bool=False, ): ''' @@ -205,6 +208,9 @@ def init_fragment_by_precursor_dataframe( initialized by :func:`alphabase.peptide.fragment.init_zero_fragment_dataframe`. Defaults to None. + dtype: np.dtype + dtype of fragment mz values, Defaults to :data:`PEAK_MZ_DTYPE`. + inplace_in_reference : bool, optional if calculate the fragment mz inplace in the reference_fragment_df (default: False) @@ -234,7 +240,7 @@ def init_fragment_by_precursor_dataframe( np.zeros(( precursor_df.frag_stop_idx.max(), len(charged_frag_types) - )), + ), dtype=dtype), columns = charged_frag_types ) else: @@ -248,7 +254,7 @@ def init_fragment_by_precursor_dataframe( np.zeros(( len(reference_fragment_df), len(charged_frag_types) - )), + ), dtype=dtype), columns = charged_frag_types ) return fragment_df @@ -289,10 +295,12 @@ def update_sliced_fragment_dataframe( frag_slice_list = [slice(start,end) for start,end in frag_start_end_list] frag_slices = np.r_[tuple(frag_slice_list)] if charged_frag_types is None or len(charged_frag_types)==0: - fragment_df.values[frag_slices, :] = values + fragment_df.values[frag_slices, :] = values.astype(fragment_df.dtypes[0]) else: charged_frag_idxes = [fragment_df.columns.get_loc(c) for c in charged_frag_types] - fragment_df.iloc[frag_slices, charged_frag_idxes] = values + fragment_df.iloc[ + frag_slices, charged_frag_idxes + ] = values.astype(fragment_df.dtypes[0]) return fragment_df def get_sliced_fragment_dataframe( @@ -546,9 +554,10 @@ def flatten_fragments( fragment_intensity_df: pd.DataFrame, min_fragment_intensity: float = -1, keep_top_k_fragments: int = 1000, - custom_columns:list = [ + custom_columns : list = [ 'type','number','position','charge','loss_type' ], + custom_df : Dict[str, pd.DataFrame] = {} )->Tuple[pd.DataFrame, pd.DataFrame]: """ Converts the tabular fragment format consisting of @@ -560,8 +569,8 @@ def flatten_fragments( `type`, `number`, `charge` and `loss_type`, where each column refers to: - - mz: float64, fragment mz value - - intensity: float32, fragment intensity value + - mz: :data:`PEAK_MZ_DTYPE`, fragment mz value + - intensity: :data:`PEAK_INTENSITY_DTYPE`, fragment intensity value - type: int8, ASCII code of the ion type (97=a, 98=b, 99=c, 120=x, 121=y, 122=z), or more ion types in the future. See https://en.wikipedia.org/wiki/ASCII for more ASCII information - number: uint32, fragment series number - position: uint32, fragment position in sequence (from left to right, starts with 0) @@ -590,6 +599,9 @@ def flatten_fragments( custom_columns : list, optional 'mz' and 'intensity' columns are required. Others could be customized. Defaults to ['type','number','position','charge','loss_type'] + + custom_df : Dict[str, pd.DataFrame], optional + Append custom columns by providing additional dataframes of the same shape as fragment_mz_df and fragment_intensity_df. Defaults to {}. Returns ------- @@ -599,8 +611,8 @@ def flatten_fragments( fragment dataframe with columns: `mz`, `intensity`, `type`, `number`, `charge` and `loss_type`, where each column refers to: - - mz: float, fragment mz value - - intensity: float32, fragment intensity value + - mz: :data:`PEAK_MZ_DTYPE`, fragment mz value + - intensity: :data:`PEAK_INTENSITY_DTYPE`, fragment intensity value - type: int8, ASCII code of the ion type (97=a, 98=b, 99=c, 120=x, 121=y, 122=z), or more ion types in the future. See https://en.wikipedia.org/wiki/ASCII for more ASCII information - number: uint32, fragment series number - position: uint32, fragment position in sequence (from left to right, starts with 0) @@ -614,11 +626,18 @@ def flatten_fragments( frag_df = pd.DataFrame() frag_df['mz'] = fragment_mz_df.values.reshape(-1) if len(fragment_intensity_df) > 0: - frag_df['intensity'] = fragment_intensity_df.values.astype(np.float32).reshape(-1) + frag_df['intensity'] = fragment_intensity_df.values.astype( + PEAK_INTENSITY_DTYPE + ).reshape(-1) use_intensity = True else: use_intensity = False + # add additional columns to the fragment dataframe + # each column in the flat fragment dataframe is a whole pandas dataframe in the dense representation + for col_name, df in custom_df.items(): + frag_df[col_name] = df.values.reshape(-1) + frag_types = [] frag_loss_types = [] frag_charges = [] @@ -772,7 +791,11 @@ def remove_unused_fragments( output_tuple = [] for i in range(len(fragment_df_list)): - output_tuple.append(fragment_df_list[i].iloc[fragment_pointer].copy().reset_index(drop=True)) + output_tuple.append( + fragment_df_list[i].iloc[ + fragment_pointer + ].copy().reset_index(drop=True) + ) return precursor_df, tuple(output_tuple) @@ -780,6 +803,7 @@ def create_fragment_mz_dataframe_by_sort_precursor( precursor_df: pd.DataFrame, charged_frag_types:List, batch_size:int=500000, + dtype:np.dtype=PEAK_MZ_DTYPE, )->pd.DataFrame: """Sort nAA in precursor_df for faster fragment mz dataframe creation. @@ -808,7 +832,8 @@ def create_fragment_mz_dataframe_by_sort_precursor( refine_precursor_df(precursor_df) fragment_mz_df = init_fragment_by_precursor_dataframe( - precursor_df, charged_frag_types + precursor_df, charged_frag_types, + dtype=dtype, ) _grouped = precursor_df.groupby('nAA') @@ -825,7 +850,7 @@ def create_fragment_mz_dataframe_by_sort_precursor( fragment_mz_df.iloc[ df_group.frag_start_idx.values[0]: df_group.frag_stop_idx.values[-1], : - ] = mz_values + ] = mz_values.astype(PEAK_MZ_DTYPE) return mask_fragments_for_charge_greater_than_precursor_charge( fragment_mz_df, precursor_df.charge.values, @@ -839,6 +864,7 @@ def create_fragment_mz_dataframe( reference_fragment_df: pd.DataFrame = None, inplace_in_reference:bool = False, batch_size:int=500000, + dtype:np.dtype=PEAK_MZ_DTYPE, )->pd.DataFrame: ''' Generate fragment mass dataframe for the precursor_df. If @@ -882,6 +908,7 @@ def create_fragment_mz_dataframe( # ) fragment_mz_df = init_fragment_by_precursor_dataframe( precursor_df, charged_frag_types, + dtype=dtype, ) return create_fragment_mz_dataframe( precursor_df=precursor_df, @@ -889,11 +916,13 @@ def create_fragment_mz_dataframe( reference_fragment_df=fragment_mz_df, inplace_in_reference=True, batch_size=batch_size, + dtype=dtype, ) if 'nAA' not in precursor_df.columns: # fast return create_fragment_mz_dataframe_by_sort_precursor( - precursor_df, charged_frag_types, batch_size + precursor_df, charged_frag_types, + batch_size, dtype=dtype, ) if (is_precursor_sorted(precursor_df) and @@ -901,7 +930,8 @@ def create_fragment_mz_dataframe( ): # fast return create_fragment_mz_dataframe_by_sort_precursor( - precursor_df, charged_frag_types, batch_size + precursor_df, charged_frag_types, + batch_size, dtype=dtype ) else: @@ -917,12 +947,13 @@ def create_fragment_mz_dataframe( np.zeros(( len(reference_fragment_df), len(charged_frag_types) - )), + ), dtype=dtype), columns = charged_frag_types ) else: fragment_mz_df = init_fragment_by_precursor_dataframe( precursor_df, charged_frag_types, + dtype=dtype, ) _grouped = precursor_df.groupby('nAA') @@ -1004,3 +1035,194 @@ def join_left( joined_index[left_indices] = joined_index return joined_index + +def calc_fragment_count( + precursor_df : pd.DataFrame, + fragment_intensity_df : pd.DataFrame + ): + + """ + Calculates the number of fragments for each precursor. + + Parameters + ---------- + + precursor_df : pd.DataFrame + precursor dataframe which contains the frag_start_idx and frag_stop_idx columns + + fragment_intensity_df : pd.DataFrame + fragment intensity dataframe which contains the fragment intensities + + Returns + ------- + numpy.ndarray + array with the number of fragments for each precursor + """ + if not set(['frag_start_idx', 'frag_stop_idx']).issubset(precursor_df.columns): + raise KeyError('frag_start_idx and frag_stop_idx not in dataframe') + + n_fragments = [] + + for start, stop in zip(precursor_df['frag_start_idx'].values, precursor_df['frag_stop_idx'].values): + n_fragments += [np.sum(fragment_intensity_df.iloc[start:stop].values > 0)] + + return np.array(n_fragments) + +def filter_fragment_number( + precursor_df : pd.DataFrame, + fragment_intensity_df : pd.DataFrame, + n_fragments_allowed_column_name : str = 'n_fragments_allowed', + n_allowed : int = 999 + ): + + """ + Filters the number of fragments for each precursor. + + Parameters + ---------- + + precursor_df : pd.DataFrame + + precursor dataframe which contains the frag_start_idx and frag_stop_idx columns + + fragment_intensity_df : pd.DataFrame + fragment intensity dataframe which contains the fragment intensities + + n_fragments_allowed_column_name : str, default = 'n_fragments_allowed' + column name in precursor_df which contains the number of allowed fragments + + n_allowed : int, default = 999 + number of fragments which should be allowed + + Returns + ------- + None + """ + + if not set(['frag_start_idx', 'frag_stop_idx']).issubset(precursor_df.columns): + raise KeyError('frag_start_idx and frag_stop_idx not in dataframe') + + for i, (start_idx, stop_idx, n_allowed_lib) in enumerate( + zip( + precursor_df['frag_start_idx'].values, + precursor_df['frag_stop_idx'].values, + precursor_df[n_fragments_allowed_column_name].values + ) + ): + + _allowed = min(n_allowed_lib, n_allowed) + + intensies = fragment_intensity_df.iloc[start_idx:stop_idx].values + flat_intensities = np.sort(intensies.flatten())[::-1] + intensies[intensies <= flat_intensities[_allowed]] = 0 + fragment_intensity_df.iloc[start_idx:stop_idx] = intensies + +def calc_fragment_cardinality( + precursor_df, + fragment_mz_df, + group_column = 'elution_group_idx', + split_target_decoy = True + ): + + """ + Calculate the cardinality for a given fragment across a group of precursors. + The cardinality is the number of precursors that have a given fragment at a given position. + + All precursors within a group are expected to have the same number of fragments. + The precursor dataframe. + + fragment_mz_df : pd.DataFrame + The fragment mz dataframe. + + group_column : str + The column to group the precursors by. Integer column is expected. + + split_target_decoy : bool + If True, the cardinality is calculated for the target and decoy precursors separately. + + """ + + if len(precursor_df) == 0: + raise ValueError('Precursor dataframe is empty.') + + if len(fragment_mz_df) == 0: + raise ValueError('Fragment dataframe is empty.') + + if group_column not in precursor_df.columns: + raise KeyError('Group column not in precursor dataframe.') + + if ('frag_start_idx' not in precursor_df.columns) or ('frag_stop_idx' not in precursor_df.columns): + raise KeyError('Precursor dataframe does not contain fragment indices.') + + precursor_df = precursor_df.sort_values(group_column) + fragment_mz = fragment_mz_df.values + fragment_cardinality = np.ones(fragment_mz.shape, dtype=np.uint8) + + @nb.njit + def _calc_fragment_cardinality( + elution_group_idx, + start_idx, + stop_idx, + fragment_mz, + fragment_cardinality, + ): + elution_group = elution_group_idx[0] + elution_group_start = 0 + + for i in range(len(elution_group_idx)): + if i == len(elution_group_idx)-1 or elution_group_idx[i] != elution_group_idx[i+1]: + elution_group_stop = i+1 + + # check if whole elution group is covered + n_precursor = elution_group_stop - elution_group_start + + # Check that all precursors within a group have the same number of fragments. + nAA = stop_idx[elution_group_start:elution_group_stop] - start_idx[elution_group_start:elution_group_stop] + if not np.all(nAA[0] == nAA): + raise ValueError('All precursors within a group must have the same number of fragments.') + + # within a group, check for each precursor if it has the same fragment as another precursor + for i in range(n_precursor): + + precursor_start_idx = start_idx[elution_group_start + i] + precursor_stop_idx = stop_idx[elution_group_start + i] + + precursor_fragment_mz = fragment_mz[precursor_start_idx:precursor_stop_idx] + + for j in range(n_precursor): + if i == j: + continue + + other_precursor_start_idx = start_idx[elution_group_start + j] + other_precursor_stop_idx = stop_idx[elution_group_start + j] + other_precursor_fragment_mz = fragment_mz[other_precursor_start_idx:other_precursor_stop_idx] + + binary_mask = np.abs(precursor_fragment_mz - other_precursor_fragment_mz) < 0.00001 + + fragment_cardinality[precursor_start_idx:precursor_stop_idx] += binary_mask.astype(np.uint8) + + elution_group_start = elution_group_stop + if ('decoy' in precursor_df.columns) and (split_target_decoy): + decoy_classes = precursor_df['decoy'].unique() + for decoy_class in decoy_classes: + df = precursor_df[precursor_df['decoy'] == decoy_class] + _calc_fragment_cardinality( + df[group_column].values, + df['frag_start_idx'].values, + df['frag_stop_idx'].values, + fragment_mz, + fragment_cardinality, + ) + else: + _calc_fragment_cardinality( + precursor_df[group_column].values, + precursor_df['frag_start_idx'].values, + precursor_df['frag_stop_idx'].values, + fragment_mz, + fragment_cardinality, + ) + + return pd.DataFrame( + fragment_cardinality, + columns = fragment_mz_df.columns + ) \ No newline at end of file diff --git a/alphabase/peptide/precursor.py b/alphabase/peptide/precursor.py index 52bd87a4..14597b29 100644 --- a/alphabase/peptide/precursor.py +++ b/alphabase/peptide/precursor.py @@ -2,6 +2,7 @@ import numpy as np import numba import multiprocessing as mp +from tqdm import tqdm from xxhash import xxh64_intdigest from functools import partial @@ -542,3 +543,109 @@ def calc_precursor_isotope_mp( for df in processing: df_list.append(df) return pd.concat(df_list) + +def calc_precursor_isotope_intensity( + precursor_df, + max_isotope = 6, + min_right_most_intensity = 0.001 + ): + """Calculate isotope intensity values for precursor_df inplace. + + Parameters + ---------- + + precursor_df : pd.DataFrame + Precursor_df to calculate isotope intensity + + max_isotope : int + Max isotope number to calculate. Optional, by default 6 + + min_right_most_intensity : float + The minimal intensity value of the right-most peak relative to apex peak. + + Returns + ------- + + pd.DataFrame + precursor_df with additional columns: + + """ + + isotope_dist = IsotopeDistribution() + + col_names = ['i_{}'.format(i) for i in range(max_isotope)] + + precursor_dist = np.zeros((len(precursor_df), max_isotope), dtype=np.float32) + + for i in range(len(precursor_df)): + + row = precursor_df.iloc[i] + dist, mono = isotope_dist.calc_formula_distribution( + get_mod_seq_formula(row['sequence'], row['mods']) + ) + dist[dist <= min_right_most_intensity] = 0. + dist = dist / dist.sum() + precursor_dist[i] = dist[:max_isotope] + + precursor_df[col_names] = precursor_dist + + return precursor_df + +def calc_precursor_isotope_intensity_mp( + precursor_df, + max_isotope = 6, + min_right_most_intensity = 0.001, + mp_batch_size = 1000, + mp_process_num = 8, + progress_bar = True + ): + + """Calculate isotope intensity values for precursor_df using multiprocessing. + + Parameters + ---------- + + precursor_df : pd.DataFrame + Precursor_df to calculate isotope intensity + + max_isotope : int + Max isotope number to calculate. Optional, by default 6 + + min_right_most_intensity : float + The minimal intensity value of the right-most peak relative to apex peak. + + mp_batch_size : int + Multiprocessing batch size. Optional, by default 1000. + + mp_process_num : int + Process number. Optional, by default 8 + + progress_bar : bool + Whether to show progress bar. Optional, by default True + + Returns + ------- + + pd.DataFrame + precursor_df with additional columns i_0, i_1, i_2, ... i_{max_isotope-1} + + """ + + df_list = [] + df_group = precursor_df.groupby('nAA') + + with mp.get_context("spawn").Pool(mp_process_num) as p: + processing = p.imap( + partial( + calc_precursor_isotope_intensity, + max_isotope=max_isotope, + min_right_most_intensity=min_right_most_intensity + ), _batchify_df(df_group, mp_batch_size) + ) + + if progress_bar: + df_list = list(tqdm(processing, total=_count_batchify_df(df_group, mp_batch_size))) + else: + df_list = list(processing) + + return pd.concat(df_list, ignore_index=True) \ No newline at end of file diff --git a/alphabase/protein/fasta.py b/alphabase/protein/fasta.py index 95a8171d..245ca672 100644 --- a/alphabase/protein/fasta.py +++ b/alphabase/protein/fasta.py @@ -68,6 +68,12 @@ def load_all_proteins(fasta_file_list:list): protein_dict[protein['full_name']] = protein return protein_dict +def load_fasta_list_as_protein_df(fasta_list:list): + protein_dict = load_all_proteins(fasta_list) + return pd.DataFrame().from_dict( + protein_dict, orient="index" + ).reset_index(drop=True) + def concat_proteins(protein_dict:dict, sep='$')->str: """Concatenate all protein sequences into a single sequence, seperated by `sep ($ by default)`. @@ -466,9 +472,9 @@ def parse_labels(labels:list): if len(aa) == 1: label_aas += aa label_mod_dict[aa] = label - elif aa == 'Any N-term': + elif aa == 'Any N-term' or aa == "Any_N-term": nterm_label_mod = label - elif aa == 'Any C-term': + elif aa == 'Any C-term' or aa == "Any_C-term": cterm_label_mod = label return label_aas, label_mod_dict, nterm_label_mod, cterm_label_mod @@ -638,7 +644,7 @@ def __init__(self, precursor_charge_max:int = 4, precursor_mz_min:float = 400.0, precursor_mz_max:float = 2000.0, - var_mods:list = ['Acetyl@Protein N-term','Oxidation@M'], + var_mods:list = ['Acetyl@Protein_N-term','Oxidation@M'], min_var_mod_num:int = 0, max_var_mod_num:int = 2, fix_mods:list = ['Carbamidomethyl@C'], @@ -687,7 +693,7 @@ def __init__(self, var_mods : list, optional list of variable modifications, - by default ['Acetyl@Protein N-term','Oxidation@M'] + by default ['Acetyl@Protein_N-term','Oxidation@M'] max_var_mod_num : int, optional Minimal number of variable modifications on a peptide sequence, @@ -795,19 +801,19 @@ def _set_dict(term_dict,site,mod, else: term_dict[site] = term_mod site, term = parse_term_mod(term_mod) - if term == "Any N-term": + if term == "Any N-term" or term == "Any_N-term": _set_dict(pep_nterm, site, term_mod, allow_conflicts ) - elif term == 'Protein N-term': + elif term == 'Protein N-term' or term == "Protein_N-term": _set_dict(prot_nterm, site, term_mod, allow_conflicts ) - elif term == 'Any C-term': + elif term == 'Any C-term' or term == "Any_C-term": _set_dict(pep_cterm, site, term_mod, allow_conflicts ) - elif term == 'Protein C-term': + elif term == 'Protein C-term' or term == "Protein_C-term": _set_dict(prot_cterm, site, term_mod, allow_conflicts ) @@ -1207,9 +1213,9 @@ def add_peptide_labeling(self, labeling_channel_dict:dict=None): ``` { -1: [], # not labeled - 0: ['Dimethyl@Any N-term','Dimethyl@K'], - 4: ['Dimethyl:2H(4)@Any N-term','Dimethyl:2H(4)@K'], - 8: ['Dimethyl:2H(6)13C(2)@Any N-term','Dimethyl:2H(6)13C(2)@K'], + 0: ['Dimethyl@Any_N-term','Dimethyl@K'], + 4: ['Dimethyl:2H(4)@Any_N-term','Dimethyl:2H(4)@K'], + 8: ['Dimethyl:2H(6)13C(2)@Any_N-term','Dimethyl:2H(6)13C(2)@K'], } ```. The key name could be int (highly recommended or diff --git a/alphabase/psm_reader/dia_psm_reader.py b/alphabase/psm_reader/dia_psm_reader.py index f7bc1208..d145d7d5 100644 --- a/alphabase/psm_reader/dia_psm_reader.py +++ b/alphabase/psm_reader/dia_psm_reader.py @@ -44,7 +44,6 @@ def __init__(self, ) self.mod_seq_column = 'ModifiedPeptide' - self._min_max_rt_norm = True def _init_column_mapping(self): diff --git a/alphabase/psm_reader/maxquant_reader.py b/alphabase/psm_reader/maxquant_reader.py index 1a419632..9738980b 100644 --- a/alphabase/psm_reader/maxquant_reader.py +++ b/alphabase/psm_reader/maxquant_reader.py @@ -198,7 +198,8 @@ def _init_column_mapping(self): ]['column_mapping'] def _load_file(self, filename): - df = pd.read_csv(filename, sep='\t') + csv_sep = self._get_table_delimiter(filename) + df = pd.read_csv(filename, sep=csv_sep) self._find_mod_seq_column(df) df = df[~pd.isna(df['Retention time'])] df.fillna('', inplace=True) diff --git a/alphabase/psm_reader/msfragger_reader.py b/alphabase/psm_reader/msfragger_reader.py index bdb92286..cbc4fbcd 100644 --- a/alphabase/psm_reader/msfragger_reader.py +++ b/alphabase/psm_reader/msfragger_reader.py @@ -6,7 +6,7 @@ psm_reader_provider ) from alphabase.constants.aa import AA_ASCII_MASS -from alphabase.constants.atom import MASS_H +from alphabase.constants.atom import MASS_H, MASS_O, MASS_PROTON from alphabase.constants.modification import MOD_MASS try: @@ -32,8 +32,12 @@ def _get_mods_from_masses(sequence, msf_aa_mods): _mass_str, site_str = mod.split('@') mod_mass = float(_mass_str) site = int(site_str) + cterm_position = len(sequence) + 1 if site > 0: - mod_mass = mod_mass - AA_ASCII_MASS[ord(sequence[site-1])] + if site < cterm_position: + mod_mass = mod_mass - AA_ASCII_MASS[ord(sequence[site-1])] + else: + mod_mass -= (2* MASS_H + MASS_O) else: mod_mass -= MASS_H @@ -48,6 +52,12 @@ def _get_mods_from_masses(sequence, msf_aa_mods): site_str = '0' else: _mod = mod_name.split('@')[0]+'@'+sequence[0] + elif site==cterm_position: + if mod_name.endswith('C-term'): + _mod = mod_name + else: + _mod = mod_name.split('@')[0]+'@Any C-term' #what if only Protein C-term is listed? + site_str = '-1' else: _mod = mod_name.split('@')[0]+'@'+sequence[site-1] if _mod in MOD_MASS: diff --git a/alphabase/psm_reader/psm_reader.py b/alphabase/psm_reader/psm_reader.py index 6066e26e..81132f06 100644 --- a/alphabase/psm_reader/psm_reader.py +++ b/alphabase/psm_reader/psm_reader.py @@ -10,20 +10,9 @@ ) from alphabase.constants._const import CONST_FILE_FOLDER +from alphabase.utils import get_delimiter from alphabase.yaml_utils import load_yaml -def _get_delimiter(tsv_file:str): - if isinstance(tsv_file, io.StringIO): - # for unit tests - line = tsv_file.readline().strip() - tsv_file.seek(0) - else: - with open(tsv_file, "r") as f: - line = f.readline().strip() - if '\t' in line: return '\t' - elif ',' in line: return ',' - else: return '\t' - def translate_other_modification( mod_str: str, mod_dict: dict @@ -179,49 +168,13 @@ def __init__(self, self.keep_decoy = keep_decoy self._min_max_rt_norm = False self._engine_rt_unit = rt_unit + self._min_irt_value = -100 + self._max_irt_value = 200 @property def psm_df(self)->pd.DataFrame: return self._psm_df - def import_files(self, file_list:list): - df_list = [] - for _file in file_list: - df_list.append(self.import_file(_file)) - self._psm_df = pd.concat(df_list, ignore_index=True) - return self._psm_df - - def import_file(self, _file:str)->pd.DataFrame: - """ - This is the main entry function of PSM readers, - it imports the file with following steps: - ``` - origin_df = self._load_file(_file) - self._translate_columns(origin_df) - self._translate_decoy(origin_df) - self._translate_score(origin_df) - self._load_modifications(origin_df) - self._translate_modifications() - self._post_process(origin_df) - ``` - - Parameters - ---------- - _file: str - file path or file stream (io). - """ - origin_df = self._load_file(_file) - if len(origin_df) == 0: - self._psm_df = pd.DataFrame() - else: - self._translate_columns(origin_df) - self._translate_decoy(origin_df) - self._translate_score(origin_df) - self._load_modifications(origin_df) - self._translate_modifications() - self._post_process(origin_df) - return self._psm_df - def add_modification_mapping(self, modification_mapping:dict): """ Append additional modification mappings for the search engine. @@ -314,6 +267,45 @@ def load(self, _file)->pd.DataFrame: else: return self.import_file(_file) + def import_files(self, file_list:list): + df_list = [] + for _file in file_list: + df_list.append(self.import_file(_file)) + self._psm_df = pd.concat(df_list, ignore_index=True) + return self._psm_df + + def import_file(self, _file:str)->pd.DataFrame: + """ + This is the main entry function of PSM readers, + it imports the file with following steps: + ``` + origin_df = self._load_file(_file) + self._translate_columns(origin_df) + self._translate_decoy(origin_df) + self._translate_score(origin_df) + self._load_modifications(origin_df) + self._translate_modifications() + self._post_process(origin_df) + ``` + + Parameters + ---------- + _file: str + file path or file stream (io). + """ + origin_df = self._load_file(_file) + if len(origin_df) == 0: + self._psm_df = pd.DataFrame() + else: + self._translate_columns(origin_df) + self._transform_table(origin_df) + self._translate_decoy(origin_df) + self._translate_score(origin_df) + self._load_modifications(origin_df) + self._translate_modifications() + self._post_process(origin_df) + return self._psm_df + def _translate_decoy( self, origin_df:pd.DataFrame=None @@ -329,7 +321,7 @@ def _translate_score( pass def _get_table_delimiter(self, _filename): - return _get_delimiter(_filename) + return get_delimiter(_filename) def normalize_rt(self): if 'rt' in self.psm_df.columns: @@ -339,11 +331,19 @@ def normalize_rt(self): # elif self._engine_rt_unit == 'minute': # self.psm_df['rt_sec'] = self.psm_df.rt*60 min_rt = self.psm_df.rt.min() - if not self._min_max_rt_norm or min_rt > 0: + max_rt = self.psm_df.rt.max() + if min_rt < 0: # iRT + if min_rt < self._min_irt_value: + min_rt = self._min_irt_value + if max_rt > self._max_irt_value: + max_rt = self._max_irt_value + + elif not self._min_max_rt_norm : min_rt = 0 - self.psm_df['rt_norm'] = ( + + self.psm_df['rt_norm'] = (( self.psm_df.rt - min_rt - ) / (self.psm_df.rt.max()-min_rt) + ) / (max_rt-min_rt)).clip(0, 1) def norm_rt(self): self.normalize_rt() @@ -416,13 +416,29 @@ def _translate_columns(self, origin_df:pd.DataFrame): self._psm_df = pd.DataFrame() for col, map_col in mapped_columns.items(): self._psm_df[col] = origin_df[map_col] - + if ( 'scan_num' in self._psm_df.columns and not 'spec_idx' in self._psm_df.columns ): self._psm_df['spec_idx'] = self._psm_df.scan_num - 1 + def _transform_table(self, origin_df:pd.DataFrame): + """ + Transform the dataframe format if needed. + Usually only needed in combination with spectral libraries. + + Parameters + ---------- + origin_df : pd.DataFrame + df of other search engines + + Returns + ------- + None + Add information inplace into self._psm_df + """ + pass def _load_modifications(self, origin_df:pd.DataFrame): """Read modification information from 'origin_df'. diff --git a/alphabase/spectral_library/base.py b/alphabase/spectral_library/base.py index c6c4d3fd..8ad4ee42 100644 --- a/alphabase/spectral_library/base.py +++ b/alphabase/spectral_library/base.py @@ -2,6 +2,8 @@ import numpy as np import typing import logging +import copy +import warnings import alphabase.peptide.fragment as fragment import alphabase.peptide.precursor as precursor @@ -131,6 +133,134 @@ def fragment_intensity_df(self)->pd.DataFrame: fragment types as columns (['b_z1', 'y_z2', ...]) """ return self._fragment_intensity_df + + def copy(self): + """ + Return a copy of the spectral library object. + + Returns + ------- + SpecLibBase + A copy of the spectral library object. + """ + new_instance = self.__class__() + new_instance.__dict__ = copy.deepcopy(self.__dict__) + + return new_instance + + def append( + self, + other : 'SpecLibBase', + dfs_to_append : typing.List[str] = ['_precursor_df','_fragment_intensity_df', '_fragment_mz_df','_fragment_intensity_predicted_df'], + ): + """ + + Append another SpecLibBase object to the current one in place. + All matching dataframes in the second object will be appended to the current one. Dataframes missing in the current object will be ignored. + All matching columns in the second object will be appended to the current one. Columns missing in the current object will be ignored. + Dataframes and columns missing in the second object will raise an error. + + Parameters + ---------- + other : SpecLibBase + Second SpecLibBase object to be appended. + + dfs_to_append : list, optional + List of dataframes to be appended. + Defaults to ['_precursor_df','_fragment_intensity_df', '_fragment_mz_df','_fragment_intensity_predicted_df']. + + Returns + ------- + None + + """ + + def check_matching_columns(df1, df2): + # check if the columns are compatible + # the first dataframe should have all the columns of the second dataframe, otherwise raise error + # the second dataframe may have more columns, but they will be dropped with a warning + missing_columns = set(df1.columns) - set(df2.columns) + if len(missing_columns) > 0: + raise ValueError( + f"The columns are not compatible. {missing_columns} are missing in the dataframe which should be appended." + ) + + missing_columns = set(df2.columns) - set(df1.columns) + if len(missing_columns) > 0: + warnings.warn( + f"Unmatched columns in second dataframe will be dropped: {missing_columns}." + ) + + return df1.columns.values + + # get subset of dataframes and columns to append + # will fail if the speclibs are not compatible + matching_columns = [] + for attr in dfs_to_append: + if hasattr(self, attr) and hasattr(other, attr): + matching_columns.append( + check_matching_columns( + getattr(self, attr), getattr(other, attr) + ) + ) + elif hasattr(self, attr) and not hasattr(other, attr): + raise ValueError( + f"The libraries can't be appended as {attr} is missing in the second library." + ) + else: + matching_columns.append([]) + + n_fragments = [] + # get subset of dfs_to_append starting with _fragment + for attr in dfs_to_append: + if attr.startswith('_fragment'): + if hasattr(self, attr): + n_current_fragments = len(getattr(self, attr)) + if n_current_fragments > 0: + n_fragments += [n_current_fragments] + + if not np.all(np.array(n_fragments) == n_fragments[0]): + raise ValueError( + f"The libraries can't be appended as the number of fragments in the current libraries are not the same." + ) + + for attr, matching_columns in zip( + dfs_to_append, + matching_columns + ): + if hasattr(self, attr) and hasattr(other, attr): + + current_df = getattr(self, attr) + + # copy dataframes to avoid changing the original ones + other_df = getattr(other, attr)[matching_columns].copy() + + if attr.startswith('_precursor'): + + frag_idx_increment = 0 + for fragment_df in ['_fragment_intensity_df', '_fragment_mz_df']: + if hasattr(self, fragment_df): + if len(getattr(self, fragment_df)) > 0: + frag_idx_increment = len(getattr(self, fragment_df)) + + if 'frag_start_idx' in other_df.columns: + other_df['frag_start_idx'] += frag_idx_increment + + if 'frag_stop_idx' in other_df.columns: + other_df['frag_stop_idx'] += frag_idx_increment + + setattr( + self, attr, + pd.concat( + [ + current_df, + other_df + ], + axis=0, + ignore_index=True, + sort=False + ).reset_index(drop=True) + ) def refine_df(self): """ @@ -192,6 +322,57 @@ def update_precursor_mz(self): and clip the self._precursor_df using `self.clip_by_precursor_mz_` """ self.calc_precursor_mz() + + def calc_precursor_isotope_intensity(self, + multiprocessing : bool=True, + max_isotope = 6, + min_right_most_intensity = 0.001, + mp_batch_size = 1000, + mp_process_num = 8 + ): + """ + Calculate and append the isotope intensity columns into self.precursor_df. + See `alphabase.peptide.precursor.calc_precursor_isotope_intensity` for details. + + Parameters + ---------- + + max_isotope : int, optional + The maximum isotope to calculate. + + min_right_most_intensity : float, optional + The minimum intensity of the right most isotope. + + mp_batch_size : int, optional + The batch size for multiprocessing. + + mp_processes : int, optional + The number of processes for multiprocessing. + + """ + + if 'precursor_mz' not in self._precursor_df.columns: + self.calc_precursor_mz() + self.clip_by_precursor_mz_() + + if multiprocessing and len(self.precursor_df)>mp_batch_size: + ( + self._precursor_df + ) = precursor.calc_precursor_isotope_intensity_mp( + self.precursor_df, + max_isotope = max_isotope, + min_right_most_intensity = min_right_most_intensity, + mp_process_num = mp_process_num, + ) + else: + ( + self._precursor_df + ) = precursor.calc_precursor_isotope_intensity( + self.precursor_df, + max_isotope = max_isotope, + min_right_most_intensity = min_right_most_intensity, + ) + def calc_precursor_isotope(self, multiprocessing:bool=True, @@ -290,7 +471,41 @@ def remove_unused_fragments(self): self._precursor_df, (self._fragment_mz_df,) ) + def calc_fragment_count(self): + """ + Count the number of non-zero fragments for each precursor. + Creates the column 'n_fragments' in self._precursor_df. + """ + + self._precursor_df['n_fragments'] = fragment.calc_fragment_count( + self._precursor_df, + self._fragment_intensity_df + ) + + def filter_fragment_number( + self, + n_fragments_allowed_column_name='n_fragments_allowed', + n_allowed=999 + ): + """ + Filter the top k fragments for each precursor based on a global setting and a precursor wise column. + The smaller one will be used. Can be used to make sure that target and decoy have the same number of fragments. + + Parameters + ---------- + n_fragments_allowed_column_name : str, optional, default 'n_fragments_allowed' + The column name in self._precursor_df that contains the number of fragments allowed for each precursor. + n_allowed : int, optional, default 999 + The global setting for the number of fragments allowed for each precursor. + """ + + fragment.filter_fragment_number( + self._precursor_df, + self._fragment_intensity_df, + n_fragments_allowed_column_name=n_fragments_allowed_column_name, + n_allowed=n_allowed + ) def _get_hdf_to_save(self, hdf_file, diff --git a/alphabase/spectral_library/flat.py b/alphabase/spectral_library/flat.py index bc2f82e9..60f465f4 100644 --- a/alphabase/spectral_library/flat.py +++ b/alphabase/spectral_library/flat.py @@ -82,6 +82,7 @@ def parse_base_library(self, library:SpecLibBase, keep_original_frag_dfs:bool=True, copy_precursor_df:bool=False, + **kwargs ): """ Flatten an library object of SpecLibBase or its inherited class. @@ -111,6 +112,7 @@ def parse_base_library(self, min_fragment_intensity=self.min_fragment_intensity, keep_top_k_fragments=self.keep_top_k_fragments, custom_columns=self.custom_fragment_df_columns, + **kwargs ) if hasattr(library, 'protein_df'): diff --git a/alphabase/spectral_library/reader.py b/alphabase/spectral_library/reader.py index 4a52f73b..0655913a 100644 --- a/alphabase/spectral_library/reader.py +++ b/alphabase/spectral_library/reader.py @@ -1,14 +1,19 @@ import typing +import os import numpy as np import pandas as pd from alphabase.peptide.mobility import mobility_to_ccs_for_df from alphabase.io.psm_reader.dia_search_reader import SpectronautReader +from alphabase.io.psm_reader.maxquant_reader import MaxQuantReader from alphabase.spectral_library.base import SpecLibBase from alphabase.psm_reader.psm_reader import psm_reader_yaml from alphabase.psm_reader import psm_reader_provider -class SWATHLibraryReader(SpectronautReader, SpecLibBase): +from alphabase.constants._const import CONST_FILE_FOLDER, PEAK_INTENSITY_DTYPE +from alphabase.yaml_utils import load_yaml + +class LibraryReaderBase(MaxQuantReader, SpecLibBase): def __init__(self, charged_frag_types:typing.List[str] = [ 'b_z1','b_z2','y_z1', 'y_z2', @@ -20,7 +25,7 @@ def __init__(self, fdr = 0.01, fixed_C57 = False, mod_seq_columns=psm_reader_yaml[ - 'spectronaut' + 'library_reader_base' ]['mod_seq_columns'], rt_unit='irt', precursor_mz_min:float = 400, @@ -28,6 +33,49 @@ def __init__(self, decoy:str = None, **kwargs ): + """ + + Base class for reading spectral libraries from long format csv files. + + Parameters + ---------- + + charged_frag_types: list of str + List of fragment types to be used in the spectral library. + The default is ['b_z1','b_z2','y_z1', 'y_z2', 'b_modloss_z1','b_modloss_z2','y_modloss_z1', 'y_modloss_z2'] + + column_mapping: dict + Dictionary mapping the column names in the csv file to the column names in the spectral library. + The default is None, which uses the `library_reader_base` column mapping in `psm_reader.yaml` + + modification_mapping: dict + Dictionary mapping the modification names in the csv file to the modification names in the spectral library. + + fdr: float + False discovery rate threshold for filtering the spectral library. + default is 0.01 + + fixed_C57: bool + + mod_seq_columns: list of str + List of column names in the csv file containing the modified sequence. + By default the mapping is taken from `psm_reader.yaml` + + rt_unit: str + Unit of the retention time column in the csv file. + The default is 'irt' + + precursor_mz_min: float + Minimum precursor m/z value for filtering the spectral library. + + precursor_mz_max: float + Maximum precursor m/z value for filtering the spectral library. + + decoy: str + Decoy type for the spectral library. + Can be either `pseudo_reverse` or `diann` + + """ SpecLibBase.__init__(self, charged_frag_types = charged_frag_types, precursor_mz_min=precursor_mz_min, @@ -35,7 +83,7 @@ def __init__(self, decoy=decoy ) - SpectronautReader.__init__(self, + MaxQuantReader.__init__(self, column_mapping = column_mapping, modification_mapping = modification_mapping, fdr = fdr, @@ -45,71 +93,95 @@ def __init__(self, rt_unit=rt_unit, ) - self._frag_type_columns = "FragmentType FragmentIonType ProductType ProductIonType".split(' ') - self._frag_number_columns = "FragmentNumber FragmentSeriesNumber".split(' ') - self._frag_charge_columns = "FragmentCharge FragmentIonCharge ProductCharge ProductIonCharge".split(' ') - self._frag_loss_type_columns = "FragmentLossType FragmentIonLossType ProductLossType ProductIonLossType".split(' ') - self._frag_inten_columns = "RelativeIntensity RelativeFragmentIntensity RelativeFragmentIonIntensity LibraryIntensity".split(' ') + def _init_column_mapping(self): + """ + Initialize the column mapping from the `psm_reader.yaml` file. + """ + self.column_mapping = psm_reader_yaml[ + 'library_reader_base' + ]['column_mapping'] def _find_key_columns(self, lib_df:pd.DataFrame): - def find_col(target_columns, df_columns): - for col in target_columns: - if col in df_columns: - return col - return None - self.mod_seq_col = find_col(self._mod_seq_columns, lib_df.columns) + """ + Find and create the key columns for the spectral library. + + Parameters + ---------- + + lib_df: pd.DataFrame + Dataframe containing the spectral library. - self.mapped_peptide_columns = self._find_mapped_columns(lib_df) + """ + + if 'fragment_loss_type' not in lib_df.columns: + lib_df['fragment_loss_type'] = '' + + lib_df['fragment_loss_type'].fillna('', inplace=True) + lib_df['fragment_loss_type'].replace('noloss','',inplace=True) + - self.frag_type_col = find_col(self._frag_type_columns, lib_df.columns) - self.frag_num_col = find_col(self._frag_number_columns, lib_df.columns) - self.frag_charge_col = find_col(self._frag_charge_columns, lib_df.columns) - self.frag_loss_type_col = find_col(self._frag_loss_type_columns, lib_df.columns) - self.frag_inten_col = find_col(self._frag_inten_columns, lib_df.columns) + if 'mods' not in lib_df.columns: + lib_df['mods'] = '' - if self.frag_loss_type_col is None: - self.frag_loss_type_col = 'FragmentLossType' - lib_df[self.frag_loss_type_col] = '' + if 'mod_sites' not in lib_df.columns: + lib_df['mod_sites'] = '' def _get_fragment_intensity(self, lib_df:pd.DataFrame): + """ + + Create the self._fragment_intensity dataframe from a given spectral library. + In the process, the input dataframe is converted from long format to a precursor dataframe and returned. + + Parameters + ---------- + lib_df: pd.DataFrame + Dataframe containing the spectral library. + + Returns + ------- + precursor_df: pd.DataFrame + Dataframe containing the fragment intensity. + + """ frag_col_dict = dict(zip( self.charged_frag_types, range(len(self.charged_frag_types)) )) self._find_key_columns(lib_df) - lib_df[self.frag_loss_type_col].fillna('', inplace=True) - lib_df[self.frag_loss_type_col].replace('noloss','',inplace=True) - group_cols = [ - self.mod_seq_col, - self.mapped_peptide_columns['sequence'], - self.mapped_peptide_columns['charge'], - ] + # drop all columns which are all NaN as they prohibit grouping + lib_df = lib_df.dropna(axis=1, how='all') - if 'raw_name' in self.mapped_peptide_columns: - group_cols.append(self.mapped_peptide_columns['raw_name']) - - col_list_dict = dict([(col, []) for col in self.mapped_peptide_columns.values()]) - col_list_dict[self.mod_seq_col] = [] + precursor_df_list = [] frag_intens_list = [] nAA_list = [] + + fragment_columns = [ + 'fragment_mz','fragment_type','fragment_charge','fragment_series','fragment_loss_type','fragment_intensity' + ] + + # by default, all non-fragment columns are used to group the library + non_fragment_columns = list(set(lib_df.columns) - set(fragment_columns)) + for keys, df_group in lib_df.groupby( - group_cols + non_fragment_columns ): + precursor_columns = dict(zip(non_fragment_columns, keys)) + + nAA = len(precursor_columns['sequence']) - nAA = len(keys[1]) - nAA_list.append(nAA) intens = np.zeros( - (nAA-1, len(self.charged_frag_types)),dtype=np.float32 + (nAA-1, len(self.charged_frag_types)), + dtype=PEAK_INTENSITY_DTYPE, ) for frag_type, frag_num, loss_type, frag_charge, inten in df_group[ [ - self.frag_type_col,self.frag_num_col,self.frag_loss_type_col, - self.frag_charge_col,self.frag_inten_col + 'fragment_type','fragment_series','fragment_loss_type', + 'fragment_charge','fragment_intensity' ] ].values: if frag_type in 'abc': @@ -127,6 +199,8 @@ def _get_fragment_intensity(self, lib_df:pd.DataFrame): frag_type = f'{frag_type}_H2O_z{frag_charge}' elif loss_type == 'NH3': frag_type = f'{frag_type}_NH3_z{frag_charge}' + elif loss_type == 'unknown': # DiaNN+fragger + frag_type = f'{frag_type}_z{frag_charge}' else: continue @@ -138,11 +212,12 @@ def _get_fragment_intensity(self, lib_df:pd.DataFrame): if max_inten <= 0: continue intens /= max_inten - for col, col_list in col_list_dict.items(): - col_list.append(df_group[col].values[0]) + precursor_df_list.append(precursor_columns) frag_intens_list.append(intens) - - df = pd.DataFrame(col_list_dict) + nAA_list.append(nAA) + + df = pd.DataFrame(precursor_df_list) + self._fragment_intensity_df = pd.DataFrame( np.concatenate(frag_intens_list), @@ -158,43 +233,53 @@ def _get_fragment_intensity(self, lib_df:pd.DataFrame): return df - def _load_file(self, filename): - self.csv_sep = self._get_table_delimiter(filename) - df = pd.read_csv(filename, sep=self.csv_sep) - self._find_mod_seq_column(df) + def _load_file( + self, + filename:str + ): + """ + Load the spectral library from a csv file. + Reimplementation of `PSMReaderBase._translate_columns`. + """ - df = self._get_fragment_intensity(df) + csv_sep = self._get_table_delimiter(filename) + df = pd.read_csv(filename, sep=csv_sep) + self._find_mod_seq_column(df) + return df - - def _post_process(self, - lib_df + + def _post_process( + self, + lib_df:pd.DataFrame, ): - self._psm_df['nAA'] = self._psm_df.sequence.str.len() - self._psm_df[ - ['frag_start_idx','frag_stop_idx'] - ] = lib_df[['frag_start_idx','frag_stop_idx']] + """ + Process the spectral library and create the `fragment_intensity`, `fragment_mz`dataframe. + Reimplementation of `PSMReaderBase._post_process`. + """ + + if 'nAA' not in self._psm_df.columns: + self._psm_df['nAA'] = self._psm_df.sequence.str.len() + self._psm_df = self._get_fragment_intensity(self._psm_df) + self.normalize_rt_by_raw_name() - - if ( - 'mobility' in self._psm_df.columns - ): + + if 'mobility' in self._psm_df.columns: self._psm_df['ccs'] = ( mobility_to_ccs_for_df( self._psm_df, 'mobility' ) ) - - self._psm_df = self._psm_df[ - ~self._psm_df.mods.isna() - ].reset_index(drop=True) + self._psm_df.drop('modified_sequence', axis=1, inplace=True) self._precursor_df = self._psm_df self.calc_fragment_mz_df() +# legacy +SWATHLibraryReader = LibraryReaderBase class LibraryReaderFromRawData(SpecLibBase): def __init__(self, diff --git a/alphabase/spectral_library/reader_from_raw.py b/alphabase/spectral_library/reader_from_raw.py deleted file mode 100644 index 0c7350b8..00000000 --- a/alphabase/spectral_library/reader_from_raw.py +++ /dev/null @@ -1,6 +0,0 @@ -import typing -import pandas as pd - -from alphabase.psm_reader import psm_reader_provider -from alphabase.spectral_library.base import SpecLibBase - diff --git a/alphabase/spectral_library/translate.py b/alphabase/spectral_library/translate.py index e8e03758..203d13fe 100644 --- a/alphabase/spectral_library/translate.py +++ b/alphabase/spectral_library/translate.py @@ -93,8 +93,8 @@ def merge_precursor_fragment_df( frag_mass_head:str='FragmentMz', frag_inten_head:str='RelativeIntensity', frag_charge_head:str='FragmentCharge', + frag_series_head:str='FragmentNumber', frag_loss_head:str='FragmentLossType', - frag_num_head:str='FragmentNumber', verbose=True, ): ''' @@ -143,8 +143,8 @@ def merge_precursor_fragment_df( df[frag_mass_head] = frag_mass_list df[frag_inten_head] = frag_inten_list df[frag_charge_head] = frag_charge_list + df[frag_series_head] = frag_num_list df[frag_loss_head] = frag_loss_list - df[frag_num_head] = frag_num_list return explode_multiple_columns(df, [ @@ -152,8 +152,8 @@ def merge_precursor_fragment_df( frag_mass_head, frag_inten_head, frag_charge_head, + frag_series_head, frag_loss_head, - frag_num_head ] ) @@ -241,7 +241,7 @@ def speclib_to_single_df( frag_inten_head:str='RelativeIntensity', frag_charge_head:str='FragmentCharge', frag_loss_head:str='FragmentLossType', - frag_num_head:str='FragmentNumber', + frag_series_head:str='FragmentNumber', verbose = True, )->pd.DataFrame: ''' @@ -353,7 +353,7 @@ def speclib_to_single_df( frag_inten_head=frag_inten_head, frag_charge_head=frag_charge_head, frag_loss_head=frag_loss_head, - frag_num_head=frag_num_head, + frag_series_head=frag_series_head, verbose=verbose ) df = df[df['RelativeIntensity']>min_frag_intensity] @@ -388,7 +388,7 @@ def run(self): while True: df, batch = self.task_queue.get() if df is None: break - df.to_csv(self.tsv, header=(batch==0), sep="\t", mode="a", index=False) + df.to_csv(self.tsv, header=(batch==0), sep="\t", mode="a", index=False, lineterminator="\n") def translate_to_tsv( speclib:SpecLibBase, @@ -443,7 +443,7 @@ def translate_to_tsv( if multiprocessing: df_head_queue.put((df, i)) else: - df.to_csv(tsv, header=(i==0), sep="\t", mode='a', index=False) + df.to_csv(tsv, header=(i==0), sep="\t", mode='a', index=False, lineterminator="\n") if multiprocessing: df_head_queue.put((None, None)) print("Translation finished, it will take several minutes to export the rest precursors to the tsv file...") diff --git a/alphabase/spectral_library/validate.py b/alphabase/spectral_library/validate.py new file mode 100644 index 00000000..4381daa6 --- /dev/null +++ b/alphabase/spectral_library/validate.py @@ -0,0 +1,228 @@ +import pandas as pd +import numpy as np + +from typing import Union, List + +class Column(): + def __init__( + self, + name : str, + type : Union[str, type, np.dtype], + allow_NaN : bool = False, + allow_inf : bool = False + ): + """ + Base class for validating a single column. + The column is safely cast to the specified type inplace. + NaN and inf values are checked. + + Parameters + ---------- + + name: str + Name of the column + + type: Union[str, type, np.dtype] + Type of the column + + allow_NaN: bool + If True, allow NaN values + + allow_inf: bool + If True, allow inf values + + Properties + ---------- + + name: str + Name of the column + + type: Union[type, np.dtype] + Type of the column + + """ + + self.name = name + + if isinstance(type, str): + self.type = np.dtype(type) + else: + self.type = type + + self.allow_NaN = allow_NaN + self.allow_inf = allow_inf + + def __call__( + self, + df : pd.DataFrame + ): + """ + Validates the column. + + Parameters + ---------- + + df: pd.DataFrame + Dataframe which contains the column. + + """ + if df[self.name].dtype != self.type: + if np.can_cast(df[self.name].dtype, self.type): + df[self.name] = df[self.name].astype(self.type) + else: + raise ValueError(f"Validation failed: Column {self.name} of type {_get_type_name(df[self.name].dtype)} cannot be cast to {_get_type_name(self.type)}") + + if not self.allow_NaN: + if df[self.name].isna().any(): + raise ValueError(f"Validation failed: Column {self.name} contains NaN values") + + if not self.allow_inf: + if not np.isfinite(df[self.name]).all(): + raise ValueError(f"Validation failed: Column {self.name} contains inf values") + +class Optional(Column): + """ + Optional column to be validated. + If the column is not present in the dataframe, the validation is skipped. + """ + def __init__(self, *args, **kwargs): + """ + Optional column + + Parameters + ---------- + + name: str + Name of the column + + type: type + Type of the column + + """ + + super().__init__( *args, **kwargs) + + + def __call__( + self, + df : pd.DataFrame + ): + """ + Casts the column to the specified type if it is present in the dataframe + + Parameters + ---------- + + df: pd.DataFrame + Dataframe to validate + + """ + + if self.name in df.columns: + super().__call__(df) + +class Required(Column): + """ + Required column to be validated. + If the column is not present in the dataframe, the validation fails. + """ + def __init__(self, *args, **kwargs): + """ + Required column + + Parameters + ---------- + + name: str + Name of the column + + type: type + Type of the column + + """ + super().__init__(*args, **kwargs) + + def __call__( + self, + df : pd.DataFrame + ): + """ + Casts the column to the specified type if it is present in the dataframe + + Parameters + ---------- + + df: pd.DataFrame + Dataframe to validate + + """ + + if self.name in df.columns: + super().__call__(df) + else: + raise ValueError(f"Validation failed: Column {self.name} is not present in the dataframe") + +class Schema(): + def __init__( + self, + name : str, + properties: List[Column]): + """ + Schema for validating dataframes + + Parameters + ---------- + + name: str + Name of the schema + + properties: list + List of Property objects + + """ + + self.name = name + self.schema = properties + for column in self.schema: + if not isinstance(column, Column): + raise ValueError(f"Schema must contain only Property objects") + + def __call__(self, df): + """ + Validates the dataframe + + Parameters + ---------- + + df: pd.DataFrame + Dataframe to validate + + """ + + for column in self.schema: + column(df) + +def _get_type_name( + type : Union[str, type, np.dtype]) -> str: + """ + Returns the human readable name of the type + + Parameters + ---------- + + type: Union[str, type, np.dtype] + Type to get the name of + + Returns + ------- + + name: str + Human readable name of the type + + """ + if isinstance(type, str): + return type + elif isinstance(type, np.dtype): + return type.name + else: + return type.__name__ \ No newline at end of file diff --git a/alphabase/statistics/regression.py b/alphabase/statistics/regression.py index 18a5b2d9..618a84ad 100644 --- a/alphabase/statistics/regression.py +++ b/alphabase/statistics/regression.py @@ -184,6 +184,12 @@ def fit(self, x: np.ndarray, y: np.ndarray): if len(y.shape) == 1: y = y[...,np.newaxis] + + # remove outliers by using only the 0.5 to 99.5 percentile + percentiles = np.percentile(x, [0.1, 99.9]) + mask = (percentiles[0] < x[:,0]) & (x[:,0] < percentiles[1]) + x = x[mask,...] + y = y[mask,...] # === end === sanity checks === @@ -191,6 +197,7 @@ def fit(self, x: np.ndarray, y: np.ndarray): idx_sorted = np.argsort(x.flat) x_sorted = x.flat[idx_sorted] + # stores if uniform training is still possible this round uniform = self.uniform diff --git a/alphabase/utils.py b/alphabase/utils.py index 3f51537b..bd67e805 100644 --- a/alphabase/utils.py +++ b/alphabase/utils.py @@ -3,6 +3,7 @@ import sys import pandas as pd import itertools +import io # from alphatims def process_bar(iterator, len_iter): @@ -31,3 +32,15 @@ def explode_multiple_columns(df:pd.DataFrame, columns:list): for col in columns[1:]: ret_df[col] = _flatten(df[col].values) return ret_df + +def get_delimiter(tsv_file:str): + if isinstance(tsv_file, io.StringIO): + # for unit tests + line = tsv_file.readline().strip() + tsv_file.seek(0) + else: + with open(tsv_file, "r") as f: + line = f.readline().strip() + if '\t' in line: return '\t' + elif ',' in line: return ',' + else: return '\t' diff --git a/docs/_static/css/custom.css b/docs/_static/css/custom.css new file mode 100644 index 00000000..fb2f0aa3 --- /dev/null +++ b/docs/_static/css/custom.css @@ -0,0 +1,12 @@ +.toc-drawer { + width: 20em !important; +} + +.sidebar-container { + width: 20em !important; +} + + +.autosummary.longtable{ + width: 100%; +} \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index ed3f762f..f7df678a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -23,7 +23,7 @@ copyright = '2022, Mann Labs, MPIB' author = 'Mann Labs, MPIB' -release = "1.0.2" +release = "1.0.3" # -- General configuration --------------------------------------------------- @@ -101,6 +101,10 @@ def linkcode_resolve(domain, info): # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] +html_css_files = [ + 'css/custom.css', +] + autodoc_default_options = { 'autosummary': True, 'special-members': '__init__', # Include __init__ methods. diff --git a/requirements/requirements_development.txt b/extra_requirements/development.txt similarity index 100% rename from requirements/requirements_development.txt rename to extra_requirements/development.txt diff --git a/nbdev_nbs/peptide/fragment.ipynb b/nbdev_nbs/peptide/fragment.ipynb index f551735b..9f2c186f 100644 --- a/nbdev_nbs/peptide/fragment.ipynb +++ b/nbdev_nbs/peptide/fragment.ipynb @@ -1614,11 +1614,134 @@ "\n", "test_join_left()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "precursor_df = pd.DataFrame([\n", + " {'elution_group_idx': 0, 'frag_start_idx': 0, 'frag_stop_idx': 10, 'decoy': 0},\n", + " {'elution_group_idx': 0, 'frag_start_idx': 10, 'frag_stop_idx': 20, 'decoy': 0},\n", + " {'elution_group_idx': 0, 'frag_start_idx': 20, 'frag_stop_idx': 30, 'decoy': 1},\n", + " {'elution_group_idx': 0, 'frag_start_idx': 30, 'frag_stop_idx': 40, 'decoy': 1},\n", + " {'elution_group_idx': 1, 'frag_start_idx': 40, 'frag_stop_idx': 50, 'decoy': 0},\n", + " {'elution_group_idx': 1, 'frag_start_idx': 50, 'frag_stop_idx': 60, 'decoy': 0},\n", + " {'elution_group_idx': 1, 'frag_start_idx': 60, 'frag_stop_idx': 70, 'decoy': 1},\n", + " {'elution_group_idx': 1, 'frag_start_idx': 70, 'frag_stop_idx': 80, 'decoy': 1},\n", + "])\n", + "\n", + "fragment_mz = np.arange(0,160).reshape(80,2)\n", + "\n", + "fragment_mz[0::2,:] = 0\n", + "\n", + "fragment_df = pd.DataFrame(\n", + " fragment_mz,\n", + " columns=['y1','y2']\n", + ")\n", + "\n", + "cardinality_df = calc_fragment_cardinality(\n", + " precursor_df,\n", + " fragment_df,\n", + " group_column='elution_group_idx',\n", + " split_target_decoy=True\n", + ")\n", + "\n", + "assert np.all(cardinality_df.values[0::2,:]==2)\n", + "assert np.all(cardinality_df.values[1::2,:]==1)\n", + "\n", + "cardinality_df = calc_fragment_cardinality(\n", + " precursor_df,\n", + " fragment_df,\n", + " group_column='elution_group_idx',\n", + " split_target_decoy=False\n", + ")\n", + "\n", + "assert np.all(cardinality_df.values[0::2,:]==4)\n", + "assert np.all(cardinality_df.values[1::2,:]==1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "peptides = ['AGHCEWQMKAADER', 'AGHCEWQMKAADER']\n", + "mods = ['Dimethyl:2H(4)@Any N-term', 'Dimethyl@Any N-term']\n", + "sites = ['0','0']\n", + "charge = [2, 2]\n", + "elution_group_idx = [0, 0]\n", + "\n", + "precursor_df = pd.DataFrame({\n", + " 'elution_group_idx': elution_group_idx,\n", + " 'sequence': peptides,\n", + " 'mods': mods,\n", + " 'mod_sites': sites,\n", + " 'charge': charge\n", + "})\n", + "precursor_df['nAA'] = precursor_df['sequence'].str.len()\n", + "precursor_df = update_precursor_mz(precursor_df)\n", + "\n", + "fragment_mz_df = create_fragment_mz_dataframe_by_sort_precursor(\n", + " precursor_df,\n", + " get_charged_frag_types(['b','y'],2)\n", + ")\n", + "fragment_intensity_df = fragment_mz_df.copy()\n", + "fragment_intensity_df[fragment_intensity_df.columns] = np.random.randint(0,11, size=(fragment_mz_df.shape))/10.0\n", + "\n", + "cardinality_df = calc_fragment_cardinality(\n", + " precursor_df,\n", + " fragment_mz_df,\n", + " group_column='elution_group_idx',\n", + " split_target_decoy=False\n", + ")\n", + "\n", + "# flattening the fragments will create a dataframe with one column for each of the following dataframes: fragment_mz_df, fragment_intensity_df, cardinality_df\n", + "# cardinality_df is provided as item in the custom_df dictionary\n", + "\n", + "precursor_new_df, fragment_df = flatten_fragments(\n", + " precursor_df, fragment_mz_df, fragment_intensity_df, \n", + " min_fragment_intensity=-1,keep_top_k_fragments=6,\n", + " custom_columns=['type','position'],\n", + " custom_df={'cardinality': cardinality_df}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " mz intensity cardinality type position\n", + "0 265.620114 1.0 1 98 4\n", + "1 358.659770 1.0 1 98 5\n", + "2 948.456741 0.9 2 121 5\n", + "3 410.702720 0.9 2 121 6\n", + "4 680.807368 1.0 1 98 11\n", + "5 304.161545 1.0 2 121 11\n", + "6 157.097154 1.0 1 98 1\n", + "7 712.287157 1.0 1 98 5\n", + "8 474.732009 0.9 2 121 5\n", + "9 550.244230 0.9 1 98 8\n", + "10 1356.582353 0.9 1 98 11\n", + "11 88.063114 1.0 2 121 12\n" + ] + } + ], + "source": [ + "print(fragment_df)" + ] } ], "metadata": { "kernelspec": { - "display_name": "base", + "display_name": "alpha", "language": "python", "name": "python3" } diff --git a/nbdev_nbs/peptide/precursor.ipynb b/nbdev_nbs/peptide/precursor.ipynb index c55a85b9..290a4763 100644 --- a/nbdev_nbs/peptide/precursor.ipynb +++ b/nbdev_nbs/peptide/precursor.ipynb @@ -10,6 +10,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -26,6 +27,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -418,12 +420,65 @@ "assert get_mod_seq_charge_hash(\"AGHCEWQMKAADER\",'Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M','0;4;8',2) == precursor_df.mod_seq_charge_hash.values[0]" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# test precursor.calc_precursor_isotope_intensity_mp" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "repeat = 2\n", + "peptides = ['AGHCEWQMKAADER']*repeat\n", + "mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat\n", + "sites = ['0;4;8']*repeat\n", + "peptides += ['AGHCEWQMK']*repeat\n", + "mods += ['']*repeat\n", + "sites += ['']*repeat\n", + "\n", + "precursor_df = pd.DataFrame({\n", + " 'sequence': peptides,\n", + " 'mods': mods,\n", + " 'mod_sites': sites\n", + "})\n", + "precursor_df['nAA'] = precursor_df['sequence'].str.len()\n", + "precursor_df['charge'] = 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 2/2 [00:02<00:00, 1.07s/it]\n" + ] + } + ], + "source": [ + "precursor_df = calc_precursor_isotope_intensity_mp(precursor_df)\n", + "\n", + "assert all(col in precursor_df.columns for col in ['i_0','i_1','i_2','i_3','i_4','i_5'])\n", + "\n", + "assert np.allclose(\n", + " precursor_df[['i_0','i_1','i_2','i_3','i_4','i_5']].values,\n", + " np.array([[0.504251,0.290763,0.139951,0.048122,0.013660,0.003253],\n", + " [0.504251,0.290763,0.139951,0.048122,0.013660,0.003253],\n", + " [0.360538,0.320501,0.190923,0.085047,0.030905,0.009528],\n", + " [0.360538,0.320501,0.190923,0.085047,0.030905,0.009528]]\n", + " ),\n", + " 0.01\n", + ")" + ] } ], "metadata": { diff --git a/nbdev_nbs/psm_reader/dia_psm_reader.ipynb b/nbdev_nbs/psm_reader/dia_psm_reader.ipynb index 3cf3b794..33e88b26 100644 --- a/nbdev_nbs/psm_reader/dia_psm_reader.ipynb +++ b/nbdev_nbs/psm_reader/dia_psm_reader.ipynb @@ -34,7 +34,20 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/wenfengzeng/workspace/alphabase/alphabase/constants/modification.py:88: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " _df[\"mod_name\"] = MOD_DF.mod_name.str.replace(' ', '_', regex=False)\n" + ] + } + ], "source": [ "from alphabase.psm_reader.dia_psm_reader import *" ] @@ -65,9 +78,14 @@ " 'ccs': 'CCS',\n", " 'precursor_mz': 'PrecursorMz',\n", " 'mobility': ['Mobility', 'IonMobility', 'PrecursorIonMobility'],\n", - " 'proteins': ['Protein Name', 'ProteinId', 'ProteinID', 'ProteinName'],\n", + " 'proteins': ['Protein Name',\n", + " 'ProteinId',\n", + " 'ProteinID',\n", + " 'ProteinName',\n", + " 'ProteinGroup',\n", + " 'ProteinGroups'],\n", " 'uniprot_ids': ['UniProtIds', 'UniProtID', 'UniprotId'],\n", - " 'genes': ['Genes', 'Gene', 'GeneName']}" + " 'genes': ['Genes', 'Gene', 'GeneName', 'GeneNames']}" ] }, "execution_count": null, @@ -535,7 +553,7 @@ " 0.843331\n", " 0.006937\n", " 11190\n", - " Acetyl@Any N-term;Oxidation@M\n", + " Acetyl@Any_N-term;Oxidation@M\n", " 0;12\n", " 14\n", " 0.372721\n", @@ -556,7 +574,7 @@ " 0.951820\n", " 0.001225\n", " 11238\n", - " Acetyl@Any N-term;Oxidation@M\n", + " Acetyl@Any_N-term;Oxidation@M\n", " 0;12\n", " 14\n", " 0.374635\n", @@ -577,7 +595,7 @@ " 0.999997\n", " 0.000040\n", " 30052\n", - " Acetyl@Any N-term\n", + " Acetyl@Any_N-term\n", " 0\n", " 17\n", " 1.000000\n", @@ -598,7 +616,7 @@ " 0.995505\n", " 0.000184\n", " 30028\n", - " Acetyl@Any N-term\n", + " Acetyl@Any_N-term\n", " 0\n", " 17\n", " 1.000000\n", @@ -619,7 +637,7 @@ " 0.997286\n", " 0.000185\n", " 30004\n", - " Acetyl@Any N-term\n", + " Acetyl@Any_N-term\n", " 0\n", " 17\n", " 1.000000\n", @@ -640,7 +658,7 @@ " 0.996593\n", " 0.000153\n", " 29980\n", - " Acetyl@Any N-term\n", + " Acetyl@Any_N-term\n", " 0\n", " 17\n", " 1.000000\n", @@ -661,7 +679,7 @@ " 0.999999\n", " 0.000040\n", " 22186\n", - " Acetyl@Any N-term\n", + " Acetyl@Any_N-term\n", " 0\n", " 21\n", " 0.738374\n", @@ -682,7 +700,7 @@ " 0.999996\n", " 0.000050\n", " 22090\n", - " Acetyl@Any N-term\n", + " Acetyl@Any_N-term\n", " 0\n", " 21\n", " 0.735775\n", @@ -703,7 +721,7 @@ " 0.999999\n", " 0.000061\n", " 22066\n", - " Acetyl@Any N-term\n", + " Acetyl@Any_N-term\n", " 0\n", " 21\n", " 0.735576\n", @@ -724,7 +742,7 @@ " 0.999997\n", " 0.000044\n", " 21946\n", - " Acetyl@Any N-term\n", + " Acetyl@Any_N-term\n", " 0\n", " 21\n", " 0.732136\n", @@ -869,16 +887,16 @@ "13 MECP2 10957 0.971834 0.000604 10956 \n", "\n", " mods mod_sites nAA rt_norm precursor_mz \\\n", - "0 Acetyl@Any N-term;Oxidation@M 0;12 14 0.372721 650.819344 \n", - "1 Acetyl@Any N-term;Oxidation@M 0;12 14 0.374635 650.819344 \n", - "2 Acetyl@Any N-term 0 17 1.000000 834.428635 \n", - "3 Acetyl@Any N-term 0 17 1.000000 834.428635 \n", - "4 Acetyl@Any N-term 0 17 1.000000 834.428635 \n", - "5 Acetyl@Any N-term 0 17 1.000000 834.428635 \n", - "6 Acetyl@Any N-term 0 21 0.738374 895.991600 \n", - "7 Acetyl@Any N-term 0 21 0.735775 895.991600 \n", - "8 Acetyl@Any N-term 0 21 0.735576 895.991600 \n", - "9 Acetyl@Any N-term 0 21 0.732136 895.991600 \n", + "0 Acetyl@Any_N-term;Oxidation@M 0;12 14 0.372721 650.819344 \n", + "1 Acetyl@Any_N-term;Oxidation@M 0;12 14 0.374635 650.819344 \n", + "2 Acetyl@Any_N-term 0 17 1.000000 834.428635 \n", + "3 Acetyl@Any_N-term 0 17 1.000000 834.428635 \n", + "4 Acetyl@Any_N-term 0 17 1.000000 834.428635 \n", + "5 Acetyl@Any_N-term 0 17 1.000000 834.428635 \n", + "6 Acetyl@Any_N-term 0 21 0.738374 895.991600 \n", + "7 Acetyl@Any_N-term 0 21 0.735775 895.991600 \n", + "8 Acetyl@Any_N-term 0 21 0.735576 895.991600 \n", + "9 Acetyl@Any_N-term 0 21 0.732136 895.991600 \n", "10 23 0.368908 695.666290 \n", "11 23 0.367626 695.666290 \n", "12 23 0.366309 695.666290 \n", @@ -927,8 +945,18 @@ "_df = diann_reader.import_file(tsv)\n", "assert 'ccs' in diann_reader.psm_df.columns\n", "assert len(diann_reader.psm_df) == 14\n", - "assert np.sum(diann_reader.psm_df.mods.str.contains('Acetyl@Protein N-term')|diann_reader.psm_df.mods.str.contains('Acetyl@Any N-term')) == 10\n", - "assert np.sum(~diann_reader.psm_df.mods.str.contains('Acetyl@Protein N-term')&~diann_reader.psm_df.mods.str.contains('Acetyl@Any N-term')) == 4\n", + "assert np.sum(\n", + " diann_reader.psm_df.mods.str.contains('Acetyl@Protein N-term')|\n", + " diann_reader.psm_df.mods.str.contains('Acetyl@Any N-term')|\n", + " diann_reader.psm_df.mods.str.contains('Acetyl@Protein_N-term')|\n", + " diann_reader.psm_df.mods.str.contains('Acetyl@Any_N-term')\n", + ") == 10\n", + "assert np.sum(\n", + " ~diann_reader.psm_df.mods.str.contains('Acetyl@Protein N-term')&\n", + " ~diann_reader.psm_df.mods.str.contains('Acetyl@Any N-term')&\n", + " ~diann_reader.psm_df.mods.str.contains('Acetyl@Protein_N-term')&\n", + " ~diann_reader.psm_df.mods.str.contains('Acetyl@Any_N-term')\n", + ") == 4\n", "assert np.sum(diann_reader.psm_df.mods.str.contains('Oxidation@M')) == 2\n", "assert np.all(np.array(diann_reader.modification_mapping['Phospho@S'])==np.array([\n", " 'S(Phospho (S))',\n", @@ -973,8 +1001,8 @@ "\n", "assert 'ccs' in diann_reader.psm_df.columns\n", "assert len(diann_reader.psm_df) == 14\n", - "assert np.sum(diann_reader.psm_df.mods.str.contains('Acetyl@Any N-term')) == 10\n", - "assert np.sum(~diann_reader.psm_df.mods.str.contains('Acetyl@Any N-term')) == 4\n", + "assert np.sum(diann_reader.psm_df.mods.str.contains('Acetyl@Any_N-term')) == 10\n", + "assert np.sum(~diann_reader.psm_df.mods.str.contains('Acetyl@Any_N-term')) == 4\n", "assert np.sum(diann_reader.psm_df.mods.str.contains('Oxidation@M')) == 2\n", "assert np.all(np.array(diann_reader.modification_mapping['Phospho@S'])==np.array([\n", " 'S(Phospho (S))',\n", @@ -999,7 +1027,7 @@ { "data": { "text/plain": [ - "'Acetyl@Any N-term'" + "'Acetyl@Any_N-term'" ] }, "execution_count": null, @@ -1011,6 +1039,13 @@ "diann_reader.rev_mod_mapping['(UniMod:1)']" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -1021,7 +1056,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.8.3 ('base')", + "display_name": "alpha", "language": "python", "name": "python3" } diff --git a/nbdev_nbs/scoring/fdr.ipynb b/nbdev_nbs/scoring/fdr.ipynb index 20dc6383..87d2435f 100644 --- a/nbdev_nbs/scoring/fdr.ipynb +++ b/nbdev_nbs/scoring/fdr.ipynb @@ -38,167 +38,33 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
scoredecoykindfdr
41720.9867510True0.000000
48620.9588030True0.000000
4620.9542440True0.000000
13120.8324400True0.000000
23620.8095950True0.000000
...............
11110.0463660False0.504008
7090.0408411False0.504505
12090.0308410False0.504505
9390.0137041False0.505000
14390.0037040False0.505000
\n", - "

1505 rows × 4 columns

\n", - "
" - ], - "text/plain": [ - " score decoy kind fdr\n", - "417 20.986751 0 True 0.000000\n", - "486 20.958803 0 True 0.000000\n", - "46 20.954244 0 True 0.000000\n", - "131 20.832440 0 True 0.000000\n", - "236 20.809595 0 True 0.000000\n", - "... ... ... ... ...\n", - "1111 0.046366 0 False 0.504008\n", - "709 0.040841 1 False 0.504505\n", - "1209 0.030841 0 False 0.504505\n", - "939 0.013704 1 False 0.505000\n", - "1439 0.003704 0 False 0.505000\n", - "\n", - "[1505 rows x 4 columns]" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "df = pd.DataFrame(\n", - " {\n", - " 'score': np.random.random(500)*10+11,\n", - " 'decoy': 0,\n", - " 'kind': True,\n", - " }\n", - ")\n", + "\n", "f_score = np.random.random(500)*9.9\n", - "df = df.append(\n", + "\n", + "df = pd.concat([\n", + " pd.DataFrame(\n", + " {\n", + " 'score': np.random.random(500)*10+11,\n", + " 'decoy': 0,\n", + " 'kind': True,\n", + " }\n", + " ),\n", " pd.DataFrame(\n", " {\n", " 'score': f_score+0.01,\n", " 'decoy': 1,\n", " 'kind': False\n", " }\n", - " )\n", - ")\n", - "df = df.append(\n", + " ),\n", " pd.DataFrame(\n", " {\n", " 'score': f_score,\n", " 'decoy': 0,\n", " 'kind': False\n", " }\n", - " )\n", - ")\n", - "df = df.append(\n", + " ),\n", " pd.DataFrame(\n", " {\n", " 'score': np.random.random(5)+10,\n", @@ -206,7 +72,7 @@ " 'kind': False\n", " }\n", " )\n", - ")\n", + "])\n", "\n", "df = calculate_fdr(df, 'score', 'decoy')\n", "df" @@ -216,139 +82,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
scoredecoykindfdr
41720.9867510True0.0
48620.9588030True0.0
4620.9542440True0.0
13120.8324400True0.0
23620.8095950True0.0
...............
31311.0706950True0.0
22711.0284310True0.0
15311.0143300True0.0
11311.0139780True0.0
4811.0106290True0.0
\n", - "

500 rows × 4 columns

\n", - "
" - ], - "text/plain": [ - " score decoy kind fdr\n", - "417 20.986751 0 True 0.0\n", - "486 20.958803 0 True 0.0\n", - "46 20.954244 0 True 0.0\n", - "131 20.832440 0 True 0.0\n", - "236 20.809595 0 True 0.0\n", - ".. ... ... ... ...\n", - "313 11.070695 0 True 0.0\n", - "227 11.028431 0 True 0.0\n", - "153 11.014330 0 True 0.0\n", - "113 11.013978 0 True 0.0\n", - "48 11.010629 0 True 0.0\n", - "\n", - "[500 rows x 4 columns]" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df[(df.fdr < 0.01)&(df.decoy==0)]" ] @@ -370,37 +104,36 @@ "outputs": [], "source": [ "#| hide\n", - "dff = pd.DataFrame(\n", - " {\n", - " 'score': np.random.random(500)*10+11,\n", - " 'decoy': 0\n", - " }\n", - ")\n", + "\n", "f_score = np.random.random(500)*9.9\n", - "dff = dff.append(\n", + "\n", + "dff = pd.concat([\n", + " pd.DataFrame(\n", + " {\n", + " 'score': np.random.random(500)*10+11,\n", + " 'decoy': 0\n", + " }\n", + " ),\n", " pd.DataFrame(\n", " {\n", " 'score': f_score+0.01,\n", " 'decoy': 1\n", " }\n", - " )\n", - ")\n", - "dff = dff.append(\n", + " ),\n", " pd.DataFrame(\n", " {\n", " 'score': f_score,\n", " 'decoy': 0\n", " }\n", - " )\n", - ")\n", - "dff = dff.append(\n", + " ),\n", " pd.DataFrame(\n", " {\n", " 'score': np.random.random(5)+10,\n", " 'decoy': 1\n", " }\n", " )\n", - ")\n", + "\n", + "])\n", "\n", "dff['fdr'] = fdr_from_ref(dff.score.values, df.score.values, df.fdr.values)\n", "\n", @@ -417,13 +150,6 @@ "dff = calculate_fdr_from_ref(dff, df.score.values, df.fdr.values, 'score')\n", "assert len(dff[(dff.fdr < 0.01)&(dff.decoy==0)]) == 500" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/nbdev_nbs/spectral_library/flat_library.ipynb b/nbdev_nbs/spectral_library/flat_library.ipynb index 08964e05..55404f6a 100644 --- a/nbdev_nbs/spectral_library/flat_library.ipynb +++ b/nbdev_nbs/spectral_library/flat_library.ipynb @@ -26,7 +26,7 @@ "source": [ "#| hide\n", "from io import StringIO\n", - "from alphabase.spectral_library.reader import SWATHLibraryReader" + "from alphabase.spectral_library.reader import LibraryReaderBase" ] }, { @@ -413,7 +413,7 @@ "2\t_YSLS[Phospho (STY)]PSK_\tYSLSPSK\t-6.428198\t_YSLS[Phospho (STY)]PSK_\t431.1913264\t1(+H2+O)1(+H3+O4+P)\t6\ty\t1\t582.3245847\t5.233977\t0.9\n", "\"\"\"\n", "\n", - "reader = SWATHLibraryReader()\n", + "reader = LibraryReaderBase()\n", "reader.import_file(StringIO(tsv_str))\n", "flat_lib = SpecLibFlat(custom_fragment_df_columns=['type'])\n", "flat_lib.parse_base_library(reader, keep_original_frag_dfs=True)\n", diff --git a/nbdev_nbs/spectral_library/library_base.ipynb b/nbdev_nbs/spectral_library/library_base.ipynb index 4b156633..c1e806ec 100644 --- a/nbdev_nbs/spectral_library/library_base.ipynb +++ b/nbdev_nbs/spectral_library/library_base.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -10,6 +10,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -18,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -26,6 +27,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -34,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -44,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -60,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -103,8 +105,8 @@ " 9\n", " 2\n", " 602.747333\n", - " -5783464648586361190\n", - " -5783464648586361188\n", + " 15612371025470493168\n", + " 15612371025470493170\n", " \n", " \n", " 1\n", @@ -114,8 +116,8 @@ " 9\n", " 2\n", " 602.747333\n", - " -5783464648586361190\n", - " -5783464648586361188\n", + " 15612371025470493168\n", + " 15612371025470493170\n", " \n", " \n", " 2\n", @@ -125,8 +127,8 @@ " 9\n", " 2\n", " 602.747333\n", - " -5783464648586361190\n", - " -5783464648586361188\n", + " 15612371025470493168\n", + " 15612371025470493170\n", " \n", " \n", " 3\n", @@ -136,8 +138,8 @@ " 14\n", " 2\n", " 816.356299\n", - " -1606275412423975023\n", - " -1606275412423975021\n", + " 6831658824673244135\n", + " 6831658824673244137\n", " \n", " \n", " 4\n", @@ -147,8 +149,8 @@ " 14\n", " 2\n", " 816.356299\n", - " -1606275412423975023\n", - " -1606275412423975021\n", + " 6831658824673244135\n", + " 6831658824673244137\n", " \n", " \n", " 5\n", @@ -158,8 +160,8 @@ " 14\n", " 2\n", " 816.356299\n", - " -1606275412423975023\n", - " -1606275412423975021\n", + " 6831658824673244135\n", + " 6831658824673244137\n", " \n", " \n", "\n", @@ -174,24 +176,24 @@ "4 AGHCEWQMKAADER \n", "5 AGHCEWQMKAADER \n", "\n", - " mod_sites nAA charge precursor_mz mod_seq_hash \\\n", - "0 0;4;8 9 2 602.747333 -5783464648586361190 \n", - "1 0;4;8 9 2 602.747333 -5783464648586361190 \n", - "2 0;4;8 9 2 602.747333 -5783464648586361190 \n", - "3 14 2 816.356299 -1606275412423975023 \n", - "4 14 2 816.356299 -1606275412423975023 \n", - "5 14 2 816.356299 -1606275412423975023 \n", + " mod_sites nAA charge precursor_mz mod_seq_hash \\\n", + "0 0;4;8 9 2 602.747333 15612371025470493168 \n", + "1 0;4;8 9 2 602.747333 15612371025470493168 \n", + "2 0;4;8 9 2 602.747333 15612371025470493168 \n", + "3 14 2 816.356299 6831658824673244135 \n", + "4 14 2 816.356299 6831658824673244135 \n", + "5 14 2 816.356299 6831658824673244135 \n", "\n", - " mod_seq_charge_hash \n", - "0 -5783464648586361188 \n", - "1 -5783464648586361188 \n", - "2 -5783464648586361188 \n", - "3 -1606275412423975021 \n", - "4 -1606275412423975021 \n", - "5 -1606275412423975021 " + " mod_seq_charge_hash \n", + "0 15612371025470493170 \n", + "1 15612371025470493170 \n", + "2 15612371025470493170 \n", + "3 6831658824673244137 \n", + "4 6831658824673244137 \n", + "5 6831658824673244137 " ] }, - "execution_count": null, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -258,7 +260,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -302,8 +304,8 @@ " 9\n", " 2\n", " 602.747333\n", - " -5783464648586361190\n", - " -5783464648586361188\n", + " 15612371025470493168\n", + " 15612371025470493170\n", " 0\n", " \n", " \n", @@ -314,8 +316,8 @@ " 9\n", " 2\n", " 602.747333\n", - " -5783464648586361190\n", - " -5783464648586361188\n", + " 15612371025470493168\n", + " 15612371025470493170\n", " 0\n", " \n", " \n", @@ -326,8 +328,8 @@ " 9\n", " 2\n", " 602.747333\n", - " -5783464648586361190\n", - " -5783464648586361188\n", + " 15612371025470493168\n", + " 15612371025470493170\n", " 0\n", " \n", " \n", @@ -338,8 +340,8 @@ " 9\n", " 2\n", " 602.747333\n", - " -5783464648586361190\n", - " -5783464648586361188\n", + " 15612371025470493168\n", + " 15612371025470493170\n", " 1\n", " \n", " \n", @@ -350,8 +352,8 @@ " 9\n", " 2\n", " 602.747333\n", - " -5783464648586361190\n", - " -5783464648586361188\n", + " 15612371025470493168\n", + " 15612371025470493170\n", " 1\n", " \n", " \n", @@ -362,8 +364,8 @@ " 9\n", " 2\n", " 602.747333\n", - " -5783464648586361190\n", - " -5783464648586361188\n", + " 15612371025470493168\n", + " 15612371025470493170\n", " 1\n", " \n", " \n", @@ -374,8 +376,8 @@ " 14\n", " 2\n", " 816.356299\n", - " -1606275412423975023\n", - " -1606275412423975021\n", + " 6831658824673244135\n", + " 6831658824673244137\n", " 0\n", " \n", " \n", @@ -386,8 +388,8 @@ " 14\n", " 2\n", " 816.356299\n", - " -1606275412423975023\n", - " -1606275412423975021\n", + " 6831658824673244135\n", + " 6831658824673244137\n", " 0\n", " \n", " \n", @@ -398,8 +400,8 @@ " 14\n", " 2\n", " 816.356299\n", - " -1606275412423975023\n", - " -1606275412423975021\n", + " 6831658824673244135\n", + " 6831658824673244137\n", " 0\n", " \n", " \n", @@ -410,8 +412,8 @@ " 14\n", " 2\n", " 816.356299\n", - " -1606275412423975023\n", - " -1606275412423975021\n", + " 6831658824673244135\n", + " 6831658824673244137\n", " 1\n", " \n", " \n", @@ -422,8 +424,8 @@ " 14\n", " 2\n", " 816.356299\n", - " -1606275412423975023\n", - " -1606275412423975021\n", + " 6831658824673244135\n", + " 6831658824673244137\n", " 1\n", " \n", " \n", @@ -434,8 +436,8 @@ " 14\n", " 2\n", " 816.356299\n", - " -1606275412423975023\n", - " -1606275412423975021\n", + " 6831658824673244135\n", + " 6831658824673244137\n", " 1\n", " \n", " \n", @@ -457,36 +459,36 @@ "10 EDAAKMQWECHGAR \n", "11 EDAAKMQWECHGAR \n", "\n", - " mod_sites nAA charge precursor_mz mod_seq_hash \\\n", - "0 0;4;8 9 2 602.747333 -5783464648586361190 \n", - "1 0;4;8 9 2 602.747333 -5783464648586361190 \n", - "2 0;4;8 9 2 602.747333 -5783464648586361190 \n", - "3 0;4;8 9 2 602.747333 -5783464648586361190 \n", - "4 0;4;8 9 2 602.747333 -5783464648586361190 \n", - "5 0;4;8 9 2 602.747333 -5783464648586361190 \n", - "6 14 2 816.356299 -1606275412423975023 \n", - "7 14 2 816.356299 -1606275412423975023 \n", - "8 14 2 816.356299 -1606275412423975023 \n", - "9 14 2 816.356299 -1606275412423975023 \n", - "10 14 2 816.356299 -1606275412423975023 \n", - "11 14 2 816.356299 -1606275412423975023 \n", + " mod_sites nAA charge precursor_mz mod_seq_hash \\\n", + "0 0;4;8 9 2 602.747333 15612371025470493168 \n", + "1 0;4;8 9 2 602.747333 15612371025470493168 \n", + "2 0;4;8 9 2 602.747333 15612371025470493168 \n", + "3 0;4;8 9 2 602.747333 15612371025470493168 \n", + "4 0;4;8 9 2 602.747333 15612371025470493168 \n", + "5 0;4;8 9 2 602.747333 15612371025470493168 \n", + "6 14 2 816.356299 6831658824673244135 \n", + "7 14 2 816.356299 6831658824673244135 \n", + "8 14 2 816.356299 6831658824673244135 \n", + "9 14 2 816.356299 6831658824673244135 \n", + "10 14 2 816.356299 6831658824673244135 \n", + "11 14 2 816.356299 6831658824673244135 \n", "\n", - " mod_seq_charge_hash decoy \n", - "0 -5783464648586361188 0 \n", - "1 -5783464648586361188 0 \n", - "2 -5783464648586361188 0 \n", - "3 -5783464648586361188 1 \n", - "4 -5783464648586361188 1 \n", - "5 -5783464648586361188 1 \n", - "6 -1606275412423975021 0 \n", - "7 -1606275412423975021 0 \n", - "8 -1606275412423975021 0 \n", - "9 -1606275412423975021 1 \n", - "10 -1606275412423975021 1 \n", - "11 -1606275412423975021 1 " + " mod_seq_charge_hash decoy \n", + "0 15612371025470493170 0 \n", + "1 15612371025470493170 0 \n", + "2 15612371025470493170 0 \n", + "3 15612371025470493170 1 \n", + "4 15612371025470493170 1 \n", + "5 15612371025470493170 1 \n", + "6 6831658824673244137 0 \n", + "7 6831658824673244137 0 \n", + "8 6831658824673244137 0 \n", + "9 6831658824673244137 1 \n", + "10 6831658824673244137 1 \n", + "11 6831658824673244137 1 " ] }, - "execution_count": null, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -499,7 +501,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -534,36 +536,36 @@ " 0\n", " 114.054955\n", " 57.531116\n", - " 1091.439712\n", - " 546.223494\n", + " 1091.439697\n", + " 546.223511\n", " \n", " \n", " 1\n", - " 171.076419\n", - " 86.041848\n", - " 1034.418248\n", - " 517.712762\n", + " 171.076416\n", + " 86.041847\n", + " 1034.418213\n", + " 517.712769\n", " \n", " \n", " 2\n", - " 308.135331\n", - " 154.571303\n", - " 897.359336\n", - " 449.183306\n", + " 308.135345\n", + " 154.571304\n", + " 897.359314\n", + " 449.183319\n", " \n", " \n", " 3\n", - " 468.165979\n", - " 234.586628\n", - " 737.328687\n", - " 369.167982\n", + " 468.165985\n", + " 234.586624\n", + " 737.328674\n", + " 369.167969\n", " \n", " \n", " 4\n", - " 597.208572\n", - " 299.107924\n", - " 608.286094\n", - " 304.646685\n", + " 597.208557\n", + " 299.107910\n", + " 608.286072\n", + " 304.646698\n", " \n", " \n", " ...\n", @@ -574,38 +576,38 @@ " \n", " \n", " 121\n", - " 1089.466972\n", - " 545.237124\n", - " 543.245626\n", - " 272.126451\n", + " 1089.466919\n", + " 545.237122\n", + " 543.245605\n", + " 272.126465\n", " \n", " \n", " 122\n", - " 1192.476157\n", - " 596.741717\n", - " 440.236442\n", - " 220.621859\n", + " 1192.476196\n", + " 596.741699\n", + " 440.236450\n", + " 220.621857\n", " \n", " \n", " 123\n", - " 1329.535069\n", - " 665.271173\n", - " 303.177530\n", - " 152.092403\n", + " 1329.535034\n", + " 665.271179\n", + " 303.177521\n", + " 152.092407\n", " \n", " \n", " 124\n", - " 1386.556532\n", - " 693.781904\n", - " 246.156066\n", - " 123.581671\n", + " 1386.556519\n", + " 693.781921\n", + " 246.156067\n", + " 123.581673\n", " \n", " \n", " 125\n", - " 1457.593646\n", - " 729.300461\n", - " 175.118952\n", - " 88.063114\n", + " 1457.593628\n", + " 729.300476\n", + " 175.118958\n", + " 88.063118\n", " \n", " \n", "\n", @@ -614,22 +616,22 @@ ], "text/plain": [ " b_z1 b_z2 y_z1 y_z2\n", - "0 114.054955 57.531116 1091.439712 546.223494\n", - "1 171.076419 86.041848 1034.418248 517.712762\n", - "2 308.135331 154.571303 897.359336 449.183306\n", - "3 468.165979 234.586628 737.328687 369.167982\n", - "4 597.208572 299.107924 608.286094 304.646685\n", + "0 114.054955 57.531116 1091.439697 546.223511\n", + "1 171.076416 86.041847 1034.418213 517.712769\n", + "2 308.135345 154.571304 897.359314 449.183319\n", + "3 468.165985 234.586624 737.328674 369.167969\n", + "4 597.208557 299.107910 608.286072 304.646698\n", ".. ... ... ... ...\n", - "121 1089.466972 545.237124 543.245626 272.126451\n", - "122 1192.476157 596.741717 440.236442 220.621859\n", - "123 1329.535069 665.271173 303.177530 152.092403\n", - "124 1386.556532 693.781904 246.156066 123.581671\n", - "125 1457.593646 729.300461 175.118952 88.063114\n", + "121 1089.466919 545.237122 543.245605 272.126465\n", + "122 1192.476196 596.741699 440.236450 220.621857\n", + "123 1329.535034 665.271179 303.177521 152.092407\n", + "124 1386.556519 693.781921 246.156067 123.581673\n", + "125 1457.593628 729.300476 175.118958 88.063118\n", "\n", "[126 rows x 4 columns]" ] }, - "execution_count": null, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -643,20 +645,9 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-12-21 13:45:10> Speclib with 4 precursors will be reannotated with speclib with 12 precursors and 504 fragments\n", - "2022-12-21 13:45:11> A total of 4 precursors were succesfully annotated, 0 precursors were not matched\n", - "2022-12-21 13:45:11> Speclib with 4 precursors will be reannotated with speclib with 12 precursors and 504 fragments\n", - "2022-12-21 13:45:11> A total of 4 precursors were succesfully annotated, 0 precursors were not matched\n" - ] - } - ], + "outputs": [], "source": [ "\n", "repeat = 3\n", @@ -709,18 +700,9 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-12-21 13:45:11> Speclib with 12 precursors will be reannotated with speclib with 12 precursors and 504 fragments\n", - "2022-12-21 13:45:11> A total of 12 precursors were succesfully annotated, 0 precursors were not matched\n" - ] - } - ], + "outputs": [], "source": [ "\n", "repeat = 3\n", @@ -752,13 +734,208 @@ "\n", "empty_lib = annotate_fragments_from_speclib(empty_lib, fragment_lib)" ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# test SpecLibBase.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "repeat = 3\n", + "peptides = ['AGHCEWQMK']*repeat\n", + "mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat\n", + "sites = ['0;4;8']*repeat\n", + "peptides += ['AGHCEWQMKAADER']*repeat\n", + "mods += ['']*repeat\n", + "sites += ['']*repeat\n", + "\n", + "precursor_df = pd.DataFrame({\n", + " 'sequence': peptides,\n", + " 'mods': mods,\n", + " 'mod_sites': sites\n", + "})\n", + "precursor_df['nAA'] = precursor_df['sequence'].str.len()\n", + "precursor_df['charge'] = 2\n", + "spec_lib = SpecLibBase(\n", + " ['b_z1','b_z2','y_z1','y_z2'],\n", + " decoy='pseudo_reverse'\n", + ")\n", + "\n", + "spec_lib._precursor_df = precursor_df\n", + "spec_lib.calc_precursor_mz()\n", + "spec_lib.append_decoy_sequence()\n", + "spec_lib.calc_fragment_mz_df()\n", + "\n", + "spec_lib_copy = spec_lib.copy()\n", + "\n", + "assert len(spec_lib_copy.precursor_df) == len(spec_lib.precursor_df)\n", + "assert len(spec_lib_copy.fragment_mz_df) == len(spec_lib.fragment_mz_df)\n", + "assert len(spec_lib_copy.fragment_intensity_df) == len(spec_lib.fragment_intensity_df)\n", + "\n", + "spec_lib._precursor_df['precursor_mz'] = 0\n", + "assert all(spec_lib_copy._precursor_df['precursor_mz'] != 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "lib1 = spec_lib.copy()\n", + "lib2 = spec_lib.copy()\n", + "lib1.append(lib2)\n", + "assert len(lib1.precursor_df) == len(spec_lib.precursor_df)*2\n", + "assert len(lib1.fragment_mz_df) == len(spec_lib.fragment_mz_df)*2\n", + "assert len(lib1.fragment_intensity_df) == len(spec_lib.fragment_intensity_df)*2\n", + "\n", + "for i, (frag_start, frag_stop) in enumerate(zip(lib1._precursor_df['frag_start_idx'].values, lib1._precursor_df['frag_stop_idx'].values)):\n", + " assert frag_start < frag_stop\n", + " fragments = lib1.fragment_mz_df.iloc[frag_start:frag_stop]\n", + " assert len(fragments) == lib1._precursor_df.iloc[i]['nAA']-1" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The columns are not compatible. {'y_z1', 'b_z1', 'b_z2', 'y_z2'} are missing in the dataframe which should be appended.\n" + ] + } + ], + "source": [ + "lib1 = spec_lib.copy()\n", + "lib2 = spec_lib.copy()\n", + "lib2._fragment_mz_df = pd.DataFrame()\n", + "\n", + "error = False\n", + "try:\n", + " lib1.append(lib2)\n", + "except ValueError as e:\n", + " print(e)\n", + " error = True\n", + "assert error" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The columns are not compatible. {'sequence'} are missing in the dataframe which should be appended.\n" + ] + } + ], + "source": [ + "lib1 = spec_lib.copy()\n", + "lib2 = spec_lib.copy()\n", + "lib2._precursor_df.drop('sequence', axis=1, inplace=True)\n", + "\n", + "error = False\n", + "try:\n", + " lib1.append(lib2)\n", + "except ValueError as e:\n", + " print(e)\n", + " error = True\n", + "assert error" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/georgwallmann/Documents/git/alphabase/alphabase/spectral_library/base.py:190: UserWarning: Unmatched columns in second dataframe will be dropped: {'sequence'}.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "lib1 = spec_lib.copy()\n", + "lib2 = spec_lib.copy()\n", + "lib1._precursor_df.drop('sequence', axis=1, inplace=True)\n", + "lib1.append(lib2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "repeat = 3\n", + "peptides = ['AGHCEWQMK']*repeat\n", + "mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat\n", + "sites = ['0;4;8']*repeat\n", + "peptides += ['AGHCEWQMKAADER']*repeat\n", + "mods += ['']*repeat\n", + "sites += ['']*repeat\n", + "\n", + "spec_lib = SpecLibBase()\n", + "spec_lib._precursor_df = pd.DataFrame({\n", + " 'sequence': peptides,\n", + " 'mods': mods,\n", + " 'mod_sites': sites\n", + "})\n", + "spec_lib._precursor_df['charge'] = 2\n", + "spec_lib.calc_precursor_mz()\n", + "spec_lib.calc_fragment_mz_df()\n", + "spec_lib._fragment_intensity_df = spec_lib._fragment_mz_df.copy()\n", + "\n", + "# add random intensity\n", + "for col in spec_lib._fragment_intensity_df.columns:\n", + " spec_lib._fragment_intensity_df[col] = np.random.rand(len(spec_lib._fragment_intensity_df)) * np.random.randint(0,2, len(spec_lib._fragment_intensity_df))\n", + "\n", + "# calculate fragment number\n", + "spec_lib.calc_fragment_count()\n", + "\n", + "# set maximum number of fragments to random number\n", + "spec_lib._precursor_df['n_fragments_allowed'] = np.random.randint(0, 10, len(spec_lib._precursor_df))\n", + "spec_lib.filter_fragment_number()\n", + "spec_lib.calc_fragment_count()\n", + "\n", + "assert np.all(spec_lib._precursor_df['n_fragments_allowed'].values >= spec_lib._precursor_df['n_fragments'].values)" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3.8.3 ('base')", + "display_name": "alpha", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" } }, "nbformat": 4, diff --git a/nbdev_nbs/spectral_library/library_reader.ipynb b/nbdev_nbs/spectral_library/library_reader.ipynb index 7ba9f908..38c0972b 100644 --- a/nbdev_nbs/spectral_library/library_reader.ipynb +++ b/nbdev_nbs/spectral_library/library_reader.ipynb @@ -17,10 +17,11 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "SWATH/Spectronaut TSV library reader (`SWATHLibraryReader`)." + "SWATH/Spectronaut TSV library reader (`LibraryReaderBase`)." ] }, { @@ -68,14 +69,14 @@ " \n", " \n", " \n", - " sequence\n", - " charge\n", - " rt\n", " precursor_mz\n", " mobility\n", " mods\n", - " mod_sites\n", + " sequence\n", + " rt\n", " nAA\n", + " mod_sites\n", + " charge\n", " frag_start_idx\n", " frag_stop_idx\n", " rt_norm\n", @@ -85,114 +86,114 @@ " \n", " \n", " 0\n", - " AVVVSPK\n", - " 2\n", - " -22.849740\n", - " 390.206779\n", + " 379.208161\n", " 0.9\n", - " Phospho@S\n", - " 5\n", + " \n", + " DPLAVDK\n", + " -15.087100\n", " 7\n", + " \n", + " 2\n", " 0\n", " 6\n", - " 0.075327\n", - " 366.858877\n", + " -0.430886\n", + " 367.043100\n", " \n", " \n", " 1\n", - " DPLAVDK\n", - " 2\n", - " -15.087100\n", - " 379.208161\n", + " 390.206779\n", " 0.9\n", - " \n", - " \n", + " Phospho@S\n", + " AVVVSPK\n", + " -22.849740\n", " 7\n", + " 5\n", + " 2\n", " 6\n", " 12\n", - " 0.199375\n", - " 367.043100\n", + " -0.652587\n", + " 366.858877\n", " \n", " \n", " 2\n", - " MGSLDSK\n", - " 2\n", - " -27.563500\n", " 409.161712\n", " 0.9\n", " Phospho@S\n", - " 3\n", + " MGSLDSK\n", + " -27.563500\n", " 7\n", + " 3\n", + " 2\n", " 12\n", " 18\n", - " 0.000000\n", + " -0.787211\n", " 366.564438\n", " \n", " \n", " 3\n", - " SVSFSLK\n", - " 1\n", - " 35.014110\n", - " 847.396112\n", + " 431.167001\n", " 0.9\n", - " Phospho@S\n", - " 3\n", + " Phospho@S;Phospho@S\n", + " VSVSPGR\n", + " -23.930850\n", " 7\n", + " 2;4\n", + " 2\n", " 18\n", " 24\n", - " 1.000000\n", - " 183.178171\n", + " -0.683463\n", + " 366.254833\n", " \n", " \n", " 4\n", - " VSVSPGR\n", - " 2\n", - " -23.930850\n", - " 431.167001\n", + " 431.191326\n", " 0.9\n", - " Phospho@S;Phospho@S\n", - " 2;4\n", + " Phospho@S\n", + " YSLSPSK\n", + " -6.428198\n", " 7\n", + " 4\n", + " 2\n", " 24\n", " 30\n", - " 0.058050\n", - " 366.254833\n", + " -0.183589\n", + " 366.254509\n", " \n", " \n", " 5\n", - " YSLSPSK\n", - " 2\n", - " -6.428198\n", - " 431.191326\n", + " 847.396112\n", " 0.9\n", " Phospho@S\n", - " 4\n", + " SVSFSLK\n", + " 35.014110\n", " 7\n", + " 3\n", + " 1\n", " 30\n", " 36\n", - " 0.337745\n", - " 366.254509\n", + " 1.000000\n", + " 183.178171\n", " \n", " \n", "\n", "" ], "text/plain": [ - " sequence charge rt precursor_mz mobility mods \\\n", - "0 AVVVSPK 2 -22.849740 390.206779 0.9 Phospho@S \n", - "1 DPLAVDK 2 -15.087100 379.208161 0.9 \n", - "2 MGSLDSK 2 -27.563500 409.161712 0.9 Phospho@S \n", - "3 SVSFSLK 1 35.014110 847.396112 0.9 Phospho@S \n", - "4 VSVSPGR 2 -23.930850 431.167001 0.9 Phospho@S;Phospho@S \n", - "5 YSLSPSK 2 -6.428198 431.191326 0.9 Phospho@S \n", + " precursor_mz mobility mods sequence rt nAA \\\n", + "0 379.208161 0.9 DPLAVDK -15.087100 7 \n", + "1 390.206779 0.9 Phospho@S AVVVSPK -22.849740 7 \n", + "2 409.161712 0.9 Phospho@S MGSLDSK -27.563500 7 \n", + "3 431.167001 0.9 Phospho@S;Phospho@S VSVSPGR -23.930850 7 \n", + "4 431.191326 0.9 Phospho@S YSLSPSK -6.428198 7 \n", + "5 847.396112 0.9 Phospho@S SVSFSLK 35.014110 7 \n", "\n", - " mod_sites nAA frag_start_idx frag_stop_idx rt_norm ccs \n", - "0 5 7 0 6 0.075327 366.858877 \n", - "1 7 6 12 0.199375 367.043100 \n", - "2 3 7 12 18 0.000000 366.564438 \n", - "3 3 7 18 24 1.000000 183.178171 \n", - "4 2;4 7 24 30 0.058050 366.254833 \n", - "5 4 7 30 36 0.337745 366.254509 " + " mod_sites charge frag_start_idx frag_stop_idx rt_norm ccs \n", + "0 2 0 6 -0.430886 367.043100 \n", + "1 5 2 6 12 -0.652587 366.858877 \n", + "2 3 2 12 18 -0.787211 366.564438 \n", + "3 2;4 2 18 24 -0.683463 366.254833 \n", + "4 4 2 24 30 -0.183589 366.254509 \n", + "5 3 1 30 36 1.000000 183.178171 " ] }, "execution_count": null, @@ -252,7 +253,7 @@ "2\t_YSLS[Phospho (STY)]PSK_\tYSLSPSK\t-6.428198\t_YSLS[Phospho (STY)]PSK_\t431.1913264\t1(+H2+O)1(+H3+O4+P)\t6\ty\t1\t582.3245847\t5.233977\t0.9\n", "\"\"\"\n", "\n", - "reader = SWATHLibraryReader()\n", + "reader = LibraryReaderBase()\n", "psm_df = reader.import_file(StringIO(tsv_str))\n", "for col in ['sequence','charge','rt','rt_norm','mods','mod_sites','nAA','frag_start_idx','frag_stop_idx']:\n", " assert col in psm_df.columns\n", @@ -301,7 +302,7 @@ " 0.000000\n", " 0.000000\n", " 0.000000\n", - " 0.000000\n", + " 0.515072\n", " 0.000000\n", " 0.0\n", " 0.000000\n", @@ -311,38 +312,38 @@ " 1\n", " 0.000000\n", " 0.000000\n", - " 0.450362\n", + " 0.745664\n", " 0.000000\n", " 0.000000\n", " 0.0\n", - " 1.000000\n", + " 0.000000\n", " 0.0\n", " \n", " \n", " 2\n", + " 0.143703\n", " 0.000000\n", - " 0.000000\n", - " 0.106543\n", + " 1.000000\n", " 0.000000\n", " 0.000000\n", " 0.0\n", - " 0.374123\n", + " 0.000000\n", " 0.0\n", " \n", " \n", " 3\n", + " 0.094888\n", " 0.000000\n", - " 0.000000\n", - " 0.069116\n", + " 0.377585\n", " 0.000000\n", " 0.000000\n", " 0.0\n", - " 0.173858\n", + " 0.000000\n", " 0.0\n", " \n", " \n", " 4\n", - " 0.000000\n", + " 0.054980\n", " 0.000000\n", " 0.000000\n", " 0.000000\n", @@ -367,7 +368,7 @@ " 0.000000\n", " 0.000000\n", " 0.000000\n", - " 0.515072\n", + " 0.000000\n", " 0.000000\n", " 0.0\n", " 0.000000\n", @@ -377,38 +378,38 @@ " 7\n", " 0.000000\n", " 0.000000\n", - " 0.745664\n", + " 0.450362\n", " 0.000000\n", " 0.000000\n", " 0.0\n", - " 0.000000\n", + " 1.000000\n", " 0.0\n", " \n", " \n", " 8\n", - " 0.143703\n", " 0.000000\n", - " 1.000000\n", + " 0.000000\n", + " 0.106543\n", " 0.000000\n", " 0.000000\n", " 0.0\n", - " 0.000000\n", + " 0.374123\n", " 0.0\n", " \n", " \n", " 9\n", - " 0.094888\n", " 0.000000\n", - " 0.377585\n", + " 0.000000\n", + " 0.069116\n", " 0.000000\n", " 0.000000\n", " 0.0\n", - " 0.000000\n", + " 0.173858\n", " 0.0\n", " \n", " \n", " 10\n", - " 0.054980\n", + " 0.000000\n", " 0.000000\n", " 0.000000\n", " 0.000000\n", @@ -498,42 +499,42 @@ " 18\n", " 0.000000\n", " 0.000000\n", - " 0.000000\n", + " 0.084908\n", " 0.000000\n", " 0.000000\n", " 0.0\n", - " 0.000000\n", + " 0.328738\n", " 0.0\n", " \n", " \n", " 19\n", " 0.000000\n", " 0.000000\n", - " 0.198974\n", + " 0.000000\n", " 0.000000\n", " 0.000000\n", " 0.0\n", - " 0.774316\n", + " 0.284129\n", " 0.0\n", " \n", " \n", " 20\n", " 0.000000\n", " 0.000000\n", - " 1.000000\n", + " 0.276969\n", " 0.000000\n", - " 0.649715\n", + " 0.057554\n", " 0.0\n", - " 0.000000\n", + " 0.262853\n", " 0.0\n", " \n", " \n", " 21\n", " 0.000000\n", + " 0.087439\n", + " 1.000000\n", " 0.000000\n", - " 0.882733\n", - " 0.000000\n", - " 0.351781\n", + " 0.061627\n", " 0.0\n", " 0.000000\n", " 0.0\n", @@ -544,18 +545,18 @@ " 0.000000\n", " 0.000000\n", " 0.000000\n", - " 0.400474\n", + " 0.000000\n", " 0.0\n", " 0.000000\n", " 0.0\n", " \n", " \n", " 23\n", - " 0.244350\n", " 0.000000\n", " 0.000000\n", " 0.000000\n", - " 0.631000\n", + " 0.000000\n", + " 0.000000\n", " 0.0\n", " 0.000000\n", " 0.0\n", @@ -564,42 +565,42 @@ " 24\n", " 0.000000\n", " 0.000000\n", - " 0.084908\n", + " 0.602346\n", " 0.000000\n", " 0.000000\n", " 0.0\n", - " 0.328738\n", + " 1.000000\n", " 0.0\n", " \n", " \n", " 25\n", " 0.000000\n", " 0.000000\n", - " 0.000000\n", + " 0.141106\n", " 0.000000\n", " 0.000000\n", " 0.0\n", - " 0.284129\n", + " 0.705295\n", " 0.0\n", " \n", " \n", " 26\n", " 0.000000\n", " 0.000000\n", - " 0.276969\n", + " 0.108914\n", + " 0.000000\n", " 0.000000\n", - " 0.057554\n", " 0.0\n", - " 0.262853\n", + " 0.279959\n", " 0.0\n", " \n", " \n", " 27\n", " 0.000000\n", - " 0.087439\n", - " 1.000000\n", " 0.000000\n", - " 0.061627\n", + " 0.492018\n", + " 0.000000\n", + " 0.000000\n", " 0.0\n", " 0.000000\n", " 0.0\n", @@ -630,42 +631,42 @@ " 30\n", " 0.000000\n", " 0.000000\n", - " 0.602346\n", + " 0.000000\n", " 0.000000\n", " 0.000000\n", " 0.0\n", - " 1.000000\n", + " 0.000000\n", " 0.0\n", " \n", " \n", " 31\n", " 0.000000\n", " 0.000000\n", - " 0.141106\n", + " 0.198974\n", " 0.000000\n", " 0.000000\n", " 0.0\n", - " 0.705295\n", + " 0.774316\n", " 0.0\n", " \n", " \n", " 32\n", " 0.000000\n", " 0.000000\n", - " 0.108914\n", - " 0.000000\n", + " 1.000000\n", " 0.000000\n", + " 0.649715\n", " 0.0\n", - " 0.279959\n", + " 0.000000\n", " 0.0\n", " \n", " \n", " 33\n", " 0.000000\n", " 0.000000\n", - " 0.492018\n", - " 0.000000\n", + " 0.882733\n", " 0.000000\n", + " 0.351781\n", " 0.0\n", " 0.000000\n", " 0.0\n", @@ -676,18 +677,18 @@ " 0.000000\n", " 0.000000\n", " 0.000000\n", - " 0.000000\n", + " 0.400474\n", " 0.0\n", " 0.000000\n", " 0.0\n", " \n", " \n", " 35\n", + " 0.244350\n", " 0.000000\n", " 0.000000\n", " 0.000000\n", - " 0.000000\n", - " 0.000000\n", + " 0.631000\n", " 0.0\n", " 0.000000\n", " 0.0\n", @@ -698,17 +699,17 @@ ], "text/plain": [ " b_z1 b_z2 y_z1 y_z2 b_modloss_z1 b_modloss_z2 \\\n", - "0 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", - "1 0.000000 0.000000 0.450362 0.000000 0.000000 0.0 \n", - "2 0.000000 0.000000 0.106543 0.000000 0.000000 0.0 \n", - "3 0.000000 0.000000 0.069116 0.000000 0.000000 0.0 \n", - "4 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", + "0 0.000000 0.000000 0.000000 0.515072 0.000000 0.0 \n", + "1 0.000000 0.000000 0.745664 0.000000 0.000000 0.0 \n", + "2 0.143703 0.000000 1.000000 0.000000 0.000000 0.0 \n", + "3 0.094888 0.000000 0.377585 0.000000 0.000000 0.0 \n", + "4 0.054980 0.000000 0.000000 0.000000 0.000000 0.0 \n", "5 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", - "6 0.000000 0.000000 0.000000 0.515072 0.000000 0.0 \n", - "7 0.000000 0.000000 0.745664 0.000000 0.000000 0.0 \n", - "8 0.143703 0.000000 1.000000 0.000000 0.000000 0.0 \n", - "9 0.094888 0.000000 0.377585 0.000000 0.000000 0.0 \n", - "10 0.054980 0.000000 0.000000 0.000000 0.000000 0.0 \n", + "6 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", + "7 0.000000 0.000000 0.450362 0.000000 0.000000 0.0 \n", + "8 0.000000 0.000000 0.106543 0.000000 0.000000 0.0 \n", + "9 0.000000 0.000000 0.069116 0.000000 0.000000 0.0 \n", + "10 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", "11 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", "12 0.000000 0.000000 0.103734 0.000000 0.000000 0.0 \n", "13 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", @@ -716,36 +717,36 @@ "15 0.000000 0.000000 0.092058 0.000000 0.000000 0.0 \n", "16 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", "17 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", - "18 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", - "19 0.000000 0.000000 0.198974 0.000000 0.000000 0.0 \n", - "20 0.000000 0.000000 1.000000 0.000000 0.649715 0.0 \n", - "21 0.000000 0.000000 0.882733 0.000000 0.351781 0.0 \n", - "22 0.000000 0.000000 0.000000 0.000000 0.400474 0.0 \n", - "23 0.244350 0.000000 0.000000 0.000000 0.631000 0.0 \n", - "24 0.000000 0.000000 0.084908 0.000000 0.000000 0.0 \n", - "25 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", - "26 0.000000 0.000000 0.276969 0.000000 0.057554 0.0 \n", - "27 0.000000 0.087439 1.000000 0.000000 0.061627 0.0 \n", + "18 0.000000 0.000000 0.084908 0.000000 0.000000 0.0 \n", + "19 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", + "20 0.000000 0.000000 0.276969 0.000000 0.057554 0.0 \n", + "21 0.000000 0.087439 1.000000 0.000000 0.061627 0.0 \n", + "22 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", + "23 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", + "24 0.000000 0.000000 0.602346 0.000000 0.000000 0.0 \n", + "25 0.000000 0.000000 0.141106 0.000000 0.000000 0.0 \n", + "26 0.000000 0.000000 0.108914 0.000000 0.000000 0.0 \n", + "27 0.000000 0.000000 0.492018 0.000000 0.000000 0.0 \n", "28 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", "29 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", - "30 0.000000 0.000000 0.602346 0.000000 0.000000 0.0 \n", - "31 0.000000 0.000000 0.141106 0.000000 0.000000 0.0 \n", - "32 0.000000 0.000000 0.108914 0.000000 0.000000 0.0 \n", - "33 0.000000 0.000000 0.492018 0.000000 0.000000 0.0 \n", - "34 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", - "35 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", + "30 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 \n", + "31 0.000000 0.000000 0.198974 0.000000 0.000000 0.0 \n", + "32 0.000000 0.000000 1.000000 0.000000 0.649715 0.0 \n", + "33 0.000000 0.000000 0.882733 0.000000 0.351781 0.0 \n", + "34 0.000000 0.000000 0.000000 0.000000 0.400474 0.0 \n", + "35 0.244350 0.000000 0.000000 0.000000 0.631000 0.0 \n", "\n", " y_modloss_z1 y_modloss_z2 \n", "0 0.000000 0.0 \n", - "1 1.000000 0.0 \n", - "2 0.374123 0.0 \n", - "3 0.173858 0.0 \n", + "1 0.000000 0.0 \n", + "2 0.000000 0.0 \n", + "3 0.000000 0.0 \n", "4 0.000000 0.0 \n", "5 0.000000 0.0 \n", "6 0.000000 0.0 \n", - "7 0.000000 0.0 \n", - "8 0.000000 0.0 \n", - "9 0.000000 0.0 \n", + "7 1.000000 0.0 \n", + "8 0.374123 0.0 \n", + "9 0.173858 0.0 \n", "10 0.000000 0.0 \n", "11 0.000000 0.0 \n", "12 1.000000 0.0 \n", @@ -754,21 +755,21 @@ "15 0.000000 0.0 \n", "16 0.000000 0.0 \n", "17 0.000000 0.0 \n", - "18 0.000000 0.0 \n", - "19 0.774316 0.0 \n", - "20 0.000000 0.0 \n", + "18 0.328738 0.0 \n", + "19 0.284129 0.0 \n", + "20 0.262853 0.0 \n", "21 0.000000 0.0 \n", "22 0.000000 0.0 \n", "23 0.000000 0.0 \n", - "24 0.328738 0.0 \n", - "25 0.284129 0.0 \n", - "26 0.262853 0.0 \n", + "24 1.000000 0.0 \n", + "25 0.705295 0.0 \n", + "26 0.279959 0.0 \n", "27 0.000000 0.0 \n", "28 0.000000 0.0 \n", "29 0.000000 0.0 \n", - "30 1.000000 0.0 \n", - "31 0.705295 0.0 \n", - "32 0.279959 0.0 \n", + "30 0.000000 0.0 \n", + "31 0.774316 0.0 \n", + "32 0.000000 0.0 \n", "33 0.000000 0.0 \n", "34 0.000000 0.0 \n", "35 0.000000 0.0 " @@ -823,133 +824,133 @@ " \n", " \n", " 0\n", - " 72.044390\n", - " 36.525833\n", - " 708.369169\n", - " 354.688223\n", + " 116.034219\n", + " 58.520748\n", + " 642.382103\n", + " 321.694690\n", + " 0.000000\n", + " 0.000000\n", " 0.000000\n", " 0.000000\n", - " 610.392273\n", - " 305.699775\n", " \n", " \n", " 1\n", - " 171.112804\n", - " 86.060040\n", - " 609.300755\n", - " 305.154016\n", + " 213.086983\n", + " 107.047130\n", + " 545.329339\n", + " 273.168308\n", + " 0.000000\n", + " 0.000000\n", " 0.000000\n", " 0.000000\n", - " 511.323860\n", - " 256.165568\n", " \n", " \n", " 2\n", - " 270.181218\n", - " 135.594247\n", - " 510.232341\n", - " 255.619809\n", + " 326.171047\n", + " 163.589162\n", + " 432.245275\n", + " 216.626276\n", + " 0.000000\n", + " 0.000000\n", " 0.000000\n", " 0.000000\n", - " 412.255446\n", - " 206.631361\n", " \n", " \n", " 3\n", - " 369.249632\n", - " 185.128454\n", - " 411.163927\n", - " 206.085602\n", + " 397.208161\n", + " 199.107719\n", + " 361.208161\n", + " 181.107719\n", + " 0.000000\n", + " 0.000000\n", " 0.000000\n", " 0.000000\n", - " 313.187032\n", - " 157.097154\n", " \n", " \n", " 4\n", - " 536.247991\n", - " 268.627634\n", - " 244.165568\n", - " 122.586422\n", - " 438.271096\n", - " 219.639186\n", + " 496.276575\n", + " 248.641926\n", + " 262.139747\n", + " 131.573512\n", " 0.000000\n", " 0.000000\n", - " \n", + " 0.000000\n", + " 0.000000\n", + " \n", " \n", " 5\n", - " 633.300755\n", - " 317.154016\n", + " 611.303518\n", + " 306.155397\n", " 147.112804\n", " 74.060040\n", - " 535.323860\n", - " 268.165568\n", + " 0.000000\n", + " 0.000000\n", " 0.000000\n", " 0.000000\n", " \n", " \n", " 6\n", - " 116.034219\n", - " 58.520748\n", - " 642.382103\n", - " 321.694690\n", - " 0.000000\n", - " 0.000000\n", + " 72.044390\n", + " 36.525833\n", + " 708.369169\n", + " 354.688223\n", " 0.000000\n", " 0.000000\n", + " 610.392273\n", + " 305.699775\n", " \n", " \n", " 7\n", - " 213.086983\n", - " 107.047130\n", - " 545.329339\n", - " 273.168308\n", - " 0.000000\n", - " 0.000000\n", + " 171.112804\n", + " 86.060040\n", + " 609.300755\n", + " 305.154016\n", " 0.000000\n", " 0.000000\n", + " 511.323860\n", + " 256.165568\n", " \n", " \n", " 8\n", - " 326.171047\n", - " 163.589162\n", - " 432.245275\n", - " 216.626276\n", - " 0.000000\n", - " 0.000000\n", + " 270.181218\n", + " 135.594247\n", + " 510.232341\n", + " 255.619809\n", " 0.000000\n", " 0.000000\n", + " 412.255446\n", + " 206.631361\n", " \n", " \n", " 9\n", - " 397.208161\n", - " 199.107719\n", - " 361.208161\n", - " 181.107719\n", - " 0.000000\n", - " 0.000000\n", + " 369.249632\n", + " 185.128454\n", + " 411.163927\n", + " 206.085602\n", " 0.000000\n", " 0.000000\n", + " 313.187032\n", + " 157.097154\n", " \n", " \n", " 10\n", - " 496.276575\n", - " 248.641926\n", - " 262.139747\n", - " 131.573512\n", - " 0.000000\n", - " 0.000000\n", + " 536.247991\n", + " 268.627634\n", + " 244.165568\n", + " 122.586422\n", + " 438.271096\n", + " 219.639186\n", " 0.000000\n", " 0.000000\n", " \n", " \n", " 11\n", - " 611.303518\n", - " 306.155397\n", + " 633.300755\n", + " 317.154016\n", " 147.112804\n", " 74.060040\n", - " 0.000000\n", - " 0.000000\n", + " 535.323860\n", + " 268.165568\n", " 0.000000\n", " 0.000000\n", " \n", @@ -1021,72 +1022,6 @@ " \n", " \n", " 18\n", - " 88.039305\n", - " 0.000000\n", - " 760.364084\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 662.387188\n", - " 0.000000\n", - " \n", - " \n", - " 19\n", - " 187.107719\n", - " 0.000000\n", - " 661.295670\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 563.318774\n", - " 0.000000\n", - " \n", - " \n", - " 20\n", - " 354.106078\n", - " 0.000000\n", - " 494.297310\n", - " 0.000000\n", - " 256.129183\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " \n", - " \n", - " 21\n", - " 501.174492\n", - " 0.000000\n", - " 347.228897\n", - " 0.000000\n", - " 403.197596\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " \n", - " \n", - " 22\n", - " 588.206520\n", - " 0.000000\n", - " 260.196868\n", - " 0.000000\n", - " 490.229625\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " \n", - " \n", - " 23\n", - " 701.290584\n", - " 0.000000\n", - " 147.112804\n", - " 0.000000\n", - " 603.313689\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " \n", - " \n", - " 24\n", " 100.075690\n", " 50.541483\n", " 762.258312\n", @@ -1097,7 +1032,7 @@ " 332.644347\n", " \n", " \n", - " 25\n", + " 19\n", " 267.074050\n", " 134.040663\n", " 595.259953\n", @@ -1108,7 +1043,7 @@ " 249.145167\n", " \n", " \n", - " 26\n", + " 20\n", " 366.142464\n", " 183.574870\n", " 496.191539\n", @@ -1119,7 +1054,7 @@ " 199.610960\n", " \n", " \n", - " 27\n", + " 21\n", " 533.140823\n", " 267.074050\n", " 329.193180\n", @@ -1130,7 +1065,7 @@ " 0.000000\n", " \n", " \n", - " 28\n", + " 22\n", " 630.193587\n", " 315.600432\n", " 232.140416\n", @@ -1141,7 +1076,7 @@ " 0.000000\n", " \n", " \n", - " 29\n", + " 23\n", " 687.215050\n", " 344.111163\n", " 175.118952\n", @@ -1152,7 +1087,7 @@ " 0.000000\n", " \n", " \n", - " 30\n", + " 24\n", " 164.070605\n", " 82.538941\n", " 698.312048\n", @@ -1163,7 +1098,7 @@ " 300.671214\n", " \n", " \n", - " 31\n", + " 25\n", " 251.102633\n", " 126.054955\n", " 611.280020\n", @@ -1174,7 +1109,7 @@ " 257.155200\n", " \n", " \n", - " 32\n", + " 26\n", " 364.186697\n", " 182.596987\n", " 498.195956\n", @@ -1185,7 +1120,7 @@ " 200.613168\n", " \n", " \n", - " 33\n", + " 27\n", " 531.185057\n", " 266.096167\n", " 331.197596\n", @@ -1196,7 +1131,7 @@ " 0.000000\n", " \n", " \n", - " 34\n", + " 28\n", " 628.237821\n", " 314.622548\n", " 234.144833\n", @@ -1207,7 +1142,7 @@ " 0.000000\n", " \n", " \n", - " 35\n", + " 29\n", " 715.269849\n", " 358.138563\n", " 147.112804\n", @@ -1217,86 +1152,152 @@ " 0.000000\n", " 0.000000\n", " \n", + " \n", + " 30\n", + " 88.039305\n", + " 0.000000\n", + " 760.364084\n", + " 0.000000\n", + " 0.000000\n", + " 0.000000\n", + " 662.387188\n", + " 0.000000\n", + " \n", + " \n", + " 31\n", + " 187.107719\n", + " 0.000000\n", + " 661.295670\n", + " 0.000000\n", + " 0.000000\n", + " 0.000000\n", + " 563.318774\n", + " 0.000000\n", + " \n", + " \n", + " 32\n", + " 354.106078\n", + " 0.000000\n", + " 494.297310\n", + " 0.000000\n", + " 256.129183\n", + " 0.000000\n", + " 0.000000\n", + " 0.000000\n", + " \n", + " \n", + " 33\n", + " 501.174492\n", + " 0.000000\n", + " 347.228897\n", + " 0.000000\n", + " 403.197596\n", + " 0.000000\n", + " 0.000000\n", + " 0.000000\n", + " \n", + " \n", + " 34\n", + " 588.206520\n", + " 0.000000\n", + " 260.196868\n", + " 0.000000\n", + " 490.229625\n", + " 0.000000\n", + " 0.000000\n", + " 0.000000\n", + " \n", + " \n", + " 35\n", + " 701.290584\n", + " 0.000000\n", + " 147.112804\n", + " 0.000000\n", + " 603.313689\n", + " 0.000000\n", + " 0.000000\n", + " 0.000000\n", + " \n", " \n", "\n", "" ], "text/plain": [ " b_z1 b_z2 y_z1 y_z2 b_modloss_z1 \\\n", - "0 72.044390 36.525833 708.369169 354.688223 0.000000 \n", - "1 171.112804 86.060040 609.300755 305.154016 0.000000 \n", - "2 270.181218 135.594247 510.232341 255.619809 0.000000 \n", - "3 369.249632 185.128454 411.163927 206.085602 0.000000 \n", - "4 536.247991 268.627634 244.165568 122.586422 438.271096 \n", - "5 633.300755 317.154016 147.112804 74.060040 535.323860 \n", - "6 116.034219 58.520748 642.382103 321.694690 0.000000 \n", - "7 213.086983 107.047130 545.329339 273.168308 0.000000 \n", - "8 326.171047 163.589162 432.245275 216.626276 0.000000 \n", - "9 397.208161 199.107719 361.208161 181.107719 0.000000 \n", - "10 496.276575 248.641926 262.139747 131.573512 0.000000 \n", - "11 611.303518 306.155397 147.112804 74.060040 0.000000 \n", + "0 116.034219 58.520748 642.382103 321.694690 0.000000 \n", + "1 213.086983 107.047130 545.329339 273.168308 0.000000 \n", + "2 326.171047 163.589162 432.245275 216.626276 0.000000 \n", + "3 397.208161 199.107719 361.208161 181.107719 0.000000 \n", + "4 496.276575 248.641926 262.139747 131.573512 0.000000 \n", + "5 611.303518 306.155397 147.112804 74.060040 0.000000 \n", + "6 72.044390 36.525833 708.369169 354.688223 0.000000 \n", + "7 171.112804 86.060040 609.300755 305.154016 0.000000 \n", + "8 270.181218 135.594247 510.232341 255.619809 0.000000 \n", + "9 369.249632 185.128454 411.163927 206.085602 0.000000 \n", + "10 536.247991 268.627634 244.165568 122.586422 438.271096 \n", + "11 633.300755 317.154016 147.112804 74.060040 535.323860 \n", "12 132.047762 66.527519 686.275663 343.641470 0.000000 \n", "13 189.069225 95.038251 629.254199 315.130738 0.000000 \n", "14 356.067585 178.537431 462.255840 231.631558 258.090689 \n", "15 469.151649 235.079463 349.171776 175.089526 371.174753 \n", "16 584.178592 292.592934 234.144833 117.576055 486.201696 \n", "17 671.210620 336.108948 147.112804 74.060040 573.233724 \n", - "18 88.039305 0.000000 760.364084 0.000000 0.000000 \n", - "19 187.107719 0.000000 661.295670 0.000000 0.000000 \n", - "20 354.106078 0.000000 494.297310 0.000000 256.129183 \n", - "21 501.174492 0.000000 347.228897 0.000000 403.197596 \n", - "22 588.206520 0.000000 260.196868 0.000000 490.229625 \n", - "23 701.290584 0.000000 147.112804 0.000000 603.313689 \n", - "24 100.075690 50.541483 762.258312 381.632794 0.000000 \n", - "25 267.074050 134.040663 595.259953 298.133615 169.097154 \n", - "26 366.142464 183.574870 496.191539 248.599408 268.165568 \n", - "27 533.140823 267.074050 329.193180 165.100228 435.163927 \n", - "28 630.193587 315.600432 232.140416 116.573846 532.216691 \n", - "29 687.215050 344.111163 175.118952 88.063114 589.238155 \n", - "30 164.070605 82.538941 698.312048 349.659662 0.000000 \n", - "31 251.102633 126.054955 611.280020 306.143648 0.000000 \n", - "32 364.186697 182.596987 498.195956 249.601616 0.000000 \n", - "33 531.185057 266.096167 331.197596 166.102436 433.208161 \n", - "34 628.237821 314.622548 234.144833 117.576055 530.260925 \n", - "35 715.269849 358.138563 147.112804 74.060040 617.292953 \n", + "18 100.075690 50.541483 762.258312 381.632794 0.000000 \n", + "19 267.074050 134.040663 595.259953 298.133615 169.097154 \n", + "20 366.142464 183.574870 496.191539 248.599408 268.165568 \n", + "21 533.140823 267.074050 329.193180 165.100228 435.163927 \n", + "22 630.193587 315.600432 232.140416 116.573846 532.216691 \n", + "23 687.215050 344.111163 175.118952 88.063114 589.238155 \n", + "24 164.070605 82.538941 698.312048 349.659662 0.000000 \n", + "25 251.102633 126.054955 611.280020 306.143648 0.000000 \n", + "26 364.186697 182.596987 498.195956 249.601616 0.000000 \n", + "27 531.185057 266.096167 331.197596 166.102436 433.208161 \n", + "28 628.237821 314.622548 234.144833 117.576055 530.260925 \n", + "29 715.269849 358.138563 147.112804 74.060040 617.292953 \n", + "30 88.039305 0.000000 760.364084 0.000000 0.000000 \n", + "31 187.107719 0.000000 661.295670 0.000000 0.000000 \n", + "32 354.106078 0.000000 494.297310 0.000000 256.129183 \n", + "33 501.174492 0.000000 347.228897 0.000000 403.197596 \n", + "34 588.206520 0.000000 260.196868 0.000000 490.229625 \n", + "35 701.290584 0.000000 147.112804 0.000000 603.313689 \n", "\n", " b_modloss_z2 y_modloss_z1 y_modloss_z2 \n", - "0 0.000000 610.392273 305.699775 \n", - "1 0.000000 511.323860 256.165568 \n", - "2 0.000000 412.255446 206.631361 \n", - "3 0.000000 313.187032 157.097154 \n", - "4 219.639186 0.000000 0.000000 \n", - "5 268.165568 0.000000 0.000000 \n", - "6 0.000000 0.000000 0.000000 \n", - "7 0.000000 0.000000 0.000000 \n", - "8 0.000000 0.000000 0.000000 \n", - "9 0.000000 0.000000 0.000000 \n", - "10 0.000000 0.000000 0.000000 \n", - "11 0.000000 0.000000 0.000000 \n", + "0 0.000000 0.000000 0.000000 \n", + "1 0.000000 0.000000 0.000000 \n", + "2 0.000000 0.000000 0.000000 \n", + "3 0.000000 0.000000 0.000000 \n", + "4 0.000000 0.000000 0.000000 \n", + "5 0.000000 0.000000 0.000000 \n", + "6 0.000000 610.392273 305.699775 \n", + "7 0.000000 511.323860 256.165568 \n", + "8 0.000000 412.255446 206.631361 \n", + "9 0.000000 313.187032 157.097154 \n", + "10 219.639186 0.000000 0.000000 \n", + "11 268.165568 0.000000 0.000000 \n", "12 0.000000 588.298767 294.653022 \n", "13 0.000000 531.277303 266.142290 \n", "14 129.548983 0.000000 0.000000 \n", "15 186.091015 0.000000 0.000000 \n", "16 243.604486 0.000000 0.000000 \n", "17 287.120500 0.000000 0.000000 \n", - "18 0.000000 662.387188 0.000000 \n", - "19 0.000000 563.318774 0.000000 \n", - "20 0.000000 0.000000 0.000000 \n", - "21 0.000000 0.000000 0.000000 \n", - "22 0.000000 0.000000 0.000000 \n", - "23 0.000000 0.000000 0.000000 \n", - "24 0.000000 664.281417 332.644347 \n", - "25 85.052215 497.283057 249.145167 \n", - "26 134.586422 398.214643 199.610960 \n", - "27 218.085602 0.000000 0.000000 \n", - "28 266.611984 0.000000 0.000000 \n", - "29 295.122716 0.000000 0.000000 \n", - "30 0.000000 600.335153 300.671214 \n", - "31 0.000000 513.303124 257.155200 \n", - "32 0.000000 400.219060 200.613168 \n", - "33 217.107719 0.000000 0.000000 \n", - "34 265.634101 0.000000 0.000000 \n", - "35 309.150115 0.000000 0.000000 " + "18 0.000000 664.281417 332.644347 \n", + "19 85.052215 497.283057 249.145167 \n", + "20 134.586422 398.214643 199.610960 \n", + "21 218.085602 0.000000 0.000000 \n", + "22 266.611984 0.000000 0.000000 \n", + "23 295.122716 0.000000 0.000000 \n", + "24 0.000000 600.335153 300.671214 \n", + "25 0.000000 513.303124 257.155200 \n", + "26 0.000000 400.219060 200.613168 \n", + "27 217.107719 0.000000 0.000000 \n", + "28 265.634101 0.000000 0.000000 \n", + "29 309.150115 0.000000 0.000000 \n", + "30 0.000000 662.387188 0.000000 \n", + "31 0.000000 563.318774 0.000000 \n", + "32 0.000000 0.000000 0.000000 \n", + "33 0.000000 0.000000 0.000000 \n", + "34 0.000000 0.000000 0.000000 \n", + "35 0.000000 0.000000 0.000000 " ] }, "execution_count": null, @@ -1313,24 +1314,53 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "YSLSPSK 24 30\n", + " PrecursorCharge ModifiedPeptide StrippedPeptide iRT \\\n", + "39 2 _YSLS[Phospho (STY)]PSK_ YSLSPSK -6.428198 \n", + "40 2 _YSLS[Phospho (STY)]PSK_ YSLSPSK -6.428198 \n", + "42 2 _YSLS[Phospho (STY)]PSK_ YSLSPSK -6.428198 \n", + "44 2 _YSLS[Phospho (STY)]PSK_ YSLSPSK -6.428198 \n", + "\n", + " LabeledPeptide PrecursorMz FragmentLossType FragmentNumber \\\n", + "39 _YSLS[Phospho (STY)]PSK_ 431.191326 noloss 3 \n", + "40 _YSLS[Phospho (STY)]PSK_ 431.191326 noloss 4 \n", + "42 _YSLS[Phospho (STY)]PSK_ 431.191326 noloss 5 \n", + "44 _YSLS[Phospho (STY)]PSK_ 431.191326 noloss 6 \n", + "\n", + " FragmentType FragmentCharge FragmentMz RelativeIntensity IonMobility \n", + "39 y 1 331.197596 49.20179 0.9 \n", + "40 y 1 498.195955 10.89141 0.9 \n", + "42 y 1 611.280019 14.11057 0.9 \n", + "44 y 1 698.312048 60.23455 0.9 \n" + ] + } + ], "source": [ "#| hide\n", "df = pd.read_csv(StringIO(tsv_str), sep='\\t')\n", "seq = 'YSLSPSK'\n", "seq,start,end = psm_df.loc[psm_df.sequence==seq,['sequence','frag_start_idx','frag_stop_idx']].values[0]\n", + "print(seq,start,end)\n", "y_df = df[(df['StrippedPeptide']==seq)&(df['FragmentLossType']=='noloss')&(df['FragmentType']=='y')]\n", + "print(y_df)\n", "y_ions = np.zeros(len(seq)-1)\n", "y_ions[len(seq)-y_df.FragmentNumber-1] = y_df.RelativeIntensity.values / 100\n", "assert np.allclose(\n", - " reader.fragment_intensity_df.loc[start:end+1,'y_z1'].values,\n", + " reader.fragment_intensity_df.loc[start:end-1,'y_z1'].values,\n", " y_ions\n", ")\n", + "\n", "y_df = df[(df['StrippedPeptide']==seq)&(df['FragmentLossType']=='H3PO4')&(df['FragmentType']=='y')]\n", "y_ions = np.zeros(len(seq)-1)\n", "y_ions[len(seq)-y_df.FragmentNumber-1] = y_df.RelativeIntensity.values / 100\n", + "\n", "assert np.allclose(\n", - " reader.fragment_intensity_df.loc[start:end+1,'y_modloss_z1'].values,\n", + " reader.fragment_intensity_df.loc[start:end-1,'y_modloss_z1'].values,\n", " y_ions\n", ")" ] @@ -1362,17 +1392,17 @@ " \n", " \n", " raw_name\n", - " sequence\n", - " charge\n", - " rt\n", - " precursor_mz\n", " mobility\n", - " proteins\n", + " precursor_mz\n", + " mods\n", + " sequence\n", " uniprot_ids\n", + " rt\n", " genes\n", - " mods\n", - " mod_sites\n", " nAA\n", + " mod_sites\n", + " charge\n", + " proteins\n", " frag_start_idx\n", " frag_stop_idx\n", " rt_norm\n", @@ -1383,199 +1413,199 @@ " \n", " 0\n", " 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph...\n", - " AGSPDVLR\n", - " 2\n", - " -0.797019\n", - " 447.707675\n", - " 0.780\n", - " ZN740_HUMAN\n", - " Q8NDX6\n", - " ZNF740\n", - " Phospho@S\n", - " 3\n", + " 0.758\n", + " 418.717511\n", + " Phospho@T\n", + " ALVATPGK\n", + " P19338\n", + " -5.032703\n", + " NCL\n", " 8\n", + " 5\n", + " 2\n", + " NUCL_HUMAN\n", " 0\n", " 7\n", - " 0.344252\n", - " 317.236399\n", + " -0.152690\n", + " 308.612143\n", " \n", " \n", " 1\n", " 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph...\n", - " ALVATPGK\n", - " 2\n", - " -5.032703\n", - " 418.717511\n", - " 0.758\n", - " NUCL_HUMAN\n", - " P19338\n", - " NCL\n", - " Phospho@T\n", - " 5\n", + " 0.775\n", + " 470.202225\n", + " Phospho@S\n", + " GGSPDLWK\n", + " Q96JM3\n", + " 30.200720\n", + " CHAMP1\n", " 8\n", + " 3\n", + " 2\n", + " CHAP1_HUMAN\n", " 7\n", " 14\n", - " 0.261972\n", - " 308.612143\n", + " 0.916278\n", + " 314.974129\n", " \n", " \n", " 2\n", " 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph...\n", - " ELIQEYGAQSGGLEK\n", - " 2\n", - " 39.084610\n", - " 851.390200\n", - " 1.081\n", - " A0A075B730_HUMAN\n", - " A0A075B730\n", - " EPPK1\n", + " 0.780\n", + " 447.707675\n", " Phospho@S\n", - " 10\n", - " 15\n", + " AGSPDVLR\n", + " Q8NDX6\n", + " -0.797019\n", + " ZNF740\n", + " 8\n", + " 3\n", + " 2\n", + " ZN740_HUMAN\n", " 14\n", - " 28\n", - " 1.000000\n", - " 436.485165\n", + " 21\n", + " -0.024181\n", + " 317.236399\n", " \n", " \n", " 3\n", " 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph...\n", - " GGSPDLWK\n", - " 2\n", - " 30.200720\n", - " 470.202225\n", - " 0.775\n", - " CHAP1_HUMAN\n", - " Q96JM3\n", - " CHAMP1\n", + " 0.785\n", + " 453.731484\n", " Phospho@S\n", - " 3\n", - " 8\n", - " 28\n", - " 35\n", - " 0.946396\n", - " 314.974129\n", - " \n", - " \n", - " 4\n", - " 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph...\n", - " NLTEDNSQNQDLIAK\n", + " TRLSPPR\n", + " Q96PK6\n", + " -18.518720\n", + " RBM14\n", + " 7\n", + " 4\n", " 2\n", - " 12.650000\n", - " 851.915755\n", - " 1.092\n", - " ATX10_HUMAN\n", - " Q9UBB4\n", - " ATXN10\n", - " \n", - " \n", - " 15\n", - " 35\n", - " 49\n", - " 0.541092\n", - " 440.924535\n", + " RBM14_HUMAN\n", + " 21\n", + " 27\n", + " -0.561851\n", + " 319.205696\n", " \n", " \n", - " 5\n", + " 4\n", " 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph...\n", - " SSPLSWR\n", - " 2\n", - " 32.960210\n", - " 456.702393\n", " 0.806\n", - " NAV1_HUMAN\n", + " 456.702393\n", + " Phospho@S\n", + " SSPLSWR\n", " Q8NEY1;Q8NEY1-3\n", + " 32.960210\n", " NAV1\n", - " Phospho@S\n", - " 1\n", " 7\n", - " 49\n", - " 55\n", + " 1\n", + " 2\n", + " NAV1_HUMAN\n", + " 27\n", + " 33\n", " 1.000000\n", " 327.713047\n", " \n", " \n", - " 6\n", + " 5\n", " 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph...\n", - " TLTPPLR\n", - " 2\n", - " 27.716590\n", - " 439.230786\n", " 0.818\n", - " ZC3HD_HUMAN\n", + " 439.230786\n", + " Phospho@T\n", + " TLTPPLR\n", " Q5T200\n", + " 27.716590\n", " ZC3H13\n", - " Phospho@T\n", - " 3\n", " 7\n", - " 55\n", - " 61\n", - " 0.898140\n", + " 3\n", + " 2\n", + " ZC3HD_HUMAN\n", + " 33\n", + " 39\n", + " 0.840911\n", " 332.788837\n", " \n", " \n", + " 6\n", + " 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph...\n", + " 1.081\n", + " 851.390200\n", + " Phospho@S\n", + " ELIQEYGAQSGGLEK\n", + " A0A075B730\n", + " 39.084610\n", + " EPPK1\n", + " 15\n", + " 10\n", + " 2\n", + " A0A075B730_HUMAN\n", + " 39\n", + " 53\n", + " 1.000000\n", + " 436.485165\n", + " \n", + " \n", " 7\n", " 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph...\n", - " TRLSPPR\n", + " 1.092\n", + " 851.915755\n", + " \n", + " NLTEDNSQNQDLIAK\n", + " Q9UBB4\n", + " 12.650000\n", + " ATXN10\n", + " 15\n", + " \n", " 2\n", - " -18.518720\n", - " 453.731484\n", - " 0.785\n", - " RBM14_HUMAN\n", - " Q96PK6\n", - " RBM14\n", - " Phospho@S\n", - " 4\n", - " 7\n", - " 61\n", + " ATX10_HUMAN\n", + " 53\n", " 67\n", - " 0.000000\n", - " 319.205696\n", + " 0.323657\n", + " 440.924535\n", " \n", " \n", "\n", "" ], "text/plain": [ - " raw_name sequence charge \\\n", - "0 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph... AGSPDVLR 2 \n", - "1 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph... ALVATPGK 2 \n", - "2 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph... ELIQEYGAQSGGLEK 2 \n", - "3 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph... GGSPDLWK 2 \n", - "4 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph... NLTEDNSQNQDLIAK 2 \n", - "5 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph... SSPLSWR 2 \n", - "6 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph... TLTPPLR 2 \n", - "7 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph... TRLSPPR 2 \n", + " raw_name mobility precursor_mz \\\n", + "0 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph... 0.758 418.717511 \n", + "1 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph... 0.775 470.202225 \n", + "2 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph... 0.780 447.707675 \n", + "3 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph... 0.785 453.731484 \n", + "4 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph... 0.806 456.702393 \n", + "5 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph... 0.818 439.230786 \n", + "6 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph... 1.081 851.390200 \n", + "7 202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phosph... 1.092 851.915755 \n", "\n", - " rt precursor_mz mobility proteins uniprot_ids \\\n", - "0 -0.797019 447.707675 0.780 ZN740_HUMAN Q8NDX6 \n", - "1 -5.032703 418.717511 0.758 NUCL_HUMAN P19338 \n", - "2 39.084610 851.390200 1.081 A0A075B730_HUMAN A0A075B730 \n", - "3 30.200720 470.202225 0.775 CHAP1_HUMAN Q96JM3 \n", - "4 12.650000 851.915755 1.092 ATX10_HUMAN Q9UBB4 \n", - "5 32.960210 456.702393 0.806 NAV1_HUMAN Q8NEY1;Q8NEY1-3 \n", - "6 27.716590 439.230786 0.818 ZC3HD_HUMAN Q5T200 \n", - "7 -18.518720 453.731484 0.785 RBM14_HUMAN Q96PK6 \n", + " mods sequence uniprot_ids rt genes nAA \\\n", + "0 Phospho@T ALVATPGK P19338 -5.032703 NCL 8 \n", + "1 Phospho@S GGSPDLWK Q96JM3 30.200720 CHAMP1 8 \n", + "2 Phospho@S AGSPDVLR Q8NDX6 -0.797019 ZNF740 8 \n", + "3 Phospho@S TRLSPPR Q96PK6 -18.518720 RBM14 7 \n", + "4 Phospho@S SSPLSWR Q8NEY1;Q8NEY1-3 32.960210 NAV1 7 \n", + "5 Phospho@T TLTPPLR Q5T200 27.716590 ZC3H13 7 \n", + "6 Phospho@S ELIQEYGAQSGGLEK A0A075B730 39.084610 EPPK1 15 \n", + "7 NLTEDNSQNQDLIAK Q9UBB4 12.650000 ATXN10 15 \n", "\n", - " genes mods mod_sites nAA frag_start_idx frag_stop_idx rt_norm \\\n", - "0 ZNF740 Phospho@S 3 8 0 7 0.344252 \n", - "1 NCL Phospho@T 5 8 7 14 0.261972 \n", - "2 EPPK1 Phospho@S 10 15 14 28 1.000000 \n", - "3 CHAMP1 Phospho@S 3 8 28 35 0.946396 \n", - "4 ATXN10 15 35 49 0.541092 \n", - "5 NAV1 Phospho@S 1 7 49 55 1.000000 \n", - "6 ZC3H13 Phospho@T 3 7 55 61 0.898140 \n", - "7 RBM14 Phospho@S 4 7 61 67 0.000000 \n", + " mod_sites charge proteins frag_start_idx frag_stop_idx \\\n", + "0 5 2 NUCL_HUMAN 0 7 \n", + "1 3 2 CHAP1_HUMAN 7 14 \n", + "2 3 2 ZN740_HUMAN 14 21 \n", + "3 4 2 RBM14_HUMAN 21 27 \n", + "4 1 2 NAV1_HUMAN 27 33 \n", + "5 3 2 ZC3HD_HUMAN 33 39 \n", + "6 10 2 A0A075B730_HUMAN 39 53 \n", + "7 2 ATX10_HUMAN 53 67 \n", "\n", - " ccs \n", - "0 317.236399 \n", - "1 308.612143 \n", - "2 436.485165 \n", - "3 314.974129 \n", - "4 440.924535 \n", - "5 327.713047 \n", - "6 332.788837 \n", - "7 319.205696 " + " rt_norm ccs \n", + "0 -0.152690 308.612143 \n", + "1 0.916278 314.974129 \n", + "2 -0.024181 317.236399 \n", + "3 -0.561851 319.205696 \n", + "4 1.000000 327.713047 \n", + "5 0.840911 332.788837 \n", + "6 1.000000 436.485165 \n", + "7 0.323657 440.924535 " ] }, "execution_count": null, @@ -1668,7 +1698,7 @@ "202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_library15_S4-B3_1_25857\t2\t\t_NLTEDNSQNQDLIAK_\t\tTrue\t_NLTEDNSQNQDLIAK_\tNLTEDNSQNQDLIAK\t12.65\t1.092\t12.65\tQ9UBB4\tFalse\t_NLTEDNSQNQDLIAK_\t_NLTEDNSQNQDLIAK_\t851.915754844857\t0.000129324282170273\t6592\tNH3\t12\ty\t1\t1357.6230149506\t10.41056\tFalse\tsp\tQ9UBB4\tQ9UBB4\tATX10_HUMAN\tAtaxin-10\tHomo sapiens\t\tATXN10\t1\t1\tMCT_human_UP000005640_9606\n", "\"\"\"\n", "\n", - "reader = SWATHLibraryReader()\n", + "reader = LibraryReaderBase()\n", "psm_df = reader.import_file(StringIO(tsv_str))\n", "for col in ['sequence','charge','rt','rt_norm','mods','mod_sites','nAA','frag_start_idx','frag_stop_idx']:\n", " assert col in psm_df.columns\n", @@ -1715,58 +1745,58 @@ " \n", " \n", " 0\n", - " 0.000000\n", " 0.0\n", - " 0.000000\n", " 0.0\n", " 0.000000\n", " 0.0\n", + " 0.0\n", + " 0.0\n", " 0.000000\n", - " 0.270302\n", + " 0.0\n", " \n", " \n", " 1\n", - " 0.000000\n", " 0.0\n", - " 0.000000\n", " 0.0\n", - " 0.000000\n", + " 1.000000\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.504698\n", " 0.0\n", - " 0.000000\n", - " 0.684009\n", " \n", " \n", " 2\n", - " 0.000000\n", " 0.0\n", - " 1.000000\n", " 0.0\n", - " 0.000000\n", + " 0.546010\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.460998\n", " 0.0\n", - " 0.000000\n", - " 0.000000\n", " \n", " \n", " 3\n", - " 0.000000\n", " 0.0\n", - " 0.095609\n", " 0.0\n", " 0.000000\n", " 0.0\n", - " 0.000000\n", - " 0.000000\n", + " 0.0\n", + " 0.0\n", + " 0.263160\n", + " 0.0\n", " \n", " \n", " 4\n", - " 0.000000\n", " 0.0\n", - " 0.524231\n", " 0.0\n", - " 0.000000\n", + " 0.531991\n", + " 0.0\n", + " 0.0\n", " 0.0\n", " 0.000000\n", - " 0.000000\n", + " 0.0\n", " \n", " \n", " ...\n", @@ -1781,58 +1811,58 @@ " \n", " \n", " 62\n", - " 0.000000\n", " 0.0\n", - " 0.064688\n", " 0.0\n", - " 0.000000\n", + " 0.318182\n", + " 0.0\n", + " 0.0\n", " 0.0\n", - " 0.206230\n", " 0.000000\n", + " 0.0\n", " \n", " \n", " 63\n", - " 0.000000\n", " 0.0\n", - " 0.000000\n", " 0.0\n", - " 0.000000\n", + " 0.419355\n", + " 0.0\n", + " 0.0\n", " 0.0\n", - " 0.070385\n", " 0.000000\n", + " 0.0\n", " \n", " \n", " 64\n", - " 0.177417\n", " 0.0\n", - " 0.976432\n", " 0.0\n", - " 1.000000\n", + " 0.344575\n", + " 0.0\n", + " 0.0\n", " 0.0\n", " 0.000000\n", - " 0.000000\n", + " 0.0\n", " \n", " \n", " 65\n", - " 0.000000\n", + " 0.0\n", " 0.0\n", " 0.000000\n", " 0.0\n", - " 0.104885\n", + " 0.0\n", " 0.0\n", " 0.000000\n", - " 0.000000\n", + " 0.0\n", " \n", " \n", " 66\n", - " 0.000000\n", " 0.0\n", - " 0.000000\n", " 0.0\n", " 0.000000\n", " 0.0\n", + " 0.0\n", + " 0.0\n", " 0.000000\n", - " 0.000000\n", + " 0.0\n", " \n", " \n", "\n", @@ -1840,31 +1870,31 @@ "" ], "text/plain": [ - " b_z1 b_z2 y_z1 y_z2 b_modloss_z1 b_modloss_z2 y_modloss_z1 \\\n", - "0 0.000000 0.0 0.000000 0.0 0.000000 0.0 0.000000 \n", - "1 0.000000 0.0 0.000000 0.0 0.000000 0.0 0.000000 \n", - "2 0.000000 0.0 1.000000 0.0 0.000000 0.0 0.000000 \n", - "3 0.000000 0.0 0.095609 0.0 0.000000 0.0 0.000000 \n", - "4 0.000000 0.0 0.524231 0.0 0.000000 0.0 0.000000 \n", - ".. ... ... ... ... ... ... ... \n", - "62 0.000000 0.0 0.064688 0.0 0.000000 0.0 0.206230 \n", - "63 0.000000 0.0 0.000000 0.0 0.000000 0.0 0.070385 \n", - "64 0.177417 0.0 0.976432 0.0 1.000000 0.0 0.000000 \n", - "65 0.000000 0.0 0.000000 0.0 0.104885 0.0 0.000000 \n", - "66 0.000000 0.0 0.000000 0.0 0.000000 0.0 0.000000 \n", + " b_z1 b_z2 y_z1 y_z2 b_modloss_z1 b_modloss_z2 y_modloss_z1 \\\n", + "0 0.0 0.0 0.000000 0.0 0.0 0.0 0.000000 \n", + "1 0.0 0.0 1.000000 0.0 0.0 0.0 0.504698 \n", + "2 0.0 0.0 0.546010 0.0 0.0 0.0 0.460998 \n", + "3 0.0 0.0 0.000000 0.0 0.0 0.0 0.263160 \n", + "4 0.0 0.0 0.531991 0.0 0.0 0.0 0.000000 \n", + ".. ... ... ... ... ... ... ... \n", + "62 0.0 0.0 0.318182 0.0 0.0 0.0 0.000000 \n", + "63 0.0 0.0 0.419355 0.0 0.0 0.0 0.000000 \n", + "64 0.0 0.0 0.344575 0.0 0.0 0.0 0.000000 \n", + "65 0.0 0.0 0.000000 0.0 0.0 0.0 0.000000 \n", + "66 0.0 0.0 0.000000 0.0 0.0 0.0 0.000000 \n", "\n", " y_modloss_z2 \n", - "0 0.270302 \n", - "1 0.684009 \n", - "2 0.000000 \n", - "3 0.000000 \n", - "4 0.000000 \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", ".. ... \n", - "62 0.000000 \n", - "63 0.000000 \n", - "64 0.000000 \n", - "65 0.000000 \n", - "66 0.000000 \n", + "62 0.0 \n", + "63 0.0 \n", + "64 0.0 \n", + "65 0.0 \n", + "66 0.0 \n", "\n", "[67 rows x 8 columns]" ] @@ -1919,54 +1949,54 @@ " 0\n", " 72.044390\n", " 36.525833\n", - " 823.370960\n", - " 412.189118\n", + " 765.390633\n", + " 383.198955\n", " 0.000000\n", " 0.000000\n", - " 725.394064\n", - " 363.200670\n", + " 667.413737\n", + " 334.210507\n", " \n", " \n", " 1\n", - " 129.065854\n", - " 65.036565\n", - " 766.349496\n", - " 383.678386\n", + " 185.128454\n", + " 93.067865\n", + " 652.306569\n", + " 326.656923\n", " 0.000000\n", " 0.000000\n", - " 668.372601\n", - " 334.689939\n", + " 554.329673\n", + " 277.668475\n", " \n", " \n", " 2\n", - " 296.064213\n", - " 148.535745\n", - " 599.351137\n", - " 300.179207\n", - " 198.087318\n", - " 99.547297\n", + " 284.196868\n", + " 142.602072\n", + " 553.238155\n", + " 277.122716\n", " 0.000000\n", " 0.000000\n", + " 455.261259\n", + " 228.134268\n", " \n", " \n", " 3\n", - " 393.116977\n", - " 197.062127\n", - " 502.298373\n", - " 251.652825\n", - " 295.140082\n", - " 148.073679\n", + " 355.233982\n", + " 178.120629\n", + " 482.201041\n", + " 241.604159\n", " 0.000000\n", " 0.000000\n", + " 384.224146\n", + " 192.615711\n", " \n", " \n", " 4\n", - " 508.143920\n", - " 254.575598\n", - " 387.271430\n", - " 194.139353\n", - " 410.167025\n", - " 205.587151\n", + " 536.247991\n", + " 268.627634\n", + " 301.187032\n", + " 151.097154\n", + " 438.271096\n", + " 219.639186\n", " 0.000000\n", " 0.000000\n", " \n", @@ -1983,56 +2013,56 @@ " \n", " \n", " 62\n", - " 258.156066\n", - " 129.581671\n", - " 649.306903\n", - " 325.157090\n", + " 1144.486521\n", + " 572.746899\n", + " 559.344989\n", + " 280.176133\n", + " 0.000000\n", + " 0.000000\n", " 0.000000\n", " 0.000000\n", - " 551.330008\n", - " 276.168642\n", " \n", " \n", " 63\n", - " 371.240130\n", - " 186.123703\n", - " 536.222839\n", - " 268.615058\n", + " 1259.513464\n", + " 630.260370\n", + " 444.318046\n", + " 222.662661\n", + " 0.000000\n", + " 0.000000\n", " 0.000000\n", " 0.000000\n", - " 438.245944\n", - " 219.626610\n", " \n", " \n", " 64\n", - " 538.238489\n", - " 269.622883\n", - " 369.224480\n", - " 185.115878\n", - " 440.261594\n", - " 220.634435\n", + " 1372.597528\n", + " 686.802402\n", + " 331.233982\n", + " 166.120629\n", + " 0.000000\n", + " 0.000000\n", " 0.000000\n", " 0.000000\n", " \n", " \n", " 65\n", - " 635.291253\n", - " 318.149265\n", - " 272.171716\n", - " 136.589496\n", - " 537.314358\n", - " 269.160817\n", + " 1485.681592\n", + " 743.344434\n", + " 218.149918\n", + " 109.578597\n", + " 0.000000\n", + " 0.000000\n", " 0.000000\n", " 0.000000\n", " \n", " \n", " 66\n", - " 732.344017\n", - " 366.675647\n", - " 175.118952\n", - " 88.063114\n", - " 634.367121\n", - " 317.687199\n", + " 1556.718706\n", + " 778.862991\n", + " 147.112804\n", + " 74.060040\n", + " 0.000000\n", + " 0.000000\n", " 0.000000\n", " 0.000000\n", " \n", @@ -2042,31 +2072,31 @@ "" ], "text/plain": [ - " b_z1 b_z2 y_z1 y_z2 b_modloss_z1 \\\n", - "0 72.044390 36.525833 823.370960 412.189118 0.000000 \n", - "1 129.065854 65.036565 766.349496 383.678386 0.000000 \n", - "2 296.064213 148.535745 599.351137 300.179207 198.087318 \n", - "3 393.116977 197.062127 502.298373 251.652825 295.140082 \n", - "4 508.143920 254.575598 387.271430 194.139353 410.167025 \n", - ".. ... ... ... ... ... \n", - "62 258.156066 129.581671 649.306903 325.157090 0.000000 \n", - "63 371.240130 186.123703 536.222839 268.615058 0.000000 \n", - "64 538.238489 269.622883 369.224480 185.115878 440.261594 \n", - "65 635.291253 318.149265 272.171716 136.589496 537.314358 \n", - "66 732.344017 366.675647 175.118952 88.063114 634.367121 \n", + " b_z1 b_z2 y_z1 y_z2 b_modloss_z1 \\\n", + "0 72.044390 36.525833 765.390633 383.198955 0.000000 \n", + "1 185.128454 93.067865 652.306569 326.656923 0.000000 \n", + "2 284.196868 142.602072 553.238155 277.122716 0.000000 \n", + "3 355.233982 178.120629 482.201041 241.604159 0.000000 \n", + "4 536.247991 268.627634 301.187032 151.097154 438.271096 \n", + ".. ... ... ... ... ... \n", + "62 1144.486521 572.746899 559.344989 280.176133 0.000000 \n", + "63 1259.513464 630.260370 444.318046 222.662661 0.000000 \n", + "64 1372.597528 686.802402 331.233982 166.120629 0.000000 \n", + "65 1485.681592 743.344434 218.149918 109.578597 0.000000 \n", + "66 1556.718706 778.862991 147.112804 74.060040 0.000000 \n", "\n", " b_modloss_z2 y_modloss_z1 y_modloss_z2 \n", - "0 0.000000 725.394064 363.200670 \n", - "1 0.000000 668.372601 334.689939 \n", - "2 99.547297 0.000000 0.000000 \n", - "3 148.073679 0.000000 0.000000 \n", - "4 205.587151 0.000000 0.000000 \n", + "0 0.000000 667.413737 334.210507 \n", + "1 0.000000 554.329673 277.668475 \n", + "2 0.000000 455.261259 228.134268 \n", + "3 0.000000 384.224146 192.615711 \n", + "4 219.639186 0.000000 0.000000 \n", ".. ... ... ... \n", - "62 0.000000 551.330008 276.168642 \n", - "63 0.000000 438.245944 219.626610 \n", - "64 220.634435 0.000000 0.000000 \n", - "65 269.160817 0.000000 0.000000 \n", - "66 317.687199 0.000000 0.000000 \n", + "62 0.000000 0.000000 0.000000 \n", + "63 0.000000 0.000000 0.000000 \n", + "64 0.000000 0.000000 0.000000 \n", + "65 0.000000 0.000000 0.000000 \n", + "66 0.000000 0.000000 0.000000 \n", "\n", "[67 rows x 8 columns]" ] @@ -2106,6 +2136,7 @@ "300.150371\t507.223495\tb4^1\tP09651\tHNRNPA1\tEDTEEHHLR\t(UniMod:199)EDTEEHHLR\t4\t544.8236215\t2.957367634819564\t\tb\t1\t4\t\t256.87189917542054\n", "300.150371\t562.320842\ty4^1\tP09651\tHNRNPA1\tEDTEEHHLR\t(UniMod:199)EDTEEHHLR\t4\t758.6205095\t2.957367634819564\t\ty\t1\t4\t\t256.87189917542054\n", "300.150371\t691.363436\ty5^1\tP09651\tHNRNPA1\tEDTEEHHLR\t(UniMod:199)EDTEEHHLR\t4\t386.09158199999996\t2.957367634819564\t\ty\t1\t5\t\t256.87189917542054\n", + "300.150371\t691.363436\ty5^1\tP09651\tHNRNPA1\tKKKEDTEEHHLR\tKKK(unknown)EDTEEHHLR\t4\t386.09158199999996\t2.957367634819564\t\ty\t1\t5\t\t256.87189917542054\n", "300.851044\t175.118953\ty1^1\tP14618\tPKM\tMQHLIAR\t(UniMod:199)MQHLIAR\t3\t6232.253602000001\t10.90490543435342\t\ty\t1\t1\t\t396.233918513634\n", "300.851044\t180.123704\ty3^2\tP14618\tPKM\tMQHLIAR\t(UniMod:199)MQHLIAR\t3\t187.126964\t10.90490543435342\t\ty\t2\t3\t\t396.233918513634\n", "300.851044\t215.11446800000002\tb3^2\tP14618\tPKM\tMQHLIAR\t(UniMod:199)MQHLIAR\t3\t1356.0843869999999\t10.90490543435342\t\tb\t2\t3\t\t396.233918513634\n", @@ -2114,6 +2145,15 @@ "\"\"\"" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(StringIO(tsv_str), sep='\\t')" + ] + }, { "cell_type": "code", "execution_count": null, @@ -2140,207 +2180,63 @@ " \n", " \n", " \n", - " b_z1\n", - " b_z2\n", - " y_z1\n", - " y_z2\n", - " b_modloss_z1\n", - " b_modloss_z2\n", - " y_modloss_z1\n", - " y_modloss_z2\n", + " precursor_mz\n", + " mods\n", + " sequence\n", + " rt\n", + " genes\n", + " nAA\n", + " mod_sites\n", + " charge\n", + " proteins\n", + " frag_start_idx\n", + " frag_stop_idx\n", + " rt_norm\n", " \n", " \n", " \n", " \n", " 0\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", + " 300.150371\n", + " Dimethyl:2H(4)@Any N-term\n", + " EDTEEHHLR\n", + " 2.957368\n", + " HNRNPA1\n", + " 9\n", + " 0\n", + " 4\n", + " P09651\n", + " 0\n", + " 8\n", + " 0.271196\n", " \n", " \n", " 1\n", - " 0.521395\n", - " 0.000000\n", - " 0.000000\n", - " 0.065841\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 2\n", - " 0.286104\n", - " 0.000000\n", - " 0.000000\n", - " 0.110450\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 3\n", - " 0.054482\n", - " 0.000000\n", - " 0.038609\n", - " 0.089280\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 4\n", - " 0.000000\n", - " 0.000000\n", - " 0.075862\n", - " 0.826872\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 5\n", - " 0.000000\n", - " 0.000000\n", - " 1.000000\n", - " 0.389981\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 6\n", - " 0.000000\n", - " 0.000000\n", - " 0.778377\n", - " 0.029382\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 7\n", - " 0.000000\n", - " 0.000000\n", - " 0.697581\n", - " 0.000000\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 8\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 9\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 10\n", - " 0.000000\n", - " 0.152581\n", - " 0.000000\n", - " 0.034972\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 11\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.021055\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 12\n", - " 0.000000\n", - " 0.000000\n", + " 300.851044\n", + " Dimethyl:2H(4)@Any N-term\n", + " MQHLIAR\n", + " 10.904905\n", + " PKM\n", + " 7\n", + " 0\n", + " 3\n", + " P14618\n", + " 8\n", + " 14\n", " 1.000000\n", - " 0.000000\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 13\n", - " 0.000000\n", - " 0.000000\n", - " 0.701227\n", - " 0.000000\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " b_z1 b_z2 y_z1 y_z2 b_modloss_z1 b_modloss_z2 \\\n", - "0 0.000000 0.000000 0.000000 0.000000 0.0 0.0 \n", - "1 0.521395 0.000000 0.000000 0.065841 0.0 0.0 \n", - "2 0.286104 0.000000 0.000000 0.110450 0.0 0.0 \n", - "3 0.054482 0.000000 0.038609 0.089280 0.0 0.0 \n", - "4 0.000000 0.000000 0.075862 0.826872 0.0 0.0 \n", - "5 0.000000 0.000000 1.000000 0.389981 0.0 0.0 \n", - "6 0.000000 0.000000 0.778377 0.029382 0.0 0.0 \n", - "7 0.000000 0.000000 0.697581 0.000000 0.0 0.0 \n", - "8 0.000000 0.000000 0.000000 0.000000 0.0 0.0 \n", - "9 0.000000 0.000000 0.000000 0.000000 0.0 0.0 \n", - "10 0.000000 0.152581 0.000000 0.034972 0.0 0.0 \n", - "11 0.000000 0.000000 0.000000 0.021055 0.0 0.0 \n", - "12 0.000000 0.000000 1.000000 0.000000 0.0 0.0 \n", - "13 0.000000 0.000000 0.701227 0.000000 0.0 0.0 \n", + " precursor_mz mods sequence rt genes \\\n", + "0 300.150371 Dimethyl:2H(4)@Any N-term EDTEEHHLR 2.957368 HNRNPA1 \n", + "1 300.851044 Dimethyl:2H(4)@Any N-term MQHLIAR 10.904905 PKM \n", "\n", - " y_modloss_z1 y_modloss_z2 \n", - "0 0.0 0.0 \n", - "1 0.0 0.0 \n", - "2 0.0 0.0 \n", - "3 0.0 0.0 \n", - "4 0.0 0.0 \n", - "5 0.0 0.0 \n", - "6 0.0 0.0 \n", - "7 0.0 0.0 \n", - "8 0.0 0.0 \n", - "9 0.0 0.0 \n", - "10 0.0 0.0 \n", - "11 0.0 0.0 \n", - "12 0.0 0.0 \n", - "13 0.0 0.0 " + " nAA mod_sites charge proteins frag_start_idx frag_stop_idx rt_norm \n", + "0 9 0 4 P09651 0 8 0.271196 \n", + "1 7 0 3 P14618 8 14 1.000000 " ] }, "execution_count": null, @@ -2350,10 +2246,16 @@ ], "source": [ "#| hide\n", - "reader = SWATHLibraryReader()\n", - "reader.import_file(StringIO(tsv_str))\n", - "reader.fragment_intensity_df" + "reader = LibraryReaderBase()\n", + "reader.import_file(StringIO(tsv_str))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/nbdev_nbs/spectral_library/translate.ipynb b/nbdev_nbs/spectral_library/translate.ipynb index 8cc1eb5f..bff5793c 100644 --- a/nbdev_nbs/spectral_library/translate.ipynb +++ b/nbdev_nbs/spectral_library/translate.ipynb @@ -850,7 +850,7 @@ "assert len(ddf) == len(speclib_sdf)\n", "assert ddf.StrippedPeptide.values[0] == speclib_sdf.StrippedPeptide.values[0]\n", "assert ddf.StrippedPeptide.values[-1] == speclib_sdf.StrippedPeptide.values[-1]\n", - "assert ddf.PrecursorCharge.dtype in [np.int,np.int8,np.int32,np.int64]\n", + "assert ddf.PrecursorCharge.dtype in [np.int8, np.int16,np.int32,np.int64]\n", "ddf" ] }, diff --git a/nbdev_nbs/spectral_library/validate.ipynb b/nbdev_nbs/spectral_library/validate.ipynb new file mode 100644 index 00000000..c36df351 --- /dev/null +++ b/nbdev_nbs/spectral_library/validate.ipynb @@ -0,0 +1,165 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from alphabase.spectral_library import validate\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "schema_1 = validate.Schema(\n", + " 'schema_1',\n", + " [\n", + " validate.Required('a', np.int64),\n", + " validate.Required('b', np.int64),\n", + " validate.Required('c', np.int64),\n", + " validate.Required('d', np.int64),\n", + " ]\n", + ")\n", + "\n", + "df_1 = pd.DataFrame({\n", + " 'a': [1, 2, 3],\n", + " 'b': np.array([4,5,6]),\n", + " 'c': np.array([7,8,9]).astype(np.int16),\n", + " 'd': np.array([10,11,12]).astype(np.uint32),\n", + " 'e': np.array([13.,14.,15.]),\n", + "})\n", + "\n", + "schema_1(df_1)\n", + "for column in [column.name for column in schema_1.schema]:\n", + " assert np.issubdtype(df_1[column].dtype, np.int64)\n", + "assert np.issubdtype(df_1['e'].dtype, float)\n", + "\n", + "# raise on missing column \n", + "df_1 = pd.DataFrame({\n", + " 'a': [1, 2, 3],\n", + " 'b': np.array([4,5,6]),\n", + " 'c': np.array([7,8,9])\n", + "})\n", + "\n", + "raised = False\n", + "try:\n", + " schema_1(df_1)\n", + "except:\n", + " raised = True\n", + "assert raised\n", + "\n", + "# raise on wrong type\n", + "df_1 = pd.DataFrame({\n", + " 'a': [1, 2, 3],\n", + " 'b': np.array([4,5,6]),\n", + " 'c': np.array([7,8,9]),\n", + " 'd': np.array([10.,11.,12.])\n", + "})\n", + "\n", + "raised = False\n", + "try:\n", + " schema_1(df_1)\n", + "except:\n", + " raised = True\n", + "assert raised" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# convert to correct type\n", + "schema_2 = validate.Schema(\n", + " 'schema_2',\n", + " [\n", + " validate.Required('a', np.int32),\n", + " validate.Optional('b', np.int32),\n", + " validate.Required('c', np.int32),\n", + " ]\n", + ")\n", + "\n", + "df_2 = pd.DataFrame({\n", + " 'a': np.array([1, 2, 3]).astype(np.int16),\n", + " 'b': np.array([4,5,6]).astype(np.uint16),\n", + " 'c': np.array([7,8,9]).astype(np.bool_),\n", + "})\n", + "\n", + "schema_2(df_2)\n", + "for column in [column.name for column in schema_2.schema]:\n", + " assert np.issubdtype(df_2[column].dtype, np.int32)\n", + "\n", + "# raise on impossible conversion\n", + "df_2 = pd.DataFrame({\n", + " 'a': np.array([1, 2, 3]).astype(np.uint32),\n", + " 'b': np.array([4,5,6]).astype(np.int32),\n", + " 'c': np.array([7,8,9]).astype(np.int32),\n", + "})\n", + "\n", + "raised = False\n", + "try:\n", + " schema_2(df_2)\n", + "except:\n", + " raised = True\n", + "assert raised\n", + "\n", + "# raise on impossible conversion\n", + "df_2 = pd.DataFrame({\n", + " 'a': [1, 2, 3],\n", + " 'b': np.array([4,5,6]).astype(np.int32),\n", + " 'c': np.array([7,8,9]).astype(np.int32),\n", + "})\n", + "\n", + "raised = False\n", + "try:\n", + " schema_2(df_2)\n", + "except:\n", + " raised = True\n", + "assert raised\n", + "\n", + "# raise on impossible conversion\n", + "df_2 = pd.DataFrame({\n", + " 'a': np.array([1, 2, 3]).astype(np.float32),\n", + " 'b': np.array([4,5,6]).astype(np.int32),\n", + " 'c': np.array([7,8,9]).astype(np.int32),\n", + "})\n", + "\n", + "raised = False\n", + "try:\n", + " schema_2(df_2)\n", + "except:\n", + " raised = True\n", + "assert raised" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "alpha", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/release/one_click_linux_gui/control b/release/one_click_linux_gui/control index b832202b..9cb688fd 100644 --- a/release/one_click_linux_gui/control +++ b/release/one_click_linux_gui/control @@ -1,5 +1,5 @@ Package: AlphaBase -Version: 1.0.2 +Version: 1.0.3 Architecture: all Maintainer: Mann Labs Description: AlphaBase diff --git a/release/one_click_linux_gui/create_installer_linux.sh b/release/one_click_linux_gui/create_installer_linux.sh index 79460518..6be22ed3 100644 --- a/release/one_click_linux_gui/create_installer_linux.sh +++ b/release/one_click_linux_gui/create_installer_linux.sh @@ -17,7 +17,7 @@ python setup.py sdist bdist_wheel # Setting up the local package cd release/one_click_linux_gui # Make sure you include the required extra packages and always use the stable or very-stable options! -pip install "../../dist/alphabase-1.0.2-py3-none-any.whl[stable]" +pip install "../../dist/alphabase-1.0.3-py3-none-any.whl[stable]" # Creating the stand-alone pyinstaller folder pip install pyinstaller diff --git a/release/one_click_macos_gui/Info.plist b/release/one_click_macos_gui/Info.plist index 13ec96e7..312334cd 100644 --- a/release/one_click_macos_gui/Info.plist +++ b/release/one_click_macos_gui/Info.plist @@ -9,9 +9,9 @@ CFBundleIconFile alpha_logo.icns CFBundleIdentifier - alphabase.1.0.2 + alphabase.1.0.3 CFBundleShortVersionString - 1.0.2 + 1.0.3 CFBundleInfoDictionaryVersion 6.0 CFBundleName diff --git a/release/one_click_macos_gui/create_installer_macos.sh b/release/one_click_macos_gui/create_installer_macos.sh index 25b254a6..c388ca28 100644 --- a/release/one_click_macos_gui/create_installer_macos.sh +++ b/release/one_click_macos_gui/create_installer_macos.sh @@ -20,7 +20,7 @@ python setup.py sdist bdist_wheel # Setting up the local package cd release/one_click_macos_gui -pip install "../../dist/alphabase-1.0.2-py3-none-any.whl[stable]" +pip install "../../dist/alphabase-1.0.3-py3-none-any.whl[stable]" # Creating the stand-alone pyinstaller folder pip install pyinstaller @@ -40,5 +40,5 @@ cp ../../LICENSE.txt Resources/LICENSE.txt cp ../logos/alpha_logo.png Resources/alpha_logo.png chmod 777 scripts/* -pkgbuild --root dist/alphabase --identifier de.mpg.biochem.alphabase.app --version 1.0.2 --install-location /Applications/AlphaBase.app --scripts scripts AlphaBase.pkg +pkgbuild --root dist/alphabase --identifier de.mpg.biochem.alphabase.app --version 1.0.3 --install-location /Applications/AlphaBase.app --scripts scripts AlphaBase.pkg productbuild --distribution distribution.xml --resources Resources --package-path AlphaBase.pkg dist/alphabase_gui_installer_macos.pkg diff --git a/release/one_click_macos_gui/distribution.xml b/release/one_click_macos_gui/distribution.xml index fc7c6651..2e075417 100644 --- a/release/one_click_macos_gui/distribution.xml +++ b/release/one_click_macos_gui/distribution.xml @@ -1,6 +1,6 @@ - AlphaBase 1.0.2 + AlphaBase 1.0.3 diff --git a/release/one_click_windows_gui/alphabase_innoinstaller.iss b/release/one_click_windows_gui/alphabase_innoinstaller.iss index 636364a7..ea52a5a7 100644 --- a/release/one_click_windows_gui/alphabase_innoinstaller.iss +++ b/release/one_click_windows_gui/alphabase_innoinstaller.iss @@ -2,7 +2,7 @@ ; SEE THE DOCUMENTATION FOR DETAILS ON CREATING INNO SETUP SCRIPT FILES! #define MyAppName "AlphaBase" -#define MyAppVersion "1.0.2" +#define MyAppVersion "1.0.3" #define MyAppPublisher "Max Planck Institute of Biochemistry and the University of Copenhagen, Mann Labs" #define MyAppURL "https://github.com/MannLabs/alphabase" #define MyAppExeName "alphabase_gui.exe" diff --git a/release/one_click_windows_gui/create_installer_windows.sh b/release/one_click_windows_gui/create_installer_windows.sh index 19425936..e831b595 100644 --- a/release/one_click_windows_gui/create_installer_windows.sh +++ b/release/one_click_windows_gui/create_installer_windows.sh @@ -17,7 +17,7 @@ python setup.py sdist bdist_wheel # Setting up the local package cd release/one_click_windows_gui # Make sure you include the required extra packages and always use the stable or very-stable options! -pip install "../../dist/alphabase-1.0.2-py3-none-any.whl[stable]" +pip install "../../dist/alphabase-1.0.3-py3-none-any.whl[stable]" # Creating the stand-alone pyinstaller folder pip install pyinstaller diff --git a/requirements/requirements.txt b/requirements.txt similarity index 86% rename from requirements/requirements.txt rename to requirements.txt index 9689c083..605fdb3b 100644 --- a/requirements/requirements.txt +++ b/requirements.txt @@ -10,4 +10,4 @@ psutil tqdm scikit-learn regex -pydivsufsort \ No newline at end of file +# pydivsufsort \ No newline at end of file diff --git a/settings.ini b/settings.ini index 43d88d67..48607a1d 100644 --- a/settings.ini +++ b/settings.ini @@ -4,7 +4,7 @@ ### Python library ### repo = alphabase lib_name = alphabase -version = 1.0.2 +version = 1.0.3 min_python = 3.7 license = apache2 diff --git a/setup.py b/setup.py index cedec034..664c3669 100644 --- a/setup.py +++ b/setup.py @@ -24,11 +24,7 @@ def get_requirements(): requirement_file_names = package2install.__extra_requirements__ requirement_file_names[""] = "requirements.txt" for extra, requirement_file_name in requirement_file_names.items(): - full_requirement_file_name = os.path.join( - "requirements", - requirement_file_name, - ) - with open(full_requirement_file_name) as requirements_file: + with open(requirement_file_name) as requirements_file: if extra != "": extra_stable = f"{extra}-stable" else: