libpappsomspp
Library for mass spectrometry
pappso::Enzyme Class Reference

#include <enzyme.h>

Public Member Functions

 Enzyme ()
 build the default enzyme (trypsin) with recognition_site = "([KR])([^P])" More...
 
 Enzyme (const QString &recognition_site)
 build any enzyme given a recognition_site More...
 
 ~Enzyme ()
 
void eat (std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, EnzymeProductInterface &enzyme_product) const
 digest a protein into enzyme products More...
 
void setMiscleavage (unsigned int miscleavage)
 sets the maximum number of missed cleavage allowed in the digestion More...
 
unsigned int getMiscleavage () const
 get the maximum number of missed cleavage allowed in the digestion More...
 
void setTakeOnlyFirstWildcard (bool take_only_first_wildcard)
 take only first m_takeOnlyFirstWildcard More...
 
void setMaxPeptideVariantListSize (std::size_t max_peptide_variant_list_size)
 if there are wildcards in the protein sequence : restrict the number of possible peptide sequences More...
 
const QRegExp & getQRegExpRecognitionSite () const
 

Private Member Functions

void sanityCheck (EnzymeProductInterface &enzyme_product, std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, const PeptideStr &peptide, unsigned int start, bool is_nter, unsigned int missed_cleavage_number, bool semi_enzyme) const
 
void replaceWildcards (std::vector< std::string > *p_peptide_variant_list) const
 

Private Attributes

QRegExp m_recognitionSite
 example with a kinase == [K,R] More...
 
unsigned int m_miscleavage = 0
 
bool m_takeOnlyFirstWildcard = false
 
std::size_t m_maxPeptideVariantListSize = 100
 
std::vector< char > m_wildCardX
 
std::vector< char > m_wildCardB
 
std::vector< char > m_wildCardZ
 

Detailed Description

Definition at line 31 of file enzyme.h.

Constructor & Destructor Documentation

◆ Enzyme() [1/2]

pappso::Enzyme::Enzyme ( )

build the default enzyme (trypsin) with recognition_site = "([KR])([^P])"

Definition at line 32 of file enzyme.cpp.

33 {
34  m_recognitionSite.setPattern("([KR])([^P])");
35  m_miscleavage = 0;
36 
37 
38  char vv1[] = {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
39  'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'};
40  m_wildCardX.assign(std::begin(vv1), std::end(vv1));
41 
42  char vv2[] = {'N', 'D'};
43  m_wildCardB.assign(std::begin(vv2), std::end(vv2));
44 
45  char vv3[] = {'Q', 'E'};
46  m_wildCardZ.assign(std::begin(vv3), std::end(vv3));
47 }
std::vector< char > m_wildCardB
Definition: enzyme.h:97
std::vector< char > m_wildCardZ
Definition: enzyme.h:98
std::vector< char > m_wildCardX
Definition: enzyme.h:96
QRegExp m_recognitionSite
example with a kinase == [K,R]
Definition: enzyme.h:89
unsigned int m_miscleavage
Definition: enzyme.h:90

References m_miscleavage, m_recognitionSite, m_wildCardB, m_wildCardX, and m_wildCardZ.

◆ Enzyme() [2/2]

pappso::Enzyme::Enzyme ( const QString &  recognition_site)

build any enzyme given a recognition_site

Parameters
recognition_siteis a regular expression that must identify 2 motifs : one on Nter side one on Cter side

Definition at line 49 of file enzyme.cpp.

50 {
51  m_recognitionSite.setPattern(recognition_site);
52  m_miscleavage = 0;
53 
54 
55  char vv1[] = {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
56  'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'};
57  m_wildCardX.assign(std::begin(vv1), std::end(vv1));
58 
59  char vv2[] = {'N', 'D'};
60  m_wildCardB.assign(std::begin(vv2), std::end(vv2));
61 
62  char vv3[] = {'Q', 'E'};
63  m_wildCardZ.assign(std::begin(vv3), std::end(vv3));
64 }

References m_miscleavage, m_recognitionSite, m_wildCardB, m_wildCardX, and m_wildCardZ.

◆ ~Enzyme()

pappso::Enzyme::~Enzyme ( )

Definition at line 66 of file enzyme.cpp.

67 {
68 }

Member Function Documentation

◆ eat()

void pappso::Enzyme::eat ( std::int8_t  sequence_database_id,
const ProteinSp protein_sp,
bool  is_decoy,
EnzymeProductInterface enzyme_product 
) const

digest a protein into enzyme products

Parameters
sequence_database_idinteger that references the sequence fatabase (file, stream, url...)
protein_spis the original protein to be digested
is_decoytell if the current protein is a decoy (true) or normal (false) protein
enzyme_productis the object that will receive the digestion products

Definition at line 87 of file enzyme.cpp.

91 {
92  /*
93  * for aa in self.aa_to_cut:
94  seq = seq.replace(aa, aa + ' ')
95  seq_stack = []
96  for s in seq.strip().split(' '):
97  seq_stack.append(s)
98  if len(seq_stack) > self.misscleavage + 1:
99  seq_stack.pop(0)
100  s2 = ""
101  for s_miss in seq_stack[::-1]:
102  s2 = s_miss + s2
103  yield s2
104  */
105  qDebug() << "Enzyme::eat begin ";
106  const QString sequence = protein_sp.get()->getSequence();
107  qDebug() << sequence;
108  QStringList peptide_list;
109  int pos = 0;
110  int peptide_start = 0;
111  int peptide_size = sequence.size();
112  while((pos = m_recognitionSite.indexIn(sequence, pos)) != -1)
113  {
114  peptide_size = pos + m_recognitionSite.cap(1).length() - peptide_start;
115  // qDebug() << "pos=" << pos << " peptide_start=" << peptide_start << "
116  // peptide_size=" << peptide_size << " " <<
117  // sequence.mid(peptide_start,peptide_size);
118  if(peptide_size > 0)
119  {
120  peptide_list.append(sequence.mid(peptide_start, peptide_size));
121  }
122  peptide_start += peptide_size;
123  pos = peptide_start; // all peptides MUST be consecutive
124  }
125  peptide_size = sequence.size() - peptide_start;
126  if(peptide_size > 0)
127  {
128  peptide_list.append(sequence.mid(peptide_start, peptide_size));
129  }
130 
131  unsigned int start = 1;
132  bool is_nter = true;
133  foreach(const QString &peptide, peptide_list)
134  {
135  // enzyme_product.setPeptide(sequence_database_id, protein_sp,is_decoy,
136  // peptide, start,is_nter,0, false);
137  sanityCheck(enzyme_product,
138  sequence_database_id,
139  protein_sp,
140  is_decoy,
141  peptide,
142  start,
143  is_nter,
144  0,
145  false);
146  is_nter = false;
147  start += peptide.size();
148  }
149 
150  unsigned int miscleavage_i = 0;
151  while(miscleavage_i < m_miscleavage)
152  {
153  miscleavage_i++;
154  qDebug() << "miscleavage_i=" << miscleavage_i;
155  int chunk_number = miscleavage_i + 1;
156  unsigned int start = 1;
157  bool is_nter = true;
158 
159  for(auto i = 0; i < peptide_list.size(); ++i)
160  {
161  qDebug() << "start=" << start;
162  QStringList peptide_mis_list;
163  for(auto j = 0; (j < chunk_number) && ((i + j) < peptide_list.size());
164  j++)
165  {
166  peptide_mis_list << peptide_list.at(i + j);
167  }
168  if(peptide_mis_list.size() == chunk_number)
169  {
170  // enzyme_product.setPeptide(sequence_database_id,
171  // protein_sp,is_decoy, peptide_mis_list.join(""), start,is_nter,
172  // miscleavage_i, false);
173  sanityCheck(enzyme_product,
174  sequence_database_id,
175  protein_sp,
176  is_decoy,
177  peptide_mis_list.join(""),
178  start,
179  is_nter,
180  miscleavage_i,
181  false);
182  }
183  is_nter = false;
184  start += peptide_list.at(i).size();
185  }
186  }
187 }
void sanityCheck(EnzymeProductInterface &enzyme_product, std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, const PeptideStr &peptide, unsigned int start, bool is_nter, unsigned int missed_cleavage_number, bool semi_enzyme) const
Definition: enzyme.cpp:271

References m_miscleavage, m_recognitionSite, and sanityCheck().

◆ getMiscleavage()

unsigned int pappso::Enzyme::getMiscleavage ( ) const

get the maximum number of missed cleavage allowed in the digestion

Returns
miscleavage maximum number of missed cleavade to allow (defaults is 0)

Definition at line 76 of file enzyme.cpp.

77 {
78  return m_miscleavage;
79 }

References m_miscleavage.

◆ getQRegExpRecognitionSite()

const QRegExp & pappso::Enzyme::getQRegExpRecognitionSite ( ) const

Definition at line 348 of file enzyme.cpp.

349 {
350  return m_recognitionSite;
351 }

References m_recognitionSite.

◆ replaceWildcards()

void pappso::Enzyme::replaceWildcards ( std::vector< std::string > *  p_peptide_variant_list) const
private

Definition at line 190 of file enzyme.cpp.

191 {
192  std::string new_peptide = p_peptide_variant_list->at(0);
193  qDebug() << "Enzyme::replaceWildcards begin " << new_peptide.c_str();
194  std::vector<std::string> old_peptide_variant_list;
195  old_peptide_variant_list.assign(p_peptide_variant_list->begin(),
196  p_peptide_variant_list->end());
197 
198 
199  for(char wildcard : {'X', 'B', 'Z'})
200  {
201 
202  std::size_t position = new_peptide.find(wildcard);
203  if(position == std::string::npos)
204  {
205  continue;
206  }
207  else
208  {
209  p_peptide_variant_list->clear();
210  /*
211  new_peptide[position] = 'A';
212  p_peptide_variant_list->push_back(new_peptide);
213  break;
214  */
215 
216  const std::vector<char> *p_x_replace_wildcard = nullptr;
217  if(wildcard == 'X')
218  {
219  p_x_replace_wildcard = &m_wildCardX;
220  }
221  else if(wildcard == 'B')
222  {
223  p_x_replace_wildcard = &m_wildCardB;
224  }
225  else if(wildcard == 'Z')
226  {
227  p_x_replace_wildcard = &m_wildCardZ;
228  }
229 
230  if(p_x_replace_wildcard != nullptr)
231  {
232  for(std::string orig_peptide : old_peptide_variant_list)
233  {
234  for(char replace : *p_x_replace_wildcard)
235  {
236  orig_peptide[position] = replace;
237  p_peptide_variant_list->push_back(orig_peptide);
238  }
239  }
240  }
241  else
242  {
243  throw ExceptionNotPossible(
244  QObject::tr("x_replace_wildcard is empty"));
245  }
246  // new_peptide[position] = 'A';
247  // p_peptide_variant_list->push_back(new_peptide);
248  // p_peptide_variant_list->resize(1);
249  // std::cerr << "Enzyme::replaceWildcards begin
250  // p_peptide_variant_list.size()=" << p_peptide_variant_list->size()
251  // <<
252  // endl;
253  break;
254  }
255  }
256  std::vector<std::string>().swap(
257  old_peptide_variant_list); // clear old_peptide_variant_list reallocating
258 
259 
260  qDebug() << "Enzyme::replaceWildcards end " << new_peptide.c_str();
261 }

References m_wildCardB, m_wildCardX, and m_wildCardZ.

Referenced by sanityCheck().

◆ sanityCheck()

void pappso::Enzyme::sanityCheck ( EnzymeProductInterface enzyme_product,
std::int8_t  sequence_database_id,
const ProteinSp protein_sp,
bool  is_decoy,
const PeptideStr peptide,
unsigned int  start,
bool  is_nter,
unsigned int  missed_cleavage_number,
bool  semi_enzyme 
) const
private

Definition at line 271 of file enzyme.cpp.

280 {
281  if(peptide.contains('X') || peptide.contains('B') || peptide.contains('Z'))
282  {
283 
284  std::vector<std::string> peptide_variant_list;
285  peptide_variant_list.push_back(peptide.toStdString());
286 
287  while((peptide_variant_list.at(0).find('X') != std::string::npos) ||
288  (peptide_variant_list.at(0).find('B') != std::string::npos) ||
289  (peptide_variant_list.at(0).find('Z') != std::string::npos))
290  {
291  replaceWildcards(&peptide_variant_list);
292  if(peptide_variant_list.size() > m_maxPeptideVariantListSize)
293  {
294  peptide_variant_list.resize(m_maxPeptideVariantListSize);
295  peptide_variant_list.shrink_to_fit();
296  }
297  }
298 
299  // peptide_variant_list.resize(2);
301  {
302  enzyme_product.setPeptide(sequence_database_id,
303  protein_sp,
304  is_decoy,
305  QString(peptide_variant_list.at(0).c_str()),
306  start,
307  is_nter,
308  missed_cleavage_number,
309  semi_enzyme);
310  }
311  else
312  {
313  std::string peptide_variant = peptide_variant_list.back();
314  while(peptide_variant_list.size() > 0)
315  {
316  enzyme_product.setPeptide(sequence_database_id,
317  protein_sp,
318  is_decoy,
319  QString(peptide_variant.c_str()),
320  start,
321  is_nter,
322  missed_cleavage_number,
323  semi_enzyme);
324  peptide_variant_list.pop_back();
325  if(peptide_variant_list.size() > 0)
326  {
327  peptide_variant = peptide_variant_list.back();
328  }
329  }
330  }
331  std::vector<std::string>().swap(
332  peptide_variant_list); // clear peptide_variant_list reallocating
333  }
334  else
335  {
336  enzyme_product.setPeptide(sequence_database_id,
337  protein_sp,
338  is_decoy,
339  peptide,
340  start,
341  is_nter,
342  missed_cleavage_number,
343  semi_enzyme);
344  }
345 }
std::size_t m_maxPeptideVariantListSize
Definition: enzyme.h:93
void replaceWildcards(std::vector< std::string > *p_peptide_variant_list) const
Definition: enzyme.cpp:190
bool m_takeOnlyFirstWildcard
Definition: enzyme.h:91

References m_maxPeptideVariantListSize, m_takeOnlyFirstWildcard, replaceWildcards(), and pappso::EnzymeProductInterface::setPeptide().

Referenced by eat().

◆ setMaxPeptideVariantListSize()

void pappso::Enzyme::setMaxPeptideVariantListSize ( std::size_t  max_peptide_variant_list_size)

if there are wildcards in the protein sequence : restrict the number of possible peptide sequences

Parameters
max_peptide_variant_list_sizemaximum number of peptide variant (default is 100)

Definition at line 81 of file enzyme.cpp.

82 {
83  m_maxPeptideVariantListSize = max_peptide_variant_list_size;
84 }

References m_maxPeptideVariantListSize.

◆ setMiscleavage()

void pappso::Enzyme::setMiscleavage ( unsigned int  miscleavage)

sets the maximum number of missed cleavage allowed in the digestion

Parameters
miscleavagemaximum number of missed cleavade to allow (defaults is 0)

Definition at line 71 of file enzyme.cpp.

72 {
73  m_miscleavage = miscleavage;
74 }

References m_miscleavage.

◆ setTakeOnlyFirstWildcard()

void pappso::Enzyme::setTakeOnlyFirstWildcard ( bool  take_only_first_wildcard)

take only first m_takeOnlyFirstWildcard

Parameters
booltrue : switch to take only the first possibility if there are X, B or Z wildcards in sequence

Definition at line 264 of file enzyme.cpp.

265 {
266  m_takeOnlyFirstWildcard = take_only_first_wildcard;
267 }

References m_takeOnlyFirstWildcard.

Member Data Documentation

◆ m_maxPeptideVariantListSize

std::size_t pappso::Enzyme::m_maxPeptideVariantListSize = 100
private

Definition at line 93 of file enzyme.h.

Referenced by sanityCheck(), and setMaxPeptideVariantListSize().

◆ m_miscleavage

unsigned int pappso::Enzyme::m_miscleavage = 0
private

Definition at line 90 of file enzyme.h.

Referenced by Enzyme(), eat(), getMiscleavage(), and setMiscleavage().

◆ m_recognitionSite

QRegExp pappso::Enzyme::m_recognitionSite
private

example with a kinase == [K,R]

Definition at line 89 of file enzyme.h.

Referenced by Enzyme(), eat(), and getQRegExpRecognitionSite().

◆ m_takeOnlyFirstWildcard

bool pappso::Enzyme::m_takeOnlyFirstWildcard = false
private

Definition at line 91 of file enzyme.h.

Referenced by sanityCheck(), and setTakeOnlyFirstWildcard().

◆ m_wildCardB

std::vector<char> pappso::Enzyme::m_wildCardB
private

Definition at line 97 of file enzyme.h.

Referenced by Enzyme(), and replaceWildcards().

◆ m_wildCardX

std::vector<char> pappso::Enzyme::m_wildCardX
private

Definition at line 96 of file enzyme.h.

Referenced by Enzyme(), and replaceWildcards().

◆ m_wildCardZ

std::vector<char> pappso::Enzyme::m_wildCardZ
private

Definition at line 98 of file enzyme.h.

Referenced by Enzyme(), and replaceWildcards().


The documentation for this class was generated from the following files: