NCBI C++ ToolKit
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* ===========================================================================
2  *
4  * National Center for Biotechnology Information
5  *
6  * This software/database is a "United States Government Work" under the
7  * terms of the United States Copyright Act. It was written as part of
8  * the author's official duties as a United States Government employee and
9  * thus cannot be copyrighted. This software/database is freely available
10  * to the public for use. The National Library of Medicine and the U.S.
11  * Government have not placed any restriction on its use or reproduction.
12  *
13  * Although all reasonable efforts have been taken to ensure the accuracy
14  * and reliability of the software and data, the NLM and the U.S.
15  * Government do not and cannot warrant the performance or results that
16  * may be obtained by using this software or data. The NLM and the U.S.
17  * Government disclaim all warranties, express or implied, including
18  * warranties of performance, merchantability or fitness for any particular
19  * purpose.
20  *
21  * Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  * Author: Christiam Camacho
26  *
27  */
29 /** @file pssm_engine.cpp
30  * Implementation of the C++ API for the PSI-BLAST PSSM generation engine.
31  */
33 #include <ncbi_pch.hpp>
34 #include <sstream>
38 #include "blast_setup.hpp"
40 // Object includes
49 // Core BLAST includes
53 #include "../core/blast_psi_priv.h"
55 /** @addtogroup AlgoBlast
56  *
57  * @{
58  */
62 BEGIN_SCOPE(blast)
64 /// This function makes sure that none of the required data is returned as NULL
65 /// or "empty"
66 /// @param pssm_input_msa interface which provides the data [in]
67 /// @throw CPssmEngineException in case of validation failure
68 static void
70 {
71  if ( !pssm_input_msa ) {
72  NCBI_THROW(CPssmEngineException, eNullInputData,
73  "IPssmInputData is NULL");
74  }
76  if ( !pssm_input_msa->GetOptions() ) {
77  NCBI_THROW(CPssmEngineException, eNullInputData,
78  "IPssmInputData returns NULL PSIBlastOptions");
79  }
81  if ( !pssm_input_msa->GetQuery() ) {
82  NCBI_THROW(CPssmEngineException, eNullInputData,
83  "IPssmInputData returns NULL query sequence");
84  }
86  if (pssm_input_msa->GetQueryLength() == 0) {
87  NCBI_THROW(CPssmEngineException, eNullInputData,
88  "Query length provided by IPssmInputData is 0");
89  }
90 }
92 /// This function makes sure that none of the required data is returned as NULL
93 /// or "empty"
94 /// @param pssm_input_freqratios interface which provides the data [in]
95 /// @throw CPssmEngineException in case of validation failure
96 static void
98 {
99  if ( !pssm_input_freqratios ) {
100  NCBI_THROW(CPssmEngineException, eNullInputData,
101  "IPssmInputFreqRatios is NULL");
102  }
104  if ( !pssm_input_freqratios->GetQuery() ) {
105  NCBI_THROW(CPssmEngineException, eNullInputData,
106  "IPssmInputFreqRatiosFreqRatios returns NULL query sequence");
107  }
109  const unsigned int kQueryLength = pssm_input_freqratios->GetQueryLength();
110  if (kQueryLength == 0) {
111  NCBI_THROW(CPssmEngineException, eInvalidInputData,
112  "Query length provided by IPssmInputFreqRatiosFreqRatios is 0");
113  }
115  if (pssm_input_freqratios->GetData().GetCols() != kQueryLength) {
116  NCBI_THROW(CPssmEngineException, eInvalidInputData,
117  "Number of columns returned by IPssmInputFreqRatiosFreqRatios does "
118  "not match query length");
119  }
120  if (pssm_input_freqratios->GetData().GetRows() != BLASTAA_SIZE) {
121  NCBI_THROW(CPssmEngineException, eInvalidInputData,
122  "Number of rows returned by IPssmInputFreqRatiosFreqRatios differs "
123  "from " + NStr::IntToString(BLASTAA_SIZE));
124  }
125 }
127 /// Performs validation on data provided before invoking the CORE PSSM
128 /// engine. Should be called after invoking Process() on its argument
129 /// @throws CPssmEngineException if validation fails
130 static void
131 s_Validate(IPssmInputData* pssm_input_msa)
132 {
133  _ASSERT(pssm_input_msa);
135  if ( !pssm_input_msa->GetData() ) {
136  NCBI_THROW(CPssmEngineException, eNullInputData,
137  "IPssmInputData returns NULL multiple sequence alignment");
138  }
140  Blast_Message* errors = NULL;
141  if (PSIBlastOptionsValidate(pssm_input_msa->GetOptions(), &errors) != 0) {
142  string msg("IPssmInputData returns invalid PSIBlastOptions: ");
143  msg += string(errors->message);
144  errors = Blast_MessageFree(errors);
145  NCBI_THROW(CBlastException, eInvalidOptions, msg);
146  }
147 }
149 /// Performs validation on data provided before invoking the CORE PSSM
150 /// engine. Should be called after invoking Process() on its argument
151 /// @throws CPssmEngineException if validation fails
152 static void
154 {
155  _ASSERT(pssm_input);
157  if ( !pssm_input->GetData() ) {
158  NCBI_THROW(CPssmEngineException, eNullInputData,
159  "IPssmInputData returns NULL multiple sequence alignment");
160  }
162  Blast_Message* errors = NULL;
163  if (PSIBlastOptionsValidate(pssm_input->GetOptions(), &errors) != 0) {
164  string msg("IPssmInputData returns invalid PSIBlastOptions: ");
165  msg += string(errors->message);
166  errors = Blast_MessageFree(errors);
167  NCBI_THROW(CBlastException, eInvalidOptions, msg);
168  }
169 }
172 /// Performs validation on data provided before invoking the CORE PSSM
173 /// engine. Should be called after invoking Process() on its argument
174 /// @throws CPssmEngineException if validation fails
175 static void
177 {
178  _ASSERT(pssm_input_fr);
180  ITERATE(CNcbiMatrix<double>, itr, pssm_input_fr->GetData()) {
181  if (*itr < 0.0) {
182  NCBI_THROW(CPssmEngineException, eInvalidInputData,
183  "PSSM frequency ratios cannot have negative values");
184  }
185  }
186 }
189  : m_PssmInput(input), m_PssmInputFreqRatios(NULL)
190 {
194 }
197  : m_PssmInput(NULL), m_PssmInputFreqRatios(input)
198 {
202 }
205  m_PssmInputFreqRatios(NULL),
206  m_PssmInputCdd(input)
207 {
208  x_InitializeScoreBlock(input->GetQuery(), input->GetQueryLength(),
209  input->GetMatrixName(), input->GetGapExistence(),
210  input->GetGapExtension());
211 }
214 {
215 }
217 string
219 {
220  string retval;
222  switch (error_code) {
223  case PSI_SUCCESS:
224  retval = "No error detected";
225  break;
228  retval = "Bad argument to function detected";
229  break;
232  retval = "Out of memory";
233  break;
236  retval = "Error computing sequence weights";
237  break;
240  retval = "No matrix frequency ratios were found for requested matrix";
241  break;
244  retval = "PSSM has positive average score";
245  break;
248  retval = "No sequences left after purging biased sequences in ";
249  retval += "multiple sequence alignment";
250  break;
253  retval = "Gap found in query sequence";
254  break;
257  retval = "Found column with no sequences aligned in it";
258  break;
261  retval = "Found column with only GAP residues";
262  break;
265  retval = "Found flanking gap at start of alignment";
266  break;
269  retval = "Found flanking gap at end of alignment";
270  break;
273  retval = "Errors in conserved domain profile";
274  break;
276  default:
277  retval = "Unknown error code returned from PSSM engine: " +
278  NStr::IntToString(error_code);
279  }
281  return retval;
282 }
286 {
287  if (m_PssmInput) {
288  return x_CreatePssmFromMsa();
289  }
291  if (m_PssmInputFreqRatios) {
293  }
295  if (m_PssmInputCdd) {
296  return x_CreatePssmFromCDD();
297  }
299  NCBI_THROW(CPssmEngineException, eNullInputData, "All pointers to pre-"
300  "processing input data strategies are null");
301 }
303 /// Auxiliary class to convert from a CNcbiMatrix into a double** as
304 /// required by the C API. Used only by CPssmEngine::x_CreatePssmFromFreqRatios
306 {
307  /// Constructor
308  /// @param m standard c++ toolkit matrix
310  : m_NumCols(m.GetCols())
311  {
312  m_Data = new double*[m.GetCols()];
313  for (size_t c = 0; c < m.GetCols(); c++) {
314  m_Data[c] = new double[m.GetRows()];
315  for (size_t r = 0; r < m.GetRows(); r++) {
316  m_Data[c][r] = m(r, c);
317  }
318  }
319  }
321  /// Destructor
323  for (size_t c = 0; c < m_NumCols; c++) {
324  delete [] m_Data[c];
325  }
326  delete [] m_Data;
327  }
329  /// Retrieves data in the format expected by the C CORE APIs
330  operator double**() { return m_Data; }
332 private:
333  /// double** representation of a CNcbiMatrix
334  double** m_Data;
335  /// number of columns in the matrix (for deallocation)
336  size_t m_NumCols;
337 };
341 {
347  CPSIMatrix pssm;
350  int status =
354  m_ScoreBlk,
355  freq_ratios,
357  //kPSSM_NoImpalaScaling,
358  &pssm);
359  if (status != PSI_SUCCESS) {
360  string msg = x_ErrorCodeToString(status);
361  NCBI_THROW(CBlastException, eCoreBlastError, msg);
362  }
364  // Convert core BLAST matrix structure into ASN.1 score matrix object
368  if (query.NotEmpty()) {
369  retval->SetQuery().SetSeq(*query);
370  }
372  return retval;
373 }
377 {
380  m_PssmInput->Process();
383  CPSIMatrix pssm;
384  CPSIDiagnosticsResponse diagnostics;
385  int status =
388  m_ScoreBlk,
390  &pssm,
391  &diagnostics);
392  if (status != PSI_SUCCESS) {
393  // FIXME: need to use core level perror-like facility
394  string msg = x_ErrorCodeToString(status);
395  NCBI_THROW(CBlastException, eCoreBlastError, msg);
396  }
398  // Convert core BLAST matrix structure into ASN.1 score matrix object
400  retval = x_PSIMatrix2Asn1(pssm, m_PssmInput->GetMatrixName(),
401  m_PssmInput->GetOptions(), diagnostics);
403  if (query.NotEmpty()) {
404  retval->SetQuery().SetSeq(*query);
405  }
407  return retval;
408 }
413 {
419  CPSIMatrix pssm;
420  CPSIDiagnosticsResponse diagnostics;
421  int status =
424  m_ScoreBlk,
426  &pssm,
427  &diagnostics);
429  if (status != PSI_SUCCESS) {
430  // FIXME: need to use core level perror-like facility
431  string msg = x_ErrorCodeToString(status);
432  NCBI_THROW(CBlastException, eCoreBlastError, msg);
433  }
435  // Convert core BLAST matrix structure into ASN.1 score matrix object
437  retval = x_PSIMatrix2Asn1(pssm, m_PssmInputCdd->GetMatrixName(),
438  m_PssmInputCdd->GetOptions(), diagnostics);
441  if (query.NotEmpty()) {
442  retval->SetQuery().SetSeq(*query);
443  }
445  return retval;
446 }
448 unsigned char*
450  unsigned int query_length)
451 {
452  _ASSERT(query);
454  unsigned char* retval = NULL;
455  retval = (unsigned char*) malloc(sizeof(unsigned char)*(query_length + 2));
456  if ( !retval ) {
457  NCBI_THROW(CBlastSystemException, eOutOfMemory, "Query with sentinels");
458  }
460  retval[0] = retval[query_length+1] = GetSentinelByte(eBlastEncodingProtein);
461  memcpy((void*) &retval[1], (void*) query, query_length);
462  return retval;
463 }
466 CPssmEngine::x_InitializeQueryInfo(unsigned int query_length)
467 {
468  const int kNumQueries = 1;
469  BlastQueryInfo* retval = BlastQueryInfoNew(eBlastTypeBlastp, kNumQueries);
471  if ( !retval ) {
472  NCBI_THROW(CBlastSystemException, eOutOfMemory, "BlastQueryInfo");
473  }
475  retval->contexts[0].query_offset = 0;
476  retval->contexts[0].query_length = query_length;
477  retval->max_length = query_length;
479  return retval;
480 }
482 void
484  ancillary_data)
485 {
486  _ASSERT(m_ScoreBlk.Get() != NULL);
487  _ASSERT(ancillary_data.NotEmpty());
488  if (ancillary_data->GetPsiUngappedKarlinBlk()) {
490  m_ScoreBlk->kbp_psi[0]->Lambda =
491  ancillary_data->GetPsiUngappedKarlinBlk()->Lambda;
492  m_ScoreBlk->kbp_psi[0]->K =
493  ancillary_data->GetPsiUngappedKarlinBlk()->K;
495  m_ScoreBlk->kbp_psi[0]->H =
496  ancillary_data->GetPsiUngappedKarlinBlk()->H;
497  }
499  if (ancillary_data->GetPsiGappedKarlinBlk()) {
502  ancillary_data->GetPsiGappedKarlinBlk()->Lambda;
503  m_ScoreBlk->kbp_gap_psi[0]->K =
504  ancillary_data->GetPsiGappedKarlinBlk()->K;
506  m_ScoreBlk->kbp_gap_psi[0]->H =
507  ancillary_data->GetPsiGappedKarlinBlk()->H;
508  }
509 }
511 void
513  unsigned int query_length,
514  const char* matrix_name,
515  int gap_existence,
516  int gap_extension)
517 {
518  _ASSERT(query);
519  _ASSERT(matrix_name);
521  const EBlastProgramType kProgramType = eBlastTypePsiBlast;
522  short status = 0;
524  TAutoUint1Ptr guarded_query(x_GuardProteinQuery(query, query_length));
526  // Setup the scoring options
528  status = BlastScoringOptionsNew(kProgramType, &opts);
529  if (status != 0) {
530  NCBI_THROW(CBlastSystemException, eOutOfMemory, "BlastScoringOptions");
531  }
532  BlastScoringOptionsSetMatrix(opts, matrix_name);
533  opts->gap_open = gap_existence;
534  opts->gap_extend = gap_extension;
536  // Setup the sequence block structure
537  CBLAST_SequenceBlk query_blk;
538  status = BlastSeqBlkNew(&query_blk);
539  if (status != 0) {
540  NCBI_THROW(CBlastSystemException, eOutOfMemory, "BLAST_SequenceBlk");
541  }
543  // Populate the sequence block structure, transferring ownership of the
544  // guarded protein sequence
545  status = BlastSeqBlkSetSequence(query_blk, guarded_query.release(),
546  query_length);
547  if (status != 0) {
548  // should never happen, previous function only performs assignments
549  abort();
550  }
552  // Setup the query info structure
553  CBlastQueryInfo query_info(x_InitializeQueryInfo(query_length));
555  BlastScoreBlk* retval = NULL;
556  Blast_Message* errors = NULL;
557  const double kScaleFactor = 1.0;
558  status = BlastSetup_ScoreBlkInit(query_blk,
559  query_info,
560  opts,
561  kProgramType,
562  &retval,
563  kScaleFactor,
564  &errors,
566  if (status != 0) {
567  retval = BlastScoreBlkFree(retval);
568  if (errors) {
569  string msg(errors->message);
570  errors = Blast_MessageFree(errors);
571  NCBI_THROW(CBlastException, eCoreBlastError, msg);
572  } else {
573  NCBI_THROW(CBlastException, eCoreBlastError,
574  "Unknown error when setting up BlastScoreBlk");
575  }
576  }
578  _ASSERT(retval->kbp_ideal);
579  _ASSERT(retval->kbp == retval->kbp_psi);
580  _ASSERT(retval->kbp_gap == retval->kbp_gap_psi);
582  m_ScoreBlk.Reset(retval);
583 }
585 unsigned char*
587 {
588  return (m_PssmInput ?
590 }
592 unsigned int
594 {
595  return (m_PssmInput ?
598 }
600 const char*
602 {
603  return (m_PssmInput ?
606 }
608 int
610 {
611  return (m_PssmInput ?
614 }
616 int
618 {
619  return (m_PssmInput ?
622 }
626  const char* matrix_name,
627  const PSIBlastOptions* opts,
628  const PSIDiagnosticsResponse* diagnostics)
629 {
630  _ASSERT(pssm);
634  // Record the parameters
635  string mtx(matrix_name);
636  mtx = NStr::ToUpper(mtx); // save the matrix name in all capital letters
637  retval->SetParams().SetRpsdbparams().SetMatrixName(mtx);
638  if (opts) {
639  retval->SetParams().SetPseudocount(opts->pseudo_count);
640  }
642  CPssm& asn1_pssm = retval->SetPssm();
643  asn1_pssm.SetIsProtein(true);
644  // number of rows is alphabet size
645  asn1_pssm.SetNumRows(pssm->nrows);
646  // number of columns is query length
647  asn1_pssm.SetNumColumns(pssm->ncols);
648  asn1_pssm.SetByRow(false); // this is the default
650  asn1_pssm.SetLambda(pssm->lambda);
651  asn1_pssm.SetKappa(pssm->kappa);
652  asn1_pssm.SetH(pssm->h);
653  asn1_pssm.SetLambdaUngapped(pssm->ung_lambda);
654  asn1_pssm.SetKappaUngapped(pssm->ung_kappa);
655  asn1_pssm.SetHUngapped(pssm->ung_h);
656  if (asn1_pssm.GetByRow() == false) {
657  for (unsigned int i = 0; i < pssm->ncols; i++) {
658  for (unsigned int j = 0; j < pssm->nrows; j++) {
659  asn1_pssm.SetFinalData().SetScores().
660  push_back(pssm->pssm[i][j]);
661  }
662  }
663  } else {
664  for (unsigned int i = 0; i < pssm->nrows; i++) {
665  for (unsigned int j = 0; j < pssm->ncols; j++) {
666  asn1_pssm.SetFinalData().SetScores().
667  push_back(pssm->pssm[j][i]);
668  }
669  }
670  }
671  if (opts && opts->impala_scaling_factor != kPSSM_NoImpalaScaling) {
672  asn1_pssm.SetFinalData().
673  SetScalingFactor(static_cast<int>(opts->impala_scaling_factor));
674  }
676  /********** Collect information from diagnostics structure ************/
677  if ( !diagnostics ) {
678  return retval;
679  }
681  _ASSERT(pssm->nrows == diagnostics->alphabet_size);
682  _ASSERT(pssm->ncols == diagnostics->query_length);
684  if (diagnostics->information_content) {
686  asn1_pssm.SetIntermediateData().SetInformationContent();
687  for (Uint4 i = 0; i < diagnostics->query_length; i++) {
688  info_content.push_back(diagnostics->information_content[i]);
689  }
690  }
692  if (diagnostics->residue_freqs) {
694  asn1_pssm.SetIntermediateData().SetResFreqsPerPos();
695  if (asn1_pssm.GetByRow() == false) {
696  for (unsigned int i = 0; i < pssm->ncols; i++) {
697  for (unsigned int j = 0; j < pssm->nrows; j++) {
698  res_freqs.push_back(diagnostics->residue_freqs[i][j]);
699  }
700  }
701  } else {
702  for (unsigned int i = 0; i < pssm->nrows; i++) {
703  for (unsigned int j = 0; j < pssm->ncols; j++) {
704  res_freqs.push_back(diagnostics->residue_freqs[j][i]);
705  }
706  }
707  }
708  }
710  if (diagnostics->weighted_residue_freqs) {
712  asn1_pssm.SetIntermediateData().SetWeightedResFreqsPerPos();
713  if (asn1_pssm.GetByRow() == false) {
714  for (unsigned int i = 0; i < pssm->ncols; i++) {
715  for (unsigned int j = 0; j < pssm->nrows; j++) {
716  wres_freqs.
717  push_back(diagnostics->weighted_residue_freqs[i][j]);
718  }
719  }
720  } else {
721  for (unsigned int i = 0; i < pssm->nrows; i++) {
722  for (unsigned int j = 0; j < pssm->ncols; j++) {
723  wres_freqs.
724  push_back(diagnostics->weighted_residue_freqs[j][i]);
725  }
726  }
727  }
728  }
730  if (diagnostics->frequency_ratios) {
731  CPssmIntermediateData::TFreqRatios& freq_ratios =
732  asn1_pssm.SetIntermediateData().SetFreqRatios();
733  if (asn1_pssm.GetByRow() == false) {
734  for (unsigned int i = 0; i < pssm->ncols; i++) {
735  for (unsigned int j = 0; j < pssm->nrows; j++) {
736  freq_ratios.push_back(diagnostics->frequency_ratios[i][j]);
737  }
738  }
739  } else {
740  for (unsigned int i = 0; i < pssm->nrows; i++) {
741  for (unsigned int j = 0; j < pssm->ncols; j++) {
742  freq_ratios.push_back(diagnostics->frequency_ratios[j][i]);
743  }
744  }
745  }
746  }
748  if (diagnostics->gapless_column_weights) {
750  asn1_pssm.SetIntermediateData().SetGaplessColumnWeights();
751  for (Uint4 i = 0; i < diagnostics->query_length; i++) {
752  gcw.push_back(diagnostics->gapless_column_weights[i]);
753  }
754  }
756  if (diagnostics->sigma) {
758  asn1_pssm.SetIntermediateData().SetSigma();
759  for (Uint4 i = 0; i < diagnostics->query_length; i++) {
760  sigma.push_back(diagnostics->sigma[i]);
761  }
762  }
764  if (diagnostics->interval_sizes) {
765  CPssmIntermediateData::TIntervalSizes& interval_sizes =
766  asn1_pssm.SetIntermediateData().SetIntervalSizes();
767  for (Uint4 i = 0; i < diagnostics->query_length; i++) {
768  interval_sizes.push_back(diagnostics->interval_sizes[i]);
769  }
770  }
772  if (diagnostics->num_matching_seqs) {
773  CPssmIntermediateData::TNumMatchingSeqs& num_matching_seqs =
774  asn1_pssm.SetIntermediateData().SetNumMatchingSeqs();
775  for (Uint4 i = 0; i < diagnostics->query_length; i++) {
776  num_matching_seqs.push_back(diagnostics->num_matching_seqs[i]);
777  }
778  }
780  if (diagnostics->independent_observations) {
781  CPssmIntermediateData::TNumIndeptObsr& num_indept_obsr =
782  asn1_pssm.SetIntermediateData().SetNumIndeptObsr();
783  for (Uint4 i = 0; i < diagnostics->query_length; i++) {
784  num_indept_obsr.push_back(diagnostics->independent_observations[i]);
785  }
786  }
788  return retval;
789 }
791 END_SCOPE(blast)
794 /* @} */
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Contains C++ wrapper classes to structures in algo/blast/core as well as some auxiliary functions to ...
Blast_Message * Blast_MessageFree(Blast_Message *blast_msg)
Deallocates message memory.
Definition: blast_message.c:80
The structures and functions in blast_options.
Int2 PSIBlastOptionsValidate(const PSIBlastOptions *psi_options, Blast_Message **blast_msg)
Validates the PSI BLAST options so that they have sane values.
Int2 BlastScoringOptionsNew(EBlastProgramType program, BlastScoringOptions **options)
Allocate memory for BlastScoringOptions and fill with default values.
Int2 BlastScoringOptionsSetMatrix(BlastScoringOptions *opts, const char *matrix_name)
Resets matrix name option.
const double kPSSM_NoImpalaScaling
Value used to indicate that no IMPALA-style scaling should be performed when scaling a PSSM.
Definition: blast_options.c:43
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
@ eBlastTypePsiBlast
Definition: blast_program.h:82
@ eBlastTypeBlastp
Definition: blast_program.h:73
int PSICreatePssmFromCDD(const PSICdMsa *cd_msa, const PSIBlastOptions *options, BlastScoreBlk *sbp, const PSIDiagnosticsRequest *request, PSIMatrix **pssm, PSIDiagnosticsResponse **diagnostics)
Main entry point to core PSSM engine for computing CDD-based PSSMs.
Definition: blast_psi.c:229
int PSICreatePssmFromFrequencyRatios(const Uint1 *query, Uint4 query_length, BlastScoreBlk *sbp, double **freq_ratios, double impala_scaling_factor, PSIMatrix **pssm)
Top-level function to create a PSSM given a matrix of frequency ratios and perform scaling on the res...
Definition: blast_psi.c:344
int PSICreatePssmWithDiagnostics(const PSIMsa *msap, const PSIBlastOptions *options, BlastScoreBlk *sbp, const PSIDiagnosticsRequest *request, PSIMatrix **pssm, PSIDiagnosticsResponse **diagnostics)
Main entry point to core PSSM engine which allows to request diagnostics information.
Definition: blast_psi.c:105
Bad parameter used in function.
Found flanking gap at end of alignment.
Found an entire column full of GAP residues.
Out of memory.
Errors in conserved domain profile.
Positive average score found when scaling matrix.
After purge stage of PSSM creation, no sequences are left.
No frequency ratios were found for the given scoring matrix.
Found flanking gap at start of alignment.
Sequence weights do not add to 1.
Successful operation.
Found an entire column with no participating sequences.
GAP residue found in query sequence.
BlastQueryInfo * BlastQueryInfoNew(EBlastProgramType program, int num_queries)
Allocate memory for query information structure.
Utilities initialize/setup BLAST.
Int2 BlastSetup_ScoreBlkInit(BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info, const BlastScoringOptions *scoring_options, EBlastProgramType program_number, BlastScoreBlk **sbpp, double scale_factor, Blast_Message **blast_message, GET_MATRIX_PATH get_path)
Initializes the score block structure.
Definition: blast_setup.c:456
Internal auxiliary setup classes/functions for C++ BLAST APIs.
Definitions and prototypes used by blast_stat.c to calculate BLAST statistics.
BlastScoreBlk * BlastScoreBlkFree(BlastScoreBlk *sbp)
Deallocates BlastScoreBlk as well as all associated structures.
Definition: blast_stat.c:965
Int2 BlastSeqBlkSetSequence(BLAST_SequenceBlk *seq_blk, const Uint1 *sequence, Int4 seqlen)
Stores the sequence in the sequence block structure.
Definition: blast_util.c:147
Int2 BlastSeqBlkNew(BLAST_SequenceBlk **retval)
Allocates a new sequence block structure.
Definition: blast_util.c:133
Wrapper class for BLAST_SequenceBlk .
Definition: blast_aux.hpp:309
Defines BLAST error codes (user errors included)
Wrapper class for BlastQueryInfo .
Definition: blast_aux.hpp:311
Wrapper class for BlastScoringOptions .
Definition: blast_aux.hpp:334
Defines system exceptions occurred while running BLAST.
size_t GetRows() const
get the number of rows in this matrix
Definition: matrix.hpp:298
size_t GetCols() const
get the number of columns in this matrix
Definition: matrix.hpp:305
Wrapper class for PSIDiagnosticsResponse .
Definition: blast_aux.hpp:348
Wrapper class for PSIMatrix .
Definition: blast_aux.hpp:346
Exception class for the CPssmEngine class.
Definition: pssm_engine.hpp:63
CSeq_entry & SetQuery()
Retrieve the query sequence.
Definition: Pssm.hpp:55
void SetHUngapped(double val)
Definition: Pssm.cpp:188
void SetH(double val)
Definition: Pssm.cpp:170
void SetLambdaUngapped(double val)
Definition: Pssm.cpp:176
void SetKappa(double val)
Definition: Pssm.cpp:164
void SetKappaUngapped(double val)
Definition: Pssm.cpp:182
void SetLambda(double val)
Definition: Pssm.cpp:158
Interface for strategy to pre-process multiple alignment of conserved domains matches as input data f...
static tds_mutex mtx
Definition: condition.c:43
virtual void Process()=0
Algorithm to produce multiple sequence alignment structure should be implemented in this method.
static void s_CheckAgainstNullData(IPssmInputData *pssm_input_msa)
This function makes sure that none of the required data is returned as NULL or "empty".
Definition: pssm_engine.cpp:69
CRef< objects::CPssmWithParameters > x_CreatePssmFromMsa()
Using IPssmInputData as a delegate to provide input data in the form of a multiple sequence alignment...
CRef< objects::CPssmWithParameters > x_CreatePssmFromFreqRatios()
Using IPssmInputFreqRatios as a delegate to provide the input PSSM's frequency ratios,...
CBlastScoreBlk m_ScoreBlk
Blast score block structure.
CRef< objects::CPssmWithParameters > Run()
Runs the PSSM engine to compute the PSSM.
const Blast_KarlinBlk * GetPsiGappedKarlinBlk() const
Retrieve PSI-BLAST gapped Karlin parameters.
const char * x_GetMatrixName() const
Private interface to retrieve matrix name from its data source interface.
virtual void Process(void)=0
Pre-process CDs used for PSSM computation.
static unsigned char * x_GuardProteinQuery(const unsigned char *query, unsigned int query_length)
Copies query sequence and adds protein sentinel bytes at the beginning and at the end of the sequence...
virtual const char * GetMatrixName()
Obtain the name of the underlying matrix to use when building the PSSM.
Definition: pssm_input.hpp:68
CRef< objects::CPssmWithParameters > x_CreatePssmFromCDD()
Using IPssmInputCdd as a delegate to provide data in the form of multiple alignment of CDs,...
static CRef< objects::CPssmWithParameters > x_PSIMatrix2Asn1(const PSIMatrix *pssm, const char *matrix_name, const PSIBlastOptions *opts=NULL, const PSIDiagnosticsResponse *diagnostics=NULL)
Converts the PSIMatrix structure into a ASN.1 CPssmWithParameters object.
Size of aminoacid alphabet.
Default constructor available for derived test classes.
size_t m_NumCols
number of columns in the matrix (for deallocation)
virtual unsigned char * GetQuery()=0
Get the query sequence used as master for the multiple sequence alignment in ncbistdaa encoding.
virtual const PSIDiagnosticsRequest * GetDiagnosticsRequest(void)
Get diagnostics options.
void x_InitializeScoreBlock(const unsigned char *query, unsigned int query_length, const char *matrix_name, int gap_existence, int gap_extension)
Initializes the BlastScoreBlk data member required to run the PSSM engine.
BlastScoreBlk * Get() const
Definition: blast_aux.hpp:333
virtual const PSIDiagnosticsRequest * GetDiagnosticsRequest()
Obtain the diagnostics data that is requested from the PSSM engine Its results will be populated in t...
Definition: pssm_input.hpp:123
virtual double GetImpalaScaleFactor()
Definition: pssm_input.hpp:144
int x_GetGapExtension() const
Private interface to retrieve gap extension cost from data source.
virtual int GetGapExistence()
Obtain the gap existence value for the underlying matrix used to build the PSSM.
Definition: pssm_input.hpp:73
virtual unsigned int GetQueryLength()=0
Get the query's length.
int x_GetGapExistence() const
Private interface to retrieve gap existence cost from data source.
IPssmInputFreqRatios * m_PssmInputFreqRatios
Pointer to input data to create PSSM from frequency ratios.
const Blast_KarlinBlk * GetPsiUngappedKarlinBlk() const
Retrieve PSI-BLAST ungapped Karlin parameters.
IPssmInputData * m_PssmInput
Handle to strategy to process raw PSSM input data.
virtual int GetGapExtension()
Obtain the gap extension value for the underlying matrix used to build the PSSM.
Definition: pssm_input.hpp:78
virtual const PSIBlastOptions * GetOptions()=0
Obtain the options for the PSSM engine.
virtual const PSIBlastOptions * GetOptions(void)=0
Get CDD-related PSI-BLAST options.
SNcbiMatrix2DoubleMatrix(const CNcbiMatrix< double > &m)
static void s_Validate(IPssmInputData *pssm_input_msa)
Performs validation on data provided before invoking the CORE PSSM engine.
IPssmInputCdd * m_PssmInputCdd
Pointer to strategy to process raw PSSM input data Note: Only one m_PssmInput* should be non-NULL.
char * BlastFindMatrixPath(const char *matrix_name, Boolean is_prot)
Returns the path to a specified matrix.
virtual const CNcbiMatrix< double > & GetData()=0
Obtain a matrix of frequency ratios with this->GetQueryLength() columns and BLASTAA_SIZE rows.
virtual PSICdMsa * GetData(void)=0
Get CD data for PSSM computation.
unsigned int x_GetQueryLength() const
Private interface to retrieve query length from its data source interface.
void Reset(BlastScoreBlk *p=NULL)
Definition: blast_aux.hpp:333
void SetUngappedStatisticalParams(CConstRef< CBlastAncillaryData > ancillary_data)
Sets the Karlin & Altschul parameters in the BlastScoreBlk to be used in PSSM generation.
virtual void Process()=0
Algorithm to produce the PSSM's frequecy ratios should be implemented in this method.
virtual PSIMsa * GetData()=0
Obtain the multiple sequence alignment structure.
static std::string x_ErrorCodeToString(int error_code)
Convert a PSSM return status into a string.
virtual CRef< objects::CBioseq > GetQueryForPssm()
Get a CBioseq object for attachment into the CPssmWithParameters that CPssmEngine produces (only atta...
Definition: pssm_input.hpp:88
unsigned char * x_GetQuery() const
Private interface to retrieve query sequence from its data source interface.
double ** m_Data
double** representation of a CNcbiMatrix
Uint1 GetSentinelByte(EBlastEncoding encoding) THROWS((CBlastException))
Convenience function to centralize the knowledge of which sentinel bytes we use for supported encodin...
BlastQueryInfo * x_InitializeQueryInfo(unsigned int query_length)
Initialiazes the core BlastQueryInfo structure for a single protein sequence.
@ eBlastEncodingProtein
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
element_type * release(void)
Release will release ownership of pointer to caller.
Definition: ncbimisc.hpp:472
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define USING_SCOPE(ns)
Use the specified namespace.
Definition: ncbistl.hpp:78
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
void SetParams(TParams &value)
Assign a value to Params data member.
void SetIsProtein(TIsProtein value)
Assign a value to IsProtein data member.
Definition: Pssm_.hpp:551
void SetByRow(TByRow value)
Assign a value to ByRow data member.
Definition: Pssm_.hpp:741
void SetPssm(TPssm &value)
Assign a value to Pssm data member.
void SetIntermediateData(TIntermediateData &value)
Assign a value to IntermediateData data member.
Definition: Pssm_.cpp:99
void SetFinalData(TFinalData &value)
Assign a value to FinalData data member.
Definition: Pssm_.cpp:116
TByRow GetByRow(void) const
Get the ByRow member data.
Definition: Pssm_.hpp:735
void SetNumColumns(TNumColumns value)
Assign a value to NumColumns data member.
Definition: Pssm_.hpp:666
void SetNumRows(TNumRows value)
Assign a value to NumRows data member.
Definition: Pssm_.hpp:619
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
static const int kScaleFactor
Definition: hyperclust.cpp:176
static int input()
int i
void abort()
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
C++ API for the PSI-BLAST PSSM engine.
Int4 query_length
Length of this query, strand or frame.
Int4 query_offset
Offset of this query, strand or frame in the concatenated super-query.
The query related information.
BlastContextInfo * contexts
Information per context.
Uint4 max_length
Length of the longest among the concatenated queries.
Structure used for scoring calculations.
Definition: blast_stat.h:177
Blast_KarlinBlk ** kbp
Karlin-Altschul parameters.
Definition: blast_stat.h:207
Blast_KarlinBlk ** kbp_psi
K-A parameters for position-based alignments.
Definition: blast_stat.h:213
Blast_KarlinBlk ** kbp_gap
K-A parameters for gapped alignments.
Definition: blast_stat.h:208
Blast_KarlinBlk * kbp_ideal
Ideal values (for query with average database composition).
Definition: blast_stat.h:216
Blast_KarlinBlk ** kbp_gap_psi
K-A parameters for psi alignments.
Definition: blast_stat.h:215
Int4 gap_open
Extra penalty for starting a gap.
Int4 gap_extend
Penalty for each gap residue.
double K
K value used in statistics.
Definition: blast_stat.h:68
double Lambda
Lambda value used in statistics.
Definition: blast_stat.h:67
double H
H value used in statistics.
Definition: blast_stat.h:70
double logK
natural log of K value used in statistics
Definition: blast_stat.h:69
Structure to hold the a message from the core of the BLAST engine.
Definition: blast_message.h:70
char * message
User message to be saved.
Definition: blast_message.h:73
Abstract base class to encapsulate the source(s) and pre-processing of PSSM input data as well as opt...
Definition: pssm_input.hpp:106
Interface used to retrieve the PSSM frequency ratios to allow for "restart" processing in PSI-BLAST: ...
Definition: pssm_input.hpp:131
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
double impala_scaling_factor
Scaling factor as used in IMPALA to do the matrix rescaling.
Int4 pseudo_count
Pseudocount constant.
This structure contains the diagnostics information requested using the PSIDiagnosticsRequest structu...
Definition: blast_psi.h:201
double * information_content
position information content (query_length elements)
Definition: blast_psi.h:202
Uint4 ** residue_freqs
observed residue frequencies per position of the PSSM (Dimensions are query_length by alphabet_size)
Definition: blast_psi.h:204
double ** weighted_residue_freqs
Weighted observed residue frequencies per position of the PSSM.
Definition: blast_psi.h:208
Uint4 * interval_sizes
interval sizes of aligned regions (query_length elements)
Definition: blast_psi.h:218
Uint4 alphabet_size
Specifies length of alphabet.
Definition: blast_psi.h:225
Uint4 query_length
Specifies the number of positions in the PSSM.
Definition: blast_psi.h:223
double * gapless_column_weights
Weights for columns without gaps (query_length elements)
Definition: blast_psi.h:215
double * independent_observations
Effective number of observations per column.
Definition: blast_psi.h:227
Uint4 * num_matching_seqs
number of matching sequences per query position (query_length elements)
Definition: blast_psi.h:220
double * sigma
sigma (query_length elements)
Definition: blast_psi.h:217
double ** frequency_ratios
PSSM's frequency ratios (Dimensions are query_length by alphabet_size)
Definition: blast_psi.h:212
This is the main return value from the PSSM engine.
Definition: blast_psi.h:150
double ung_lambda
Ungapped Lambda Karlin-Altschul parameter.
Definition: blast_psi.h:157
double kappa
Kappa Karlin-Altschul parameter.
Definition: blast_psi.h:155
int ** pssm
Position-specific score matrix.
Definition: blast_psi.h:153
double ung_kappa
Ungapped Kappa Karlin-Altschul parameter.
Definition: blast_psi.h:158
Uint4 ncols
Number of columns in PSSM (query_length)
Definition: blast_psi.h:151
double ung_h
Ungapped H Karlin-Altschul parameter.
Definition: blast_psi.h:159
double lambda
Lambda Karlin-Altschul parameter.
Definition: blast_psi.h:154
Uint4 nrows
Number of rows in PSSM (alphabet_size)
Definition: blast_psi.h:152
double h
H Karlin-Altschul parameter.
Definition: blast_psi.h:156
Auxiliary class to convert from a CNcbiMatrix into a double** as required by the C API.
static string query
#define _ASSERT
voidp malloc(uInt size)
Modified on Wed Apr 24 14:13:38 2024 by rev. 669887