NCBI C++ ToolKit
redoalignment_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: redoalignment_unit_test.cpp 73100 2016-06-20 15:45:40Z boratyng $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Tom Madden
27 *
28 * File Description:
29 * Unit test module for traceback calculation
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
34 #include <corelib/test_boost.hpp>
35 
36 #include <corelib/ncbitime.hpp>
38 #include <objmgr/scope.hpp>
40 #include <objmgr/util/sequence.hpp>
41 
46 #include <serial/serial.hpp>
47 #include <serial/iterator.hpp>
48 #include <serial/objostr.hpp>
49 
52 #include <blast_objmgr_priv.hpp>
53 #include <blast_psi_priv.h>
54 
60 
63 
66 
69 
70 #include "test_objmgr.hpp"
71 #include "blast_test_util.hpp"
72 
73 using namespace std;
74 using namespace ncbi;
75 using namespace ncbi::objects;
76 using namespace ncbi::blast;
77 
79 {
82 
83  static BlastHSPList* setUpHSPList(int num_hsps,
84  const int query_offset[],
85  const int query_end[],
86  const int subject_offset[],
87  const int subject_end[],
88  const int query_gapped_start[],
89  const int subject_gapped_start[],
90  const int score[],
91  const double evalue[] = NULL,
92  const int num_ident[] = NULL)
93  {
94  const int kQueryContext = 0;
95  const int kSubjectFrame = 0;
96  const int kQueryFrame = 0;
97 
98  BlastHSPList* retval = Blast_HSPListNew(0);
99  if ( !retval ) {
100  return NULL;
101  }
102 
103  for (int i = 0; i < num_hsps; i++) {
104  BlastHSP* hsp = NULL;
105  Blast_HSPInit(query_offset[i], query_end[i],
106  subject_offset[i], subject_end[i],
107  query_gapped_start[i],
108  subject_gapped_start[i],
109  kQueryContext, kQueryFrame, kSubjectFrame, score[i],
110  NULL, &hsp);
111  if (num_ident) {
112  hsp->num_ident = num_ident[i];
113  }
114  if (evalue) {
115  hsp->evalue = evalue[i];
116  }
117  Blast_HSPListSaveHSP(retval, hsp);
118  }
119 
120  return retval;
121  }
122 
123  static int** loadPssmFromFile(const string& filename,
124  unsigned int query_length)
125  {
126  ifstream in(filename.c_str());
127  if ( !in ) {
128  throw runtime_error(filename + " could not be found");
129  }
130 
131  int** retval = (int**) _PSIAllocateMatrix(query_length,
132  BLASTAA_SIZE,
133  sizeof(int));
134  try {
135  for (unsigned int i = 0; i < query_length; i++) {
136  for (unsigned int j = 0; j < BLASTAA_SIZE; j++) {
137  in >> retval[i][j];
138  }
139  }
140  } catch (...) {
141  retval = (int**) _PSIDeallocateMatrix((void**)retval, query_length);
142  throw runtime_error("Error reading from " + filename);
143  }
144 
145  return retval;
146  }
147 
149  unsigned int qlen)
150  {
151  if ( !sbp ) {
152  throw runtime_error("NULL BlastScoreBlk*!");
153  }
154 
155  int** pssm =
156  loadPssmFromFile("data/aa.129295.pssm.txt",
157  qlen);
158  sbp->psi_matrix = SPsiBlastScoreMatrixNew(qlen);
159  _PSICopyMatrix_int(sbp->psi_matrix->pssm->data, pssm,
160  qlen, BLASTAA_SIZE);
161  pssm = (int**)_PSIDeallocateMatrix((void**) pssm, qlen);
162 
163  /* FIXME: Should offer a function that allows the setting of all
164  * PSI-BLAST settings in the BlastScoreBlk */
165  sbp->kbp = sbp->kbp_psi;
166  sbp->kbp_gap = sbp->kbp_gap_psi;
167  }
168 
169  // Core function which executes the unit tests
170  static void runRedoAlignmentCoreUnitTest(EBlastProgramType program,
171  CSeq_id& qid,
172  CSeq_id& sid,
173  BlastHSPList* init_hsp_list,
174  const BlastHSPList* ending_hsp_list,
175  Int8 effective_searchsp,
176  ECompoAdjustModes compositonBasedStatsMode,
177  bool doSmithWaterman,
178  double evalue_threshold =
180  int hit_list_size = BLAST_HITLIST_SIZE);
181 
182  // Core function which executes the unit tests
183  // FIXME: refactor so that blastOptions are passed in!
184  static void runRedoAlignmentCoreUnitTest(EBlastProgramType program,
185  SSeqLoc& qsl,
186  SSeqLoc& ssl,
187  BlastHSPList* init_hsp_list,
188  const BlastHSPList* ending_hsp_list,
189  Int8 effective_searchsp,
190  ECompoAdjustModes compositonBasedStatsMode,
191  bool doSmithWaterman,
192  double evalue_threshold =
194  int hit_list_size = BLAST_HITLIST_SIZE);
195 };
196 
199  CSeq_id& qid,
200  CSeq_id& sid,
201  BlastHSPList* init_hsp_list,
202  const BlastHSPList* ending_hsp_list,
203  Int8 effective_searchsp,
204  ECompoAdjustModes compositonBasedStatsMode,
205  bool doSmithWaterman,
206  double evalue_threshold,
207  int hit_list_size)
208 {
209 
210  auto_ptr<SSeqLoc> qsl(CTestObjMgr::Instance().CreateSSeqLoc(qid));
211  auto_ptr<SSeqLoc> ssl(CTestObjMgr::Instance().CreateSSeqLoc(sid));
212 
213  runRedoAlignmentCoreUnitTest(program, *qsl, *ssl, init_hsp_list,
214  ending_hsp_list, effective_searchsp, compositonBasedStatsMode,
215  doSmithWaterman, evalue_threshold, hit_list_size);
216 
217 }
218 
219 
222  SSeqLoc& qsl,
223  SSeqLoc& ssl,
224  BlastHSPList* init_hsp_list,
225  const BlastHSPList* ending_hsp_list,
226  Int8 effective_searchsp,
227  ECompoAdjustModes compositonBasedStatsMode,
228  bool doSmithWaterman,
229  double evalue_threshold,
230  int hit_list_size)
231 {
232 
233  char* program_buffer = NULL;
234  Int2 rv = BlastNumber2Program(program, &program_buffer);
235  BOOST_REQUIRE_MESSAGE(rv == (Int2)0, "BlastNumber2Program failed");
236  blast::EProgram prog = blast::ProgramNameToEnum(string(program_buffer));
237  sfree(program_buffer);
238  CBl2Seq blaster(qsl, ssl, prog);
239 
240  CBlastQueryInfo query_info;
241  CBLAST_SequenceBlk query_blk;
242  TSearchMessages blast_msg;
243 
244  const CBlastOptions& kOpts = blaster.GetOptionsHandle().GetOptions();
245  EBlastProgramType core_prog = kOpts.GetProgramType();
246  ENa_strand strand_opt = kOpts.GetStrandOption();
247 
248  SetupQueryInfo(const_cast<TSeqLocVector&>(blaster.GetQueries()),
249  core_prog, strand_opt,&query_info);
250  SetupQueries(const_cast<TSeqLocVector&>(blaster.GetQueries()),
251  query_info, &query_blk, core_prog, strand_opt, blast_msg);
252  ITERATE(TSearchMessages, m, blast_msg) {
253  BOOST_REQUIRE(m->empty());
254  }
255 
256  CBlastScoringOptions scoring_opts;
257  rv = BlastScoringOptionsNew(program, &scoring_opts);
258  BOOST_REQUIRE(rv == 0);
259 
260  CBlastExtensionOptions ext_opts;
261  BlastExtensionOptionsNew(program, &ext_opts,
262  scoring_opts->gapped_calculation);
263 
264  CBlastHitSavingOptions hitsaving_opts;
265  rv = BlastHitSavingOptionsNew(program, &hitsaving_opts,
266  scoring_opts->gapped_calculation);
267  hitsaving_opts->expect_value = evalue_threshold;
268  hitsaving_opts->hitlist_size = hit_list_size;
269  BOOST_REQUIRE(rv == 0);
270 
271  // FIXME: how to deal with this in case of PSI-BLAST?
272  // FIXME: GetQueryEncoding/GetSubjectEncoding for PSI-BLAST
273  // FIXME: Figure out what is needed to set up a PSI-BLAST search. Is
274  // lookup table different than eAaLookupTable? See
275  // PsiBlastOptionsHandle class also
276  CBlastSeqSrc seq_src
278  const_cast<TSeqLocVector&>(blaster.GetSubjects()),
281 
282  BlastExtensionOptions* ext_options=NULL;
283  BlastExtensionOptionsNew(program, &ext_options, true);
284  ext_options->compositionBasedStats = compositonBasedStatsMode;
285  if (doSmithWaterman)
286  ext_options->eTbackExt = eSmithWatermanTbck;
287 
290  hitsaving_opts, ext_options->compositionBasedStats,
291  scoring_opts->gapped_calculation));
292 
293  BlastHSPWriter* writer = BlastHSPWriterNew(&writer_info, NULL, NULL);
294  BOOST_REQUIRE(writer_info == NULL);
295 
296  BlastHSPStream* hsp_stream = BlastHSPStreamNew(
297  program, ext_options, FALSE, query_info->num_queries, writer);
298 
299  Blast_HSPListSortByScore(init_hsp_list);
300  BlastHSPStreamWrite(hsp_stream, &init_hsp_list);
301 
302  Blast_Message* blast_message=NULL;
303  BlastScoreBlk* sbp;
304  Int2 status;
305  /* FIXME: modularize this code */
306  const double k_rps_scale_factor = 1.0;
307  status =
308  BlastSetup_ScoreBlkInit(query_blk, query_info, scoring_opts,
309  program, &sbp, k_rps_scale_factor,
310  &blast_message, &BlastFindMatrixPath);
311  if (status > 0) {
312  return;
313  }
314 
315  if (program == eBlastTypePsiBlast) {
316  setupPositionBasedBlastScoreBlk(sbp,
317  sequence::GetLength(*(qsl.seqloc->GetId()), qsl.scope));
318  }
319 
320 
321  BlastEffectiveLengthsOptions* eff_len_opts = NULL;
322  BlastEffectiveLengthsOptionsNew(&eff_len_opts);
323  BLAST_FillEffectiveLengthsOptions(eff_len_opts, 0, 0,
324  &effective_searchsp, 1);
325  BlastEffectiveLengthsParameters* eff_len_params = NULL;
326  BlastEffectiveLengthsParametersNew(eff_len_opts, 0, 0, &eff_len_params);
327 
328  status = BLAST_CalcEffLengths(program, scoring_opts,
329  eff_len_params, sbp, query_info, NULL);
330  if (status > 0) {
331  return;
332  }
333 
334  BOOST_REQUIRE(query_info->contexts[0].eff_searchsp == effective_searchsp);
335 
336  eff_len_opts = BlastEffectiveLengthsOptionsFree(eff_len_opts);
337  BOOST_REQUIRE(eff_len_opts == NULL);
338  eff_len_params = BlastEffectiveLengthsParametersFree(eff_len_params);
339  BOOST_REQUIRE(eff_len_params == NULL);
340 
341  BlastExtensionParameters* ext_params=NULL;
342  BlastExtensionParametersNew(program, ext_options, sbp, query_info,
343  &ext_params);
344 
345  const int kAvgSubjLen = 0;
346  BlastHitSavingParameters* hit_params=NULL;
347  BlastHitSavingParametersNew(program, hitsaving_opts,
348  sbp, query_info, kAvgSubjLen,
349  2, &hit_params);
350 
351  BlastScoringParameters* scoring_params=NULL;
352  BlastScoringParametersNew(scoring_opts, sbp, &scoring_params);
353 
354  PSIBlastOptions* psi_options=NULL;
355  PSIBlastOptionsNew(&psi_options);
356 
357  BlastHSPResults* results =
358  Blast_HSPResultsNew(query_info->num_queries);
359  BOOST_REQUIRE(results);
360 
361  rv = Blast_RedoAlignmentCore(program, query_blk,
362  query_info, sbp, NULL, seq_src,
363  BLAST_GENETIC_CODE, NULL, hsp_stream,
364  scoring_params,
365  ext_params, hit_params, psi_options,
366  results);
367  BOOST_REQUIRE_MESSAGE(rv == (Int2)0, "Blast_RedoAlignmentCore failed!");
368 
369  hsp_stream = BlastHSPStreamFree(hsp_stream);
370  BOOST_REQUIRE(hsp_stream == NULL);
371  ext_params = BlastExtensionParametersFree(ext_params);
372  BOOST_REQUIRE(ext_params == NULL);
373  ext_options = BlastExtensionOptionsFree(ext_options);
374  BOOST_REQUIRE(ext_options == NULL);
375  hit_params = BlastHitSavingParametersFree(hit_params);
376  BOOST_REQUIRE(hit_params == NULL);
377  scoring_params = BlastScoringParametersFree(scoring_params);
378  BOOST_REQUIRE(scoring_params == NULL);
379  psi_options = PSIBlastOptionsFree(psi_options);
380  BOOST_REQUIRE(psi_options == NULL);
381  sbp = BlastScoreBlkFree(sbp);
382  BOOST_REQUIRE(sbp == NULL);
383 
384  BOOST_REQUIRE(results && results->num_queries == 1);
385  BOOST_REQUIRE(*results->hitlist_array != NULL);
386  BOOST_REQUIRE(results->hitlist_array[0]->hsplist_count > 0);
387  BlastHSPList* hsp_list = results->hitlist_array[0]->hsplist_array[0];
388 
389  BOOST_REQUIRE_EQUAL(ending_hsp_list->hspcnt, hsp_list->hspcnt);
390 #if 0
391 
392  if ( ending_hsp_list->hspcnt != hsp_list->hspcnt) {
393  cout << "Expected num hsps=" << ending_hsp_list->hspcnt;
394  cout << " Actual num hsps=" << hsp_list->hspcnt << endl;
395  }
396 #endif
397  for (int index=0; index<hsp_list->hspcnt; index++)
398  {
399  BlastHSP* expected_hsp = ending_hsp_list->hsp_array[index];
400  BlastHSP* actual_hsp = hsp_list->hsp_array[index];
401 
402 #if 0
403  cout << index << ": query_offset="
404  << actual_hsp->query.offset << endl;
405  cout << index << ": query_end="
406  << actual_hsp->query.end << endl;
407  cout << index << ": subject_offset="
408  << actual_hsp->subject.offset << endl;
409  cout << index << ": subject_end="
410  << actual_hsp->subject.end << endl;
411  cout << index << ": score="
412  << actual_hsp->score << endl;
413  cout << index << ": bit_score="
414  << actual_hsp->bit_score << endl;
415  cout << index << ": evalue="
416  << actual_hsp->evalue << endl;
417  cout << index << ": num_ident="
418  << actual_hsp->num_ident << endl;
419 #endif
420  BOOST_REQUIRE_EQUAL(expected_hsp->query.offset,
421  actual_hsp->query.offset);
422  BOOST_REQUIRE_EQUAL(expected_hsp->query.end,
423  actual_hsp->query.end);
424  BOOST_REQUIRE_EQUAL(expected_hsp->subject.offset,
425  actual_hsp->subject.offset);
426  BOOST_REQUIRE_EQUAL(expected_hsp->subject.end,
427  actual_hsp->subject.end);
428  BOOST_REQUIRE_EQUAL(expected_hsp->score,
429  actual_hsp->score);
430  BOOST_REQUIRE_EQUAL(expected_hsp->num_ident,
431  actual_hsp->num_ident);
432 #if 0
433  double diff = fabs((expected_hsp->evalue-actual_hsp->evalue));
434  cerr << "Diff in evalues for " << index << "=" << diff << endl;
435 #endif
436  BOOST_REQUIRE_CLOSE(expected_hsp->evalue, actual_hsp->evalue, 10.0);
437  // cout << "HSP " << index << " OK" << endl;
438  }
439 
440  results = Blast_HSPResultsFree(results);
441  BOOST_REQUIRE(results == NULL);
442 }
443 
444 BOOST_FIXTURE_TEST_SUITE(RedoAlignment, CRedoAlignmentTestFixture)
445 
446 // Start modularized code
447 BOOST_AUTO_TEST_CASE(testRedoAlignmentWithCompBasedStats) {
448  const EBlastProgramType kProgram = eBlastTypeBlastp;
449  const int k_num_hsps_start = 3;
450  const int k_num_hsps_end = 2;
451  CSeq_id query_id("gi|3091");
452  CSeq_id subj_id("gi|402871");
453 
454  // It seems that the last hsp in this set was manually constructed and
455  // expected to be dropped (please note that the first 2 hsps we
456  // constructed using blastall (thus filtering on), but the sequence
457  // passed to RedoAlignmentCore was unfiltered (default for blastpgp)
458  const int query_offset[k_num_hsps_start] = { 28, 46, 463};
459  const int query_end[k_num_hsps_start] = { 485, 331, 488};
460  const int subject_offset[k_num_hsps_start] = { 36, 327, 320};
461  const int subject_end[k_num_hsps_start] = { 512, 604, 345};
462  const int score[k_num_hsps_start] = { 554, 280, 28};
463  const int query_gapped_start[k_num_hsps_start] = { 431, 186, 480};
464  const int subject_gapped_start[k_num_hsps_start] = { 458, 458, 337};
465 
466  // This is freed by the HSPStream interface
467  BlastHSPList* init_hsp_list =
468  setUpHSPList(k_num_hsps_start,
469  query_offset, query_end,
470  subject_offset, subject_end,
471  query_gapped_start,
472  subject_gapped_start,
473  score);
474 
475  const int query_offset_final[k_num_hsps_end] = { 2, 46};
476  const int query_end_final[k_num_hsps_end] = { 485, 331};
477  const int subject_offset_final[k_num_hsps_end] = { 9, 327};
478  const int subject_end_final[k_num_hsps_end] = { 512, 604};
479  const int score_final[k_num_hsps_end] = { 510, 282};
480  const double evalue_final[k_num_hsps_end] = {7.0065e-61, 1.6958e-30};
481  const int num_idents_final[k_num_hsps_end] = { 171, 94 };
482 
483  BlastHSPList* ending_hsp_list =
484  setUpHSPList(k_num_hsps_end,
485  query_offset_final,
486  query_end_final,
487  subject_offset_final,
488  subject_end_final,
489  query_offset_final,
490  subject_offset_final,
491  score_final,
492  evalue_final,
493  num_idents_final);
494 
495  const Int8 kEffSearchSp = 500000;
496  const bool kSmithWaterman = false;
497 
498  runRedoAlignmentCoreUnitTest(kProgram, query_id, subj_id,
499  init_hsp_list, ending_hsp_list,
500  kEffSearchSp, eCompositionBasedStats,
501  kSmithWaterman);
502 
503  ending_hsp_list = Blast_HSPListFree(ending_hsp_list);
504  BOOST_REQUIRE(ending_hsp_list == NULL);
505 }
506 
507 BOOST_AUTO_TEST_CASE(testRedoAlignmentWithConditionalAdjust) {
508  const EBlastProgramType kProgram = eBlastTypeBlastp;
509  const int k_num_hsps_start = 3;
510  const int k_num_hsps_end = 2;
511  CSeq_id query_id("gi|3091");
512  CSeq_id subj_id("gi|402871");
513 
514  // It seems that the last hsp in this set was manually constructed and
515  // expected to be dropped (please note that the first 2 hsps we
516  // constructed using blastall (thus filtering on), but the sequence
517  // passed to RedoAlignmentCore was unfiltered (default for blastpgp)
518  const int query_offset[k_num_hsps_start] = { 28, 46, 463};
519  const int query_end[k_num_hsps_start] = { 485, 331, 488};
520  const int subject_offset[k_num_hsps_start] = { 36, 327, 320};
521  const int subject_end[k_num_hsps_start] = { 512, 604, 345};
522  const int score[k_num_hsps_start] = { 554, 280, 28};
523  const int query_gapped_start[k_num_hsps_start] = { 431, 186, 480};
524  const int subject_gapped_start[k_num_hsps_start] = { 458, 458, 337};
525 
526  // This is freed by the HSPStream interface
527  BlastHSPList* init_hsp_list =
528  setUpHSPList(k_num_hsps_start,
529  query_offset, query_end,
530  subject_offset, subject_end,
531  query_gapped_start,
532  subject_gapped_start,
533  score);
534 
535  const int query_offset_final[k_num_hsps_end] = { 2, 46};
536  const int query_end_final[k_num_hsps_end] = { 517, 331};
537  const int subject_offset_final[k_num_hsps_end] = { 9, 327};
538  const int subject_end_final[k_num_hsps_end] = { 546, 604};
539  const int score_final[k_num_hsps_end] = { 537, 298};
540  const double evalue_final[k_num_hsps_end] = {1.1954e-64, 1.5494e-32};
541  const int num_idents_final[k_num_hsps_end] = { 177, 95 };
542 
543  BlastHSPList* ending_hsp_list =
544  setUpHSPList(k_num_hsps_end,
545  query_offset_final,
546  query_end_final,
547  subject_offset_final,
548  subject_end_final,
549  query_offset_final,
550  subject_offset_final,
551  score_final,
552  evalue_final,
553  num_idents_final);
554 
555  const Int8 kEffSearchSp = 500000;
556  const bool kSmithWaterman = false;
557 
558  runRedoAlignmentCoreUnitTest(kProgram, query_id, subj_id,
559  init_hsp_list, ending_hsp_list,
560  kEffSearchSp, eCompositionMatrixAdjust,
561  kSmithWaterman);
562 
563  ending_hsp_list = Blast_HSPListFree(ending_hsp_list);
564  BOOST_REQUIRE(ending_hsp_list == NULL);
565 }
566 
567 BOOST_AUTO_TEST_CASE(testPSIRedoAlignmentWithCompBasedStats) {
568  const EBlastProgramType kProgram = eBlastTypePsiBlast;
569  const int k_num_hsps_start = 6;
570  const int k_num_hsps_end = 2;
571  CSeq_id query_id("gi|129295");
572  CSeq_id subj_id("gi|7450545");
573 
574  const int query_offset[k_num_hsps_start] = { 24, 99, 16, 84, 6, 223 };
575  const int query_end[k_num_hsps_start] = { 62, 128, 24, 114, 25, 231 };
576  const int subject_offset[k_num_hsps_start] =
577  { 245, 0, 198, 86, 334, 151 };
578  const int subject_end[k_num_hsps_start] =
579  { 287, 29, 206, 119, 353, 159 };
580  const int score[k_num_hsps_start] = { 37, 26, 25, 25, 24, 24 };
581  const int query_gapped_start[k_num_hsps_start] =
582  { 29, 104, 20, 91, 19, 227 };
583  const int subject_gapped_start[k_num_hsps_start] =
584  { 250, 5, 202, 93, 347, 155 };
585 
586  // No gaps were found in these alignments. This is freed by the
587  // HSPStream interface
588  BlastHSPList* init_hsp_list =
589  setUpHSPList(k_num_hsps_start,
590  query_offset, query_end,
591  subject_offset, subject_end,
592  query_gapped_start,
593  subject_gapped_start,
594  score);
595 
596  const int query_offset_final[k_num_hsps_end] = { 24, 18 };
597  const int query_end_final[k_num_hsps_end] = { 30, 31 };
598  const int subject_offset_final[k_num_hsps_end] = { 245, 200 };
599  const int subject_end_final[k_num_hsps_end] = { 251, 210 };
600  const int score_final[k_num_hsps_end] = { 29, 24 };
601  const double evalue_final[k_num_hsps_end] =
602  { 1.361074 , 6.425098 };
603  const int ident_final[k_num_hsps_end] = { 3, 6};
604 
605 
606  BlastHSPList* ending_hsp_list =
607  setUpHSPList(k_num_hsps_end,
608  query_offset_final,
609  query_end_final,
610  subject_offset_final,
611  subject_end_final,
612  query_offset_final,
613  subject_offset_final,
614  score_final,
615  evalue_final,
616  ident_final);
617 
618 
619  const Int8 kEffSearchSp = 84660;
620  const bool kSmithWaterman = false;
621 
622  runRedoAlignmentCoreUnitTest(kProgram, query_id, subj_id,
623  init_hsp_list, ending_hsp_list,
624  kEffSearchSp, eCompositionBasedStats,
625  kSmithWaterman);
626  ending_hsp_list = Blast_HSPListFree(ending_hsp_list);
627  BOOST_REQUIRE(ending_hsp_list == NULL);
628 }
629 
630 BOOST_AUTO_TEST_CASE(testRedoAlignmentWithCompBasedStatsBadlyBiasedSequence) {
631  const EBlastProgramType kProgram = eBlastTypeBlastp;
632  const int k_num_hsps_start = 6;
633  const int k_num_hsps_end = 5;
634  // CSeq_id query_id("gi|129295");
635  CSeq_id subj_id("gb|AAA22059|");
636  auto_ptr<SSeqLoc> ssl(CTestObjMgr::Instance().CreateSSeqLoc(subj_id));
637 
638  CNcbiIfstream infile("data/biased.fsa");
639  const bool is_protein(true);
640  CBlastInputSourceConfig iconfig(is_protein);
642  (new CBlastFastaInputSource(infile, iconfig));
643  CRef<CBlastInput> input(new CBlastInput(&*fasta_src));
644  CRef<CScope> scope = CBlastScopeSource(is_protein).NewScope();
645 
646  blast::TSeqLocVector query_seqs = input->GetAllSeqLocs(*scope);
647 
648  const int query_offset[k_num_hsps_start] = { 3, 1, 4, 3, 0, 1 };
649  const int query_end[k_num_hsps_start] = { 236, 232, 236, 235, 226, 233 };
650  const int subject_offset[k_num_hsps_start] =
651  { 1, 1, 6, 6, 12, 22 };
652  const int subject_end[k_num_hsps_start] =
653  { 238, 238, 238, 238, 238, 254 };
654  const int score[k_num_hsps_start] = { 345, 344, 343, 339, 332, 320 };
655  const int query_gapped_start[k_num_hsps_start] =
656  { 32, 194, 9, 8, 104, 9 };
657  const int subject_gapped_start[k_num_hsps_start] =
658  { 30, 200, 11, 11, 116, 30 };
659 
660  // No gaps were found in these alignments. This is freed by the
661  // HSPStream interface
662  BlastHSPList* init_hsp_list =
663  setUpHSPList(k_num_hsps_start,
664  query_offset, query_end,
665  subject_offset, subject_end,
666  query_gapped_start,
667  subject_gapped_start,
668  score);
669 
670  const int query_offset_final[k_num_hsps_end] = { 4, 3, 3, 0, 0};
671  const int query_end_final[k_num_hsps_end] = { 236, 235, 220, 226, 232};
672  const int subject_offset_final[k_num_hsps_end] = { 6, 6, 1, 12, 6};
673  const int subject_end_final[k_num_hsps_end] = { 238, 238, 218, 238, 238};
674  const int score_final[k_num_hsps_end] = { 73, 72, 69, 68, 66};
675  const double evalue_final[k_num_hsps_end] =
676  { 1.26e-05 , 1.7e-5 , 4.0e-5, 5.1e-5, 0.0000775};
677  const int num_idents_final[k_num_hsps_end] = { 87, 85, 81, 84, 85 };
678 
679 
680  BlastHSPList* ending_hsp_list =
681  setUpHSPList(k_num_hsps_end,
682  query_offset_final,
683  query_end_final,
684  subject_offset_final,
685  subject_end_final,
686  query_offset_final,
687  subject_offset_final,
688  score_final,
689  evalue_final,
690  num_idents_final);
691 
692 
693  const Int8 kEffSearchSp = 84660;
694  const bool kSmithWaterman = false;
695 
696  runRedoAlignmentCoreUnitTest(kProgram, query_seqs[0], *ssl,
697  init_hsp_list, ending_hsp_list,
698  kEffSearchSp, eCompositionMatrixAdjust,
699  kSmithWaterman);
700  ending_hsp_list = Blast_HSPListFree(ending_hsp_list);
701  BOOST_REQUIRE(ending_hsp_list == NULL);
702 }
703 
704 BOOST_AUTO_TEST_CASE(testRedoAlignmentWithSW) {
705  const EBlastProgramType kProgram = eBlastTypeBlastp;
706  const int k_num_hsps_start = 3;
707  const int k_num_hsps_end = 5;
708  CSeq_id query_id("gi|3091");
709  CSeq_id subj_id("gi|402871");
710 
711  // It seems that the last hsp in this set was manually constructed and
712  // expected to be dropped (please note that the first 2 hsps we
713  // constructed using blastall (thus filtering on), but the sequence
714  // passed to RedoAlignmentCore was unfiltered (default for blastpgp)
715  const int query_offset[k_num_hsps_start] = { 28, 46, 463 };
716  const int query_end[k_num_hsps_start] = { 485, 331, 488 };
717  const int subject_offset[k_num_hsps_start] = { 36, 327, 320 };
718  const int subject_end[k_num_hsps_start] = { 512, 604, 345 };
719  const int score[k_num_hsps_start] = { 554, 280, 28 };
720  const int query_gapped_start[k_num_hsps_start] = { 431, 186, 480 };
721  const int subject_gapped_start[k_num_hsps_start] = { 458, 458, 337 };
722 
723  // This is freed by the HSPStream interface
724  BlastHSPList* init_hsp_list =
725  setUpHSPList(k_num_hsps_start,
726  query_offset, query_end,
727  subject_offset, subject_end,
728  query_gapped_start,
729  subject_gapped_start,
730  score);
731 
732  const int query_offset_final[k_num_hsps_end] = { 2, 250, 494, 67, 2 };
733  const int query_end_final[k_num_hsps_end] = { 485, 331, 530, 86, 24 };
734  const int subject_offset_final[k_num_hsps_end] = { 9, 523, 261, 585, 570 };
735  const int subject_end_final[k_num_hsps_end] = { 512, 604, 297, 604, 592 };
736  const int score_final[k_num_hsps_end] = { 591, 39, 37, 33, 32 };
737  const double evalue_final[k_num_hsps_end] = { 2.3451e-72, 0.387,
738  0.6692, 1.9988, 2.6256 };
739  const int num_idents_final[k_num_hsps_end] = { 172, 22, 9, 8, 7 };
740 
741  BlastHSPList* ending_hsp_list =
742  setUpHSPList(k_num_hsps_end,
743  query_offset_final,
744  query_end_final,
745  subject_offset_final,
746  subject_end_final,
747  query_offset_final,
748  subject_offset_final,
749  score_final,
750  evalue_final,
751  num_idents_final);
752 
753  const Int8 kEffSearchSp = 500000;
754  const bool kSmithWaterman = true;
755 
756  runRedoAlignmentCoreUnitTest(kProgram, query_id, subj_id,
757  init_hsp_list, ending_hsp_list,
758  kEffSearchSp, eNoCompositionBasedStats,
759  kSmithWaterman);
760  ending_hsp_list = Blast_HSPListFree(ending_hsp_list);
761  BOOST_REQUIRE(ending_hsp_list == NULL);
762 }
763 
764 BOOST_AUTO_TEST_CASE(testRedoAlignmentWithCompBasedStatsAndSW) {
765  const EBlastProgramType kProgram = eBlastTypeBlastp;
766  const int k_num_hsps_start = 3;
767  const int k_num_hsps_end = 3;
768  CSeq_id query_id("gi|3091");
769  CSeq_id subj_id("gi|402871");
770 
771  const int query_offset[k_num_hsps_start] = { 28, 46, 463};
772  const int query_end[k_num_hsps_start] = { 485, 331, 488};
773  const int subject_offset[k_num_hsps_start] = { 36, 327, 320};
774  const int subject_end[k_num_hsps_start] = { 512, 604, 345};
775  const int score[k_num_hsps_start] = { 554, 280, 28};
776  const int query_gapped_start[k_num_hsps_start] = { 431, 186, 480};
777  const int subject_gapped_start[k_num_hsps_start] = { 458, 458, 337};
778 
779  // This is freed by the HSPStream interface
780  BlastHSPList* init_hsp_list =
781  setUpHSPList(k_num_hsps_start,
782  query_offset, query_end,
783  subject_offset, subject_end,
784  query_gapped_start,
785  subject_gapped_start,
786  score);
787 
788  const int query_offset_final[k_num_hsps_end] = { 2, 250, 67 };
789  const int query_end_final[k_num_hsps_end] = { 485, 331, 86};
790  const int subject_offset_final[k_num_hsps_end] = { 9, 523, 585};
791  const int subject_end_final[k_num_hsps_end] = { 512, 604, 604};
792  const int score_final[k_num_hsps_end] = { 510, 34, 31};
793  const double evalue_final[k_num_hsps_end] = {7.0065e-61, 1.349, 3.7944};
794  const int num_idents_final[k_num_hsps_end] = { 171, 22, 8 };
795 
796  BlastHSPList* ending_hsp_list =
797  setUpHSPList(k_num_hsps_end,
798  query_offset_final,
799  query_end_final,
800  subject_offset_final,
801  subject_end_final,
802  query_offset_final,
803  subject_offset_final,
804  score_final,
805  evalue_final,
806  num_idents_final);
807 
808  const Int8 kEffSearchSp = 500000;
809  const bool kSmithWaterman = true;
810 
811  runRedoAlignmentCoreUnitTest(kProgram, query_id, subj_id,
812  init_hsp_list, ending_hsp_list,
813  kEffSearchSp, eCompositionBasedStats,
814  kSmithWaterman);
815 
816  ending_hsp_list = Blast_HSPListFree(ending_hsp_list);
817  BOOST_REQUIRE(ending_hsp_list == NULL);
818 }
819 
820 BOOST_AUTO_TEST_CASE(testPSIRedoAlignmentWithCompBasedStatsAndSW) {
821  const EBlastProgramType kProgram = eBlastTypePsiBlast;
822  const int k_num_hsps_start = 6;
823  const int k_num_hsps_end = 8;
824  CSeq_id query_id("gi|129295");
825  CSeq_id subj_id("gi|7450545");
826 
827  const int query_offset[k_num_hsps_start] =
828  { 24, 99, 16, 84, 6, 223 };
829  const int query_end[k_num_hsps_start] =
830  { 62, 128, 24, 114, 25, 231 };
831  const int subject_offset[k_num_hsps_start] =
832  { 245, 0, 198, 86, 334, 151 };
833  const int subject_end[k_num_hsps_start] =
834  { 287, 29, 206, 119, 353, 159 };
835  const int score[k_num_hsps_start] =
836  { 37, 26, 25, 25, 24, 24 };
837  const int query_gapped_start[k_num_hsps_start] =
838  { 29, 104, 20, 91, 19, 227 };
839  const int subject_gapped_start[k_num_hsps_start] =
840  { 250, 5, 202, 93, 347, 155 };
841 
842 
843  // No gaps were found in these alignments. This is freed by the
844  // HSPStream interface
845  BlastHSPList* init_hsp_list =
846  setUpHSPList(k_num_hsps_start,
847  query_offset, query_end,
848  subject_offset, subject_end,
849  query_gapped_start,
850  subject_gapped_start,
851  score);
852 
853  const int query_offset_final[k_num_hsps_end] =
854  { 24, 140, 126, 10, 137, 198, 18, 137 };
855  const int query_end_final[k_num_hsps_end] =
856  { 30, 171, 205, 35, 157, 208, 31, 152 };
857  const int subject_offset_final[k_num_hsps_end] =
858  { 245, 408, 212, 130, 339, 388, 200, 186 };
859  const int subject_end_final[k_num_hsps_end] =
860  { 251, 439, 287, 155, 359, 398, 210, 201 };
861  const int score_final[k_num_hsps_end] =
862  { 29, 28, 28, 28, 25, 24, 24, 22 };
863  const double evalue_final[k_num_hsps_end] =
864  { 1.361074, 1.837947, 2.118044, 2.153685, 4.198304, 5.529096,
865  6.425098, 8.532644 };
866  const int ident_final[k_num_hsps_end] =
867  { 3, 8, 23, 10, 6, 5, 6, 5};
868 
869  BlastHSPList* ending_hsp_list =
870  setUpHSPList(k_num_hsps_end,
871  query_offset_final,
872  query_end_final,
873  subject_offset_final,
874  subject_end_final,
875  query_offset_final,
876  subject_offset_final,
877  score_final,
878  evalue_final,
879  ident_final);
880 
881 
882  const Int8 kEffSearchSp = 84660;
883  const bool kSmithWaterman = true;
884 
885  runRedoAlignmentCoreUnitTest(kProgram, query_id, subj_id,
886  init_hsp_list, ending_hsp_list,
887  kEffSearchSp, eCompositionBasedStats,
888  kSmithWaterman);
889 
890  ending_hsp_list = Blast_HSPListFree(ending_hsp_list);
891  BOOST_REQUIRE(ending_hsp_list == NULL);
892 }
893 
894 // Tests following changes from Mike Gertz and Alejandro Schaffer (rev 1.14
895 // of blast_kappa.c):
896 //
897 // For secondary alignments with Smith-Waterman on, use the E-value from
898 // the X-drop alignment computed by ALIGN that has more flexibility in
899 // which positions are allowed in the dynamic program.
900 //
901 // Add a sort of alignments for the same query-subject pair because the
902 // use of X-drop alignments occasionally reorders such alignments.
903 BOOST_AUTO_TEST_CASE(testRedoAlignmentUseXdropEvalue) {
904  const EBlastProgramType kProgram = eBlastTypeBlastp;
905  const int k_num_hsps_start = 4;
906  const int k_num_hsps_end = 4;
907  CSeq_id query_id("gi|48100936");
908  CSeq_id subj_id("gi|7301132");
909 
910  const int query_offset[k_num_hsps_start] = { 995, 1004, 995, 973};
911  const int query_end[k_num_hsps_start] = { 1314, 1314, 1403, 1316};
912  const int subject_offset[k_num_hsps_start] = { 61, 36, 61, 106};
913  const int subject_end[k_num_hsps_start] = { 384, 384, 455, 420};
914  const int score[k_num_hsps_start] = { 341, 327, 314, 301};
915  const int query_gapped_start[k_num_hsps_start] = { 1233, 1017, 1310,
916  1228};
917  const int subject_gapped_start[k_num_hsps_start] = { 303, 49, 347, 331};
918 
919  // This is freed by the HSPStream interface
920  BlastHSPList* init_hsp_list =
921  setUpHSPList(k_num_hsps_start,
922  query_offset, query_end,
923  subject_offset, subject_end,
924  query_gapped_start,
925  subject_gapped_start,
926  score);
927  const int query_offset_final[k_num_hsps_end] =
928  { 995, 1261, 1025, 1210};
929  const int query_end_final[k_num_hsps_end] =
930  { 1314, 1341, 1125, 1243};
931  const int subject_offset_final[k_num_hsps_end] =
932  { 61, 1, 387, 17};
933  const int subject_end_final[k_num_hsps_end] =
934  { 384, 115, 482, 50};
935  const int score_final[k_num_hsps_end] =
936  { 323, 78, 69, 60};
937  const double evalue_final[k_num_hsps_end] =
938  { 2.712e-34, 3.6003e-05, 0.00048334, 0.00441};
939  const int num_idents_final[k_num_hsps_end] = { 108, 31, 30, 12 };
940 
941  BlastHSPList* ending_hsp_list =
942  setUpHSPList(k_num_hsps_end,
943  query_offset_final,
944  query_end_final,
945  subject_offset_final,
946  subject_end_final,
947  query_offset_final,
948  subject_offset_final,
949  score_final,
950  evalue_final,
951  num_idents_final);
952 
953  const Int8 kEffSearchSp = 1000*1000;
954  const bool kSmithWaterman = true;
955 
956  const int kHitListSize = 1;
957  const double kEvalueThreshold = 0.005;
958 
959  runRedoAlignmentCoreUnitTest(kProgram, query_id, subj_id,
960  init_hsp_list, ending_hsp_list,
961  kEffSearchSp, eCompositionBasedStats,
962  kSmithWaterman, kEvalueThreshold,
963  kHitListSize);
964 
965  ending_hsp_list = Blast_HSPListFree(ending_hsp_list);
966  BOOST_REQUIRE(ending_hsp_list == NULL);
967 }
968 
969 // N.B.: the absence of a testPSIRedoAlignmentWithSW is due to the fact
970 // that blastpgp reads frequency rations from a checkpoint file and scales
971 // the PSSM and K parameter, which we cannot do unless we support reading
972 // checkpoint files. QA should be added later
973 
974 
975 BOOST_AUTO_TEST_CASE(CheckLowerCaseMatrix)
976 {
977  BOOST_REQUIRE(Blast_FrequencyDataIsAvailable("blosum62") == 1);
978 }
979 
981 
982 /*
983 * ===========================================================================
984 *
985 * $Log: redoalignment-cppunit.cpp,v $
986 * Revision 1.59 2009/04/27 15:15:41 camacho
987 * Bring up to date with latest filtering strategy changes JIRA SB-170
988 *
989 * Revision 1.55 2009/03/13 18:15:23 maning
990 * Use the writer/pipe framework to construct hsp_stream. Removed queue test.
991 *
992 * Revision 1.58 2009/03/13 19:25:56 maning
993 * Previous commit messed up. Roll back again.
994 *
995 * Revision 1.56 2009/03/13 18:45:15 maning
996 * Roll back to previous version.
997 *
998 * Revision 1.54 2008/11/21 21:13:27 madden
999 * tests for num_ident fix
1000 *
1001 * Revision 1.53 2008/10/27 17:00:12 camacho
1002 * Fix include paths to deprecated headers
1003 *
1004 * Revision 1.52 2008/02/13 21:39:12 camacho
1005 * Re-enable choice to sort by score to meet pre-condition of composition-based
1006 * statistics code.
1007 *
1008 * Revision 1.51 2007/11/19 18:48:34 camacho
1009 * Bring up to date with changes in SVN revision 114090
1010 *
1011 * Revision 1.50 2007/11/08 17:09:08 camacho
1012 * Bring up to date with changes in SVN revision 113729
1013 *
1014 * Revision 1.49 2007/10/22 19:16:10 madden
1015 * BlastExtensionOptionsNew has Boolean gapped arg
1016 *
1017 * Revision 1.48 2007/07/27 18:04:34 papadopo
1018 * change signature of HSPListCollector_Init
1019 *
1020 * Revision 1.47 2007/05/08 22:41:31 papadopo
1021 * change signature of RedoAlignmentCore
1022 *
1023 * Revision 1.46 2007/03/22 14:34:44 camacho
1024 * + support for auto-detection of genetic codes
1025 *
1026 * Revision 1.45 2007/03/20 14:54:02 camacho
1027 * changes related to addition of multiple genetic code specification
1028 *
1029 * Revision 1.44 2006/11/21 17:47:50 papadopo
1030 * use enum for lookup table type
1031 *
1032 * Revision 1.43 2006/09/25 19:34:31 madden
1033 * Changed values to reflect the results of computations done with
1034 * more precise frequencies and frequency ratios.
1035 * [from Mike Gertz]
1036 *
1037 * Revision 1.42 2006/06/29 16:25:24 camacho
1038 * Changed BlastHitSavingOptionsNew signature
1039 *
1040 * Revision 1.41 2006/06/05 13:34:05 madden
1041 * Changes to remove [GS]etMatrixPath and use callback instead
1042 *
1043 * Revision 1.40 2006/05/18 16:32:03 papadopo
1044 * change signature of BLAST_CalcEffLengths
1045 *
1046 * Revision 1.39 2006/01/23 19:57:52 camacho
1047 * Allow new varieties of composition based statistics
1048 *
1049 * Revision 1.38 2005/12/16 20:51:50 camacho
1050 * Diffuse the use of CSearchMessage, TQueryMessages, and TSearchMessages
1051 *
1052 * Revision 1.37 2005/12/01 15:13:52 madden
1053 * replaced Kappa_RedoAlignmentCore with Blast_RedoAlignmentCore
1054 *
1055 * Revision 1.36 2005/10/14 13:47:32 camacho
1056 * Fixes to pacify icc compiler
1057 *
1058 * Revision 1.35 2005/06/09 20:37:06 camacho
1059 * Use new private header blast_objmgr_priv.hpp
1060 *
1061 * Revision 1.34 2005/05/24 20:05:17 camacho
1062 * Changed signature of SetupQueries and SetupQueryInfo
1063 *
1064 * Revision 1.33 2005/05/23 15:53:19 dondosha
1065 * Special case for preliminary hitlist size in RPS BLAST - hence no need for 2 extra parameters in SBlastHitsParametersNew
1066 *
1067 * Revision 1.32 2005/05/19 17:10:04 madden
1068 * Adjust an expect value, use BOOST_REQUIRE_CLOSE when appropriate
1069 *
1070 * Revision 1.31 2005/05/16 12:29:15 madden
1071 * use SBlastHitsParameters in Blast_HSPListCollectorInit and Blast_HSPListCollectorInit[MT]
1072 *
1073 * Revision 1.30 2005/04/27 20:08:40 dondosha
1074 * PHI-blast boolean argument has been removed from BlastSetup_ScoreBlkInit
1075 *
1076 * Revision 1.29 2005/04/06 21:27:18 dondosha
1077 * Use EBlastProgramType instead of EProgram in internal functions
1078 *
1079 * Revision 1.28 2005/04/06 16:27:46 camacho
1080 * Fix to previous commit
1081 *
1082 * Revision 1.27 2005/03/31 20:46:44 camacho
1083 * Changes to avoid making CBlastRedoAlignmentTest a friend of CBlastOptions
1084 *
1085 * Revision 1.26 2005/03/31 13:45:58 camacho
1086 * BLAST options API clean-up
1087 *
1088 * Revision 1.25 2005/03/04 17:20:45 bealer
1089 * - Command line option support.
1090 *
1091 * Revision 1.24 2005/02/16 14:52:41 camacho
1092 * Fix memory leak
1093 *
1094 * Revision 1.23 2005/02/14 14:17:17 camacho
1095 * Changes to use SBlastScoreMatrix
1096 *
1097 * Revision 1.22 2005/01/06 15:43:25 camacho
1098 * Make use of modified signature to blast::SetupQueries
1099 *
1100 * Revision 1.21 2004/12/14 21:16:31 dondosha
1101 * Query frame argument added to Blast_HSPInit; renamed all constants according to toolkit convention
1102 *
1103 * Revision 1.20 2004/12/09 15:24:10 dondosha
1104 * BlastSetup_GetScoreBlock renamed to BlastSetup_ScoreBlkInit
1105 *
1106 * Revision 1.19 2004/12/02 16:12:49 bealer
1107 * - Change multiple-arrays to array-of-struct in BlastQueryInfo
1108 *
1109 * Revision 1.18 2004/11/23 21:51:34 camacho
1110 * Update call to Kappa_RedoAlignmentCore as its signature has changed.
1111 * num_ident field of HSP structure is no longer populated in
1112 * Kappa_RedoAlignmentCore.
1113 *
1114 * Revision 1.17 2004/11/17 21:02:01 camacho
1115 * Add error checking to BlastSeqSrc initialization
1116 *
1117 * Revision 1.16 2004/11/09 15:16:26 camacho
1118 * Minor change in auxiliary function
1119 *
1120 * Revision 1.15 2004/11/02 22:08:30 camacho
1121 * Refactored main body of unit test to be reused by multiple tests
1122 *
1123 * Revision 1.14 2004/11/02 18:28:52 madden
1124 * BlastHitSavingParametersNew no longer requires BlastExtensionParameters
1125 *
1126 * Revision 1.13 2004/11/01 18:38:30 madden
1127 * Change call to BLAST_FillHitSavingOptions
1128 *
1129 * Revision 1.12 2004/10/29 14:19:20 camacho
1130 * Make BlastHSPResults allocation function follow naming conventions
1131 *
1132 * Revision 1.11 2004/10/14 17:14:28 madden
1133 * New parameter in BlastHitSavingParametersNew
1134 *
1135 * Revision 1.10 2004/07/19 15:04:22 dondosha
1136 * Renamed multiseq_src to seqsrc_multiseq, seqdb_src to seqsrc_seqdb
1137 *
1138 * Revision 1.9 2004/07/06 15:58:45 dondosha
1139 * Use EBlastProgramType enumeration type for program when calling C functions
1140 *
1141 * Revision 1.8 2004/06/22 16:46:19 camacho
1142 * Changed the blast_type_* definitions for the EBlastProgramType enumeration.
1143 *
1144 * Revision 1.7 2004/06/21 20:29:07 camacho
1145 * Fix memory leaks, others remain
1146 *
1147 * Revision 1.6 2004/06/21 14:23:32 madden
1148 * Add test testRedoAlignmentUseXdropEvalue that tests recent changes by Mike Gertz to kappa.c ported to blast_kappa.c
1149 *
1150 * Revision 1.5 2004/06/21 13:06:51 madden
1151 * Add check for expect value
1152 *
1153 * Revision 1.4 2004/06/21 12:54:49 camacho
1154 * CVS Id tag fix
1155 *
1156 * Revision 1.3 2004/06/08 15:24:34 dondosha
1157 * Use BlastHSPStream interface
1158 *
1159 * Revision 1.2 2004/05/19 17:10:35 madden
1160 * Fix memory leaks
1161 *
1162 * Revision 1.1 2004/05/18 17:17:45 madden
1163 * Tests for composition-based stats and smith-waterman
1164 *
1165 *
1166 *
1167 * ===========================================================================
1168 */
Int2 BlastHitSavingParametersNew(EBlastProgramType program_number, const BlastHitSavingOptions *options, const BlastScoreBlk *sbp, const BlastQueryInfo *query_info, Int4 avg_subject_length, Int4 compositionBasedStats, BlastHitSavingParameters **parameters)
Allocate memory and initialize the BlastHitSavingParameters structure.
static int ** loadPssmFromFile(const string &filename, unsigned int query_length)
Structures and API used for saving BLAST hits.
BOOST_AUTO_TEST_CASE(testRedoAlignmentWithCompBasedStats)
Options for setting up effective lengths and search spaces.
const TSeqLocVector & GetQueries() const
Retrieve a vector of query sequences.
Definition: bl2seq.hpp:244
const CBlastOptions & GetOptions() const
Return the object which this object is a handle for.
ADT definition of BlastHSPWriter.
static BlastHSPList * setUpHSPList(int num_hsps, const int query_offset[], const int query_end[], const int subject_offset[], const int subject_end[], const int query_gapped_start[], const int subject_gapped_start[], const int score[], const double evalue[]=NULL, const int num_ident[]=NULL)
void _PSICopyMatrix_int(int **dest, int **src, unsigned int ncols, unsigned int nrows)
Copies src matrix into dest matrix, both of which must be int matrices with dimensions ncols by nrows...
Int4 hsplist_count
Filled size of the HSP lists array.
Definition: blast_hits.h:170
BlastHSPCollectorParams * BlastHSPCollectorParamsNew(const BlastHitSavingOptions *hit_options, Int4 compositionBasedStats, Boolean gapped_calculation)
Sets up parameter set for use by collector.
static void setupPositionBasedBlastScoreBlk(BlastScoreBlk *sbp, unsigned int qlen)
Declares the CBl2Seq (BLAST 2 Sequences) class.
Header file for composition-based statistics.
BlastHSPList * Blast_HSPListFree(BlastHSPList *hsp_list)
Deallocate memory for an HSP list structure as well as all it's components.
Definition: blast_hits.c:1517
#define BLAST_EXPECT_VALUE
Default parameters for saving hits.
Int4 score
This HSP's raw score.
Definition: blast_hits.h:127
Smith-waterman finds optimal scores, then ALIGN_EX to find alignment.
Int2 Blast_HSPListSaveHSP(BlastHSPList *hsp_list, BlastHSP *hsp)
Saves HSP information into a BlastHSPList structure.
Definition: blast_hits.c:1729
Class that centralizes the configuration data for sequences to be converted.
Definition: blast_input.hpp:48
EProgram
This enumeration is to evolve into a task/program specific list that specifies sets of default parame...
Definition: blast_types.hpp:56
Runs the BLAST algorithm between 2 sequences.
Definition: bl2seq.hpp:57
Class representing a text file containing sequences in fasta format.
Wrapper class for BlastExtensionOptions .
Definition: blast_aux.hpp:297
void ** _PSIAllocateMatrix(unsigned int ncols, unsigned int nrows, unsigned int data_type_sz)
Generic 2 dimensional matrix allocator.
int num_queries
Number of query sequences.
Blast_KarlinBlk ** kbp_psi
K-A parameters for position-based alignments.
Definition: blast_stat.h:212
CConstRef< objects::CSeq_loc > seqloc
Seq-loc describing the sequence to use as query/subject to BLAST The types of Seq-loc currently suppo...
Definition: sseqloc.hpp:50
Implementation of the BlastSeqSrc interface for a vector of sequence locations.
BlastHSP ** hsp_array
Array of pointers to individual HSPs.
Definition: blast_hits.h:157
EBlastTbackExt eTbackExt
type of traceback extension.
Structure used for scoring calculations.
Definition: blast_stat.h:177
const TSeqLocVector & GetSubjects() const
Retrieve a vector of subject sequences.
Definition: bl2seq.hpp:272
Blast_KarlinBlk ** kbp_gap
K-A parameters for gapped alignments.
Definition: blast_stat.h:208
BlastHitList ** hitlist_array
Array of results for individual query sequences.
Definition: blast_hits.h:185
Implementation of a number of BlastHSPWriters to save hits from a BLAST search, and subsequently retu...
STL namespace.
Generalized converter from an abstract source of biological sequence data to collections of blast inp...
void CheckForBlastSeqSrcErrors(const BlastSeqSrc *seqsrc)
BlastContextInfo * contexts
Information per context.
signed NCBI_INT8_TYPE Int8
Signed 8 byte sized integer.
Definition: ncbitype.h:143
const NCBI_NS_NCBI::CEnumeratedTypeValues *ENUM_METHOD_NAME() ENa_strand(void)
Access to ENa_strand's attributes (values, names) as defined in spec.
Wrapper class for BlastScoringOptions .
Definition: blast_aux.hpp:308
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
#define NULL
Definition: ncbistd.hpp:225
void SetupQueryInfo(TSeqLocVector &queries, EBlastProgramType prog, objects::ENa_strand strand_opt, BlastQueryInfo **qinfo)
Allocates the query information structure and fills the context offsets, in case of multiple queries...
The structure to hold all HSPs for a given sequence after the gapped alignment.
Definition: blast_hits.h:153
Int2 BlastScoringParametersNew(const BlastScoringOptions *options, BlastScoreBlk *sbp, BlastScoringParameters **parameters)
Calculate scaled cutoff scores and gap penalties.
Int2 Blast_RedoAlignmentCore(EBlastProgramType program_number, BLAST_SequenceBlk *queryBlk, const BlastQueryInfo *query_info, BlastScoreBlk *sbp, BLAST_SequenceBlk *subjectBlk, const BlastSeqSrc *seqSrc, Int4 db_genetic_code, BlastHSPList *thisMatch, BlastHSPStream *hsp_stream, BlastScoringParameters *scoringParams, const BlastExtensionParameters *extendParams, const BlastHitSavingParameters *hitParams, const PSIBlastOptions *psiOptions, BlastHSPResults *results)
Top level routine to recompute alignments for each match found by the gapped BLAST algorithm (single-...
Definition: blast_kappa.c:2934
Interface for reading SRA sequences into blast input.
Int4 hitlist_size
Maximal number of database sequences to return results for.
BlastExtensionParameters * BlastExtensionParametersFree(BlastExtensionParameters *parameters)
Deallocate memory for BlastExtensionParameters.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:795
BlastHSPWriterInfo * BlastHSPCollectorInfoNew(BlastHSPCollectorParams *params)
WriterInfo to create a default writer: the collecter.
int i
Int2 PSIBlastOptionsNew(PSIBlastOptions **psi_options)
Initialize default options for PSI BLAST.
Declaration of ADT to save and retrieve lists of HSPs in the BLAST engine.
int BlastHSPStreamWrite(BlastHSPStream *hsp_stream, BlastHSPList **hsp_list)
Invokes the user-specified write function for this BlastHSPStream implementation. ...
Int2 Blast_HSPInit(Int4 query_start, Int4 query_end, Int4 subject_start, Int4 subject_end, Int4 query_gapped_start, Int4 subject_gapped_start, Int4 query_context, Int2 query_frame, Int2 subject_frame, Int4 score, GapEditScript **gap_edit, BlastHSP **ret_hsp)
Allocates BlastHSP and inits with information from input.
Definition: blast_hits.c:126
Definition: testodbc.c:30
Defines: CTimeFormat - storage class for time format.
#define BLAST_GENETIC_CODE
Default genetic code for query and/or database.
BlastExtensionOptions * BlastExtensionOptionsFree(BlastExtensionOptions *options)
Deallocate memory for BlastExtensionOptions.
Wrapper class for BlastSeqSrc .
Definition: blast_aux.hpp:324
Int2 BlastExtensionParametersNew(EBlastProgramType blast_program, const BlastExtensionOptions *options, BlastScoreBlk *sbp, BlastQueryInfo *query_info, BlastExtensionParameters **parameters)
Calculate the raw values for the X-dropoff parameters.
Int8 eff_searchsp
Effective search space for this context.
Wrapper class for BLAST_SequenceBlk .
Definition: blast_aux.hpp:283
Int2 BlastHitSavingOptionsNew(EBlastProgramType program, BlastHitSavingOptions **options, Boolean gapped_calculation)
Allocate memory for BlastHitSavingOptions.
Structure holding all information about an HSP.
Definition: blast_hits.h:126
BlastScoreBlk * BlastScoreBlkFree(BlastScoreBlk *sbp)
Deallocates BlastScoreBlk as well as all associated structures.
Definition: blast_stat.c:965
Int2 BlastSetup_ScoreBlkInit(BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info, const BlastScoringOptions *scoring_options, EBlastProgramType program_number, BlastScoreBlk **sbpp, double scale_factor, Blast_Message **blast_message, GET_MATRIX_PATH get_path)
Initializes the score block structure.
Definition: blast_setup.c:456
Composition-based score adjustment as in Bioinformatics 21:902-911, 2005, conditioned on sequence pro...
Don't use composition based statistics.
Blast_KarlinBlk ** kbp_gap_psi
K-A parameters for psi alignments.
Definition: blast_stat.h:212
Wrapper class for BlastQueryInfo .
Definition: blast_aux.hpp:285
BlastHSPList * Blast_HSPListNew(Int4 hsp_max)
Creates HSP list structure with a default size HSP array.
Definition: blast_hits.c:1533
BlastSeg query
Query sequence info.
Definition: blast_hits.h:131
Int2 BlastScoringOptionsNew(EBlastProgramType program, BlastScoringOptions **options)
Allocate memory for BlastScoringOptions and fill with default values.
double evalue
This HSP's e-value.
Definition: blast_hits.h:130
Utility stuff for more convenient using of Boost.Test library.
Definitions used to get joint probabilities for a scoring matrix.
BlastSeqSrc * MultiSeqBlastSeqSrcInit(TSeqLocVector &seq_vector, EBlastProgramType program, bool dbscan_mode=false)
Initialize the sequence source structure.
SBlastScoreMatrix * pssm
position-specific score matrix
Definition: blast_stat.h:150
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm. ...
Definition: blast_program.h:72
BlastHSPStream * BlastHSPStreamNew(EBlastProgramType program, const BlastExtensionOptions *extn_opts, Boolean sort_on_read, Int4 num_queries, BlastHSPWriter *writer)
Initialize the HSP stream.
CRef< objects::CScope > NewScope()
Create a new, properly configured CScope.
Scoring parameters block Contains scoring-related information that is actually used for the blast sea...
BlastHSPWriter * BlastHSPWriterNew(BlastHSPWriterInfo **writer_info, BlastQueryInfo *query_info, BLAST_SequenceBlk *query)
A generic function to create writer.
double expect_value
The expect value cut-off threshold for an HSP, or a combined hit if sum statistics is used...
const CBlastOptionsHandle & GetOptionsHandle() const
Retrieve the options handle.
Definition: bl2seq.hpp:285
Boolean gapped_calculation
gap-free search if FALSE
char * BlastFindMatrixPath(const char *matrix_name, Boolean is_prot)
Returns the path to a specified matrix.
The Object manager core.
BlastHitSavingParameters * BlastHitSavingParametersFree(BlastHitSavingParameters *parameters)
Deallocate memory for BlastHitSavingOptions*.
Structure to hold the a message from the core of the BLAST engine.
Definition: blast_message.h:70
void Blast_HSPListSortByScore(BlastHSPList *hsp_list)
Sort the HSPs in an HSP list by score.
Definition: blast_hits.c:1349
ECompoAdjustModes
An collection of constants that specify all permissible modes of composition adjustment.
int Blast_FrequencyDataIsAvailable(const char *matrix_name)
Retrieve the background letter probabilities implicitly used in constructing the score matrix matrix_...
Magic spell ;-) needed for some weird compilers... very empiric.
TSeqLocVector GetAllSeqLocs(CScope &scope)
Read and convert all the sequences from the source.
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
void SetupQueries(TSeqLocVector &queries, BlastQueryInfo *qinfo, BLAST_SequenceBlk **seqblk, EBlastProgramType prog, objects::ENa_strand strand_opt, TSearchMessages &messages)
Populates BLAST_SequenceBlk with sequence data for use in CORE BLAST.
SPsiBlastScoreMatrix * psi_matrix
PSSM and associated data.
Definition: blast_stat.h:186
Default implementation of BlastHSPStream.
Int2 BlastEffectiveLengthsOptionsNew(BlastEffectiveLengthsOptions **options)
Allocate memory for BlastEffectiveLengthsOptions* and fill with default values.
Parameter block that contains a pointer to BlastHitSavingOptions and the values derived from it...
Composition-based statistics as in NAR 29:2994-3005, 2001.
Constants used in compositional score matrix adjustment.
#define BLAST_HITLIST_SIZE
Number of database sequences to save hits for.
Private interface for Position Iterated BLAST API, contains the PSSM generation engine.
Int4 end
End of hsp.
Definition: blast_hits.h:99
static void runRedoAlignmentCoreUnitTest(EBlastProgramType program, CSeq_id &qid, CSeq_id &sid, BlastHSPList *init_hsp_list, const BlastHSPList *ending_hsp_list, Int8 effective_searchsp, ECompoAdjustModes compositonBasedStatsMode, bool doSmithWaterman, double evalue_threshold=BLAST_EXPECT_VALUE, int hit_list_size=BLAST_HITLIST_SIZE)
EBlastProgramType GetProgramType() const
Returns the CORE BLAST notion of program type.
The structure to contain all BLAST results, for multiple queries.
Definition: blast_hits.h:183
static char * prog
Definition: mdb_load.c:33
void ** _PSIDeallocateMatrix(void **matrix, unsigned int ncols)
Generic 2 dimensional matrix deallocator.
Int4 num_ident
Number of identical base pairs in this HSP.
Definition: blast_hits.h:128
objects::ENa_strand GetStrandOption() const
Int4 compositionBasedStats
mode of compositional adjustment to use; if zero then compositional adjustment is not used ...
BlastEffectiveLengthsParameters * BlastEffectiveLengthsParametersFree(BlastEffectiveLengthsParameters *parameters)
Deallocate memory for BlastEffectiveLengthsParameters*.
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
Auxiliary class to convert a string into an argument count and vector.
Wrapper class for BlastHitSavingOptions .
Definition: blast_aux.hpp:300
Int2 BlastExtensionOptionsNew(EBlastProgramType program, BlastExtensionOptions **options, Boolean gapped)
Allocate memory for BlastExtensionOptions and fill with default values.
Structure to represent a single sequence to be fed to BLAST.
Definition: sseqloc.hpp:47
Int4 num_queries
Number of query sequences.
Definition: blast_hits.h:184
Encapsulates ALL the BLAST algorithm's options.
static CTestObjMgr & Instance()
Definition: test_objmgr.cpp:71
double bit_score
Bit score, calculated from score.
Definition: blast_hits.h:129
typedef for the messages for an entire BLAST search, which could be comprised of multiple query seque...
Definitions which are dependant on the NCBI C++ Object Manager.
BlastHSPList ** hsplist_array
Array of HSP lists for individual database hits.
Definition: blast_hits.h:176
Declares the CBlastOptionsHandle and CBlastOptionsFactory classes.
Computed values used as parameters for gapped alignments.
int ** data
actual scoring matrix data, stored in row-major form
Definition: blast_stat.h:140
Blast_KarlinBlk ** kbp
Karlin-Altschul parameters.
Definition: blast_stat.h:207
Utilities initialize/setup BLAST.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:245
BlastScoringParameters * BlastScoringParametersFree(BlastScoringParameters *parameters)
Deallocate memory for BlastScoringParameters.
BlastHSPResults * Blast_HSPResultsFree(BlastHSPResults *results)
Deallocate memory for BLAST results.
Definition: blast_hits.c:3251
EProgram ProgramNameToEnum(const std::string &program_name)
Map a string into an element of the ncbi::blast::EProgram enumeration (except eBlastProgramMax).
Definition: blast_aux.cpp:757
Declares the CBlastProteinOptionsHandle class.
#define BLASTAA_SIZE
Size of aminoacid alphabet.
Int2 BlastNumber2Program(EBlastProgramType number, char **program)
Return string name for program given a number.
Definition: blast_util.c:312
BlastEffectiveLengthsOptions * BlastEffectiveLengthsOptionsFree(BlastEffectiveLengthsOptions *options)
Deallocate memory for BlastEffectiveLengthsOptions*.
A wrap of data structure used to create a writer.
BlastHSPStream * BlastHSPStreamFree(BlastHSPStream *hsp_stream)
Frees the BlastHSPStream structure by invoking the destructor function set by the user-defined constr...
Int2 BlastEffectiveLengthsParametersNew(const BlastEffectiveLengthsOptions *options, Int8 db_length, Int4 num_seqs, BlastEffectiveLengthsParameters **parameters)
Allocate memory for BlastEffectiveLengthsParameters.
#define fabs(v)
Definition: ncbi_dispd.c:46
Int4 hspcnt
Number of HSPs saved.
Definition: blast_hits.h:158
std::istream & in(std::istream &in_, double &x_)
Int4 offset
Start of hsp.
Definition: blast_hits.h:98
Class whose purpose is to create CScope objects which have data loaders added with different prioriti...
CRef< objects::CScope > scope
Scope where the sequence referenced can be found by the toolkit's object manager. ...
Definition: sseqloc.hpp:54
BlastSeg subject
Subject sequence info.
Definition: blast_hits.h:132
Parameters for setting up effective lengths and search spaces.
Options used for gapped extension These include: a.
SPsiBlastScoreMatrix * SPsiBlastScoreMatrixNew(size_t ncols)
Allocates a new SPsiBlastScoreMatrix structure of dimensions ncols by BLASTAA_SIZE.
Definition: blast_stat.c:805
signed short Int2
Alias for signed short.
Definition: ncbitype.h:118
Int2 BLAST_CalcEffLengths(EBlastProgramType program_number, const BlastScoringOptions *scoring_options, const BlastEffectiveLengthsParameters *eff_len_params, const BlastScoreBlk *sbp, BlastQueryInfo *query_info, Blast_Message **blast_message)
Function to calculate effective query length and db length as well as effective search space...
Definition: blast_setup.c:699
PSIBlastOptions * PSIBlastOptionsFree(PSIBlastOptions *psi_options)
Deallocate PSI BLAST options.
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
BlastHSPResults * Blast_HSPResultsNew(Int4 num_queries)
Initialize the results structure.
Definition: blast_hits.c:3231
Int2 BLAST_FillEffectiveLengthsOptions(BlastEffectiveLengthsOptions *options, Int4 dbseq_num, Int8 db_length, Int8 *searchsp_eff, Int4 num_searchsp)
Fill the non-default values in the BlastEffectiveLengthsOptions structure.
static int input()
Interface for converting sources of sequence data into blast sequence input.
Modified on Sun Dec 10 16:48:02 2017 by modify_doxy.py rev. 546573