NCBI C++ ToolKit
split_query_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: split_query_unit_test.cpp 89942 2020-04-30 13:01:21Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Christiam Camacho
27 *
28 * File Description:
29 * Unit test module for code to split query sequences
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
34 #include <corelib/test_boost.hpp>
35 #include "test_objmgr.hpp"
36 
37 #include <blast_objmgr_priv.hpp>
39 #include "blast_aux_priv.hpp"
40 #include "split_query_aux_priv.hpp"
42 #include "split_query.hpp"
45 #include <util/random_gen.hpp>
47 
48 /* IMPORTANT NOTE: If you have made changes to the query splitting code, the
49  * data in the configuration file (split_query.ini) might need to be updated.
50  * To aid in this, the xblast library supports tracing messages that output the
51  * internal data structure's contents to facilitate updating this file. To
52  * enable this, please run the unit_test application with the DIAG_TRACE
53  * environment variable set.
54  */
55 
56 typedef vector<vector<Uint4> > TSplitQueryChunkMap;
57 
58 using namespace std;
59 using namespace ncbi;
60 using namespace ncbi::objects;
61 using namespace ncbi::blast;
62 
63 /// Calculate and assign the maximum length field in the BlastQueryInfo
64 /// structure
65 static void s_CalculateMaxLength(BlastQueryInfo* query_info)
66 {
67  query_info->max_length = 0;
68  for (int i = query_info->first_context; i <= query_info->last_context; i++)
69  {
70  BOOST_REQUIRE(query_info->contexts[i].query_length >= 0);
71  query_info->max_length =
72  max<Uint4>(query_info->max_length,
73  query_info->contexts[i].query_length);
74  }
75 }
76 
77 /// Pair for gis and their length (in that order)
78 typedef pair<TIntId, size_t> TGiLenPair;
79 /// Vector containing pairs of gis and their length
80 typedef vector<TGiLenPair> TGiLengthVector;
81 
82 /// Convert a vector of GIs with its lengths into a TSeqLocVector
83 /// @param gi_length vector of TGiLenPair containing GIs and their length [in]
84 /// @param retval the return value of this function [out]
85 /// @param tot_length total length of sequence data contained in gi_length
86 /// (optional) [in]
87 /// @param strands vector of strands to use (optional), if provided it must
88 /// match the size of the gi_length vector [in]
89 /// @param masks vector of masks (optional), if provided it must match the size
90 /// of the gi_length vector [in]
91 static void
93  TSeqLocVector& retval,
94  size_t* tot_length = NULL,
95  vector<ENa_strand>* strands = NULL,
96  const TSeqLocInfoVector* masks = NULL)
97 {
98  if (tot_length) {
99  *tot_length = 0;
100  }
101  retval.clear();
102  retval.reserve(gi_length.size());
103 
104  if (strands) {
105  BOOST_REQUIRE(strands->size() == gi_length.size());
106  }
107  if (masks) {
108  BOOST_REQUIRE(masks->size() == gi_length.size());
109  }
110 
111  for (size_t i = 0; i < gi_length.size(); i++) {
112  CRef<CSeq_loc> loc(new CSeq_loc());
113  if (strands) {
114  CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, gi_length[i].first));
115  loc->SetInt().SetFrom(0);
116  loc->SetInt().SetTo(gi_length[i].second-1);
117  loc->SetId(*id);
118  loc->SetStrand((*strands)[i]);
119  } else {
120  loc->SetWhole().SetGi(GI_FROM(TIntId, gi_length[i].first));
121  }
123  retval.push_back(SSeqLoc(loc, &*scope));
124  if (tot_length) {
125  *tot_length += gi_length[i].second;
126  }
127  }
128 
129  if (masks == NULL) {
130  return;
131  }
132 
133  for (size_t i = 0; i < masks->size(); i++) {
134  const TMaskedQueryRegions& single_query_masks = (*masks)[i];
135  // FIXME: don't make the distinction between single and multiple masks
136  CRef<CSeq_loc> m(new CSeq_loc);
137 
138  if (single_query_masks.size() == 1) {
139  const CSeq_interval& interval =
140  single_query_masks.front()->GetInterval();
141  m->SetInt(const_cast<CSeq_interval&>(interval));
142  } else {
143  ITERATE(TMaskedQueryRegions, mask, single_query_masks) {
144  const CSeq_interval& interval = (*mask)->GetInterval();
145  m->SetPacked_int().AddInterval(interval);
146  }
147  }
148  BOOST_REQUIRE(m->IsInt() || m->IsPacked_int());
149  retval[i].mask = m;
150  }
151 }
152 
154 public:
155  /// This represents the split_query.ini configuration file
157  /// Default value used when a field is not present in the config file
158  static const int kDefaultIntValue = -1;
159 
161  // read the configuration file if it hasn't been read yet
162  if (m_Config.Empty()) {
163  const IRegistry::TFlags flags =
168 
169  const string fname("data/split_query.ini");
170  ifstream config_file(fname.c_str());
171  m_Config.Reset(new CNcbiRegistry(config_file, flags));
172 
173  if (m_Config->Empty()) {
174  throw runtime_error("Failed to read configuration file" +
175  fname);
176  }
177  }
178  }
179 
181  BOOST_REQUIRE(m_Config.NotEmpty());
182  }
183 
184  /// Populate a BLAST_SequenceBlk and BlastQueryInfo structures out of an
185  /// array of GIs
186  /// @param gis array of GIs, last element must be -1 indicating the end of
187  /// the array [in]
188  /// @param program program for which the query data will be created [in]
189  /// @param seq_blk BLAST_SequenceBlk structure to populate [out]
190  /// @param qinfo BlastQueryInfo structure to populate [out]
191  /// @param strand strand to use (optional) [in]
193  EProgram program,
194  BLAST_SequenceBlk** seq_blk,
195  BlastQueryInfo** qinfo,
196  ENa_strand* strand = NULL)
197  {
198  BOOST_REQUIRE(seq_blk);
199  BOOST_REQUIRE(qinfo);
200  TSeqLocVector queries;
201 
202  for (int i = 0; gis[i] != -1; i++) {
203  CRef<CSeq_loc> loc(new CSeq_loc());
204  loc->SetWhole().SetGi(GI_FROM(TIntId, gis[i]));
205  CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
206  scope->AddDefaults();
207  queries.push_back(SSeqLoc(loc, scope));
208  }
209 
211 
212  TSearchMessages msgs;
213 
214  const CBlastOptions& kOpts = opts->GetOptions();
216  ENa_strand strand_opt = (strand != NULL)
217  ? *strand : kOpts.GetStrandOption();
218 
219  SetupQueryInfo(queries, prog, strand_opt, qinfo);
220  SetupQueries(queries, *qinfo, seq_blk,
221  prog, strand_opt, msgs);
222  BOOST_REQUIRE(msgs.HasMessages() == false);
223  }
224 
226  size_t chunk_size,
227  size_t num_chunks,
228  blast::EProgram program,
229  vector< vector<int> >& starting_chunks,
230  vector< vector<int> >& absolute_contexts,
231  vector< vector<size_t> >* context_offsets,
232  ENa_strand strand,
233  vector<ENa_strand>* query_strands = NULL) {
234 
235  if (query_strands) {
236  BOOST_REQUIRE_EQUAL(gi_length.size(), query_strands->size());
237  }
238 
239  size_t tot_length;
240  TSeqLocVector queries;
241  s_ConvertToBlastQueries(gi_length, queries, &tot_length, query_strands);
242 
243  size_t nc = SplitQuery_CalculateNumChunks(
244  EProgramToEBlastProgramType(program),
245  &chunk_size, tot_length, queries.size());
246  BOOST_REQUIRE_EQUAL(num_chunks, nc);
247 
248  CRef<IQueryFactory> qf(new CObjMgr_QueryFactory(queries));
250  CRef<CBlastOptions> opts(&opts_h->SetOptions());
251  if ( !query_strands ) {
252  opts->SetStrandOption(strand);
253  }
254  CRef<ILocalQueryData> query_data(qf->MakeLocalQueryData(&*opts));
255 
256  CAutoEnvironmentVariable tmp_env("CHUNK_SIZE",
259  CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
260  CRef<CSplitQueryBlk> sqb = splitter->Split();
261 
262  BOOST_REQUIRE_EQUAL((size_t)splitter->GetNumberOfChunks(), num_chunks);
263 
264  CContextTranslator ctx_translator(*sqb);
265 
266  ostringstream os;
267  for (size_t chunk_num = 0; chunk_num < num_chunks; chunk_num++) {
268  // Test the starting chunks
269  vector<int>& st_chunks = starting_chunks[chunk_num];
270  for (size_t context_in_chunk = 0;
271  context_in_chunk < st_chunks.size();
272  context_in_chunk++) {
273  os.str("");
274  os << "Starting chunks: ";
275  os << "Chunk " << chunk_num << ", context " << context_in_chunk;
276  int sc = ctx_translator.GetStartingChunk(chunk_num,
277  context_in_chunk);
278  BOOST_REQUIRE_MESSAGE(st_chunks[context_in_chunk]==sc,os.str());
279  }
280 
281  // Test the absolute contexts
282  vector<int>& abs_ctxts = absolute_contexts[chunk_num];
283  for (size_t context_in_chunk = 0;
284  context_in_chunk < abs_ctxts.size();
285  context_in_chunk++) {
286  os.str("");
287  os << "Absolute contexts: ";
288  os << "Chunk " << chunk_num << ", context " << context_in_chunk;
289  int abs_ctx =
290  ctx_translator.GetAbsoluteContext(chunk_num,
291  context_in_chunk);
292  BOOST_REQUIRE_MESSAGE(abs_ctxts[context_in_chunk]==abs_ctx,os.str());
293  }
294  }
295 
296  // Check the context offsets
297  if ( !context_offsets ) {
298  return;
299  }
300 
301  const BLAST_SequenceBlk* global_seq = query_data->GetSequenceBlk();
302  const BlastQueryInfo* global_qinfo = query_data->GetQueryInfo();
303  CRef<CSplitQueryBlk> split_query_blk = splitter->m_SplitBlk;
304  for (size_t chunk_num = 0; chunk_num < num_chunks; chunk_num++) {
305  vector<size_t> test_ctx_off =
306  split_query_blk->GetContextOffsets(chunk_num);
307  const vector<size_t>& ref_ctx_off = (*context_offsets)[chunk_num];
308 
309  os.str("");
310  os << "Number of context offsets in chunk " << chunk_num;
311  BOOST_REQUIRE_MESSAGE(ref_ctx_off.size()==test_ctx_off.size(),os.str());
312 
313  CRef<IQueryFactory> chunk_qf =
314  splitter->GetQueryFactoryForChunk(chunk_num);
315  CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(opts));
316  const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk();
317  const BlastQueryInfo* chunk_qinfo = chunk_qd->GetQueryInfo();
318 
319  for (size_t i = 0; i < ref_ctx_off.size(); i++) {
320  size_t correction = ref_ctx_off[i];
321  os.str("");
322  os << "Context correction in chunk " << chunk_num
323  << ", context " << i << " value now " << test_ctx_off[i]
324  << " not " << correction;
325  BOOST_REQUIRE_MESSAGE(correction==test_ctx_off[i],os.str());
326 
327  int absolute_context =
328  ctx_translator.GetAbsoluteContext(chunk_num, i);
329  if (absolute_context == kInvalidContext) {
330  continue;
331  }
332 
333  int global_offset =
334  global_qinfo->contexts[absolute_context].query_offset +
335  correction;
336  int chunk_offset = chunk_qinfo->contexts[i].query_offset;
337  int num_bases2compare =
338  min(10, chunk_qinfo->contexts[i].query_length);
339 
340  os.str("");
341  os << "Sequence data in chunk " << chunk_num
342  << ", context " << i;
343  bool rv =
344  x_CmpSequenceData(&global_seq->sequence[global_offset],
345  &chunk_seq->sequence[chunk_offset],
346  num_bases2compare);
347  BOOST_REQUIRE_MESSAGE(rv,os.str());
348  }
349 
350  }
351  }
352 
353  /** Auxiliary function that compares bytes of sequence data to validate the
354  * context offset corrections
355  * @param global global query sequence data [in]
356  * @param chunk sequence data for chunk [in]
357  * @param len length of the data to compare [in]
358  * @return true if sequence data is identical, false otherwise
359  */
360  bool x_CmpSequenceData(const Uint1* global, const Uint1* chunk, size_t len)
361  {
362  for (size_t i = 0; i < len; i++) {
363  if (global[i] != chunk[i]) {
364  return false;
365  }
366  }
367  return true;
368  }
369 
371  ENa_strand strand)
372  {
374  CSeq_id id(CSeq_id::e_Gi, 112422322); // 122347 bases long
375  query.AddQuery(CTestObjMgr::Instance().CreateBlastSearchQuery(id));
376 
379  CRef<CBlastOptions> opts(&opts_h->SetOptions());
380  opts->SetStrandOption(strand);
381  CRef<ILocalQueryData> query_data(qf->MakeLocalQueryData(&*opts));
382 
383  CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
384  CRef<CSplitQueryBlk> sqb = splitter->Split();
385 
386  CQuerySplitter::TSplitQueryVector split_query_vector;
387  x_ReadQueryBoundsPerChunk(kTestName, sqb, split_query_vector);
388  x_ValidateQuerySeqLocsPerChunk(splitter, split_query_vector);
389 
390  x_ValidateChunkBounds(splitter->GetChunkSize(),
391  query_data->GetSumOfSequenceLengths(),
392  *sqb, opts->GetProgramType());
393 
394  const size_t kNumChunks = (size_t)m_Config->GetInt(kTestName,
395  "NumChunks",
396  kDefaultIntValue);
397  BOOST_REQUIRE_EQUAL(kNumChunks, (size_t)splitter->GetNumberOfChunks());
398  BOOST_REQUIRE_EQUAL(kNumChunks, sqb->GetNumChunks());
399 
400  vector< vector<size_t> > queries_per_chunk;
401  x_ReadVectorOfVectorsForTest(kTestName, "Queries", queries_per_chunk);
402  x_ValidateQueriesPerChunkAssignment(*sqb, queries_per_chunk);
403 
404  vector< vector<int> > ctxs_per_chunk;
405  x_ReadVectorOfVectorsForTest(kTestName, "Contexts", ctxs_per_chunk);
406  x_ValidateQueryContextsPerChunkAssignment(*sqb, ctxs_per_chunk);
407 
408  vector< vector<size_t> > ctx_offsets_per_chunk;
409  x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
410  ctx_offsets_per_chunk);
411  x_ValidateContextOffsetsPerChunkAssignment(*sqb, ctx_offsets_per_chunk);
412 
413  vector<BlastQueryInfo*> split_query_info;
414  x_ReadSplitQueryInfoForTest(kTestName, opts->GetProgramType(),
415  split_query_info);
416  x_ValidateLocalQueryData(splitter, &*opts, split_query_info);
417  NON_CONST_ITERATE(vector<BlastQueryInfo*>, itr, split_query_info) {
418  *itr = BlastQueryInfoFree(*itr);
419  }
420  }
421 
423  ENa_strand strand,
424  vector<ENa_strand>*
425  query_strands = NULL)
426  {
427  TGiLengthVector gi_length;
428  gi_length.push_back(make_pair<int, size_t>(112258880, 362959));
429  gi_length.push_back(make_pair<int, size_t>(112253843, 221853));
430  gi_length.push_back(make_pair<int, size_t>(112193060, 194837));
431  gi_length.push_back(make_pair<int, size_t>(112193059, 204796));
432  if (query_strands) {
433  BOOST_REQUIRE_EQUAL(gi_length.size(), query_strands->size());
434  }
435 
436  size_t tot_length;
437  TSeqLocVector queries;
438  s_ConvertToBlastQueries(gi_length, queries, &tot_length, query_strands);
439 
440  CRef<IQueryFactory> qf(new CObjMgr_QueryFactory(queries));
442  CRef<CBlastOptions> opts(&opts_h->SetOptions());
443  if ( !query_strands ) {
444  opts->SetStrandOption(strand);
445  }
446  CRef<ILocalQueryData> query_data(qf->MakeLocalQueryData(&*opts));
447 
448  CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
449  CRef<CSplitQueryBlk> sqb = splitter->Split();
450 
451  CQuerySplitter::TSplitQueryVector split_query_vector;
452  x_ReadQueryBoundsPerChunk(kTestName, sqb, split_query_vector);
453  x_ValidateQuerySeqLocsPerChunk(splitter, split_query_vector);
454 
455  x_ValidateChunkBounds(splitter->GetChunkSize(),
456  query_data->GetSumOfSequenceLengths(),
457  *sqb, opts->GetProgramType());
458 
459  const size_t kNumChunks = (size_t)m_Config->GetInt(kTestName,
460  "NumChunks",
461  kDefaultIntValue);
462  BOOST_REQUIRE_EQUAL(kNumChunks, (size_t)splitter->GetNumberOfChunks());
463  BOOST_REQUIRE_EQUAL(kNumChunks, sqb->GetNumChunks());
464 
465  vector< vector<size_t> > queries_per_chunk;
466  x_ReadVectorOfVectorsForTest(kTestName, "Queries", queries_per_chunk);
467  x_ValidateQueriesPerChunkAssignment(*sqb, queries_per_chunk);
468 
469  vector< vector<int> > ctxs_per_chunk;
470  x_ReadVectorOfVectorsForTest(kTestName, "Contexts", ctxs_per_chunk);
471  x_ValidateQueryContextsPerChunkAssignment(*sqb, ctxs_per_chunk);
472 
473  vector< vector<size_t> > ctx_offsets_per_chunk;
474  x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
475  ctx_offsets_per_chunk);
476  x_ValidateContextOffsetsPerChunkAssignment(*sqb, ctx_offsets_per_chunk);
477 
478  vector<BlastQueryInfo*> split_query_info;
479  x_ReadSplitQueryInfoForTest(kTestName, opts->GetProgramType(),
480  split_query_info);
481  x_ValidateLocalQueryData(splitter, &*opts, split_query_info);
482  NON_CONST_ITERATE(vector<BlastQueryInfo*>, itr, split_query_info) {
483  *itr = BlastQueryInfoFree(*itr);
484  }
485  }
486 
488  ENa_strand strand)
489  {
490  const size_t kLength = 122347; // length of the sequence below
491  CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, 63122693));
492  TSeqRange range(0, kLength);
494  query.push_back(*CTestObjMgr::Instance().
495  CreateSSeqLoc(*id, range, strand));
496 
499  CRef<CBlastOptions> opts(&opts_h->SetOptions());
500  CRef<ILocalQueryData> query_data(qf->MakeLocalQueryData(&*opts));
501 
502  CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
503  CRef<CSplitQueryBlk> sqb = splitter->Split();
504 
505  BOOST_REQUIRE_EQUAL(m_Config->GetInt(kTestName, "ChunkSize",
506  kDefaultIntValue),
507  (int)splitter->GetChunkSize());
508 
509  x_ValidateChunkBounds(splitter->GetChunkSize(),
510  query_data->GetSumOfSequenceLengths(),
511  *sqb, opts->GetProgramType());
512 
513  const size_t kNumChunks = (size_t)m_Config->GetInt(kTestName,
514  "NumChunks",
515  kDefaultIntValue);
516  BOOST_REQUIRE_EQUAL(kNumChunks, (size_t)splitter->GetNumberOfChunks());
517  BOOST_REQUIRE_EQUAL(kNumChunks, sqb->GetNumChunks());
518 
519  vector< vector<size_t> > queries_per_chunk;
520  x_ReadVectorOfVectorsForTest(kTestName, "Queries", queries_per_chunk);
521  x_ValidateQueriesPerChunkAssignment(*sqb, queries_per_chunk);
522 
523  vector< vector<int> > ctxs_per_chunk;
524  x_ReadVectorOfVectorsForTest(kTestName, "Contexts", ctxs_per_chunk);
525  x_ValidateQueryContextsPerChunkAssignment(*sqb, ctxs_per_chunk);
526 
527  vector< vector<size_t> > ctx_offsets_per_chunk;
528  x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
529  ctx_offsets_per_chunk);
530  x_ValidateContextOffsetsPerChunkAssignment(*sqb, ctx_offsets_per_chunk);
531 
532  vector<BlastQueryInfo*> split_query_info;
533  x_ReadSplitQueryInfoForTest(kTestName, opts->GetProgramType(),
534  split_query_info);
535  x_ValidateLocalQueryData(splitter, &*opts, split_query_info);
536  NON_CONST_ITERATE(vector<BlastQueryInfo*>, itr, split_query_info) {
537  *itr = BlastQueryInfoFree(*itr);
538  }
539  }
540 
542  ENa_strand strand,
543  vector<ENa_strand>*
544  query_strands = NULL)
545  {
546  TGiLengthVector gi_length;
547  gi_length.push_back(make_pair<int, size_t>(112817621, 5567));
548  gi_length.push_back(make_pair<int, size_t>(112585373, 5987));
549  gi_length.push_back(make_pair<int, size_t>(112585216, 5531));
550  gi_length.push_back(make_pair<int, size_t>(112585119, 5046));
551  if (query_strands) {
552  BOOST_REQUIRE_EQUAL(gi_length.size(), query_strands->size());
553  }
554 
555  size_t tot_length;
556  TSeqLocVector queries;
557  s_ConvertToBlastQueries(gi_length, queries, &tot_length, query_strands);
558 
559  CRef<IQueryFactory> qf(new CObjMgr_QueryFactory(queries));
561  CRef<CBlastOptions> opts(&opts_h->SetOptions());
562  if ( !query_strands ) {
563  opts->SetStrandOption(strand);
564  }
565  CRef<ILocalQueryData> query_data(qf->MakeLocalQueryData(&*opts));
566 
567  CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
568  CRef<CSplitQueryBlk> sqb = splitter->Split();
569 
570  BOOST_REQUIRE_EQUAL(m_Config->GetInt(kTestName, "ChunkSize",
571  kDefaultIntValue),
572  (int)splitter->GetChunkSize());
573 
574  BOOST_REQUIRE_EQUAL(tot_length, query_data->GetSumOfSequenceLengths());
575  x_ValidateChunkBounds(splitter->GetChunkSize(),
576  query_data->GetSumOfSequenceLengths(),
577  *sqb, opts->GetProgramType());
578 
579  const size_t kNumChunks = (size_t)m_Config->GetInt(kTestName,
580  "NumChunks",
581  kDefaultIntValue);
582  BOOST_REQUIRE_EQUAL(kNumChunks, (size_t)splitter->GetNumberOfChunks());
583  BOOST_REQUIRE_EQUAL(kNumChunks, sqb->GetNumChunks());
584 
585  vector< vector<size_t> > queries_per_chunk;
586  x_ReadVectorOfVectorsForTest(kTestName, "Queries", queries_per_chunk);
587  x_ValidateQueriesPerChunkAssignment(*sqb, queries_per_chunk);
588 
589  vector< vector<int> > ctxs_per_chunk;
590  x_ReadVectorOfVectorsForTest(kTestName, "Contexts", ctxs_per_chunk);
591  x_ValidateQueryContextsPerChunkAssignment(*sqb, ctxs_per_chunk);
592 
593  vector< vector<size_t> > ctx_offsets_per_chunk;
594  x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
595  ctx_offsets_per_chunk);
596  x_ValidateContextOffsetsPerChunkAssignment(*sqb, ctx_offsets_per_chunk);
597 
598  vector<BlastQueryInfo*> split_query_info;
599  x_ReadSplitQueryInfoForTest(kTestName, opts->GetProgramType(),
600  split_query_info);
601  x_ValidateLocalQueryData(splitter, &*opts, split_query_info);
602  NON_CONST_ITERATE(vector<BlastQueryInfo*>, itr, split_query_info) {
603  *itr = BlastQueryInfoFree(*itr);
604  }
605  }
606 
607  /************ Auxiliary functions **********************************/
608 
609  /// Incrementally compute the query chunk bounds. This will have a direct
610  /// impact on the success of x_ValidateChunkBounds. This function assumes
611  /// that the chunk size doesn't vary between each invocation and that the
612  /// first time this function is called, the chunk_range is initialized with
613  /// its default constructor (e.g.: TChunkRange::GetEmpty())
614  /// @param chunk_range range of the query chunk [in|out]
615  /// @param chunk_size size of the chunk [in]
616  /// @param concatenated_query_length length of the full query [in]
617  /// @param overlap length of the overlap region between each chunk [in]
619  size_t chunk_size,
620  size_t concatenated_query_length,
621  size_t overlap)
622  {
623  if (chunk_range == TChunkRange::GetEmpty()) {
624  chunk_range.SetFrom(0);
625  chunk_range.SetLength(chunk_size);
626  } else {
627  const TSeqPos kIncrement = chunk_size - overlap;
628  chunk_range.SetFrom(chunk_range.GetFrom() + kIncrement);
629  chunk_range.SetToOpen(chunk_range.GetToOpen() + kIncrement);
630  }
631  BOOST_REQUIRE(chunk_range.NotEmpty());
632 
633  if (chunk_range.GetToOpen() > concatenated_query_length) {
634  chunk_range.SetToOpen(concatenated_query_length);
635  }
636  }
637 
638  /// This function reads values in the split_query.ini file with the format
639  /// ChunkNQueryM (where N is the chunk number and M is the query number).
640  /// Each of these entries should have 3 comma-separeted elements: the
641  /// query's starting offset, ending offset, and its strand's enumeration
642  /// value.
643  /// @param kTestName name of the test to read data for [in]
644  /// @param sqb CSplitQueryBlk object from which to get query indices for
645  /// each chunk [in]
646  /// @param split_query_vector query vector where the data from config file
647  /// will be read [out]
650  CQuerySplitter::TSplitQueryVector& split_query_vector)
651  {
653  TMaskedQueryRegions empty_mask;
654  split_query_vector.clear();
655 
656  ostringstream os;
657 
658  const int kNumChunks = m_Config->GetInt(kTestName, "NumChunks",
659  kDefaultIntValue);
660  if (kNumChunks == kDefaultIntValue) {
661  throw runtime_error("Invalid number of chunks in " + kTestName);
662  }
663 
664  split_query_vector.assign(kNumChunks, CRef<CBlastQueryVector>());
665 
666  for (int i = 0; i < kNumChunks; i++) {
667  os.str("");
668  os << "Chunk" << i;
669  const vector<size_t> kQueryIndices = sqb->GetQueryIndices(i);
670 
671  BOOST_REQUIRE( !kQueryIndices.empty() );
672  split_query_vector[i].Reset(new CBlastQueryVector);
673 
674  ITERATE(vector<size_t>, itr, kQueryIndices) {
675  ostringstream out;
676  out << "Query" << *itr;
677 
678  const string& value = m_Config->Get(kTestName,
679  os.str() + out.str());
680  // This data corresponds to entries in split_query.ini of the
681  // form ChunkNQueryM, and each line should contain 3 elements:
682  // the start and stop for each query in each chunk and the
683  // strand's enumeration value
684  vector<size_t> query_data;
685  x_ParseConfigLine(value, query_data);
686  BOOST_REQUIRE_MESSAGE((size_t)3==query_data.size(),os.str() + out.str());
687 
688  CRef<CSeq_loc> sl(new CSeq_loc);
689  sl->SetInt().SetFrom(query_data[0]);
690  sl->SetInt().SetTo(query_data[1]);
691  sl->SetStrand(static_cast<ENa_strand>(query_data[2]));
693  *scope,
694  empty_mask));
695  split_query_vector[i]->AddQuery(bsq);
696  }
697  }
698  }
699 
700  /// Compare the query data (start, stop, strand) for each chunk computed by
701  /// the splitter vs. the data read from the split_query.ini file
702  /// @param splitter object which performs query splitting [in]
703  /// @param split_query_vector data instantiated from what was read from the
704  /// split_query.ini file
705  /// @param splitter CQuerySplitter object to test [in]
706  /// @param split_query_vector data read from config file to test against
707  /// [in]
709  const CQuerySplitter::TSplitQueryVector& split_query_vector)
710  {
711  if (split_query_vector.empty()) {
712  return;
713  }
714 
715  ostringstream os;
716  os << "Different split query vector sizes";
717 
718  BOOST_REQUIRE_MESSAGE(split_query_vector.size()==(size_t)splitter->m_NumChunks,os.str());
719 
720  for (size_t i = 0; i < splitter->m_NumChunks; i++) {
721  CRef<CBlastQueryVector> ref_qvector = split_query_vector[i];
722  CRef<CBlastQueryVector> test_qvector =
723  splitter->m_SplitQueriesInChunk[i];
724 
725  os.str("");
726  os << "Different split query vector sizes for chunk " << i;
727  BOOST_REQUIRE_MESSAGE(ref_qvector->Size()==test_qvector->Size(),os.str());
728 
729  for (size_t j = 0; j < ref_qvector->Size(); j++) {
730  CConstRef<CSeq_loc> ref_qloc = ref_qvector->GetQuerySeqLoc(j);
731  CConstRef<CSeq_loc> test_qloc = test_qvector->GetQuerySeqLoc(j);
732  CSeq_loc::TRange ref_query_range = ref_qloc->GetTotalRange();
733  CSeq_loc::TRange test_query_range = test_qloc->GetTotalRange();
734 
735  os.str("");
736  os << "Starting offset for query " << j << " in chunk " << i << " is now " << test_query_range.GetFrom() << " and not " << ref_query_range.GetFrom();
737  BOOST_REQUIRE_MESSAGE(ref_query_range.GetFrom()==test_query_range.GetFrom(),os.str());
738  os.str("");
739  os << "Ending offset for query " << j << " in chunk " << i << " is now " << test_query_range.GetToOpen() << " and not " << ref_query_range.GetTo();
740  BOOST_REQUIRE_MESSAGE(ref_query_range.GetTo()==test_query_range.GetToOpen(),os.str());
741  os.str("");
742  os << "Strand for query " << j << " in chunk " << i << " is now "
743  << (int)test_qloc->GetStrand() << " and not " << (int)ref_qloc->GetStrand();
744  BOOST_REQUIRE_MESSAGE(ref_qloc->GetStrand()==test_qloc->GetStrand(),os.str());
745  }
746  }
747  }
748 
749  /// Reads data to populate multiple BlastQueryInfo structures. This data is
750  /// formatted in the config file as
751  /// BlastQueryInfoN.X[.Y] where N is the chunk number, X is the field of
752  /// the BlastQueryInfo structure and Y is the field of the BlastContextInfo
753  /// structure (only applicable if X has the value contextM, where M is the
754  /// context number)
755  /// @param kTestName name of the test to read data for [in]
756  /// @param program blast program [in]
757  /// @param retval vector of BlastQueryInfo structures, there will be as
758  /// many elements as there are chunks for this test. Caller is
759  /// responsible for deallocating the contents of this vector [out]
761  EBlastProgramType program,
762  vector<BlastQueryInfo*>& retval)
763  {
764  ostringstream os, errors;
765 
766  const int kNumChunks = m_Config->GetInt(kTestName, "NumChunks",
767  kDefaultIntValue);
768  if (kNumChunks == kDefaultIntValue) {
769  throw runtime_error("Invalid number of chunks in " + kTestName);
770  }
771 
772  retval.clear();
773  retval.reserve(kNumChunks);
774  retval.assign(kNumChunks, static_cast<BlastQueryInfo*>(0));
775 
776  for (int i = 0; i < kNumChunks; i++) {
777  os.str("");
778  os << "BlastQueryInfo" << i << ".";
779  const string kPrefix(os.str());
780  errors.str("Chunk ");
781  errors << i << ": ";
782  const int kNumQueries = m_Config->GetInt(kTestName,
783  kPrefix + "num_queries",
784  kDefaultIntValue);
785  if (kNumQueries == kDefaultIntValue) {
786  string msg("Invalid BlastQueryInfo::num_queries in ");
787  msg += kTestName + " or value not specified";
788 return; // FIXME
789  //throw runtime_error(msg);
790  }
791 
792  retval[i] = BlastQueryInfoNew(program, kNumQueries);
793  errors << "Failed to allocate BlastQueryInfo structure"
794  << " (Number of queries=" << kNumQueries << ")";
795  BOOST_REQUIRE_MESSAGE(retval[i],errors.str());
796 
797  retval[i]->first_context = m_Config->GetInt(kTestName,
798  kPrefix +
799  "first_context",
800  kDefaultIntValue);
801  errors.str("Chunk ");
802  errors << i;
803  BOOST_REQUIRE_MESSAGE(retval[i]->first_context >= 0,errors.str());
804 
805  retval[i]->last_context = m_Config->GetInt(kTestName,
806  kPrefix +
807  "last_context",
808  kDefaultIntValue);
809  BOOST_REQUIRE_MESSAGE(retval[i]->last_context >= 0,errors.str());
810  BOOST_REQUIRE_MESSAGE(retval[i]->first_context <= retval[i]->last_context,errors.str());
811 
812  for (int c = retval[i]->first_context;
813  c <= retval[i]->last_context;
814  c++) {
815 
816  errors.str("");
817  errors << "Chunk " << i << ", BlastQueryInfo::context " << c;
818 
819  ostringstream ctx;
820  ctx << kPrefix << "context" << c << ".";
821 
822  retval[i]->contexts[c].query_offset =
823  m_Config->GetInt(kTestName, ctx.str() +
824  "query_offset", kDefaultIntValue);
825  BOOST_REQUIRE_MESSAGE(retval[i]->contexts[c].query_offset >= 0,
826  errors.str() + " query_offset >= 0");
827 
828  retval[i]->contexts[c].query_length =
829  m_Config->GetInt(kTestName, ctx.str() +
830  "query_length", kDefaultIntValue);
831  BOOST_REQUIRE_MESSAGE(retval[i]->contexts[c].query_length >= 0,
832  errors.str() + " query_length >= 0");
833 
834  retval[i]->contexts[c].eff_searchsp =
835  m_Config->GetInt(kTestName, ctx.str() +
836  "eff_searchsp", kDefaultIntValue);
837  BOOST_REQUIRE_MESSAGE(retval[i]->contexts[c].eff_searchsp >= 0,
838  errors.str() + " eff_searchsp >= 0");
839 
840  retval[i]->contexts[c].length_adjustment =
841  m_Config->GetInt(kTestName, ctx.str() +
842  "length_adjustment", kDefaultIntValue);
843  BOOST_REQUIRE_MESSAGE(retval[i]->contexts[c].length_adjustment >= 0,
844  errors.str() + " length_adjustment >= 0");
845 
846  retval[i]->contexts[c].query_index =
847  m_Config->GetInt(kTestName, ctx.str() +
848  "query_index", kDefaultIntValue);
849  BOOST_REQUIRE_MESSAGE(retval[i]->contexts[c].query_index >= 0,
850  errors.str() + " query_index");
851 
852  retval[i]->contexts[c].frame =
853  m_Config->GetInt(kTestName, ctx.str() +
854  "frame", kDefaultIntValue);
855  BOOST_REQUIRE_MESSAGE(retval[i]->contexts[c].frame == 1
856  || retval[i]->contexts[c].frame == 2
857  || retval[i]->contexts[c].frame == 3
858  || retval[i]->contexts[c].frame == -1
859  || retval[i]->contexts[c].frame == -2
860  || retval[i]->contexts[c].frame == -3
861  || retval[i]->contexts[c].frame == 0,
862  errors.str() + " frame");
863 
864  retval[i]->contexts[c].is_valid =
865  m_Config->GetBool(kTestName, ctx.str() +
866  "is_valid", false);
867  BOOST_REQUIRE_MESSAGE(retval[i]->contexts[c].is_valid,
868  errors.str() + " is_valid");
869  }
870  s_CalculateMaxLength(retval[i]);
871  }
872  }
873 
874  /// This method reads entries in the config file of the format
875  /// ChunkNX, here N is the chunk number and X is the value of data_to_read
876  /// @param kTestName name of the test to read data for [in]
877  /// @param data_to_read data for a chunk to read [in]
878  /// @param retval vector of vectors where the data will be returned. The
879  /// first vector will contain as many elements are there are chunks, and
880  /// the contained vectors will contain as many elements as there are items
881  /// on the config file (comma separated values) [out]
882  template <class T>
884  const char* data_to_read,
885  vector< vector<T> >& retval)
886  {
887  ostringstream os;
888 
889  const int kNumChunks = m_Config->GetInt(kTestName, "NumChunks",
890  kDefaultIntValue);
891  if (kNumChunks == kDefaultIntValue) {
892  throw runtime_error("Invalid number of chunks in " + kTestName);
893  }
894 
895  retval.clear();
896  retval.resize(kNumChunks);
897 
898  for (int i = 0; i < kNumChunks; i++) {
899  os.str("");
900  os << "Chunk" << i << data_to_read;
901 
902  const string& value = m_Config->Get(kTestName, os.str());
903  x_ParseConfigLine(value, retval[i]);
904  }
905  }
906 
907  /// Tokenizes a string containing comma-separated values into a vector of
908  /// values
909  /// @param input string to tokenize [in]
910  /// @param retval vector containing elements found in input string [out]
911  template <class T>
912  void x_ParseConfigLine(const string& input, vector<T>& retval)
913  {
914  retval.clear();
915  vector<string> tokens;
916  NStr::Split(input, ",", tokens);
917  retval.reserve(tokens.size());
918  ITERATE(vector<string>, token, tokens) {
919  retval.push_back(NStr::StringToInt(NStr::TruncateSpaces(*token)));
920  }
921  }
922 
923  /***************** Generic validation methods ****************************/
924 
925  /// Auxiliary method to validate the chunk bounds calculated by the
926  /// CSplitQueryBlk object and the x_ComputeQueryChunkBounds method
927  /// @param kChunkSize size of the chunk [in]
928  /// @param kQuerySize size of the full query [in]
929  /// @param sqb the CSplitQueryBlk object to test [in]
930  /// @param p the program type [in]
932  size_t kQuerySize,
933  const CSplitQueryBlk& sqb,
935  {
936  const size_t kNumChunks(sqb.GetNumChunks());
937  const size_t kQueryChunkOverlapSize = SplitQuery_GetOverlapChunkSize(p);
938 
939  TChunkRange expected_chunk_range(TChunkRange::GetEmpty());
940  for (size_t i = 0; i < kNumChunks; i++) {
941  x_ComputeQueryChunkBounds(expected_chunk_range, kChunkSize,
942  kQuerySize, kQueryChunkOverlapSize);
943  TChunkRange chunk_range = sqb.GetChunkBounds(i);
944  BOOST_REQUIRE_EQUAL(expected_chunk_range.GetFrom(),
945  chunk_range.GetFrom());
946  BOOST_REQUIRE_EQUAL(expected_chunk_range.GetToOpen(),
947  chunk_range.GetToOpen());
948  TSeqPos chunk_start = i*kChunkSize - (i*kQueryChunkOverlapSize);
949  TSeqPos chunk_end = chunk_start + kChunkSize > kQuerySize
950  ? kQuerySize
951  : chunk_start + kChunkSize;
952  BOOST_REQUIRE_EQUAL(expected_chunk_range.GetFrom(), chunk_start);
953  BOOST_REQUIRE_EQUAL(expected_chunk_range.GetToOpen(), chunk_end);
954  TSeqPos chunk_length = chunk_end - chunk_start;
955  BOOST_REQUIRE_EQUAL(chunk_length,
956  expected_chunk_range.GetLength());
957  }
958  }
959 
960  /// Validates the query sequences (by index) assigned to all the chunks
961  /// This compares the data calculated by the sqb parameter to the data read
962  /// from the config file in queries_per_chunk
963  /// @param sqb CSplitQueryBlk object to test [in]
964  /// @param queries_per_chunk data read from config file [in]
966  const vector< vector<size_t> >&
967  queries_per_chunk)
968  {
969  const size_t kNumChunks = sqb.GetNumChunks();
970  BOOST_REQUIRE_EQUAL(kNumChunks, queries_per_chunk.size());
971 
972  for (size_t i = 0; i < kNumChunks; i++) {
973  ostringstream os;
974  os << "Chunk number " << i << " has an invalid number of queries";
975 
976  vector<size_t> data2test = sqb.GetQueryIndices(i);
977  BOOST_REQUIRE_MESSAGE(queries_per_chunk[i].size()==data2test.size(),os.str());
978 
979  for (size_t j = 0; j < data2test.size(); j++) {
980  os.str("");
981  os << "Query index mismatch in chunk number " << i
982  << " entry number " << j;
983  BOOST_REQUIRE_MESSAGE(queries_per_chunk[i][j]==data2test[j],os.str());
984  }
985  }
986  }
987 
988  /// Validates the query contexts assigned to all the chunks
989  /// @param sqb CSplitQueryBlk object to test [in]
990  /// @param contexts_per_chunk data read from config file [in]
992  const vector< vector<int> >&
993  contexts_per_chunk)
994  {
995  const size_t kNumChunks = sqb.GetNumChunks();
996 
997  BOOST_REQUIRE_EQUAL(kNumChunks, contexts_per_chunk.size());
998  for (size_t i = 0; i < kNumChunks; i++) {
999  ostringstream os;
1000  os << "Chunk number " << i << " has an invalid number of contexts";
1001 
1002  vector<int> data2test = sqb.GetQueryContexts(i);
1003  BOOST_REQUIRE_MESSAGE(contexts_per_chunk[i].size()==data2test.size(),os.str());
1004 
1005  for (size_t j = 0; j < data2test.size(); j++) {
1006  os.str("");
1007  os << "Context index mismatch in chunk number " << i
1008  << " entry number " << j;
1009  BOOST_REQUIRE_MESSAGE(contexts_per_chunk[i][j]==data2test[j],os.str());
1010  }
1011  }
1012  }
1013 
1014  /// Validates the context offsets assigned to all the chunks
1015  /// @param sqb CSplitQueryBlk object to test [in]
1016  /// @param contexts_offsets_per_chunk data read from config file [in]
1018  const vector< vector<size_t> >&
1019  contexts_offsets_per_chunk)
1020  {
1021  const size_t kNumChunks(sqb.GetNumChunks());
1022  BOOST_REQUIRE_EQUAL(kNumChunks, contexts_offsets_per_chunk.size());
1023  for (size_t i = 0; i < kNumChunks; i++) {
1024  ostringstream os;
1025  os << "Chunk number " << i
1026  << " has an invalid number of context offsets";
1027 
1028  vector<size_t> data2test = sqb.GetContextOffsets(i);
1029  BOOST_REQUIRE_MESSAGE(contexts_offsets_per_chunk[i].size()==data2test.size(),os.str());
1030 
1031  for (size_t j = 0; j < data2test.size(); j++) {
1032  os.str("");
1033  os << "Context offset mismatch in chunk number " << i
1034  << " entry number " << j << " value now " << data2test[j]
1035  << " not " << contexts_offsets_per_chunk[i][j];
1036 // TLM cerr << "data2test " << data2test[j] << " ";
1037  BOOST_REQUIRE_MESSAGE(contexts_offsets_per_chunk[i][j]==data2test[j],os.str());
1038  }
1039 // TLM cerr << endl;
1040  }
1041  }
1042 
1043  /// Validate the query info structure generated (test) against the expected
1044  /// one (reference) (N.B.: this is called from x_ValidateLocalQueryData)
1045  /// @param reference The "good" BlastQueryInfo structure [in]
1046  /// @param test the BlastQueryInfo structure to test [in]
1047  /// @param the chunk number being tested, this is needed for error
1048  /// reporting purposes [in]
1050  const BlastQueryInfo* test,
1051  size_t chunk_num)
1052  {
1053  ostringstream os;
1054 
1055  os << "Chunk " << chunk_num << ": BlastQueryInfo::first_context";
1056  BOOST_REQUIRE_MESSAGE(reference->first_context==test->first_context,os.str());
1057 
1058  os.str("");
1059  os << "Chunk " << chunk_num << ": BlastQueryInfo::last_context";
1060  BOOST_REQUIRE_MESSAGE(reference->last_context==test->last_context,os.str());
1061 
1062  os.str("");
1063  os << "Chunk " << chunk_num << ": BlastQueryInfo::num_queries";
1064  BOOST_REQUIRE_MESSAGE(reference->num_queries==test->num_queries,os.str());
1065 
1066  os.str("");
1067  os << "Chunk " << chunk_num << ": BlastQueryInfo::max_length";
1068  BOOST_REQUIRE_MESSAGE(reference->max_length==test->max_length,os.str());
1069 
1070  os.str("");
1071  os << "Chunk " << chunk_num << ": BlastQueryInfo::pattern_info";
1072  BOOST_REQUIRE_MESSAGE(reference->pattern_info==test->pattern_info,os.str());
1073 
1074  for (Int4 ctx = reference->first_context;
1075  ctx <= reference->last_context;
1076  ctx++) {
1077 
1078  os.str("");
1079  os << "Chunk " << chunk_num << ", context " << ctx;
1080  BOOST_REQUIRE_MESSAGE(reference->contexts[ctx].query_offset==test->contexts[ctx].query_offset,
1081  os.str() + " query_offset");
1082  BOOST_REQUIRE_MESSAGE(reference->contexts[ctx].query_length==test->contexts[ctx].query_length,
1083  os.str() + " query_length");
1084  BOOST_REQUIRE_MESSAGE(reference->contexts[ctx].eff_searchsp==test->contexts[ctx].eff_searchsp,
1085  os.str() + " eff_searchsp");
1086  BOOST_REQUIRE_MESSAGE(reference->contexts[ctx].query_index==test->contexts[ctx].query_index,
1087  os.str() + " query_index");
1088  BOOST_REQUIRE_MESSAGE((int)reference->contexts[ctx].frame==(int)test->contexts[ctx].frame,
1089  os.str() + " frame");
1090  BOOST_REQUIRE_MESSAGE(reference->contexts[ctx].is_valid==test->contexts[ctx].is_valid,
1091  os.str() + " is_valid");
1092 
1093  }
1094  }
1095 
1096  /// Validate the local query data for all chunks, comparing data produced
1097  /// by the CQuerySplitter object and the BlastQueryInfo structures read
1098  /// from the config file (BLAST_SequenceBlk's are not tested)
1099  /// @param splitter object to test [in]
1100  /// @param options BLAST options [in]
1101  /// @param split_query_info_structs the data to compare to (reference) [in]
1103  const CBlastOptions* options,
1104  vector<BlastQueryInfo*>
1105  split_query_info_structs)
1106  {
1107  ostringstream os;
1108  BOOST_REQUIRE(options);
1109  const size_t kNumChunks(splitter->GetNumberOfChunks());
1110 
1111  CRef<CSplitQueryBlk> sqb = splitter->Split();
1112  BOOST_REQUIRE_EQUAL(kNumChunks, split_query_info_structs.size());
1113 
1114  for (size_t i = 0; i < kNumChunks; i++) {
1115  os.str("");
1116  os << "Chunk " << i << ": ";
1118  BOOST_REQUIRE_MESSAGE(qf.NotEmpty(),os.str() + "NULL query factory");
1119  CRef<ILocalQueryData> qd = qf->MakeLocalQueryData(options);
1120  BOOST_REQUIRE_MESSAGE(qd.NotEmpty(),os.str() + "NULL local query data");
1121 
1122  os << "Different number of queries";
1123  BOOST_REQUIRE_MESSAGE((size_t)sqb->GetNumQueriesForChunk(i)==(size_t)qd->GetNumQueries(),os.str());
1124 
1125  // FIXME: turned off for now
1126  // Validate the query info structure
1127  //x_ValidateQueryInfoForChunk(split_query_info_structs[i],
1128  // qd->GetQueryInfo(), i);
1129 
1130  //x_ValidateSequenceBlkForChunk();
1131 
1132  // Validate that query in this chunk is indeed valid
1133  //for (int qindex = 0; qindex < qd->GetNumQueries(); qindex++) {
1134  // os.str("Chunk ");
1135  // os << i << ": query " << qindex << " is invalid";
1136  // BOOST_REQUIRE_MESSAGE(qd->IsValidQuery(qindex),os.str());
1137  //}
1138 
1139  }
1140 
1141  }
1142 };
1143 
1144 BOOST_FIXTURE_TEST_SUITE(split_query, CSplitQueryTestFixture)
1145 
1146 /*********** Actual unit tests ***************************************/
1147 BOOST_AUTO_TEST_CASE(SplitQueriesIn1Chunk) {
1149  Int2 rv;
1150 
1151  rv = SplitQueryBlk_AddQueryToChunk(sqb->GetCStruct(), 41, 2);
1152  BOOST_REQUIRE_EQUAL(kBadParameter, rv);
1153 
1154  /// This will be reused for both query indices and contexts
1155  vector<Int4> query_indices_expected;
1156  query_indices_expected.push_back(45);
1157  query_indices_expected.push_back(0);
1158  query_indices_expected.push_back(7);
1159 
1160  ITERATE(vector<Int4>, qi, query_indices_expected) {
1161  rv = SplitQueryBlk_AddQueryToChunk(sqb->GetCStruct(), *qi, 0);
1162  BOOST_REQUIRE_EQUAL((Int2)0, rv);
1163  rv = SplitQueryBlk_AddContextToChunk(sqb->GetCStruct(), *qi, 0);
1164  BOOST_REQUIRE_EQUAL((Int2)0, rv);
1165  }
1166 
1167  Uint4* query_indices = NULL;
1169  &query_indices);
1170  BOOST_REQUIRE_EQUAL((Int2)0, rv);
1171  for (int i = 0; query_indices[i] != UINT4_MAX; i++) {
1172  BOOST_REQUIRE_EQUAL(query_indices_expected[i],
1173  (Int4)query_indices[i]);
1174  }
1175  sfree(query_indices);
1176 
1177  Int4* query_contexts = NULL;
1178  Uint4 num_query_contexts = 0;
1180  &query_contexts,
1181  &num_query_contexts);
1182  BOOST_REQUIRE_EQUAL((Int2)0, rv);
1183  for (Uint4 i = 0; i < num_query_contexts; i++) {
1184  BOOST_REQUIRE_EQUAL(query_indices_expected[i], query_contexts[i]);
1185  }
1186  sfree(query_contexts);
1187 
1188  size_t num_queries(0);
1190  &num_queries);
1191  BOOST_REQUIRE_EQUAL((Int2)0, rv);
1192  BOOST_REQUIRE_EQUAL(query_indices_expected.size(), num_queries);
1193 }
1194 
1195 BOOST_AUTO_TEST_CASE(SplitQueriesRandomly) {
1196  CRandom random((CRandom::TValue)time(0));
1197  const Uint4 kNumChunks(random.GetRand(1, 100));
1198  TSplitQueryChunkMap map;
1199  map.resize(kNumChunks);
1200  Uint4 query_index = 0;
1201 
1202  // Set up the artificial data
1203  for (Uint4 chunk_num = 0; chunk_num < kNumChunks; chunk_num++) {
1204  const Uint4 kQueriesPerChunk(random.GetRand(1, 365));
1205  for (Uint4 i = 0; i < kQueriesPerChunk; i++) {
1206  map[chunk_num].push_back(query_index++);
1207  }
1208  }
1209 
1210  // Set up the SplitQueryBlk structure
1211  CRef<CSplitQueryBlk> sqb(new CSplitQueryBlk(kNumChunks));
1212  for (size_t chunk_num = 0; chunk_num < map.size(); chunk_num++) {
1213  ITERATE(vector<Uint4>, qi, map[chunk_num]) {
1215  chunk_num);
1216  BOOST_REQUIRE_EQUAL((Int2)0, rv);
1217  }
1218  }
1219 
1220  for (Uint4 chunk_num = 0; chunk_num < kNumChunks; chunk_num++) {
1221  vector<Uint4> query_indices_expected = map[chunk_num];
1222 
1223  Uint4* query_indices = NULL;
1225  chunk_num,
1226  &query_indices);
1227  BOOST_REQUIRE_EQUAL((Int2)0, rv);
1228  BOOST_REQUIRE(query_indices != NULL);
1229 
1230  size_t i;
1231  for (i = 0; i < query_indices_expected.size(); i++) {
1232  BOOST_REQUIRE_EQUAL(query_indices_expected[i],
1233  query_indices[i]);
1234  }
1235  BOOST_REQUIRE_EQUAL((Uint4)UINT4_MAX, query_indices[i]);
1236  sfree(query_indices);
1237 
1238  size_t num_queries(0);
1239  rv = SplitQueryBlk_GetNumQueriesForChunk(sqb->GetCStruct(), chunk_num,
1240  &num_queries);
1241  BOOST_REQUIRE_EQUAL((Int2)0, rv);
1242  BOOST_REQUIRE_EQUAL(query_indices_expected.size(), num_queries);
1243  }
1244 }
1245 
1246 BOOST_AUTO_TEST_CASE(Split4QueriesIn3Chunks) {
1247  const Uint4 kNumChunks = 3;
1248  TSplitQueryChunkMap map;
1249  map.resize(kNumChunks);
1250  map[0].push_back(0);
1251  map[0].push_back(1);
1252  map[1].push_back(2);
1253  map[2].push_back(3);
1254 
1255  CRef<CSplitQueryBlk> sqb(new CSplitQueryBlk(kNumChunks));
1256 
1257  for (Uint4 chunk_num = 0; chunk_num < map.size(); chunk_num++) {
1258  ITERATE(vector<Uint4>, qi, map[chunk_num]) {
1259  Int2 rv = SplitQueryBlk_AddQueryToChunk(sqb->GetCStruct(), *qi,
1260  chunk_num);
1261  BOOST_REQUIRE_EQUAL((Int2)0, rv);
1262  }
1263  }
1264 
1265  for (Uint4 chunk_num = 0; chunk_num < kNumChunks; chunk_num++) {
1266  vector<Uint4> query_indices_expected = map[chunk_num];
1267 
1268  Uint4* query_indices = NULL;
1270  chunk_num,
1271  &query_indices);
1272  BOOST_REQUIRE_EQUAL((Int2)0, rv);
1273  BOOST_REQUIRE(query_indices != NULL);
1274 
1275  size_t i;
1276  for (i = 0; i < query_indices_expected.size(); i++) {
1277  BOOST_REQUIRE_EQUAL(query_indices_expected[i],
1278  query_indices[i]);
1279  }
1280  BOOST_REQUIRE_EQUAL((Uint4)UINT4_MAX, query_indices[i]);
1281  sfree(query_indices);
1282 
1283  size_t num_queries(0);
1284  rv = SplitQueryBlk_GetNumQueriesForChunk(sqb->GetCStruct(), chunk_num,
1285  &num_queries);
1286  BOOST_REQUIRE_EQUAL((Int2)0, rv);
1287  BOOST_REQUIRE_EQUAL(query_indices_expected.size(), num_queries);
1288  }
1289 }
1290 
1291 /// Tests query splitting for blastn of both strands of a single query into
1292 /// multiple chunks
1293 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnSingleQueryMultiChunk_BothStrands) {
1294  CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1295  const string
1296  kTestName("QuerySplitter_BlastnSingleQueryMultiChunk_BothStrands");
1297 
1298  QuerySplitter_BlastnSingleQueryMultiChunk(kTestName, eNa_strand_both);
1299 }
1300 
1301 /// Tests query splitting for blastn of the plus strands of a single query
1302 /// into multiple chunks
1303 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnSingleQueryMultiChunk_PlusStrand) {
1304  CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1305  const string
1306  kTestName("QuerySplitter_BlastnSingleQueryMultiChunk_PlusStrand");
1307 
1308  QuerySplitter_BlastnSingleQueryMultiChunk(kTestName, eNa_strand_plus);
1309 }
1310 
1311 /// Tests query splitting for blastn of the minus strands of a single query
1312 /// into multiple chunks
1313 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnSingleQueryMultiChunk_MinusStrand) {
1314  CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1315  const string
1316  kTestName("QuerySplitter_BlastnSingleQueryMultiChunk_MinusStrand");
1317 
1318  QuerySplitter_BlastnSingleQueryMultiChunk(kTestName, eNa_strand_minus);
1319 }
1320 
1321 /// Tests query splitting for blastn of the plus strands of multiple queries
1322 /// into multiple chunks
1323 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnMultiQueryMultiChunk_PlusStrand) {
1324  CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1325  const string
1326  kTestName("QuerySplitter_BlastnMultiQueryMultiChunk_PlusStrand");
1327 
1328  QuerySplitter_BlastnMultiQueryMultiChunk(kTestName, eNa_strand_plus);
1329 }
1330 
1331 /// Tests query splitting for blastn of the minus strands of multiple
1332 /// queries into multiple chunks
1333 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnMultiQueryMultiChunk_MinusStrand) {
1334  CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1335  const string
1336  kTestName("QuerySplitter_BlastnMultiQueryMultiChunk_MinusStrand");
1337 
1338  QuerySplitter_BlastnMultiQueryMultiChunk(kTestName, eNa_strand_minus);
1339 }
1340 
1341 /// Tests query splitting for blastn of both strands of multiple
1342 /// queries into multiple chunks
1343 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnMultiQueryMultiChunk_BothStrands) {
1344  CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1345  const string
1346  kTestName("QuerySplitter_BlastnMultiQueryMultiChunk_BothStrands");
1347  QuerySplitter_BlastnMultiQueryMultiChunk(kTestName, eNa_strand_both);
1348 }
1349 
1350 /// Tests query splitting for blastn with multiple queries in multiple
1351 /// chunks with each query using different strands
1352 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnMultiQueryMultiChunk_MixedStrands) {
1353  CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1354  const string
1355  kTestName("QuerySplitter_BlastnMultiQueryMultiChunk_MixedStrands");
1356  vector<ENa_strand> query_strands;
1357  query_strands.reserve(4);
1358  query_strands.push_back(eNa_strand_plus);
1359  query_strands.push_back(eNa_strand_both);
1360  query_strands.push_back(eNa_strand_minus);
1361  query_strands.push_back(eNa_strand_unknown);
1362 
1363  QuerySplitter_BlastnMultiQueryMultiChunk(kTestName,
1365  &query_strands);
1366 }
1367 
1368 /********* This functionality has not been implemented **************/
1369 #if 0
1370 /// Tests blastx of both strands of a single query into multiple chunks
1371 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastxSingleQueryMultiChunk_BothStrands) {
1372  const string
1373  kTestName("QuerySplitter_BlastxSingleQueryMultiChunk_BothStrands");
1374 
1375  QuerySplitter_BlastxSingleQueryMultiChunk(kTestName, eNa_strand_both);
1376 }
1377 
1378 /// Tests blastx of the plus strand of a single query into multiple chunks
1379 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastxSingleQueryMultiChunk_PlusStrand) {
1380  const string
1381  kTestName("QuerySplitter_BlastxSingleQueryMultiChunk_PlusStrand");
1382 
1383  QuerySplitter_BlastxSingleQueryMultiChunk(kTestName, eNa_strand_plus);
1384 }
1385 
1386 /// Tests blastx of the minus strand of a single query into multiple chunks
1387 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastxSingleQueryMultiChunk_MinusStrand) {
1388  const string
1389  kTestName("QuerySplitter_BlastxSingleQueryMultiChunk_MinusStrand");
1390 
1391  QuerySplitter_BlastxSingleQueryMultiChunk(kTestName, eNa_strand_minus);
1392 }
1393 
1394 
1395 /// Tests blastx of the plus strand of multiple queries into multiple chunks
1396 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastxMultiQueryMultiChunk_PlusStrand) {
1397  const string
1398  kTestName("QuerySplitter_BlastxMultiQueryMultiChunk_PlusStrand");
1399 
1400  QuerySplitter_BlastxMultiQueryMultiChunk(kTestName, eNa_strand_plus);
1401 }
1402 
1403 /// Tests blastx of the minus strand of multiple queries into multiple
1404 /// chunks
1405 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastxMultiQueryMultiChunk_MinusStrand) {
1406  const string
1407  kTestName("QuerySplitter_BlastxMultiQueryMultiChunk_MinusStrand");
1408 
1409  QuerySplitter_BlastxMultiQueryMultiChunk(kTestName, eNa_strand_minus);
1410 }
1411 
1412 /// Tests blastx of both strands of multiple queries into multiple
1413 /// chunks
1414 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastxMultiQueryMultiChunk_BothStrands) {
1415  const string
1416  kTestName("QuerySplitter_BlastxMultiQueryMultiChunk_BothStrands");
1417 
1418  QuerySplitter_BlastxMultiQueryMultiChunk(kTestName, eNa_strand_both);
1419 }
1420 
1421 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastxMultiQueryMultiChunk_MixedStrands) {
1422  const string
1423  kTestName("QuerySplitter_BlastxMultiQueryMultiChunk_MixedStrands");
1424  vector<ENa_strand> query_strands;
1425  query_strands.reserve(4);
1426  query_strands.push_back(eNa_strand_unknown);
1427  query_strands.push_back(eNa_strand_plus);
1428  query_strands.push_back(eNa_strand_both);
1429  query_strands.push_back(eNa_strand_minus);
1430 
1431  QuerySplitter_BlastxMultiQueryMultiChunk(kTestName, eNa_strand_unknown,
1432  &query_strands);
1433 }
1434 
1435 #endif
1436 
1437 /// Tests blastp of a single query into multiple chunks
1438 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastpSingleQueryMultiChunk) {
1439  const string kTestName("QuerySplitter_BlastpSingleQueryMultiChunk");
1440 
1441  const size_t kLength = 33423; // query length
1443  CSeq_id id(CSeq_id::e_Gi, 110349719);
1444  query.AddQuery(CTestObjMgr::Instance().CreateBlastSearchQuery(id));
1445 
1448  CRef<CBlastOptions> opts(&opts_h->SetOptions());
1449  CRef<ILocalQueryData> query_data(qf->MakeLocalQueryData(&*opts));
1450 
1451  CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
1452  CRef<CSplitQueryBlk> sqb = splitter->Split();
1453 
1454  BOOST_REQUIRE_EQUAL(m_Config->GetInt(kTestName, "ChunkSize",
1455  kDefaultIntValue),
1456  (int)splitter->GetChunkSize());
1457 
1458  CQuerySplitter::TSplitQueryVector split_query_vector;
1459  x_ReadQueryBoundsPerChunk(kTestName, sqb, split_query_vector);
1460  x_ValidateQuerySeqLocsPerChunk(splitter, split_query_vector);
1461 
1462  BOOST_REQUIRE_EQUAL(kLength, query_data->GetSumOfSequenceLengths());
1463  x_ValidateChunkBounds(splitter->GetChunkSize(),
1464  query_data->GetSumOfSequenceLengths(),
1465  *sqb, opts->GetProgramType());
1466 
1467  const size_t kNumChunks = (size_t)m_Config->GetInt(kTestName,
1468  "NumChunks",
1469  kDefaultIntValue);
1470  BOOST_REQUIRE_EQUAL(kNumChunks, (size_t)splitter->GetNumberOfChunks());
1471  BOOST_REQUIRE_EQUAL(kNumChunks, sqb->GetNumChunks());
1472 
1473  vector< vector<size_t> > queries_per_chunk;
1474  x_ReadVectorOfVectorsForTest(kTestName, "Queries", queries_per_chunk);
1475  x_ValidateQueriesPerChunkAssignment(*sqb, queries_per_chunk);
1476 
1477  vector< vector<int> > ctxs_per_chunk;
1478  x_ReadVectorOfVectorsForTest(kTestName, "Contexts", ctxs_per_chunk);
1479  x_ValidateQueryContextsPerChunkAssignment(*sqb, ctxs_per_chunk);
1480 
1481  vector< vector<size_t> > ctx_offsets_per_chunk;
1482  x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1483  ctx_offsets_per_chunk);
1484  x_ValidateContextOffsetsPerChunkAssignment(*sqb, ctx_offsets_per_chunk);
1485 
1486  vector<BlastQueryInfo*> split_query_info;
1487  x_ReadSplitQueryInfoForTest(kTestName, opts->GetProgramType(),
1488  split_query_info);
1489  x_ValidateLocalQueryData(splitter, &*opts, split_query_info);
1490  NON_CONST_ITERATE(vector<BlastQueryInfo*>, itr, split_query_info) {
1491  *itr = BlastQueryInfoFree(*itr);
1492  }
1493 }
1494 
1495 /// Tests blastp of multiple queries into multiple chunks
1496 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastpMultiQueryMultiChunk) {
1497  const string kTestName("QuerySplitter_BlastpMultiQueryMultiChunk");
1498 
1499  TGiLengthVector gi_length;
1500  gi_length.push_back(make_pair<int, size_t>(33624848, 6883));
1501  gi_length.push_back(make_pair<int, size_t>(4758794, 6669));
1502  gi_length.push_back(make_pair<int, size_t>(66821305, 6061));
1503  gi_length.push_back(make_pair<int, size_t>(109075552, 5007));
1504 
1505  size_t tot_length;
1506  TSeqLocVector queries;
1507  s_ConvertToBlastQueries(gi_length, queries, &tot_length);
1508 
1509  CRef<IQueryFactory> qf(new CObjMgr_QueryFactory(queries));
1511  CRef<CBlastOptions> opts(&opts_h->SetOptions());
1512  CRef<ILocalQueryData> query_data(qf->MakeLocalQueryData(&*opts));
1513 
1514  CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
1515  CRef<CSplitQueryBlk> sqb = splitter->Split();
1516 
1517  BOOST_REQUIRE_EQUAL(m_Config->GetInt(kTestName, "ChunkSize",
1518  kDefaultIntValue),
1519  (int)splitter->GetChunkSize());
1520 
1521  CQuerySplitter::TSplitQueryVector split_query_vector;
1522  x_ReadQueryBoundsPerChunk(kTestName, sqb, split_query_vector);
1523  x_ValidateQuerySeqLocsPerChunk(splitter, split_query_vector);
1524 
1525  BOOST_REQUIRE_EQUAL(tot_length, query_data->GetSumOfSequenceLengths());
1526  x_ValidateChunkBounds(splitter->GetChunkSize(),
1527  query_data->GetSumOfSequenceLengths(),
1528  *sqb, opts->GetProgramType());
1529 
1530  const size_t kNumChunks = (size_t)m_Config->GetInt(kTestName,
1531  "NumChunks",
1532  kDefaultIntValue);
1533  BOOST_REQUIRE_EQUAL(kNumChunks, (size_t)splitter->GetNumberOfChunks());
1534  BOOST_REQUIRE_EQUAL(kNumChunks, sqb->GetNumChunks());
1535 
1536  vector< vector<size_t> > queries_per_chunk;
1537  x_ReadVectorOfVectorsForTest(kTestName, "Queries", queries_per_chunk);
1538  x_ValidateQueriesPerChunkAssignment(*sqb, queries_per_chunk);
1539 
1540  vector< vector<int> > ctxs_per_chunk;
1541  x_ReadVectorOfVectorsForTest(kTestName, "Contexts", ctxs_per_chunk);
1542  x_ValidateQueryContextsPerChunkAssignment(*sqb, ctxs_per_chunk);
1543 
1544  vector< vector<size_t> > ctx_offsets_per_chunk;
1545  x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1546  ctx_offsets_per_chunk);
1547  x_ValidateContextOffsetsPerChunkAssignment(*sqb, ctx_offsets_per_chunk);
1548 
1549  vector<BlastQueryInfo*> split_query_info;
1550  x_ReadSplitQueryInfoForTest(kTestName, opts->GetProgramType(),
1551  split_query_info);
1552  x_ValidateLocalQueryData(splitter, &*opts, split_query_info);
1553  NON_CONST_ITERATE(vector<BlastQueryInfo*>, itr, split_query_info) {
1554  *itr = BlastQueryInfoFree(*itr);
1555  }
1556 }
1557 
1558 /// Tests the CContextTranslator class for blastn of both strands of
1559 /// multiple queries
1560 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastnMultiQuery_BothStrands) {
1561  const string
1562  kTestName("TestCContextTranslator_BlastnMultiQuery_BothStrands");
1563  TGiLengthVector gi_length;
1564  gi_length.push_back(make_pair<int, size_t>(107784911, 1000));
1565  gi_length.push_back(make_pair<int, size_t>(115354032, 250));
1566  gi_length.push_back(make_pair<int, size_t>(115381005, 2551));
1567 
1568  const size_t chunk_size = 500;
1569  const size_t num_chunks = 9;
1570 
1571  vector< vector<int> > starting_chunks(num_chunks);
1572  vector< vector<int> > absolute_contexts(num_chunks);
1573  vector< vector<size_t> > context_offset_corrections(num_chunks);
1574 
1575  x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1576  starting_chunks);
1577  x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1578  absolute_contexts);
1579  x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1580  context_offset_corrections);
1581 
1582  x_TestCContextTranslator(gi_length, chunk_size, num_chunks, eBlastn,
1583  starting_chunks, absolute_contexts,
1584  &context_offset_corrections,
1585  eNa_strand_both);
1586 }
1587 
1588 /// Tests the CContextTranslator class for blastn of the plus strand of
1589 /// multiple queries
1590 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastnMultiQuery_PlusStrand) {
1591  const string
1592  kTestName("TestCContextTranslator_BlastnMultiQuery_PlusStrand");
1593  TGiLengthVector gi_length;
1594  gi_length.push_back(make_pair<int, size_t>(107784911, 1000));
1595  gi_length.push_back(make_pair<int, size_t>(115354032, 250));
1596  gi_length.push_back(make_pair<int, size_t>(115381005, 2551));
1597 
1598  const size_t chunk_size = 500;
1599  const size_t num_chunks = 9;
1600 
1601  vector< vector<int> > starting_chunks(num_chunks);
1602  vector< vector<int> > absolute_contexts(num_chunks);
1603  vector< vector<size_t> > context_offset_corrections(num_chunks);
1604 
1605  x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1606  starting_chunks);
1607  x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1608  absolute_contexts);
1609  x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1610  context_offset_corrections);
1611 
1612  x_TestCContextTranslator(gi_length, chunk_size, num_chunks, eBlastn,
1613  starting_chunks, absolute_contexts,
1614  &context_offset_corrections,
1615  eNa_strand_plus);
1616 }
1617 
1618 /// Tests the CContextTranslator class for blastn of the minus strand of
1619 /// multiple queries
1620 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastnMultiQuery_MinusStrand) {
1621  const string
1622  kTestName("TestCContextTranslator_BlastnMultiQuery_MinusStrand");
1623  TGiLengthVector gi_length;
1624  gi_length.push_back(make_pair<int, size_t>(107784911, 1000));
1625  gi_length.push_back(make_pair<int, size_t>(115354032, 250));
1626  gi_length.push_back(make_pair<int, size_t>(115381005, 2551));
1627 
1628  const size_t chunk_size = 500;
1629  const size_t num_chunks = 9;
1630 
1631  vector< vector<int> > starting_chunks(num_chunks);
1632  vector< vector<int> > absolute_contexts(num_chunks);
1633  vector< vector<size_t> > context_offset_corrections(num_chunks);
1634 
1635  x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1636  starting_chunks);
1637  x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1638  absolute_contexts);
1639  x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1640  context_offset_corrections);
1641 
1642  x_TestCContextTranslator(gi_length, chunk_size, num_chunks, eBlastn,
1643  starting_chunks, absolute_contexts,
1644  &context_offset_corrections,
1646 }
1647 
1648 /// Tests the CContextTranslator class for blastx of both strands of
1649 /// a single query with length divisible by CODON_LENGTH
1650 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastxSingleQuery_BothStrands_0) {
1651  const string
1652  kTestName("TestCContextTranslator_BlastxSingleQuery_BothStrands_0");
1653  TGiLengthVector gi_length;
1654  gi_length.push_back(make_pair<int, size_t>(116001669, 33));
1655 
1656  const size_t chunk_size = 15;
1657  const size_t num_chunks = 3;
1658  CAutoEnvironmentVariable tmp_env("OVERLAP_CHUNK_SIZE", "6");
1659 
1660  vector< vector<int> > starting_chunks(num_chunks);
1661  vector< vector<int> > absolute_contexts(num_chunks);
1662  vector< vector<size_t> > context_offset_corrections(num_chunks);
1663 
1664  x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1665  starting_chunks);
1666  x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1667  absolute_contexts);
1668  x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1669  context_offset_corrections);
1670 
1671  x_TestCContextTranslator(gi_length, chunk_size, num_chunks, eBlastx,
1672  starting_chunks, absolute_contexts,
1673  &context_offset_corrections,
1674  eNa_strand_both);
1675 }
1676 
1677 /// Tests the CContextTranslator class for blastx of both strands of
1678 /// a single query with length not divisible by CODON_LENGTH, instead, the
1679 /// (query length % CODON_LENGTH == 1)
1680 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastxSingleQuery_BothStrands_1) {
1681  const string
1682  kTestName("TestCContextTranslator_BlastxSingleQuery_BothStrands_1");
1683  TGiLengthVector gi_length;
1684  gi_length.push_back(make_pair<int, size_t>(116001673, 34));
1685 
1686  const size_t chunk_size = 15;
1687  const size_t num_chunks = 3;
1688  CAutoEnvironmentVariable tmp_env("OVERLAP_CHUNK_SIZE", "6");
1689 
1690  vector< vector<int> > starting_chunks(num_chunks);
1691  vector< vector<int> > absolute_contexts(num_chunks);
1692  vector< vector<size_t> > context_offset_corrections(num_chunks);
1693 
1694  x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1695  starting_chunks);
1696  x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1697  absolute_contexts);
1698  x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1699  context_offset_corrections);
1700 
1701  x_TestCContextTranslator(gi_length, chunk_size, num_chunks, eBlastx,
1702  starting_chunks, absolute_contexts,
1703  &context_offset_corrections,
1704  eNa_strand_both);
1705 }
1706 
1707 /// Tests the CContextTranslator class for blastx of both strands of
1708 /// a single query with length not divisible by CODON_LENGTH, instead, the
1709 /// (query length % CODON_LENGTH == 2)
1710 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastxSingleQuery_BothStrands_2) {
1711  const string
1712  kTestName("TestCContextTranslator_BlastxSingleQuery_BothStrands_2");
1713  TGiLengthVector gi_length;
1714  gi_length.push_back(make_pair<int, size_t>(116001668, 35));
1715 
1716  const size_t chunk_size = 15;
1717  const size_t kNumChunks = m_Config->GetInt(kTestName, "NumChunks",
1718  kDefaultIntValue);
1719  CAutoEnvironmentVariable tmp_env("OVERLAP_CHUNK_SIZE", "6");
1720 
1721  vector< vector<int> > starting_chunks(kNumChunks);
1722  vector< vector<int> > absolute_contexts(kNumChunks);
1723  vector< vector<size_t> > context_offset_corrections(kNumChunks);
1724 
1725  x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1726  starting_chunks);
1727  x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1728  absolute_contexts);
1729  x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1730  context_offset_corrections);
1731 
1732  x_TestCContextTranslator(gi_length, chunk_size, kNumChunks, eBlastx,
1733  starting_chunks, absolute_contexts,
1734  &context_offset_corrections,
1735  eNa_strand_both);
1736 }
1737 
1738 /********* This functionality has not been implemented **************/
1739 #if 0
1740 
1741 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastxMultiQuery_BothStrands) {
1742  const string
1743  kTestName("TestCContextTranslator_BlastxMultiQuery_BothStrands");
1744  TGiLengthVector gi_length;
1745  gi_length.push_back(make_pair<int, size_t>(107784911, 1000));
1746  gi_length.push_back(make_pair<int, size_t>(115354032, 250));
1747  gi_length.push_back(make_pair<int, size_t>(115381005, 2551));
1748 
1749  const size_t chunk_size = 501;
1750  const size_t num_chunks = 10;
1751 
1752  vector< vector<int> > starting_chunks(num_chunks);
1753  vector< vector<int> > absolute_contexts(num_chunks);
1754  vector< vector<size_t> > context_offset_corrections(num_chunks);
1755 
1756  x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1757  starting_chunks);
1758  x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1759  absolute_contexts);
1760  x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1761  context_offset_corrections);
1762 
1763  x_TestCContextTranslator(gi_length, chunk_size, num_chunks, eBlastx,
1764  starting_chunks, absolute_contexts,
1765  &context_offset_corrections,
1766  eNa_strand_both);
1767 }
1768 
1769 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastxMultiQuery_PlusStrand) {
1770  const string
1771  kTestName("TestCContextTranslator_BlastxMultiQuery_PlusStrand");
1772  TGiLengthVector gi_length;
1773  gi_length.push_back(make_pair<int, size_t>(107784911, 1000));
1774  gi_length.push_back(make_pair<int, size_t>(115354032, 250));
1775  gi_length.push_back(make_pair<int, size_t>(115381005, 2551));
1776 
1777  const size_t chunk_size = 500;
1778  const size_t num_chunks = 10;
1779 
1780  vector< vector<int> > starting_chunks(num_chunks);
1781  vector< vector<int> > absolute_contexts(num_chunks);
1782  vector< vector<size_t> > context_offset_corrections(num_chunks);
1783 
1784  x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1785  starting_chunks);
1786  x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1787  absolute_contexts);
1788  x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1789  context_offset_corrections);
1790 
1791  x_TestCContextTranslator(gi_length, chunk_size, num_chunks, eBlastx,
1792  starting_chunks, absolute_contexts,
1793  &context_offset_corrections,
1794  eNa_strand_plus);
1795 }
1796 
1797 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastxMultiQuery_MinusStrand) {
1798  const string
1799  kTestName("TestCContextTranslator_BlastxMultiQuery_MinusStrand");
1800  TGiLengthVector gi_length;
1801  gi_length.push_back(make_pair<int, size_t>(107784911, 1000));
1802  gi_length.push_back(make_pair<int, size_t>(115354032, 250));
1803  gi_length.push_back(make_pair<int, size_t>(115381005, 2551));
1804 
1805  const size_t chunk_size = 500;
1806  const size_t num_chunks = 10;
1807 
1808  vector< vector<int> > starting_chunks(num_chunks);
1809  vector< vector<int> > absolute_contexts(num_chunks);
1810  vector< vector<size_t> > context_offset_corrections(num_chunks);
1811 
1812  x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1813  starting_chunks);
1814  x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1815  absolute_contexts);
1816  x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1817  context_offset_corrections);
1818 
1819  x_TestCContextTranslator(gi_length, chunk_size, num_chunks, eBlastx,
1820  starting_chunks, absolute_contexts,
1821  &context_offset_corrections,
1823 }
1824 #endif
1825 
1826 
1827 /// Tests the CQuerySplitter class when no splitting should occur
1828 BOOST_AUTO_TEST_CASE(QuerySplitter_NoSplit) {
1829  CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1830  const string kTestName("QuerySplitter_NoSplit");
1832  CSeq_id id(CSeq_id::e_Gi, 555);
1833  query.AddQuery(CTestObjMgr::Instance().CreateBlastSearchQuery(id));
1834 
1837  CRef<CBlastOptions> opts(&opts_h->SetOptions());
1838 
1839  const size_t kNumChunks = m_Config->GetInt(kTestName, "NumChunks",
1840  kDefaultIntValue);
1841  CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
1842 
1843  BOOST_REQUIRE_EQUAL(false, splitter->IsQuerySplit());
1844  BOOST_REQUIRE_EQUAL(m_Config->GetInt(kTestName, "ChunkSize",
1845  kDefaultIntValue),
1846  (int)splitter->GetChunkSize());
1847  BOOST_REQUIRE_EQUAL(kNumChunks, (size_t)splitter->GetNumberOfChunks());
1848 
1849  CRef<CSplitQueryBlk> sqb = splitter->Split();
1850  BOOST_REQUIRE_EQUAL(false, splitter->IsQuerySplit());
1851  BOOST_REQUIRE_EQUAL(kNumChunks, sqb->GetNumChunks());
1852 
1853  try {
1854  // try passing an out-of-range index
1855  (void)sqb->GetNumQueriesForChunk(kNumChunks + 8);
1856  BOOST_REQUIRE(false);
1857  } catch (const runtime_error&) {
1858  BOOST_REQUIRE(true);
1859  }
1860 
1861  CRef<IQueryFactory> chunk_query_factory =
1862  splitter->GetQueryFactoryForChunk(0);
1863  BOOST_REQUIRE_EQUAL(qf, chunk_query_factory);
1864 }
1865 
1866 /// Tests the CQuerySplitter class for retrieval of IQueryFactory objects
1867 /// for given chunks
1868 BOOST_AUTO_TEST_CASE(QuerySplitter_ValidateQueryFactoriesBlastn) {
1869  CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "30000");
1870  TGiLengthVector gi_length;
1871  gi_length.push_back(make_pair<int, size_t>(95116755, 35000));
1872  gi_length.push_back(make_pair<int, size_t>(112123020, 35580));
1873 
1874  TSeqLocVector queries;
1875  s_ConvertToBlastQueries(gi_length, queries);
1876 
1877  CRef<IQueryFactory> qf(new CObjMgr_QueryFactory(queries));
1879  CRef<CBlastOptions> opts(&opts_h->SetOptions());
1880 
1881  CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
1882  const size_t kNumChunks(2);
1883 
1884  try {
1885  (void)splitter->GetQueryFactoryForChunk(kNumChunks);
1886  BOOST_REQUIRE(false);
1887  } catch (const out_of_range& ) {
1888  BOOST_REQUIRE(true);
1889  }
1890 
1891  CRef<IQueryFactory> chunk_0 = splitter->GetQueryFactoryForChunk(0);
1892  CRef<IQueryFactory> chunk_1 = splitter->GetQueryFactoryForChunk(1);
1893 
1894  BOOST_REQUIRE(chunk_0 != qf);
1895  BOOST_REQUIRE(chunk_1 != qf);
1896 
1897  BOOST_REQUIRE(chunk_0.NotEmpty());
1898  BOOST_REQUIRE(chunk_1.NotEmpty());
1899 }
1900 
1901 BOOST_AUTO_TEST_CASE(CalculateNumberChunks)
1902 {
1904  size_t chunk_size = 10002;
1905  Uint4 retval = SplitQuery_CalculateNumChunks(program,
1906  &chunk_size, 10240000, 1);
1907  BOOST_REQUIRE_EQUAL(1055, retval);
1908 
1910  &chunk_size, chunk_size/2, 1);
1911 
1912  BOOST_REQUIRE_EQUAL(1, retval);
1913 
1914  retval = SplitQuery_CalculateNumChunks(program,
1915  &chunk_size,
1916  3*chunk_size-2*SplitQuery_GetOverlapChunkSize(program), 1);
1917 
1918  BOOST_REQUIRE_EQUAL(3, retval);
1919 
1920  retval = SplitQuery_CalculateNumChunks(program,
1921  &chunk_size,
1922  1+2*chunk_size+SplitQuery_GetOverlapChunkSize(program), 1);
1923 
1924  BOOST_REQUIRE_EQUAL(2, retval);
1925 }
1926 
1927 BOOST_AUTO_TEST_CASE(InvalidChunkSizeBlastx)
1928 {
1929  CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1931 }
1932 
1933 BOOST_AUTO_TEST_CASE(InvalidChunkSizeTblastx)
1934 {
1935  CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1937 }
1938 
Auxiliary functions for BLAST.
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
Definitions which are dependant on the NCBI C++ Object Manager.
Declares the CBlastOptionsHandle and CBlastOptionsFactory classes.
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
@ eBlastTypeBlastx
Definition: blast_program.h:75
BlastQueryInfo * BlastQueryInfoFree(BlastQueryInfo *query_info)
Deallocate memory for query information structure.
BlastQueryInfo * BlastQueryInfoNew(EBlastProgramType program, int num_queries)
Allocate memory for query information structure.
EProgram
This enumeration is to evolve into a task/program specific list that specifies sets of default parame...
Definition: blast_types.hpp:56
@ eTblastx
Translated nucl-Translated nucl.
Definition: blast_types.hpp:62
@ eBlastn
Nucl-Nucl (traditional blastn)
Definition: blast_types.hpp:58
@ eBlastp
Protein-Protein.
Definition: blast_types.hpp:59
@ eBlastx
Translated nucl-Protein.
Definition: blast_types.hpp:60
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
ncbi::TMaskedQueryRegions mask
CAutoEnvironmentVariable –.
Definition: ncbienv.hpp:179
Defines BLAST error codes (user errors included)
Encapsulates ALL the BLAST algorithm's options.
Query Vector.
Definition: sseqloc.hpp:276
size_type Size() const
Returns the number of queries found in this query vector.
Definition: sseqloc.hpp:305
CConstRef< objects::CSeq_loc > GetQuerySeqLoc(size_type i) const
Get the query Seq-loc for a query by index.
Definition: sseqloc.hpp:313
Search Query.
Definition: sseqloc.hpp:147
CConstRef –.
Definition: ncbiobj.hpp:1266
Auxiliary class to provide convenient and efficient access to conversions between contexts local to q...
CNcbiRegistry –.
Definition: ncbireg.hpp:913
NCBI C++ Object Manager dependant implementation of IQueryFactory.
Class responsible for splitting query sequences and providing data to the BLAST search class to searc...
Definition: split_query.hpp:66
CRandom::
Definition: random_gen.hpp:66
CScope –.
Definition: scope.hpp:92
static CRef< CScope > NewScope(bool with_defaults=true)
Return a new scope, possibly (by default) with default loaders, which will include the Genbank loader...
Definition: simple_om.cpp:202
Wrapper class around SSplitQueryBlk structure.
CRef< CNcbiRegistry > m_Config
This represents the split_query.ini configuration file.
void QuerySplitter_BlastxSingleQueryMultiChunk(const string &kTestName, ENa_strand strand)
void x_ValidateQueryContextsPerChunkAssignment(const CSplitQueryBlk &sqb, const vector< vector< int > > &contexts_per_chunk)
Validates the query contexts assigned to all the chunks.
void x_ValidateLocalQueryData(CRef< CQuerySplitter > splitter, const CBlastOptions *options, vector< BlastQueryInfo * > split_query_info_structs)
Validate the local query data for all chunks, comparing data produced by the CQuerySplitter object an...
void x_ReadVectorOfVectorsForTest(const string &kTestName, const char *data_to_read, vector< vector< T > > &retval)
This method reads entries in the config file of the format ChunkNX, here N is the chunk number and X ...
void x_ValidateChunkBounds(size_t kChunkSize, size_t kQuerySize, const CSplitQueryBlk &sqb, EBlastProgramType p)
Auxiliary method to validate the chunk bounds calculated by the CSplitQueryBlk object and the x_Compu...
void x_ReadQueryBoundsPerChunk(const string &kTestName, CConstRef< CSplitQueryBlk > sqb, CQuerySplitter::TSplitQueryVector &split_query_vector)
This function reads values in the split_query.ini file with the format ChunkNQueryM (where N is the c...
void x_ValidateQueryInfoForChunk(const BlastQueryInfo *reference, const BlastQueryInfo *test, size_t chunk_num)
Validate the query info structure generated (test) against the expected one (reference) (N....
void x_TestCContextTranslator(TGiLengthVector &gi_length, size_t chunk_size, size_t num_chunks, blast::EProgram program, vector< vector< int > > &starting_chunks, vector< vector< int > > &absolute_contexts, vector< vector< size_t > > *context_offsets, ENa_strand strand, vector< ENa_strand > *query_strands=NULL)
void x_ValidateQueriesPerChunkAssignment(const CSplitQueryBlk &sqb, const vector< vector< size_t > > &queries_per_chunk)
Validates the query sequences (by index) assigned to all the chunks This compares the data calculated...
void QuerySplitter_BlastnSingleQueryMultiChunk(const string &kTestName, ENa_strand strand)
void x_ParseConfigLine(const string &input, vector< T > &retval)
Tokenizes a string containing comma-separated values into a vector of values.
void x_PrepareBlastQueryStructures(TIntId gis[], EProgram program, BLAST_SequenceBlk **seq_blk, BlastQueryInfo **qinfo, ENa_strand *strand=NULL)
Populate a BLAST_SequenceBlk and BlastQueryInfo structures out of an array of GIs.
void QuerySplitter_BlastxMultiQueryMultiChunk(const string &kTestName, ENa_strand strand, vector< ENa_strand > *query_strands=NULL)
bool x_CmpSequenceData(const Uint1 *global, const Uint1 *chunk, size_t len)
Auxiliary function that compares bytes of sequence data to validate the context offset corrections.
void x_ComputeQueryChunkBounds(TChunkRange &chunk_range, size_t chunk_size, size_t concatenated_query_length, size_t overlap)
Incrementally compute the query chunk bounds.
void x_ReadSplitQueryInfoForTest(const string &kTestName, EBlastProgramType program, vector< BlastQueryInfo * > &retval)
Reads data to populate multiple BlastQueryInfo structures.
void x_ValidateQuerySeqLocsPerChunk(CRef< CQuerySplitter > splitter, const CQuerySplitter::TSplitQueryVector &split_query_vector)
Compare the query data (start, stop, strand) for each chunk computed by the splitter vs.
void x_ValidateContextOffsetsPerChunkAssignment(const CSplitQueryBlk &sqb, const vector< vector< size_t > > &contexts_offsets_per_chunk)
Validates the context offsets assigned to all the chunks.
void QuerySplitter_BlastnMultiQueryMultiChunk(const string &kTestName, ENa_strand strand, vector< ENa_strand > *query_strands=NULL)
static CTestObjMgr & Instance()
Definition: test_objmgr.cpp:69
Collection of masked regions for a single query sequence.
Definition: seqlocinfo.hpp:113
typedef for the messages for an entire BLAST search, which could be comprised of multiple query seque...
static const int chunk_size
static bool is_valid(const char *num, int type, CONV_RESULT *cr)
static uch flags
std::ofstream out("events_result.xml")
main entry point for tests
CS_CONTEXT * ctx
Definition: t0006.c:12
#define test(a, b, c, d, e)
Definition: numeric.c:170
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
bool HasMessages() const
Definition: blast_aux.cpp:1002
size_t SplitQuery_GetOverlapChunkSize(EBlastProgramType program)
Size of the region that overlaps in between each query chunk.
void SetStrandOption(objects::ENa_strand s)
size_t GetChunkSize() const
Returns the number of bases/residues that make up a query chunk.
Definition: split_query.hpp:82
TChunkRange GetChunkBounds(size_t chunk_num) const
Get the boundaries of a chunk in the concatenated query.
SSplitQueryBlk * GetCStruct() const
Returns the C structure managed by objects of this class.
vector< CRef< CBlastQueryVector > > TSplitQueryVector
Definition of a vector of CBlastQueryVectors, each element corresponds to a query chunk.
Definition: split_query.hpp:73
bool IsQuerySplit() const
Determines whether the query sequence(s) are split or not.
Definition: split_query.hpp:88
size_t GetNumQueriesForChunk(size_t chunk_num) const
Get the number of queries in a given chunk.
size_t GetNumChunks() const
Retrieve the number of chunks.
int GetAbsoluteContext(size_t chunk_num, Int4 context_in_chunk) const
Get the context number in the absolute (i.e.
void SetupQueries(TSeqLocVector &queries, BlastQueryInfo *qinfo, BLAST_SequenceBlk **seqblk, EBlastProgramType prog, objects::ENa_strand strand_opt, TSearchMessages &messages)
Populates BLAST_SequenceBlk with sequence data for use in CORE BLAST.
objects::ENa_strand GetStrandOption() const
virtual BLAST_SequenceBlk * GetSequenceBlk()=0
Accessor for the BLAST_SequenceBlk structure.
CRef< ILocalQueryData > MakeLocalQueryData(const CBlastOptions *opts)
Creates and caches an ILocalQueryData.
Definition: query_data.cpp:52
static CBlastOptionsHandle * Create(EProgram program, EAPILocality locality=CBlastOptions::eLocal)
Creates an options handle object configured with default options for the requested program,...
CBlastOptions & SetOptions()
Returns a reference to the internal options class which this object is a handle for.
Uint4 m_NumChunks
Number of chunks, if this is 1, no splitting occurs.
EBlastProgramType EProgramToEBlastProgramType(EProgram p)
Convert EProgram to EBlastProgramType.
Definition: blast_aux.cpp:709
size_t SplitQuery_GetChunkSize(EProgram program)
Returns the optimal chunk size for a given task.
Definition: local_blast.cpp:54
EBlastProgramType GetProgramType() const
Returns the CORE BLAST notion of program type.
virtual BlastQueryInfo * GetQueryInfo()=0
Accessor for the BlastQueryInfo structure.
const CBlastOptions & GetOptions() const
Return the object which this object is a handle for.
CRef< IQueryFactory > GetQueryFactoryForChunk(Uint4 chunk_num)
Returns a IQueryFactory suitable to be executed by a BLAST search class.
virtual size_t GetNumQueries()=0
Get the number of queries.
void SetupQueryInfo(TSeqLocVector &queries, EBlastProgramType prog, objects::ENa_strand strand_opt, BlastQueryInfo **qinfo)
Allocates the query information structure and fills the context offsets, in case of multiple queries,...
vector< size_t > GetQueryIndices(size_t chunk_num) const
Get the indices of the queries contained in a given chunk.
CRef< CSplitQueryBlk > Split()
Split the query sequence(s)
int GetStartingChunk(size_t curr_chunk, Int4 context_in_chunk) const
Get the chunk number where context_in_chunk starts (i.e.
vector< int > GetQueryContexts(size_t chunk_num) const
Get the contexts of the queries contained in a given chunk.
Uint4 SplitQuery_CalculateNumChunks(EBlastProgramType program, size_t *chunk_size, size_t concatenated_query_length, size_t num_queries)
Calculate the number of chunks that a query will be split into based upon query length,...
size_t GetSumOfSequenceLengths()
Compute the sum of all the sequence's lengths.
Definition: query_data.cpp:107
Uint4 GetNumberOfChunks() const
Returns the number of chunks the query/queries will be split into.
Definition: split_query.hpp:85
CRef< CSplitQueryBlk > m_SplitBlk
Split query block structure.
vector< size_t > GetContextOffsets(size_t chunk_num) const
Get the context offsets (corrections) of the queries contained in a given chunk.
TSplitQueryVector m_SplitQueriesInChunk
Vector of split queries.
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
Int8 TIntId
Definition: ncbimisc.hpp:999
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
void SetPacked_int(TPacked_int &v)
Definition: Seq_loc.hpp:984
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
void SetId(CSeq_id &id)
set the 'id' field in all parts of this location
Definition: Seq_loc.cpp:3474
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:913
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
void SetStrand(ENa_strand strand)
Set the strand for all of the location's ranges.
Definition: Seq_loc.cpp:5196
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
Uint4 TValue
Type of the generated integer value and/or the seed value.
Definition: random_gen.hpp:69
TValue GetRand(void)
Get the next random number in the interval [0..GetMax()] (inclusive)
Definition: random_gen.hpp:238
position_type GetLength(void) const
Definition: range.hpp:158
TThisType & SetFrom(position_type from)
Definition: range.hpp:170
bool NotEmpty(void) const
Definition: range.hpp:152
TThisType & SetToOpen(position_type toOpen)
Definition: range.hpp:175
position_type GetToOpen(void) const
Definition: range.hpp:138
position_type GetFrom(void) const
Definition: range.hpp:134
static TThisType GetEmpty(void)
Definition: range.hpp:306
TThisType & SetLength(position_type length)
Definition: range.hpp:194
int TFlags
Binary OR of "EFlags".
Definition: ncbireg.hpp:107
virtual bool GetBool(const string &section, const string &name, bool default_value, TFlags flags=0, EErrAction err_action=eThrow) const
Get boolean value of specified parameter name.
Definition: ncbireg.cpp:391
virtual const string & Get(const string &section, const string &name, TFlags flags=0) const
Get the parameter value.
Definition: ncbireg.cpp:262
virtual int GetInt(const string &section, const string &name, int default_value, TFlags flags=0, EErrAction err_action=eThrow) const
Get integer value of specified parameter name.
Definition: ncbireg.cpp:362
bool Empty(TFlags flags=fAllLayers) const
Verify if Registry is empty.
Definition: ncbireg.cpp:162
@ fTruncate
Leading, trailing blanks can be truncated.
Definition: ncbireg.hpp:87
@ fNoOverride
Cannot change existing value.
Definition: ncbireg.hpp:86
@ fTransient
Transient – not saved by default.
Definition: ncbireg.hpp:83
@ fNotJustCore
Include auxiliary subregistries.
Definition: ncbireg.hpp:90
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2751
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3186
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
bool IsPacked_int(void) const
Check if variant Packed_int is selected.
Definition: Seq_loc_.hpp:534
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
@ eNa_strand_both
in forward orientation
Definition: Na_strand_.hpp:68
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
static string kPrefix
Definition: id2info.cpp:146
static int input()
int i
int len
Main class to perform a BLAST search on the local machine.
static char * prog
Definition: mdb_load.c:33
const size_t kChunkSize
Definition: na_utils.cpp:587
const string kTestName
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::SIZE size
Magic spell ;-) needed for some weird compilers... very empiric.
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
#define UINT4_MAX
largest number represented by unsigned int.
Definition: ncbi_std.h:136
T min(T x_, T y_)
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
vector< TMaskedQueryRegions > TSeqLocInfoVector
Collection of masked regions for all queries in a BLAST search.
Definition: seqlocinfo.hpp:139
Int2 SplitQueryBlk_GetQueryIndicesForChunk(const SSplitQueryBlk *squery_blk, Uint4 chunk_num, Uint4 **query_indices)
Retrieve an array of query indices for the requested chunk.
Definition: split_query.c:235
Int2 SplitQueryBlk_GetQueryContextsForChunk(const SSplitQueryBlk *squery_blk, Uint4 chunk_num, Int4 **query_contexts, Uint4 *num_query_contexts)
Retrieve an array of query contexts for the requested chunk.
Definition: split_query.c:262
Int2 SplitQueryBlk_AddQueryToChunk(SSplitQueryBlk *squery_blk, Uint4 query_index, Uint4 chunk_num)
Add a query index to a given chunk.
Definition: split_query.c:201
Int2 SplitQueryBlk_GetNumQueriesForChunk(const SSplitQueryBlk *squery_blk, Uint4 chunk_num, size_t *num_queries)
Retrieve the number of queries that correspond to chunk number chunk_num.
Definition: split_query.c:189
Int2 SplitQueryBlk_AddContextToChunk(SSplitQueryBlk *squery_blk, Int4 ctx_index, Uint4 chunk_num)
Add a query context index to a given chunk.
Definition: split_query.c:211
const Int4 kInvalidContext
Value to represent an invalid context.
Definition: split_query.c:39
const Int2 kBadParameter
Invalid parameter used in a function call.
Definition: split_query.c:37
Declares CQuerySplitter, a class to split the query sequence(s)
Auxiliary functions and classes to assist in query splitting.
static void s_ConvertToBlastQueries(const TGiLengthVector &gi_length, TSeqLocVector &retval, size_t *tot_length=NULL, vector< ENa_strand > *strands=NULL, const TSeqLocInfoVector *masks=NULL)
Convert a vector of GIs with its lengths into a TSeqLocVector.
static void s_CalculateMaxLength(BlastQueryInfo *query_info)
Calculate and assign the maximum length field in the BlastQueryInfo structure.
vector< TGiLenPair > TGiLengthVector
Vector containing pairs of gis and their length.
pair< TIntId, size_t > TGiLenPair
Pair for gis and their length (in that order)
BOOST_AUTO_TEST_CASE(SplitQueriesIn1Chunk)
vector< vector< Uint4 > > TSplitQueryChunkMap
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
Structure to hold a sequence.
Definition: blast_def.h:242
Uint1 * sequence
Sequence used for search (could be translation).
Definition: blast_def.h:243
Int4 query_length
Length of this query, strand or frame.
Boolean is_valid
Determine if this context is valid or not.
Int4 query_offset
Offset of this query, strand or frame in the concatenated super-query.
Int8 eff_searchsp
Effective search space for this context.
Int4 query_index
Index of query (same for all frames)
Int1 frame
Frame number (-1, -2, -3, 0, 1, 2, or 3)
The query related information.
Int4 first_context
Index of the first element of the context array.
BlastContextInfo * contexts
Information per context.
int num_queries
Number of query sequences.
struct SPHIQueryInfo * pattern_info
Counts of PHI BLAST pattern occurrences, used in PHI BLAST only.
Int4 last_context
Index of the last element of the context array.
Uint4 max_length
Length of the longest among the concatenated queries.
Structure to represent a single sequence to be fed to BLAST.
Definition: sseqloc.hpp:47
static string query
Utility stuff for more convenient using of Boost.Test library.
Modified on Wed Apr 24 14:16:55 2024 by modify_doxy.py rev. 669887