NCBI C++ ToolKit
blastfilter_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blastfilter_unit_test.cpp 95564 2021-11-26 14:52:02Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Ilya Dondoshansky
27  *
28  * File Description:
29  * Unit test for low complexity filtering
30  *
31  * ===========================================================================
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/test_boost.hpp>
36 #include <corelib/ncbitime.hpp>
40 #include <serial/iterator.hpp>
41 #include <util/random_gen.hpp>
42 #include <objmgr/util/sequence.hpp>
43 
44 
46 #include "blast_objmgr_priv.hpp"
47 
51 
53 
54 // For repeats and dust filtering only
57 #include "winmask_filter.hpp"
58 #include "dust_filter.hpp"
61 
62 #include "test_objmgr.hpp"
63 #include "blast_test_util.hpp"
64 
65 using namespace std;
66 using namespace ncbi;
67 using namespace ncbi::objects;
68 using namespace ncbi::blast;
69 
70 typedef vector<TSeqRange> TRangeVector;
71 
72 static BlastSeqLoc*
74 {
75  BlastSeqLoc* retval = NULL;
76 
77  if (rv.empty()) {
78  return retval;
79  }
80 
81  BlastSeqLoc* tail = NULL;
82  ITERATE(TRangeVector, itr, rv) {
83  tail = BlastSeqLocNew(tail ? &tail : &retval,
84  itr->GetFrom(),
85  itr->GetTo());
86  }
87 
88  return retval;
89 }
90 
92  size_t num_seqs)
93 {
94  const string kProgName(Blast_ProgramNameFromType(program));
95  typedef vector< CRef<CSeq_id> > TSeqIds;
96  TSeqIds seqid_v(num_seqs);
97  generate(seqid_v.begin(), seqid_v.end(),
99  CPacked_seqint seqintervals;
100  ITERATE(TSeqIds, seqid, seqid_v) {
101  seqintervals.AddInterval(**seqid, 0, 100000);
102  }
103 
104  const size_t kNumContexts(GetNumberOfContexts(program));
105  CBlastMaskLoc mask(BlastMaskLocNew(num_seqs*kNumContexts));
106 
107  // Fill the masks
108  const TSeqPos kOffsetLength(30);
109  for (int index = 0; index < mask->total_size; ++index) {
110  mask->seqloc_array[index] = BlastSeqLocNew(NULL, index,
111  index+kOffsetLength);
112  }
113  TSeqLocInfoVector mask_v;
114  Blast_GetSeqLocInfoVector(program, seqintervals, mask, mask_v);
115  BOOST_REQUIRE_EQUAL(num_seqs, mask_v.size());
116 
117  unsigned int qindex(0); // query index
118  ITERATE(TSeqLocInfoVector, query_masks_list, mask_v) {
119  const size_t kNumMasks = program == eBlastTypeBlastn
120  ? 1 : kNumContexts;
121  BOOST_REQUIRE_MESSAGE( kNumMasks == query_masks_list->size(),
122  "Failed on " + kProgName);
123  size_t context = 0;
124  ITERATE(TMaskedQueryRegions, itr, *query_masks_list) {
125  CNcbiOstrstream ss;
126  ss << "Error in query number " << qindex << ", context "
127  << context << " ('" << kProgName << "')";
128  // Validate the frame
129  int frame = program == eBlastTypeBlastn
131  : BLAST_ContextToFrame(program, context);
132  BOOST_REQUIRE_MESSAGE(frame == (*itr)->GetFrame(),
133  (string)CNcbiOstrstreamToString(ss));
134 
135  // Validate the artificially built offsets of the mask
136  const BlastSeqLoc* loc =
137  mask->seqloc_array[kNumContexts*qindex+context];
138  BOOST_REQUIRE(loc != NULL);
139  TSeqRange offsets(loc->ssr->left, loc->ssr->right);;
140  BOOST_REQUIRE_MESSAGE
141  (offsets.GetFrom() == (*itr)->GetInterval().GetFrom(),
142  (string)CNcbiOstrstreamToString(ss));
143  BOOST_REQUIRE_MESSAGE
144  (offsets.GetTo() == (*itr)->GetInterval().GetTo(),
145  (string)CNcbiOstrstreamToString(ss));
146  ++context;
147  }
148  BOOST_REQUIRE_EQUAL(kNumMasks, context);
149  ++qindex;
150  }
151 }
152 
153 // Returns true if *all* bases in the range provided are masked
154 static bool x_AreAllBasesMasked(const Uint1* sequence, int start, int stop)
155 {
156  BOOST_CHECK(start <= stop);
157  for (int i = start; i < stop; i++) {
158  if (sequence[i] != kNuclMask) {
159  return false;
160  }
161  }
162  return true;
163 }
164 
166 public:
168  bool ignore_strand_in_mask)
169  {
170  const int kNumLcaseLocs = 11;
171  const int kLcaseStarts[kNumLcaseLocs] =
172  { 0, 78, 217, 380, 694, 1018, 1128, 2817, 3084, 3428, 3782 };
173  const int kLcaseEnds[kNumLcaseLocs] =
174  { 75, 208, 316, 685, 1004, 1122, 1298, 2952, 3409, 3733, 3916 };
175 
176  int i = 0; // loop index
177  const int kQuerySize = 9180;
178  vector<int> kLcaseStartsNegStrand, kLcaseEndsNegStrand;
179  kLcaseStartsNegStrand.reserve(kNumLcaseLocs);
180  kLcaseEndsNegStrand.reserve(kNumLcaseLocs);
181  for (i = 0; i < kNumLcaseLocs; i++) {
182  int start = kQuerySize - 1 - kLcaseEnds[i];
183  int stop = kQuerySize - 1 - kLcaseStarts[i];
184  kLcaseStartsNegStrand.push_back(start);
185  kLcaseEndsNegStrand.push_back(stop);
186  }
187 
188  CSeq_id id("gi|1945388");
189  unique_ptr<SSeqLoc> qsl(
190  CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_both));
191  // Fill the lower case mask into the SSeqLoc
192  CSeq_loc* seqloc = new CSeq_loc();
193  for (int index = 0; index < kNumLcaseLocs; ++index) {
194  seqloc->SetPacked_int().AddInterval(id, kLcaseStarts[index],
195  kLcaseEnds[index]);
196  BOOST_CHECK(!seqloc->GetPacked_int().Get().back()->CanGetStrand());
197  seqloc->SetPacked_int().Set().back()->SetStrand(strand);
198  }
199  qsl->mask.Reset(seqloc);
200  qsl->ignore_strand_in_mask = ignore_strand_in_mask;
201 
202  TSeqLocVector query_v;
203  query_v.push_back(*qsl);
205  nucl_handle->SetDustFiltering(false);
206  nucl_handle->SetMaskAtHash(false);
207 
208  // Run a self hit BLAST search, discard the return value, and get the
209  // masked query regions
210  blast::CBl2Seq blaster(*qsl.get(), *qsl.get(), *nucl_handle);
211  (void) blaster.Run();
212 
213  // check that the actual query sequence was masked at the proper
214  // locations
215  BOOST_CHECK_EQUAL(false, nucl_handle->GetMaskAtHash());
216  for (i = 0; i < kNumLcaseLocs; i++) {
217  const pair<int, int> range_plus(kLcaseStarts[i], kLcaseEnds[i]);
218  const pair<int, int> range_minus(kLcaseStartsNegStrand[i],
219  kLcaseEndsNegStrand[i]);
220  int starting_offset = 0;
221 
222  if (ignore_strand_in_mask || strand == eNa_strand_both) {
223  starting_offset =
224  blaster.m_Blast->m_InternalData->m_QueryInfo->contexts[0].query_offset;
225  BOOST_CHECK(x_AreAllBasesMasked
226  (blaster.m_Blast->m_InternalData->m_Queries->sequence,
227  starting_offset + range_plus.first,
228  starting_offset + range_plus.second));
229 
230  starting_offset =
231  blaster.m_Blast->m_InternalData->m_QueryInfo->contexts[1].query_offset;
232  BOOST_CHECK(x_AreAllBasesMasked
233  (blaster.m_Blast->m_InternalData->m_Queries->sequence,
234  starting_offset + range_minus.first,
235  starting_offset + range_minus.second));
236  } else {
237 
238  if (strand == eNa_strand_plus) {
239  starting_offset =
240  blaster.m_Blast->m_InternalData->m_QueryInfo->contexts[0].query_offset;
241  BOOST_CHECK(x_AreAllBasesMasked
242  (blaster.m_Blast->m_InternalData->m_Queries->sequence,
243  starting_offset + range_plus.first,
244  starting_offset + range_plus.second));
245 
246  starting_offset =
247  blaster.m_Blast->m_InternalData->m_QueryInfo->contexts[1].query_offset;
248  BOOST_CHECK(!x_AreAllBasesMasked
249  (blaster.m_Blast->m_InternalData->m_Queries->sequence,
250  starting_offset + range_minus.first,
251  starting_offset + range_minus.second));
252  } else if (strand == eNa_strand_minus) {
253  starting_offset =
254  blaster.m_Blast->m_InternalData->m_QueryInfo->contexts[0].query_offset;
255  BOOST_CHECK(!x_AreAllBasesMasked
256  (blaster.m_Blast->m_InternalData->m_Queries->sequence,
257  starting_offset + range_plus.first,
258  starting_offset + range_plus.second));
259  starting_offset =
260  blaster.m_Blast->m_InternalData->m_QueryInfo->contexts[1].query_offset;
261  BOOST_CHECK(x_AreAllBasesMasked
262  (blaster.m_Blast->m_InternalData->m_Queries->sequence,
263  starting_offset + range_minus.first,
264  starting_offset + range_minus.second));
265  } else {
266  abort();
267  }
268  }
269  }
270 
271  // Check that the masked regions (returned as part of the original
272  // SSeqLoc.mask field or from CBl2Seq::GetFilteredQueryRegions) are
273  // those on the plus strand only
274  TSeqLocInfoVector masked_regions_vector =
275  blaster.GetFilteredQueryRegions();
276  BOOST_CHECK(masked_regions_vector.size() == 1);
277  BOOST_CHECK_EQUAL(masked_regions_vector.front().size(),
278  (size_t)kNumLcaseLocs);
279 
280  BOOST_CHECK(query_v[0].mask->IsPacked_int());
281  BOOST_CHECK_EQUAL(query_v[0].mask->GetPacked_int().Get().size(),
282  masked_regions_vector.front().size());
283  int loc_index = 0;
284  ITERATE(list< CRef<CSeq_interval> >, itr,
285  query_v[0].mask->GetPacked_int().Get()) {
286  BOOST_CHECK_EQUAL(kLcaseStarts[loc_index], (int)(*itr)->GetFrom());
287  BOOST_CHECK_EQUAL(kLcaseEnds[loc_index], (int)(*itr)->GetTo());
288  ++loc_index;
289  }
290  BOOST_CHECK_EQUAL(kNumLcaseLocs, loc_index);
291 
292  loc_index = 0;
293  ITERATE(TMaskedQueryRegions, itr, masked_regions_vector[0]) {
294  const CSeq_interval& intv = (*itr)->GetInterval();
295  BOOST_CHECK_EQUAL(kLcaseStarts[loc_index], (int)intv.GetFrom());
296  BOOST_CHECK_EQUAL(kLcaseEnds[loc_index], (int)intv.GetTo());
297  BOOST_CHECK(!intv.CanGetStrand());
298  BOOST_CHECK_EQUAL((*itr)->GetFrame(),
300  loc_index++;
301  }
302 
303  BOOST_CHECK_EQUAL(kNumLcaseLocs, loc_index);
304  }
305 };
306 
307 BOOST_AUTO_TEST_SUITE(blastfilter)
308 
310  typedef vector< pair<TSeqPos, TSeqPos> > TSegments;
311  TSegments masked_offsets;
312  masked_offsets.push_back(make_pair(298U, 305U));
313  masked_offsets.push_back(make_pair(875U, 882U));
314  masked_offsets.push_back(make_pair(1018U, 1115U));
315  masked_offsets.push_back(make_pair(1449U, 1479U));
316  masked_offsets.push_back(make_pair(3113U, 3133U));
317  masked_offsets.push_back(make_pair(3282U, 3298U));
318  masked_offsets.push_back(make_pair(3428U, 3441U));
319  masked_offsets.push_back(make_pair(3598U, 3606U));
320  masked_offsets.push_back(make_pair(4704U, 4710U));
321  masked_offsets.push_back(make_pair(6364U, 6373U));
322  masked_offsets.push_back(make_pair(6512U, 6573U));
323  masked_offsets.push_back(make_pair(7600U, 7672U));
324  masked_offsets.push_back(make_pair(7766U, 7772U));
325  masked_offsets.push_back(make_pair(8873U, 8880U));
326  masked_offsets.push_back(make_pair(9109U, 9179U));
327 
328  const size_t kNumQueries(1);
329  const size_t kNumLocs(masked_offsets.size());
330  size_t index(0);
331 
332  CSeq_id id("gi|1945388");
333  unique_ptr<SSeqLoc> qsl(
334  CTestObjMgr::Instance().CreateSSeqLoc(id, strand));
335  TSeqLocVector query_reference(kNumQueries, *qsl);
336  TSeqLocVector query_test(kNumQueries, *qsl);
338 
339  // Filter the query regions using the C++ APIs
340  Blast_FindDustFilterLoc(query_reference, &(*nucl_handle));
341  BOOST_CHECK(query_reference[0].mask->IsPacked_int());
342  const CPacked_seqint::Tdata& seqinterval_list =
343  query_reference[0].mask->GetPacked_int().Get();
344  BOOST_CHECK_EQUAL(kNumLocs, seqinterval_list.size());
345  // CSeq_loc_mapper returns intervals sorted in reverse order if on minus strand.
346  bool reverse = IsReverse(query_reference[0].mask->GetStrand());
347  index = reverse ? masked_offsets.size() - 1 : 0;
348  ITERATE(CPacked_seqint::Tdata, itr, seqinterval_list) {
349  BOOST_CHECK_EQUAL(masked_offsets[index].first,
350  (*itr)->GetFrom());
351  BOOST_CHECK_EQUAL(masked_offsets[index].second,
352  (*itr)->GetTo());
353  reverse ? index-- : index++;
354  }
355 
356  // Run a self hit BLAST search, discard the return value, and get the
357  // masked query regions
358  blast::CBl2Seq blaster(query_test, query_test, *nucl_handle);
359  (void) blaster.Run();
360  TSeqLocInfoVector masked_regions_vector =
361  blaster.GetFilteredQueryRegions();
362 
363  BOOST_CHECK_EQUAL(kNumQueries, query_reference.size());
364  BOOST_CHECK_EQUAL(kNumQueries, query_test.size());
365  BOOST_CHECK_EQUAL(kNumQueries, masked_regions_vector.size());
366 
367  TMaskedQueryRegions& masked_regions = *masked_regions_vector.begin();
368  BOOST_CHECK_EQUAL(kNumLocs, masked_regions.size());
369  index = 0;
370  ITERATE(TMaskedQueryRegions, itr, masked_regions) {
371  BOOST_CHECK_EQUAL(masked_offsets[index].first,
372  (*itr)->GetInterval().GetFrom());
373  BOOST_CHECK_EQUAL(masked_offsets[index].second,
374  (*itr)->GetInterval().GetTo());
375  index++;
376  }
377 }
378 
379 BOOST_AUTO_TEST_CASE(TSeqLocVector2Packed_seqint_TestIntervals) {
380 
381  vector< CRef<CSeq_id> > gis;
382  gis.push_back(CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Gi, 6)));
383  gis.push_back(CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Gi, 129295)));
384  gis.push_back(CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Gi, 15606659)));
385 
386  vector<TSeqRange> ranges;
387  ranges.push_back(TSeqRange(10, 100));
388  ranges.push_back(TSeqRange(100, 200));
389  ranges.push_back(TSeqRange(50, 443));
390 
391  BOOST_REQUIRE(gis.size() == ranges.size());
392  TSeqLocVector input(gis.size());
393  size_t i(0);
394  for (i = 0; i < gis.size(); i++) {
395  CRef<CSeq_loc> seqloc(new CSeq_loc(*gis[i],
396  ranges[i].GetFrom(),
397  ranges[i].GetTo()));
398  input[i] = SSeqLoc(seqloc, CSimpleOM::NewScope());
399  }
400 
402  i = 0;
403  ITERATE(CPacked_seqint::Tdata, query_interval, packed_seqint->Get()) {
404  BOOST_REQUIRE(gis[i]->Match((*query_interval)->GetId()));
405  BOOST_REQUIRE_EQUAL(ranges[i].GetFrom(),
406  (*query_interval)->GetFrom());
407  BOOST_REQUIRE_EQUAL(ranges[i].GetTo(),
408  (*query_interval)->GetTo());
409  i++;
410  }
411 }
412 
413 BOOST_AUTO_TEST_CASE(TSeqLocVector2Packed_seqint_TestNoIntervals) {
414  typedef pair<TGi, TSeqPos> TGiLength;
415  vector<TGiLength> gis;
416  gis.push_back(make_pair(GI_CONST(6), 342U));
417  gis.push_back(make_pair(GI_CONST(129295), 232U));
418  gis.push_back(make_pair(GI_CONST(15606659), 443U));
419 
421  input.reserve(gis.size());
422  ITERATE(vector<TGiLength>, gi, gis) {
423  CRef<CSeq_loc> seqloc(new CSeq_loc);
424  seqloc->SetWhole().SetGi(gi->first);
425  input.push_back(SSeqLoc(seqloc, CSimpleOM::NewScope()));
426  }
427 
429  int i(0);
430  const TSeqPos kStartingPosition(0);
431  ITERATE(CPacked_seqint::Tdata, query_interval, packed_seqint->Get()) {
432  const TGiLength& kGiLength = gis[i++];
433  const CSeq_id kTargetId(CSeq_id::e_Gi, kGiLength.first);
434  BOOST_REQUIRE(kTargetId.Match((*query_interval)->GetId()));
435  BOOST_REQUIRE_EQUAL(kStartingPosition,
436  (*query_interval)->GetFrom());
437  BOOST_REQUIRE_EQUAL(kGiLength.second,
438  (*query_interval)->GetTo());
439  }
440 }
441 
442 BOOST_AUTO_TEST_CASE(TSeqLocVector2Packed_seqint_TestEmptyInput) {
445  BOOST_REQUIRE(retval.Empty());
446 }
447 
449  const CBlastOptions& kOpts,
450  BLAST_SequenceBlk** query_blk,
451  BlastQueryInfo** qinfo)
452 {
453  TSearchMessages blast_msg;
454 
456  ENa_strand strand_opt = kOpts.GetStrandOption();
457 
458  SetupQueryInfo(query_vector, prog, strand_opt, qinfo);
459  CBlastQueryInfo qi_tmp(*qinfo);
460  SetupQueries(query_vector, qi_tmp, query_blk,
461  prog, strand_opt, blast_msg);
462  qi_tmp.Release();
463  ITERATE(TSearchMessages, m, blast_msg) {
464  BOOST_REQUIRE(m->empty());
465  }
466 }
467 
469  const int kNumLocs = 3;
470  const int kSegStarts[kNumLocs] = { 15, 55, 495 };
471  const int kSegEnds[kNumLocs] = { 27, 68, 513 };
472  CSeq_id id("gi|3091");
473  unique_ptr<SSeqLoc> qsl(CTestObjMgr::Instance().CreateSSeqLoc(id));
474  TSeqLocVector query_v;
475  query_v.push_back(*qsl);
476  CBlastQueryInfo query_info;
477  CBLAST_SequenceBlk query_blk;
479 
480  setupQueryStructures(query_v, opts->GetOptions(),
481  &query_blk, &query_info);
482 
483  BlastSeqLoc *filter_slp = NULL, *loc;
484  SBlastFilterOptions* filtering_options;
485  SBlastFilterOptionsNew(&filtering_options, eSeg);
486  Int2 status = BlastSetUp_Filter(opts->GetOptions().GetProgramType(),
487  query_blk->sequence,
488  query_info->contexts[0].query_length,
489  0,
490  filtering_options,
491  & filter_slp, NULL);
492  filtering_options = SBlastFilterOptionsFree(filtering_options);
493  BOOST_REQUIRE(filtering_options == NULL);
494  BOOST_REQUIRE(status == 0);
495 
496  Int4 loc_index;
497  SSeqRange* di;
498  for (loc_index=0, loc = filter_slp; loc; loc = loc->next, ++loc_index) {
499  di = loc->ssr;
500  BOOST_REQUIRE_EQUAL(kSegStarts[loc_index], di->left);
501  BOOST_REQUIRE_EQUAL(kSegEnds[loc_index], di->right);
502  }
503  BlastSeqLocFree(filter_slp);
504 
505  BOOST_REQUIRE_EQUAL(kNumLocs, loc_index);
506 }
507 
508 BOOST_AUTO_TEST_CASE(RepeatsFilter) {
509  const size_t kNumLocs = 4;
510  const TSeqPos kRepeatStarts[kNumLocs] = { 0, 380, 2851, 3113 };
511  const TSeqPos kRepeatEnds[kNumLocs] = { 212, 1297, 2953, 3764 };
512  CSeq_id id("gi|1945388");
513  unique_ptr<SSeqLoc> qsl(
514  CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_both));
515  TSeqLocVector query_v;
516  query_v.push_back(*qsl);
517 
518  CBlastNucleotideOptionsHandle nucl_handle;
519  nucl_handle.SetRepeatFiltering(true);
520  Blast_FindRepeatFilterLoc(query_v, &nucl_handle);
521 
522  BOOST_REQUIRE(query_v[0].mask.NotEmpty());
523  BOOST_REQUIRE(query_v[0].mask->IsPacked_int());
524  const CPacked_seqint::Tdata& seqinterval_list =
525  query_v[0].mask->GetPacked_int().Get();
526 
527  size_t loc_index = 0;
528  BOOST_REQUIRE_EQUAL(kNumLocs, seqinterval_list.size());
529  ITERATE(CPacked_seqint::Tdata, itr, seqinterval_list) {
530 // cerr << (*itr)->GetFrom() << " " << (*itr)->GetTo() << endl;
531  BOOST_REQUIRE_EQUAL(kRepeatStarts[loc_index], (*itr)->GetFrom());
532  BOOST_REQUIRE_EQUAL(kRepeatEnds[loc_index], (*itr)->GetTo());
533  BOOST_REQUIRE(!(*itr)->CanGetStrand());
534  ++loc_index;
535  }
536 
537  BOOST_REQUIRE_EQUAL(kNumLocs, loc_index);
538 }
539 
540 BOOST_AUTO_TEST_CASE(WindowMasker)
541 {
542  int pair_size = sizeof(TSeqPos) * 2;
543 
544  const TSeqPos intervals[] =
545  { 0, 79,
546  100, 122,
547  146, 169,
548  225, 247,
549  286, 329,
550  348, 366,
551  373, 688,
552  701, 1303,
553  1450, 1485,
554  2858, 2887,
555  3103, 3212,
556  3217, 3735,
557  4142, 4162,
558  5423, 5443,
559  5797, 5817,
560  6333, 6383,
561  6458, 6477,
562  6519, 6539,
563  7043, 7063,
564  7170, 7189,
565  7604, 7623,
566  8454, 8476,
567  8829, 8851,
568  8860, 8889
569  };
570 
571  size_t num_locs = sizeof(intervals) / pair_size;
572  BOOST_REQUIRE(0 == (sizeof(intervals) % pair_size));
573 
574  CSeq_id id("gi|1945388");
575  unique_ptr<SSeqLoc>
576  qsl(CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_both));
577 
578  TSeqLocVector query_v;
579  query_v.push_back(*qsl);
580 
581  CBlastNucleotideOptionsHandle nucl_handle;
582  nucl_handle.SetWindowMaskerTaxId(9606);
583  Blast_FindWindowMaskerLoc(query_v, &nucl_handle);
584 
585  BOOST_REQUIRE(query_v[0].mask.NotEmpty());
586  BOOST_REQUIRE(query_v[0].mask->IsPacked_int());
587  const CPacked_seqint::Tdata& seqinterval_list =
588  query_v[0].mask->GetPacked_int().Get();
589 
590  size_t loc_index = 0;
591  BOOST_REQUIRE_EQUAL(num_locs, seqinterval_list.size());
592 
593  ITERATE(CPacked_seqint::Tdata, itr, seqinterval_list) {
594  //cout << (*itr)->GetFrom() << " " << (*itr)->GetTo() << endl;
595  BOOST_REQUIRE_EQUAL(intervals[loc_index], (*itr)->GetFrom());
596  BOOST_REQUIRE_EQUAL(intervals[loc_index+1], (*itr)->GetTo());
597  BOOST_REQUIRE(! (*itr)->CanGetStrand());
598  loc_index += 2;
599  }
600 
601  BOOST_REQUIRE_EQUAL(num_locs*2, loc_index);
602 }
603 
604 BOOST_AUTO_TEST_CASE(RepeatsFilter_OnSeqInterval) {
605  vector<TSeqRange> masked_regions;
606  masked_regions.push_back(TSeqRange(85028, 85528));
607  masked_regions.push_back(TSeqRange(85539, 85736));
608  masked_regions.push_back(TSeqRange(86334, 86461));
609  masked_regions.push_back(TSeqRange(86487, 86585));
610  masked_regions.push_back(TSeqRange(86730, 87050));
611  masked_regions.push_back(TSeqRange(87313, 87370));
612  masked_regions.push_back(TSeqRange(88134, 88140));
613  masked_regions.push_back(TSeqRange(88171, 88483));
614  masked_regions.push_back(TSeqRange(89032, 89152));
615  masked_regions.push_back(TSeqRange(91548, 91704));
616  masked_regions.push_back(TSeqRange(92355, 92539));
617  masked_regions.push_back(TSeqRange(92550, 92973));
618  masked_regions.push_back(TSeqRange(92983, 93283));
619  masked_regions.push_back(TSeqRange(93296, 93384));
620  masked_regions.push_back(TSeqRange(93472, 93642));
621  masked_regions.push_back(TSeqRange(93685, 94026));
622  masked_regions.push_back(TSeqRange(94435, 94545));
623 
624  CSeq_id id("gi|20196551");
625  unique_ptr<SSeqLoc> qsl(
626  CTestObjMgr::Instance().CreateSSeqLoc(id,
627  make_pair<TSeqPos, TSeqPos>(84999, 94637),
628  eNa_strand_both));
629  TSeqLocVector query_v;
630  query_v.push_back(*qsl);
631 
632  CBlastNucleotideOptionsHandle nucl_handle;
633  nucl_handle.SetDustFiltering(true);
634  nucl_handle.SetRepeatFiltering(true);
635  Blast_FindDustFilterLoc(query_v, &nucl_handle);
636  Blast_FindRepeatFilterLoc(query_v, &nucl_handle);
637 
638  BOOST_REQUIRE(query_v[0].mask->IsPacked_int());
639  const CPacked_seqint::Tdata& seqinterval_list =
640  query_v[0].mask->GetPacked_int().Get();
641 
642  size_t loc_index = 0;
643  BOOST_REQUIRE_EQUAL(masked_regions.size(), seqinterval_list.size());
644  ITERATE(CPacked_seqint::Tdata, itr, seqinterval_list) {
645 // cerr << (*itr)->GetFrom() << " " << (*itr)->GetTo() << endl;
646  BOOST_REQUIRE_EQUAL(masked_regions[loc_index].GetFrom(),
647  (*itr)->GetFrom());
648  BOOST_REQUIRE_EQUAL(masked_regions[loc_index].GetTo(),
649  (*itr)->GetTo());
650  BOOST_REQUIRE(!(*itr)->CanGetStrand());
651  ++loc_index;
652  }
653 
654  BOOST_REQUIRE_EQUAL(masked_regions.size(), loc_index);
655 }
656 
657 BOOST_AUTO_TEST_CASE(CSeqLocInfo_EqualityOperators)
658 {
659  CSeq_id id("gi|197670657");
660  TSeqRange r(1, 100);
663  BOOST_REQUIRE(a == b);
664 
665  b.SetFrame(2);
666  BOOST_REQUIRE(a != b);
667 }
668 
669 BOOST_AUTO_TEST_CASE(CombineDustAndLowerCaseMasking_WithBlastQueryVector) {
670  CSeq_id id("gi|197670657");
671  TSeqRange r(2, 299);
672  CRef<CSeqLocInfo> lower_case_mask
673  (new CSeqLocInfo(id, r, (int)CSeqLocInfo::eFramePlus1));
676  query->AddMask(lower_case_mask);
677  CBlastQueryVector queries;
678  queries.AddQuery(query);
679 
680  CBlastNucleotideOptionsHandle nucl_handle;
681  nucl_handle.SetDustFiltering(true);
682  Blast_FindDustFilterLoc(queries,
683  nucl_handle.GetDustFilteringLevel(),
684  nucl_handle.GetDustFilteringWindow(),
685  nucl_handle.GetDustFilteringLinker());
686  TMaskedQueryRegions mqr = queries.GetMaskedRegions(0);
687 
688  BOOST_REQUIRE( !mqr.empty() );
689  try { CRef<CSeq_loc> masks = queries.GetMasks(0); }
690  catch (const CBlastException& e) {
691  BOOST_REQUIRE(e.GetErrCode() == CBlastException::eNotSupported);
692  BOOST_REQUIRE(e.GetMsg().find("lossy direction") != NPOS);
693  }
694 
695  CRef<CSeqLocInfo> sli = mqr.front();
696  BOOST_REQUIRE(sli.NotEmpty());
697  BOOST_REQUIRE(*sli == *lower_case_mask);
698  BOOST_REQUIRE_EQUAL((int)2, (int)mqr.size());
699  BOOST_REQUIRE(mqr.front()->GetFrame() == 1);
700  BOOST_REQUIRE(mqr.back()->GetFrame() == -1);
701 }
702 
703 
704 BOOST_AUTO_TEST_CASE(RepeatsAndDustFilter) {
705 
706  CSeq_id id1("gi|197333738");
707  unique_ptr<SSeqLoc> qsl1(CTestObjMgr::Instance().CreateSSeqLoc(id1));
708  TSeqLocVector query_v1;
709  query_v1.push_back(*qsl1);
710 
711  CSeq_id id2("gi|197333738");
712  unique_ptr<SSeqLoc> qsl2(CTestObjMgr::Instance().CreateSSeqLoc(id2));
713  TSeqLocVector query_v2;
714  query_v2.push_back(*qsl2);
715 
716  CBlastNucleotideOptionsHandle nucl_handle;
717  nucl_handle.SetDustFiltering(true);
718  nucl_handle.SetRepeatFiltering(true);
719 
720  Blast_FindDustFilterLoc(query_v1, &nucl_handle);
721  Blast_FindRepeatFilterLoc(query_v1, &nucl_handle);
722 
723 
724  Blast_FindRepeatFilterLoc(query_v2, &nucl_handle);
725  Blast_FindDustFilterLoc(query_v2, &nucl_handle);
726 
727  BOOST_REQUIRE_EQUAL(sequence::Compare(*(query_v1[0].mask), *(query_v2[0].mask),
729 }
730 
731 BOOST_AUTO_TEST_CASE(WindowMaskerAndDustFilter) {
732 
733  CSeq_id id1("gi|197333738");
734  unique_ptr<SSeqLoc> qsl1(CTestObjMgr::Instance().CreateSSeqLoc(id1));
735  TSeqLocVector query_v1;
736  query_v1.push_back(*qsl1);
737 
738  CSeq_id id2("gi|197333738");
739  unique_ptr<SSeqLoc> qsl2(CTestObjMgr::Instance().CreateSSeqLoc(id2));
740  TSeqLocVector query_v2;
741  query_v2.push_back(*qsl2);
742 
743  CBlastNucleotideOptionsHandle nucl_handle;
744  nucl_handle.SetDustFiltering(true);
745  nucl_handle.SetWindowMaskerTaxId(9606);
746 
747  Blast_FindDustFilterLoc(query_v1, &nucl_handle);
748  Blast_FindWindowMaskerLoc(query_v1, &nucl_handle);
749 
750 
751  Blast_FindWindowMaskerLoc(query_v2, &nucl_handle);
752  Blast_FindDustFilterLoc(query_v2, &nucl_handle);
753 
754  BOOST_REQUIRE_EQUAL(sequence::Compare(*(query_v1[0].mask), *(query_v2[0].mask),
756 }
757 
758 BOOST_AUTO_TEST_CASE(WindowMasker_OnSeqInterval)
759 {
760  // these are from window masker and dust
761  vector<TSeqRange> masked_regions;
762  masked_regions.push_back(TSeqRange(85019, 85172));
763  masked_regions.push_back(TSeqRange(85190, 85345));
764  masked_regions.push_back(TSeqRange(85385, 85452));
765  masked_regions.push_back(TSeqRange(85483, 85505));
766  masked_regions.push_back(TSeqRange(85511, 85533));
767  masked_regions.push_back(TSeqRange(85575, 85596));
768  masked_regions.push_back(TSeqRange(85673, 85694));
769  masked_regions.push_back(TSeqRange(85725, 85745));
770 
771  CSeq_id id("gi|20196551");
772  unique_ptr<SSeqLoc>
773  qsl(CTestObjMgr::Instance().CreateSSeqLoc
774  (id, make_pair<TSeqPos, TSeqPos>(85000, 86200), eNa_strand_both));
775 
776  TSeqLocVector query_v;
777  query_v.push_back(*qsl);
778 
779  CBlastNucleotideOptionsHandle nucl_handle;
780  nucl_handle.SetDustFiltering(true);
781  nucl_handle.SetWindowMaskerTaxId(9606);
782 
783  Blast_FindDustFilterLoc(query_v, &nucl_handle);
784  Blast_FindWindowMaskerLoc(query_v, &nucl_handle);
785 
786  BOOST_REQUIRE(query_v[0].mask->IsPacked_int());
787  const CPacked_seqint::Tdata& seqinterval_list =
788  query_v[0].mask->GetPacked_int().Get();
789 
790  size_t loc_index = 0;
791  BOOST_REQUIRE_EQUAL(masked_regions.size(), seqinterval_list.size());
792 
793  ITERATE(CPacked_seqint::Tdata, itr, seqinterval_list) {
794  BOOST_REQUIRE_EQUAL(masked_regions[loc_index].GetFrom(),
795  (*itr)->GetFrom());
796  BOOST_REQUIRE_EQUAL(masked_regions[loc_index].GetTo(),
797  (*itr)->GetTo());
798  BOOST_REQUIRE(!(*itr)->CanGetStrand());
799  ++loc_index;
800  }
801 
802  BOOST_REQUIRE_EQUAL(masked_regions.size(), loc_index);
803 }
804 
805 BOOST_AUTO_TEST_CASE(RepeatsFilter_NoHitsFound) {
806  CSeq_id id("gi|33079743");
807  unique_ptr<SSeqLoc> qsl(
808  CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_both));
809  TSeqLocVector query_v;
810  query_v.push_back(*qsl);
811 
812  CBlastNucleotideOptionsHandle nucl_handle;
813  nucl_handle.SetRepeatFiltering(true);
814  nucl_handle.SetRepeatFilteringDB("repeat/repeat_9606");
815  Blast_FindRepeatFilterLoc(query_v, &nucl_handle);
816 
817  BOOST_REQUIRE(query_v[0].mask.Empty());
818 }
819 
820 BOOST_AUTO_TEST_CASE(WindowMasker_NoHitsFound) {
821  CSeq_id id("gi|33079743");
822  unique_ptr<SSeqLoc> qsl
823  (CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_both));
824 
825  TSeqLocVector query_v;
826  query_v.push_back(*qsl);
827 
828  CBlastNucleotideOptionsHandle nucl_handle;
829  nucl_handle.SetWindowMaskerTaxId(9606);
830 
831  Blast_FindRepeatFilterLoc(query_v, &nucl_handle);
832 
833  BOOST_REQUIRE(query_v[0].mask.Empty());
834 }
835 
836 BOOST_AUTO_TEST_CASE(RepeatsFilterWithMissingParameter) {
837  CSeq_id id("gi|1945388");
838  unique_ptr<SSeqLoc> qsl(CTestObjMgr::Instance().CreateSSeqLoc(id));
839  TSeqLocVector query_v;
840  query_v.push_back(*qsl);
841 
842  CBlastNucleotideOptionsHandle nucl_handle;
843  // note the missing argument to the repeats database
844  nucl_handle.SetFilterString("m L; R -d ");/* NCBI_FAKE_WARNING */
845  BOOST_REQUIRE_THROW(Blast_FindRepeatFilterLoc(query_v, &nucl_handle),
847 }
848 
849 BOOST_AUTO_TEST_CASE(WindowMaskerWithMissingParameter) {
850  CSeq_id id("gi|1945388");
851  unique_ptr<SSeqLoc> qsl(CTestObjMgr::Instance().CreateSSeqLoc(id));
852  TSeqLocVector query_v;
853  query_v.push_back(*qsl);
854 
855  CBlastNucleotideOptionsHandle nucl_handle;
856  // note the missing argument to the repeats database
857  nucl_handle.SetFilterString("m L; W -d ");/* NCBI_FAKE_WARNING */
858  BOOST_REQUIRE_THROW(Blast_FindWindowMaskerLoc(query_v, &nucl_handle),
860 }
861 
862 /// Test the conversion of a BlastMaskLoc internal structure to the
863 /// TSeqLocInfoVector type, used in formatting.
864 BOOST_AUTO_TEST_CASE(TestGetFilteredQueryRegions_BothStrandsOneQuery) {
866 }
867 BOOST_AUTO_TEST_CASE(TestGetFilteredQueryRegions_PlusStrandsOneQuery) {
869 }
870 BOOST_AUTO_TEST_CASE(TestGetFilteredQueryRegions_MinusStrandsOneQuery) {
872 }
873 
874 BOOST_AUTO_TEST_CASE(RestrictLowerCaseMask) {
875  vector<TSeqRange> masks;
876  masks.push_back(TSeqRange(0, 75));
877  masks.push_back(TSeqRange(78, 208));
878  masks.push_back(TSeqRange(217, 316));
879  masks.push_back(TSeqRange(380, 685));
880  masks.push_back(TSeqRange(694, 1004));
881  masks.push_back(TSeqRange(1018, 1122));
882  masks.push_back(TSeqRange(1128, 1298));
883  masks.push_back(TSeqRange(2817, 2952));
884  masks.push_back(TSeqRange(2084, 3409));
885  masks.push_back(TSeqRange(3428, 3733));
886  masks.push_back(TSeqRange(3782, 3916));
887 
889  CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, 1945388));
890  ITERATE(vector<TSeqRange>, range, masks) {
891  CRef<CSeq_interval> intv(new CSeq_interval(*id,
892  range->GetFrom(),
893  range->GetTo()));
894  // N.B.: this is deliberate, because of this the return value of
895  // TMaskedQueryRegions::RestrictToSeqInt() will have its strand
896  // unset (see CSeq_interval parametrized constructor for that)
897  BOOST_REQUIRE(intv->CanGetStrand() == false);
898  CRef<CSeqLocInfo> sli(new CSeqLocInfo(intv,
900  mqr.push_back(sli);
901  }
902 
903  // N.B.: even a different Seq-id will work!
904  CSeq_id other_id(CSeq_id::e_Gi, 555);
905  CSeq_interval restriction(other_id, 0, 624);
906  TMaskedQueryRegions restricted_mask;
907  restricted_mask = mqr.RestrictToSeqInt(restriction);
908  BOOST_REQUIRE_EQUAL((size_t)4, restricted_mask.size());
909  BOOST_REQUIRE_EQUAL((TSeqPos)624,
910  restricted_mask.back()->GetInterval().GetTo());
911  BOOST_REQUIRE_EQUAL(CSeq_id::e_YES, id->Compare
912  (restricted_mask.front()->GetInterval().GetId()));
913  BOOST_REQUIRE(!(restricted_mask.front()->GetInterval().CanGetStrand()));
914 
915  restriction.SetFrom(1000);
916  restriction.SetTo(2000);
917  restriction.SetStrand(eNa_strand_plus); // this is irrelevant
918  restricted_mask = mqr.RestrictToSeqInt(restriction);
919  BOOST_REQUIRE_EQUAL((size_t)3, restricted_mask.size());
920  TMaskedQueryRegions::iterator itr = restricted_mask.begin();
921 
922  BOOST_REQUIRE_EQUAL((TSeqPos)1000, (*itr)->GetInterval().GetFrom());
923  BOOST_REQUIRE_EQUAL((TSeqPos)1004, (*itr)->GetInterval().GetTo()-1);
924  BOOST_REQUIRE(id->Match((*itr)->GetInterval().GetId()));
925  BOOST_REQUIRE(!(*itr)->GetInterval().CanGetStrand());
926  BOOST_REQUIRE_EQUAL((int)CSeqLocInfo::eFrameNotSet, (*itr)->GetFrame());
927  ++itr;
928  BOOST_REQUIRE_EQUAL((TSeqPos)1018, (*itr)->GetInterval().GetFrom());
929  BOOST_REQUIRE_EQUAL((TSeqPos)1122, (*itr)->GetInterval().GetTo()-1);
930  BOOST_REQUIRE(id->Match((*itr)->GetInterval().GetId()));
931  BOOST_REQUIRE(!(*itr)->GetInterval().CanGetStrand());
932  BOOST_REQUIRE_EQUAL((int)CSeqLocInfo::eFrameNotSet, (*itr)->GetFrame());
933  ++itr;
934  BOOST_REQUIRE_EQUAL((TSeqPos)1128, (*itr)->GetInterval().GetFrom());
935  BOOST_REQUIRE_EQUAL((TSeqPos)1298, (*itr)->GetInterval().GetTo()-1);
936  BOOST_REQUIRE(id->Match((*itr)->GetInterval().GetId()));
937  BOOST_REQUIRE(!(*itr)->GetInterval().CanGetStrand());
938  BOOST_REQUIRE_EQUAL((int)CSeqLocInfo::eFrameNotSet, (*itr)->GetFrame());
939  ++itr;
940  BOOST_REQUIRE(itr == restricted_mask.end());
941 
942  restriction.SetFrom(10000);
943  restriction.SetTo(20000);
944  restricted_mask = mqr.RestrictToSeqInt(restriction);
945  BOOST_REQUIRE(restricted_mask.empty());
946 }
947 
948 // Inspired by JIRA SB-264
949 BOOST_AUTO_TEST_CASE(BlastxLowerCaseMask) {
950  vector<TSeqRange> masks;
951  masks.push_back(TSeqRange(0, 75));
952  masks.push_back(TSeqRange(78, 208));
953  masks.push_back(TSeqRange(217, 316));
954  masks.push_back(TSeqRange(380, 685));
955  masks.push_back(TSeqRange(694, 1004));
956  masks.push_back(TSeqRange(1018, 1122));
957  masks.push_back(TSeqRange(1128, 1298));
958  masks.push_back(TSeqRange(2817, 2952));
959  masks.push_back(TSeqRange(2084, 3409));
960  masks.push_back(TSeqRange(3428, 3733));
961  masks.push_back(TSeqRange(3782, 3916));
962 
964  CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, 1945388));
965  ITERATE(vector<TSeqRange>, range, masks) {
966  CRef<CSeq_interval> intv(new CSeq_interval(*id,
967  range->GetFrom(),
968  range->GetTo()));
969  CRef<CSeqLocInfo> sli(new CSeqLocInfo(intv,
971  mqr.push_back(sli);
972  }
974  BOOST_REQUIRE(!bqff.Empty());
975  BOOST_REQUIRE(bqff.QueryHasMultipleFrames());
976  const set<CSeqLocInfo::ETranslationFrame>& frames = bqff.ListFrames();
977  ITERATE(set<CSeqLocInfo::ETranslationFrame>, fr, frames) {
978  BOOST_REQUIRE(bqff[*fr] != NULL);
979  }
980  BOOST_REQUIRE(bqff.GetNumFrames() == NUM_FRAMES);
981 }
982 
983 // Inspired by SB-597
984 BOOST_AUTO_TEST_CASE(BlastxLowerCaseMaskProteinLocations)
985 {
986  vector<TSeqRange> masks;
987  masks.push_back(TSeqRange(0, 75));
988 
990  CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, 1945388));
991  ITERATE(vector<TSeqRange>, range, masks) {
992  CRef<CSeq_interval> intv(new CSeq_interval(*id,
993  range->GetFrom(),
994  range->GetTo()));
995  CRef<CSeqLocInfo> sli_plus(new CSeqLocInfo(intv,
997  mqr.push_back(sli_plus);
998  CRef<CSeqLocInfo> sli_minus(new CSeqLocInfo(intv,
1000  mqr.push_back(sli_minus);
1001  }
1003  bqff.UseProteinCoords(9180); // 9180 is length of GI|1945388
1004 
1005  BlastSeqLoc* bsl = *bqff[CSeqLocInfo::eFramePlus1];
1006  BOOST_REQUIRE_EQUAL(bsl->ssr->left, 0);
1007  BOOST_REQUIRE_EQUAL(bsl->ssr->right, 25);
1008 
1009  bsl = *bqff[CSeqLocInfo::eFramePlus2];
1010  BOOST_REQUIRE_EQUAL(bsl->ssr->left, 0);
1011  BOOST_REQUIRE_EQUAL(bsl->ssr->right, 24);
1012 
1013  bsl = *bqff[CSeqLocInfo::eFramePlus3];
1014  BOOST_REQUIRE_EQUAL(bsl->ssr->left, 0);
1015  BOOST_REQUIRE_EQUAL(bsl->ssr->right, 24);
1016 
1017  bsl = *bqff[CSeqLocInfo::eFrameMinus1];
1018  BOOST_REQUIRE_EQUAL(bsl->ssr->left, 3034);
1019  BOOST_REQUIRE_EQUAL(bsl->ssr->right, 3059);
1020 
1021  bsl = *bqff[CSeqLocInfo::eFrameMinus2];
1022  BOOST_REQUIRE_EQUAL(bsl->ssr->left, 3034);
1023  BOOST_REQUIRE_EQUAL(bsl->ssr->right, 3058);
1024 
1025  bsl = *bqff[CSeqLocInfo::eFrameMinus3];
1026  BOOST_REQUIRE_EQUAL(bsl->ssr->left, 3034);
1027  BOOST_REQUIRE_EQUAL(bsl->ssr->right, 3058);
1028 }
1029 
1030 // Inspired by SB-285
1031 BOOST_AUTO_TEST_CASE(BlastnLowerCaseMask_SingleStrand) {
1032  TSeqRange mask(TSeqRange(0, 75));
1033 
1034  TMaskedQueryRegions mqr;
1035  CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, 1945388));
1036  CRef<CSeq_interval> intv(new CSeq_interval(*id,
1037  mask.GetFrom(),
1038  mask.GetTo()));
1039  CRef<CSeqLocInfo> sli(new CSeqLocInfo(intv,
1041  mqr.push_back(sli);
1042 
1044  BOOST_REQUIRE(!bqff.Empty());
1045  BOOST_REQUIRE(bqff.QueryHasMultipleFrames());
1046  const set<CSeqLocInfo::ETranslationFrame>& frames = bqff.ListFrames();
1047  const int kExpectedNumFrames = 2;
1048  int frame_ctr = 0;
1049  ITERATE(set<CSeqLocInfo::ETranslationFrame>, fr, frames) {
1050  BOOST_REQUIRE(bqff[*fr] != NULL);
1051  frame_ctr++;
1052  }
1053  BOOST_REQUIRE_EQUAL(kExpectedNumFrames, bqff.GetNumFrames());
1054  BOOST_REQUIRE_EQUAL(1, frame_ctr); // NOTE!!
1055  BOOST_REQUIRE_EQUAL(1, frames.size()); // NOTE!!
1056 }
1057 
1058 // Inspired by SB-285
1059 BOOST_AUTO_TEST_CASE(BlastnLowerCaseMask_BothStrands) {
1060  TSeqRange mask(TSeqRange(0, 75));
1061 
1062  TMaskedQueryRegions mqr;
1063  CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, 1945388));
1064  CRef<CSeq_interval> intv(new CSeq_interval(*id,
1065  mask.GetFrom(),
1066  mask.GetTo()));
1067  CRef<CSeqLocInfo> sli(new CSeqLocInfo(intv,
1069  mqr.push_back(sli);
1071  mqr.push_back(sli);
1072 
1074  BOOST_REQUIRE(!bqff.Empty());
1075  BOOST_REQUIRE(bqff.QueryHasMultipleFrames());
1076  const set<CSeqLocInfo::ETranslationFrame>& frames = bqff.ListFrames();
1077  const int kExpectedNumFrames = 2;
1078  int frame_ctr = 0;
1079  ITERATE(set<CSeqLocInfo::ETranslationFrame>, fr, frames) {
1080  BOOST_REQUIRE(bqff[*fr] != NULL);
1081  frame_ctr++;
1082  }
1083  BOOST_REQUIRE_EQUAL(kExpectedNumFrames, bqff.GetNumFrames());
1084  BOOST_REQUIRE_EQUAL(kExpectedNumFrames, frame_ctr); // NOTE!!
1085  BOOST_REQUIRE_EQUAL(kExpectedNumFrames, frames.size()); // NOTE!!
1086 }
1087 
1088 BOOST_AUTO_TEST_CASE(LowerCaseMask_PlusStrand) {
1089  const bool ignore_strand_in_mask = true;
1091  ignore_strand_in_mask);
1092 }
1093 
1094 BOOST_AUTO_TEST_CASE(LowerCaseMask_MinusStrand) {
1095  const bool ignore_strand_in_mask = true;
1097  ignore_strand_in_mask);
1098 }
1099 
1100 BOOST_AUTO_TEST_CASE(LowerCaseMask_BothStrands) {
1101  const bool ignore_strand_in_mask = true;
1103  ignore_strand_in_mask);
1104 }
1105 
1106 BOOST_AUTO_TEST_CASE(LowerCaseMask_PlusStrand_Explicit) {
1107  const bool ignore_strand_in_mask = false;
1109  ignore_strand_in_mask);
1110 }
1111 
1112 BOOST_AUTO_TEST_CASE(LowerCaseMask_MinusStrand_Explicit) {
1113  const bool ignore_strand_in_mask = false;
1115  ignore_strand_in_mask);
1116 }
1117 
1118 BOOST_AUTO_TEST_CASE(LowerCaseMask_BothStrands_Explicit) {
1119  const bool ignore_strand_in_mask = false;
1121  ignore_strand_in_mask);
1122 }
1123 
1124 BOOST_AUTO_TEST_CASE(CombineRepeatAndLowerCaseMask) {
1125  const int kNumLcaseLocs = 11;
1126  const int kLcaseStarts[kNumLcaseLocs] =
1127  { 0, 78, 217, 380, 694, 1018, 1128, 2817, 3084, 3428, 3782 };
1128  const int kLcaseEnds[kNumLcaseLocs] =
1129  { 75, 208, 316, 685, 1004, 1122, 1298, 2952, 3409, 3733, 3916 };
1130 
1131  const int kNumLocs = 6;
1132  const int kStarts[kNumLocs] = { 0, 217, 380, 2817, 3084, 3782 };
1133  const int kEnds[kNumLocs] = { 212, 316, 1298, 2953, 3764, 3916 };
1134  CSeq_id id("gi|1945388");
1135  unique_ptr<SSeqLoc> qsl(
1136  CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_both));
1137 
1138  // Fill the lower case mask into the SSeqLoc
1139  CSeq_loc* seqloc = new CSeq_loc();
1140  for (int index = 0; index < kNumLcaseLocs; ++index) {
1141  seqloc->SetPacked_int().AddInterval(id, kLcaseStarts[index],
1142  kLcaseEnds[index]);
1143  BOOST_REQUIRE(!seqloc->GetPacked_int().Get().back()->CanGetStrand());
1144  }
1145  qsl->mask.Reset(seqloc);
1146 
1147  TSeqLocVector query_v;
1148  query_v.push_back(*qsl);
1149  CBlastNucleotideOptionsHandle nucl_handle;
1150  nucl_handle.SetRepeatFiltering(true);
1151  Blast_FindRepeatFilterLoc(query_v, &nucl_handle);
1152 
1153  BOOST_REQUIRE(query_v[0].mask->IsPacked_int());
1154 
1155  int loc_index = 0;
1156 
1157  BOOST_REQUIRE(query_v[0].mask.NotEmpty());
1159  query_v[0].mask->GetPacked_int().Get()) {
1160  // cerr << (*itr)->GetFrom() << " " << (*itr)->GetTo() << endl;
1161  BOOST_REQUIRE_EQUAL(kStarts[loc_index], (int)(*itr)->GetFrom());
1162  BOOST_REQUIRE_EQUAL(kEnds[loc_index], (int)(*itr)->GetTo());
1163  ++loc_index;
1164  }
1165 
1166  BOOST_REQUIRE_EQUAL(kNumLocs, loc_index);
1167 }
1168 
1169 BOOST_AUTO_TEST_CASE(CombineRepeatAndDustFilter) {
1170  const int kNumLocs = 13;
1171  const int kStarts[kNumLocs] =
1172  { 0, 298, 380, 1449, 2851, 3113, 4704, 6364, 6512, 7600,
1173  7766, 8873, 9109};
1174  const int kEnds[kNumLocs] =
1175  { 212, 305, 1297, 1479, 2953, 3764, 4710, 6373, 6573, 7672,
1176  7772, 8880, 9179};
1177  CSeq_id id("gi|1945388");
1178  unique_ptr<SSeqLoc> qsl(
1179  CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_both));
1180  TSeqLocVector query_v;
1181  query_v.push_back(*qsl);
1182 
1183  CBlastNucleotideOptionsHandle nucl_handle;
1184  nucl_handle.SetRepeatFiltering(true);
1185  nucl_handle.SetDustFiltering(true);
1186  Blast_FindDustFilterLoc(query_v, &nucl_handle);
1187  Blast_FindRepeatFilterLoc(query_v, &nucl_handle);
1188 
1189  int loc_index = 0;
1190 
1191  BOOST_REQUIRE(query_v[0].mask.NotEmpty());
1193  query_v[0].mask->GetPacked_int().Get()) {
1194  // cerr << (*itr)->GetFrom() << " " << (*itr)->GetTo() << endl;
1195  BOOST_REQUIRE_EQUAL(kStarts[loc_index], (int)(*itr)->GetFrom());
1196  BOOST_REQUIRE_EQUAL(kEnds[loc_index], (int)(*itr)->GetTo());
1197  ++loc_index;
1198  }
1199  BOOST_REQUIRE_EQUAL(kNumLocs, loc_index);
1200 }
1201 
1202 BOOST_AUTO_TEST_CASE(FilterLocNuclBoth) {
1203  const int kNumLocs = 15;
1204  const int kDustStarts[kNumLocs] =
1205  { 298, 875, 1018, 1449, 3113, 3282, 3428, 3598, 4704, 6364,
1206  6512, 7600, 7766, 8873, 9109};
1207  const int kDustEnds[kNumLocs] =
1208  { 305, 882, 1115, 1479, 3133, 3298, 3441, 3606, 4710, 6373,
1209  6573, 7672, 7772, 8880 , 9179};
1210 
1211  CSeq_id id("gi|1945388");
1212  unique_ptr<SSeqLoc> qsl(
1213  CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_both));
1214  TSeqLocVector query_v;
1215  query_v.push_back(*qsl);
1216 
1217  CBlastNucleotideOptionsHandle nucl_handle;
1218  nucl_handle.SetDustFiltering(true);
1219  Blast_FindDustFilterLoc(query_v, &nucl_handle);
1220 
1221  int loc_index=0;
1222  ITERATE(list< CRef<CSeq_interval> >, itr,
1223  query_v[0].mask->GetPacked_int().Get()) {
1224  BOOST_REQUIRE_EQUAL(kDustStarts[loc_index], (int)(*itr)->GetFrom());
1225  BOOST_REQUIRE_EQUAL(kDustEnds[loc_index], (int)(*itr)->GetTo());
1226  ++loc_index;
1227  }
1228 
1229  BOOST_REQUIRE_EQUAL(loc_index, kNumLocs);
1230 }
1231 
1232 BOOST_AUTO_TEST_CASE(FilterLocNuclPlus) {
1233  const int kNumLocs = 15;
1234  const int kDustStarts[kNumLocs] =
1235  { 298, 875, 1018, 1449, 3113, 3282, 3428, 3598, 4704, 6364,
1236  6512, 7600, 7766, 8873, 9109};
1237  const int kDustEnds[kNumLocs] =
1238  { 305, 882, 1115, 1479, 3133, 3298, 3441, 3606, 4710, 6373,
1239  6573, 7672, 7772, 8880 , 9179};
1240 
1241  CSeq_id id("gi|1945388");
1242  unique_ptr<SSeqLoc> qsl(
1243  CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_plus));
1244  TSeqLocVector query_v;
1245  query_v.push_back(*qsl);
1246 
1247  CBlastNucleotideOptionsHandle nucl_handle;
1248  nucl_handle.SetDustFiltering(true);
1249  Blast_FindDustFilterLoc(query_v, &nucl_handle);
1250 
1251  int loc_index=0;
1252  ITERATE(list< CRef<CSeq_interval> >, itr,
1253  query_v[0].mask->GetPacked_int().Get()) {
1254  BOOST_REQUIRE_EQUAL(kDustStarts[loc_index], (int)(*itr)->GetFrom());
1255  BOOST_REQUIRE_EQUAL(kDustEnds[loc_index], (int)(*itr)->GetTo());
1256  ++loc_index;
1257  }
1258 
1259  BOOST_REQUIRE_EQUAL(loc_index, kNumLocs);
1260 }
1261 
1262 BOOST_AUTO_TEST_CASE(FilterLocNuclMinus) {
1263  const int kNumLocs = 15;
1264  const int kDustStarts[kNumLocs] =
1265  { 298, 875, 1018, 1449, 3113, 3282, 3428, 3598, 4704, 6364,
1266  6512, 7600, 7766, 8873, 9109};
1267  const int kDustEnds[kNumLocs] =
1268  { 305, 882, 1115, 1479, 3133, 3298, 3441, 3606, 4710, 6373,
1269  6573, 7672, 7772, 8880 , 9179};
1270 
1271  CSeq_id id("gi|1945388");
1272  unique_ptr<SSeqLoc> qsl(
1273  CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_minus));
1274  TSeqLocVector query_v;
1275  query_v.push_back(*qsl);
1276 
1277  CBlastNucleotideOptionsHandle nucl_handle;
1278  nucl_handle.SetDustFiltering(true);
1279  Blast_FindDustFilterLoc(query_v, &nucl_handle);
1280  // CSeq_loc_mapper sorts intervals in reverse order if on minus strand.
1281  bool reverse = IsReverse(query_v[0].mask->GetStrand());
1282  int loc_index = reverse ? kNumLocs - 1 : 0;
1283  ITERATE(list< CRef<CSeq_interval> >, itr,
1284  query_v[0].mask->GetPacked_int().Get()) {
1285  BOOST_REQUIRE_EQUAL(kDustStarts[loc_index], (int)(*itr)->GetFrom());
1286  BOOST_REQUIRE_EQUAL(kDustEnds[loc_index], (int)(*itr)->GetTo());
1287  reverse ? --loc_index : ++loc_index;
1288  }
1289 
1290  // Check that we finished loop on reverse strand is that loc_index is -1.
1291  if ( !reverse ) {
1292  BOOST_REQUIRE_EQUAL(loc_index, kNumLocs);
1293  }
1294  else {
1295  BOOST_REQUIRE_EQUAL(loc_index, -1);
1296  }
1297 }
1298 
1299 
1300 BOOST_AUTO_TEST_CASE(FilterLocProtein) {
1301  const int kNumLocs = 3;
1302  const int kSegStarts[kNumLocs] = { 15, 55, 495 };
1303  const int kSegEnds[kNumLocs] = { 27, 68, 513 };
1304  CSeq_id id("gi|3091");
1305  unique_ptr<SSeqLoc> qsl(CTestObjMgr::Instance().CreateSSeqLoc(id));
1306  TSeqLocVector query_v;
1307  query_v.push_back(*qsl);
1308  CBlastQueryInfo query_info;
1309  CBLAST_SequenceBlk query_blk;
1311 
1312  setupQueryStructures(query_v, opts->GetOptions(),
1313  &query_blk, &query_info);
1314 
1315  BlastMaskLoc* filter_out = NULL;
1316  Blast_Message *blast_message=NULL;
1317  SBlastFilterOptions* filter_options;
1318  SBlastFilterOptionsNew(&filter_options, eSeg);
1319 
1320  Int2 status =
1321  BlastSetUp_GetFilteringLocations(query_blk, query_info,
1322  eBlastTypeBlastp, filter_options,
1323  &filter_out, &blast_message);
1324  filter_options = SBlastFilterOptionsFree(filter_options);
1325  BOOST_REQUIRE(filter_options == NULL);
1326  BOOST_REQUIRE(status == 0);
1327 
1328  BlastSeqLoc *filter_slp = filter_out->seqloc_array[0];
1329  Int4 loc_index;
1330  SSeqRange* di;
1331  BlastSeqLoc *loc = NULL;
1332  for (loc_index=0, loc = filter_slp; loc; loc = loc->next, ++loc_index) {
1333  di = loc->ssr;
1334  BOOST_REQUIRE_EQUAL(kSegStarts[loc_index], di->left);
1335  BOOST_REQUIRE_EQUAL(kSegEnds[loc_index], di->right);
1336  }
1337 
1338  BOOST_REQUIRE_EQUAL(kNumLocs, loc_index);
1339 
1340  filter_out = BlastMaskLocFree(filter_out);
1341  BOOST_REQUIRE(filter_out == NULL);
1342 }
1343 
1344 BOOST_AUTO_TEST_CASE(MaskProteinSequence) {
1345  const int kNumLocs = 3;
1346  const int kSegStarts[kNumLocs] = { 15, 55, 495 };
1347  const int kSegEnds[kNumLocs] = { 27, 68, 513 };
1348  CSeq_id id("gi|3091");
1349  unique_ptr<SSeqLoc> qsl(CTestObjMgr::Instance().CreateSSeqLoc(id));
1350  TSeqLocVector query_v;
1351  query_v.push_back(*qsl);
1353 
1354  CBlastQueryInfo query_info;
1355  CBLAST_SequenceBlk query_blk;
1356  setupQueryStructures(query_v, opts->GetOptions(),
1357  &query_blk, &query_info);
1358 
1359  BlastSeqLoc *head = NULL;
1360  BlastSeqLoc *last = NULL;
1361  for (Int4 loc_index=0; loc_index<kNumLocs; ++loc_index) {
1362  if (head == NULL)
1363  last = BlastSeqLocNew(&head, kSegStarts[loc_index],
1364  kSegEnds[loc_index]);
1365  else
1366  last = BlastSeqLocNew(&last, kSegStarts[loc_index],
1367  kSegEnds[loc_index]);
1368  }
1369 
1370  BlastMaskLoc* filter_maskloc = BlastMaskLocNew(1);
1371  filter_maskloc->seqloc_array[0] = head;
1372 
1373  BlastSetUp_MaskQuery(query_blk, query_info, filter_maskloc,
1375  filter_maskloc = BlastMaskLocFree(filter_maskloc);
1376  BOOST_REQUIRE(filter_maskloc == NULL);
1377 
1378  Uint1* buffer = &query_blk->sequence[0];
1379  Int4 query_length = query_info->contexts[0].query_length;
1380  Uint4 hash = 0;
1381  for (int index=0; index<query_length; index++)
1382  {
1383  hash *= 1103515245;
1384  hash += (Uint4)buffer[index] + 12345;
1385  }
1386  BOOST_REQUIRE_EQUAL(-241853716, (int) hash);
1387 }
1388 
1389 BOOST_AUTO_TEST_CASE(MaskNucleotideBothStrands) {
1390  const int kNumLocs = 15;
1391  const int kDustStarts[kNumLocs] =
1392  { 298, 875, 1018, 1064, 1448, 3113, 3282, 3428, 3598, 4704, 6364,
1393  6511, 7766, 8873, 9108 };
1394  const int kDustEnds[kNumLocs] =
1395  { 305, 882, 1045, 1115, 1479, 3133, 3298, 3441, 3606, 4710, 6373,
1396  6573, 7772, 8880, 9179 };
1397 
1398  CSeq_id id("gi|1945388");
1399  unique_ptr<SSeqLoc> qsl(CTestObjMgr::Instance().CreateSSeqLoc(id,
1400  eNa_strand_both));
1401  TSeqLocVector query_v;
1402  query_v.push_back(*qsl);
1404 
1405  CBlastQueryInfo query_info;
1406  CBLAST_SequenceBlk query_blk;
1407  setupQueryStructures(query_v, opts->GetOptions(),
1408  &query_blk, &query_info);
1409 
1410  BlastSeqLoc *head = NULL;
1411  BlastSeqLoc *last = NULL;
1412  for (Int4 loc_index=0; loc_index<kNumLocs; ++loc_index) {
1413  if (head == NULL)
1414  last = BlastSeqLocNew(&head, kDustStarts[loc_index],
1415  kDustEnds[loc_index]);
1416  else
1417  last = BlastSeqLocNew(&last, kDustStarts[loc_index],
1418  kDustEnds[loc_index]);
1419  }
1420 
1421  BlastMaskLoc* filter_maskloc =
1422  BlastMaskLocNew(query_info->last_context+1);
1423  filter_maskloc->seqloc_array[0] = head;
1424 
1425  BlastSetUp_MaskQuery(query_blk, query_info, filter_maskloc,
1427  filter_maskloc = BlastMaskLocFree(filter_maskloc);
1428  BOOST_REQUIRE(filter_maskloc == NULL);
1429 
1430  Uint1* buffer = &query_blk->sequence[0];
1431  Int4 query_length = query_info->contexts[0].query_length;
1432  Uint4 hash = 0;
1433  for (int index=0; index<query_length; index++)
1434  {
1435  hash *= 1103515245;
1436  hash += (Uint4)buffer[index] + 12345;
1437  }
1438  BOOST_REQUIRE_EQUAL(-1261879517, (int) hash);
1439 }
1440 
1441 BOOST_AUTO_TEST_CASE(FilterMultipleQueriesLocNuclPlus) {
1442  const int kNumLocs0 = 15;
1443  const int kNumLocs1 = 80;
1444  const int kNumLocs2 = 1;
1445 
1446  int dust_starts0[kNumLocs0] =
1447  { 298, 875, 1018, 1449, 3113, 3282, 3428, 3598, 4704, 6364,
1448  6512, 7600, 7766, 8873, 9109};
1449  int dust_ends0[kNumLocs0] =
1450  { 305, 882, 1115, 1479, 3133, 3298, 3441, 3606, 4710, 6373,
1451  6573, 7672, 7772, 8880 , 9179};
1452  int dust_starts1[kNumLocs1] =
1453  { 189, 862, 1717, 1880, 2301, 2850, 3074, 3301, 4865, 5231, 5397,
1454  5825, 5887, 6560, 6806, 7178, 7709, 8000, 8275, 8441, 9449, 9779,
1455  10297, 10457, 11033, 11242, 12271, 12410, 12727, 13803, 14743, 15052,
1456  15153, 15262, 16201, 16968, 17318, 18470, 20179, 21513, 21569,
1457  22034, 22207, 22657, 22890, 23326, 27984, 28305, 28581, 28960, 29678,
1458  30553, 31195, 32347, 33641, 33785, 34138, 34861, 34872, 35028,
1459  35676, 35727, 36105, 36312, 36841, 38459, 38610, 38997, 39217, 39428,
1460  39629, 42243, 42584, 43157, 43346, 43619, 44040, 44617, 46791, 47213};
1461  int dust_ends1[kNumLocs1] =
1462  { 230, 876, 1741, 1898, 2315, 2868, 3117, 3308, 4886, 5255, 5433, 5860,
1463  5943, 6566, 6857, 7245, 7737, 8014, 8286, 8479, 9496, 9830, 10306,
1464  10581, 11082, 11255, 12277, 12432, 12748, 13809, 14750, 15121, 15171,
1465  15345, 16237, 16992, 17332, 18482, 20185, 21524, 21688, 22072, 22220,
1466  22672, 22898, 23348, 27996, 28311, 28626, 28998, 29690, 30596, 31220,
1467  32359, 33683, 33815, 34203, 34870, 34894, 35039, 35725, 35797, 36114,
1468  36318, 36869, 38497, 38632, 39035, 39223, 39477, 39635, 42249, 42591,
1469  43175, 43410, 43648, 44049, 44630, 46811, 47219};
1470  int dust_starts2[kNumLocs2] = {156};
1471  int dust_ends2[kNumLocs2] = {172};
1472 
1473  typedef pair<int*, int*> TStartEndPair;
1474  TStartEndPair pair0(dust_starts0, dust_ends0);
1475  TStartEndPair pair1(dust_starts1, dust_ends1);
1476  TStartEndPair pair2(dust_starts2, dust_ends2);
1477 
1478  vector< TStartEndPair > start_end_v;
1479  start_end_v.push_back(pair0);
1480  start_end_v.push_back(pair1);
1481  start_end_v.push_back(pair2);
1482 
1483  CSeq_id qid1("gi|1945388");
1484  unique_ptr<SSeqLoc> qsl1(
1485  CTestObjMgr::Instance().CreateSSeqLoc(qid1, eNa_strand_both));
1486  CSeq_id qid2("gi|2655203");
1487  unique_ptr<SSeqLoc> qsl2(
1488  CTestObjMgr::Instance().CreateSSeqLoc(qid2, eNa_strand_both));
1489  CSeq_id qid3("gi|557");
1490  unique_ptr<SSeqLoc> qsl3(
1491  CTestObjMgr::Instance().CreateSSeqLoc(qid3, eNa_strand_both));
1492 
1493  TSeqLocVector query_v;
1494 
1495  query_v.push_back(*qsl1);
1496  query_v.push_back(*qsl2);
1497  query_v.push_back(*qsl3);
1498 
1499 
1500  CBlastNucleotideOptionsHandle nucl_handle;
1501  nucl_handle.SetDustFiltering(true);
1502  Blast_FindDustFilterLoc(query_v, &nucl_handle);
1504 
1505  int query_number=0;
1506  ITERATE(vector< TStartEndPair >, vec_iter, start_end_v)
1507  {
1508  TStartEndPair local_pair = *vec_iter;
1509  int* start = local_pair.first;
1510  int* stop = local_pair.second;
1511  int loc_index=0;
1512  ITERATE(list< CRef<CSeq_interval> >, itr,
1513  query_v[query_number].mask->GetPacked_int().Get()) {
1514  BOOST_REQUIRE_EQUAL(start[loc_index], (int)(*itr)->GetFrom());
1515  BOOST_REQUIRE_EQUAL(stop[loc_index], (int)(*itr)->GetTo());
1516  ++loc_index;
1517  }
1518  ++query_number;
1519  }
1520 }
1521 
1522 BOOST_AUTO_TEST_CASE(MaskRestrictToInterval)
1523 {
1524  const int kNumLocs = 4;
1525  const int kMaskStarts[kNumLocs] = { 10, 20, 30, 40 };
1526  const int kMaskEnds[kNumLocs] = { 15, 25, 35, 45 };
1527  const int kRange[2] = { 12, 22 };
1528  BlastSeqLoc* mask_loc = NULL, *loc_var;
1529  int index;
1530 
1531  for (index = 0; index < kNumLocs; ++index) {
1532  BlastSeqLocNew(&mask_loc, kMaskStarts[index], kMaskEnds[index]);
1533  }
1534 
1535  // Test that restricting to a full sequence does not change anything;
1536  // this also checks that negative ending offset indicates full
1537  // sequence.
1538  BlastSeqLoc_RestrictToInterval(&mask_loc, 0, -2);
1539  for (index = 0, loc_var = mask_loc; loc_var;
1540  ++index, loc_var = loc_var->next) {
1541  BOOST_REQUIRE_EQUAL(kMaskStarts[index], (int)loc_var->ssr->left);
1542  BOOST_REQUIRE_EQUAL(kMaskEnds[index], (int)loc_var->ssr->right);
1543  }
1544  BOOST_REQUIRE_EQUAL(kNumLocs, index);
1545 
1546  BlastSeqLoc_RestrictToInterval(&mask_loc, kRange[0], kRange[1]);
1547  for (index = 0, loc_var = mask_loc; loc_var;
1548  ++index, loc_var = loc_var->next);
1549  BOOST_REQUIRE_EQUAL(2, index);
1550  BOOST_REQUIRE_EQUAL(kMaskEnds[0]-kRange[0], (int)mask_loc->ssr->right);
1551  BOOST_REQUIRE_EQUAL(kMaskStarts[1]-kRange[0],
1552  (int)mask_loc->next->ssr->left);
1553  BOOST_REQUIRE_EQUAL(kRange[1]-kRange[0],
1554  (int)mask_loc->next->ssr->right);
1555 
1556  BlastSeqLoc_RestrictToInterval(&mask_loc, kRange[0], kRange[1]);
1557 
1558  BOOST_REQUIRE(mask_loc == NULL);
1559 }
1560 
1562 {
1563  const int kNumQueries = 3;
1564  const TGi kQueryGis[kNumQueries] = { GI_CONST(215041), GI_CONST(441158), GI_CONST(214981) };
1565  const int kQueryLengths[kNumQueries] = { 1639, 1151, 1164 };
1566 
1567  TSeqLocVector query_v;
1568 
1569  for (int index = 0; index < kNumQueries; ++index) {
1570  CRef<CSeq_loc> loc(new CSeq_loc());
1571  loc->SetWhole().SetGi(kQueryGis[index]);
1572  CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
1573  scope->AddDefaults();
1574  query_v.push_back(SSeqLoc(loc, scope));
1575  }
1576 
1578 
1579  const CBlastOptions& kOpts = opts->GetOptions();
1581  ENa_strand strand_opt = kOpts.GetStrandOption();
1582 
1583  SetupQueryInfo(query_v, prog, strand_opt, &query_info);
1584  for (int i = 0; i < kNumQueries; i++) {
1585  int len = BlastQueryInfoGetQueryLength(query_info,
1586  eBlastTypeBlastx, i);
1587  BOOST_REQUIRE_EQUAL(kQueryLengths[i], len);
1588  }
1589 }
1590 
1591 BOOST_AUTO_TEST_CASE(ConvertTranslatedFilterOffsets)
1592 {
1593  const int kNumQueries = 3;
1594  CBlastQueryInfo query_info;
1595  const int kNumContexts = kNumQueries*NUM_FRAMES;
1596 
1598  BOOST_REQUIRE_EQUAL(kNumContexts, query_info->last_context + 1);
1599 
1600  const SSeqRange kMasks[kNumQueries] =
1601  { { 660, 686 }, { 92, 119 }, { 1156, 1163 } };
1602 
1603  CBlastMaskLoc mask_loc(BlastMaskLocNew(kNumContexts));
1604  BOOST_REQUIRE_EQUAL(kNumContexts, mask_loc->total_size);
1605 
1606  for (int index = 0; index < kNumQueries; index++) {
1607  BlastSeqLoc* seqloc = mask_loc->seqloc_array[index*NUM_FRAMES] =
1608  (BlastSeqLoc*) calloc(1, sizeof(BlastSeqLoc));
1609  seqloc->ssr = (SSeqRange*) malloc(sizeof(SSeqRange));
1610  seqloc->ssr->left = kMasks[index].left;
1611  seqloc->ssr->right = kMasks[index].right;
1612  }
1613 
1614  BlastMaskLocDNAToProtein(mask_loc, query_info);
1615 
1616  BOOST_REQUIRE_EQUAL(kNumContexts, mask_loc->total_size);
1617 
1618  const int kProtStarts[kNumContexts] =
1619  { 220, 219, 219, 317, 317, 316, 30, 30, 30, 343, 343, 343, 385, 385,
1620  384, 0, 0, 0 };
1621  const int kProtEnds[kNumContexts] =
1622  { 228, 228, 228, 326, 325, 325, 39, 39, 39, 352, 352, 352, 387, 386,
1623  386, 2, 2, 1 };
1624 
1625  for (int index = 0; index < kNumContexts; ++index) {
1626  {{
1627  CNcbiOstrstream os;
1628  os << "Context " << index << " has no mask!";
1629  BOOST_REQUIRE_MESSAGE(mask_loc->seqloc_array[index],
1630  (string)CNcbiOstrstreamToString(os));
1631  }}
1632  const SSeqRange* range = mask_loc->seqloc_array[index]->ssr;
1633  CNcbiOstrstream os;
1634  os << "Context " << index;
1635  BOOST_REQUIRE_MESSAGE(kProtStarts[index] == range->left,
1636  (string)CNcbiOstrstreamToString(os));
1637  BOOST_REQUIRE_MESSAGE(kProtEnds[index] == range->right,
1638  (string)CNcbiOstrstreamToString(os));
1639  }
1640 
1641  BlastMaskLocProteinToDNA(mask_loc, query_info);
1642 
1643  BOOST_REQUIRE_EQUAL(kNumContexts, mask_loc->total_size);
1644  const int kNuclStarts[kNumContexts] =
1645  { 660, 658, 659, 661, 663, 662, 90, 91, 92, 95, 94, 93, 1155, 1156,
1646  1154, 1158, 1157, 1159 };
1647  const int kNuclEnds[kNumContexts] =
1648  { 684, 685, 686, 687, 686, 688, 117, 118, 119, 121, 120, 119, 1161,
1649  1159, 1160, 1163, 1162, 1161 };
1650 
1651  for (int index = 0; index < kNumContexts; ++index) {
1652  {{
1653  CNcbiOstrstream os;
1654  os << "Context " << index << " has no mask!";
1655  BOOST_REQUIRE_MESSAGE(mask_loc->seqloc_array[index],
1656  (string)CNcbiOstrstreamToString(os));
1657  }}
1658  const SSeqRange* range = mask_loc->seqloc_array[index]->ssr;
1659  CNcbiOstrstream os;
1660  os << "Context " << index;
1661  BOOST_REQUIRE_MESSAGE(kNuclStarts[index] == range->left,
1662  (string)CNcbiOstrstreamToString(os));
1663  BOOST_REQUIRE_MESSAGE(kNuclEnds[index] == range->right,
1664  (string)CNcbiOstrstreamToString(os));
1665  }
1666 
1667 }
1668 
1669 BOOST_AUTO_TEST_CASE(FilterOptionsToStringFromNULL)
1670 {
1672  BOOST_REQUIRE(strcmp(retval.get(), "F") == 0);
1673 }
1674 
1675 BOOST_AUTO_TEST_CASE(FilterOptionsToStringFromMaskAtHashOnly)
1676 {
1677  SBlastFilterOptions filtering_options = { '\0' };
1678  filtering_options.mask_at_hash = true;
1679  TAutoCharPtr retval = BlastFilteringOptionsToString(&filtering_options);
1680  BOOST_REQUIRE(strcmp(retval.get(), "m;") == 0);
1681 }
1682 
1683 BOOST_AUTO_TEST_CASE(FilterOptionsToStringLargeData)
1684 {
1685  SBlastFilterOptions filtering_options = { '\0' };
1686  SDustOptionsNew(&filtering_options.dustOptions);
1687  filtering_options.dustOptions->window *= 2;
1689  string(4096, 'X').c_str());
1690 
1691  TAutoCharPtr retval = BlastFilteringOptionsToString(&filtering_options);
1692  SDustOptionsFree(filtering_options.dustOptions);
1693  SRepeatFilterOptionsFree(filtering_options.repeatFilterOptions);
1694  //cerr << "FilterStr ='" << retval.get() << "'" << endl;
1695  BOOST_REQUIRE(NStr::StartsWith(string(retval.get()),
1696  "D 20 128 1;R -d XXXXXXXXXXXXXXXXXXXX"));
1697 }
1698 
1699 BOOST_AUTO_TEST_CASE(FilterOptionsFromNULLString)
1700 {
1701  const EBlastProgramType kProgram = eBlastTypeBlastn;
1702  SBlastFilterOptions* filtering_options;
1703  Int2 status = BlastFilteringOptionsFromString(kProgram, NULL,
1704  &filtering_options, NULL);
1705  BOOST_REQUIRE(status == 0);
1706  BOOST_REQUIRE(filtering_options != NULL);
1707  BOOST_REQUIRE_EQUAL(false, !!filtering_options->mask_at_hash);
1708  BOOST_REQUIRE(filtering_options->segOptions == NULL);
1709  BOOST_REQUIRE(filtering_options->dustOptions == NULL);
1710  filtering_options = SBlastFilterOptionsFree(filtering_options);
1711  BOOST_REQUIRE(filtering_options == NULL);
1712 }
1713 
1714 BOOST_AUTO_TEST_CASE(FilterOptionsFromStringDustMaskAtHash)
1715 {
1716  const EBlastProgramType kProgram = eBlastTypeBlastn;
1717  SBlastFilterOptions* filtering_options;
1718  Int2 status = BlastFilteringOptionsFromString(kProgram, (char*) "m D",
1719  &filtering_options, NULL);
1720  BOOST_REQUIRE(status == 0);
1721  BOOST_REQUIRE_EQUAL(true, !!filtering_options->mask_at_hash);
1722  BOOST_REQUIRE(filtering_options->dustOptions);
1723  BOOST_REQUIRE(filtering_options->segOptions == NULL);
1724 
1725  TAutoCharPtr retval = BlastFilteringOptionsToString(filtering_options);
1726  BOOST_REQUIRE_EQUAL(string("L;m;"), string(retval.get()));
1727 
1728  filtering_options = SBlastFilterOptionsFree(filtering_options);
1729  BOOST_REQUIRE(filtering_options == NULL);
1730 }
1731 
1732 BOOST_AUTO_TEST_CASE(FilterOptionsFromStringDust)
1733 {
1734  const EBlastProgramType kProgram = eBlastTypeBlastn;
1735  SBlastFilterOptions* filtering_options;
1736  Int2 status = BlastFilteringOptionsFromString(kProgram, (char*) "D",
1737  &filtering_options, NULL);
1738  BOOST_REQUIRE(status == 0);
1739  BOOST_REQUIRE_EQUAL(false, !!filtering_options->mask_at_hash);
1740  BOOST_REQUIRE(filtering_options->dustOptions);
1741  BOOST_REQUIRE(filtering_options->segOptions == NULL);
1742 
1743  TAutoCharPtr retval = BlastFilteringOptionsToString(filtering_options);
1744  BOOST_REQUIRE(strcmp(retval.get(), "L;") == 0);
1745 
1746  filtering_options = SBlastFilterOptionsFree(filtering_options);
1747  BOOST_REQUIRE(filtering_options == NULL);
1748 }
1749 
1750 BOOST_AUTO_TEST_CASE(FilterOptionsFromStringSEGWithParams)
1751 {
1752  const EBlastProgramType kProgram = eBlastTypeBlastp;
1753  SBlastFilterOptions* filtering_options;
1754  Int2 status = BlastFilteringOptionsFromString(kProgram, (char*) "S 10 1.0 1.5", &filtering_options, NULL);
1755  BOOST_REQUIRE(status == 0);
1756  BOOST_REQUIRE_EQUAL(false, !!filtering_options->mask_at_hash);
1757  BOOST_REQUIRE(filtering_options->dustOptions == NULL);
1758  BOOST_REQUIRE(filtering_options->segOptions);
1759  BOOST_REQUIRE_EQUAL(10, filtering_options->segOptions->window);
1760  BOOST_REQUIRE_CLOSE(1.0, filtering_options->segOptions->locut, 0.01);
1761  BOOST_REQUIRE_CLOSE(1.5, filtering_options->segOptions->hicut, 0.01);
1762 
1763  TAutoCharPtr retval = BlastFilteringOptionsToString(filtering_options);
1764  BOOST_REQUIRE(strcmp(retval.get(), "S 10 1.0 1.5;") == 0);
1765 
1766  filtering_options = SBlastFilterOptionsFree(filtering_options);
1767  BOOST_REQUIRE(filtering_options == NULL);
1768 }
1769 
1770 BOOST_AUTO_TEST_CASE(FilterOptionsFromBadStringSEGWithParams)
1771 {
1772  const EBlastProgramType kProgram = eBlastTypeBlastp;
1773  SBlastFilterOptions* filtering_options;
1774  // Only three numbers are allowed.
1775  Int2 status = BlastFilteringOptionsFromString(kProgram, (char*) "S 10 1.0 1.5 1.0", &filtering_options, NULL);
1776  BOOST_REQUIRE_EQUAL(1, (int) status);
1777  BOOST_REQUIRE(filtering_options == NULL);
1778 }
1779 
1780 BOOST_AUTO_TEST_CASE(FilterOptionsFromStringBlastnL)
1781 {
1782  const EBlastProgramType kProgram = eBlastTypeBlastn;
1783  SBlastFilterOptions* filtering_options;
1784  Int2 status = BlastFilteringOptionsFromString(kProgram, (char*) "L", &filtering_options, NULL);
1785  BOOST_REQUIRE(status == 0);
1786  BOOST_REQUIRE_EQUAL(false, !!filtering_options->mask_at_hash);
1787  BOOST_REQUIRE(filtering_options->dustOptions);
1788  BOOST_REQUIRE(filtering_options->segOptions == NULL);
1789 
1790  TAutoCharPtr retval = BlastFilteringOptionsToString(filtering_options);
1791  BOOST_REQUIRE(strcmp(retval.get(), "L;") == 0);
1792 
1793  filtering_options = SBlastFilterOptionsFree(filtering_options);
1794  BOOST_REQUIRE(filtering_options == NULL);
1795 }
1796 BOOST_AUTO_TEST_CASE(FilterOptionsFromStringBlastpL)
1797 {
1798  const EBlastProgramType kProgram = eBlastTypeBlastp;
1799  SBlastFilterOptions* filtering_options;
1800  Int2 status = BlastFilteringOptionsFromString(kProgram, (char*) "L", &filtering_options, NULL);
1801  BOOST_REQUIRE(status == 0);
1802  BOOST_REQUIRE_EQUAL(false, !!filtering_options->mask_at_hash);
1803  BOOST_REQUIRE(filtering_options->dustOptions == NULL);
1804  BOOST_REQUIRE(filtering_options->segOptions);
1805 
1806  TAutoCharPtr retval = BlastFilteringOptionsToString(filtering_options);
1807  BOOST_REQUIRE(strcmp(retval.get(), "L;") == 0);
1808 
1809  filtering_options = SBlastFilterOptionsFree(filtering_options);
1810  BOOST_REQUIRE(filtering_options == NULL);
1811 }
1812 BOOST_AUTO_TEST_CASE(FilterOptionsFromStringBlastnW)
1813 {
1814  const EBlastProgramType kProgram = eBlastTypeBlastn;
1815  SBlastFilterOptions* filtering_options = NULL;
1816  Int2 status = BlastFilteringOptionsFromString(kProgram, (char*) "W -t 9606", &filtering_options, NULL);
1817  BOOST_REQUIRE(status == 0);
1818  BOOST_REQUIRE(! filtering_options->mask_at_hash);
1819  BOOST_REQUIRE(! filtering_options->dustOptions);
1820  BOOST_REQUIRE(! filtering_options->segOptions);
1821  BOOST_REQUIRE(! filtering_options->repeatFilterOptions);
1822  BOOST_REQUIRE(filtering_options->windowMaskerOptions);
1823 
1824  TAutoCharPtr retval = BlastFilteringOptionsToString(filtering_options);
1825  BOOST_REQUIRE(strcmp(retval.get(), "W -t 9606;") == 0);
1826 
1827  filtering_options = SBlastFilterOptionsFree(filtering_options);
1828  BOOST_REQUIRE(filtering_options == NULL);
1829 }
1830 
1832 {
1833  const int kNewLevel = 21;
1834  const int kNewWindow = 68;
1835 
1836  SBlastFilterOptions* opt1 = NULL;
1837  SBlastFilterOptionsNew(&opt1, eDust);
1838  opt1->dustOptions->level = kNewLevel;
1839  opt1->dustOptions->window = kNewWindow;
1840 
1841  SBlastFilterOptions* opt2 = NULL;
1843  opt2->mask_at_hash = true;
1844 
1846 
1847  Int2 status = SBlastFilterOptionsMerge(&result, opt1, opt2);
1848  BOOST_REQUIRE_EQUAL(0, (int) status);
1849  BOOST_REQUIRE(result);
1850  BOOST_REQUIRE_EQUAL(true, !!result->mask_at_hash);
1851  BOOST_REQUIRE_EQUAL(kNewLevel, result->dustOptions->level);
1852  BOOST_REQUIRE_EQUAL(kNewWindow, result->dustOptions->window);
1853  BOOST_REQUIRE(result->repeatFilterOptions);
1855  BOOST_REQUIRE(result == NULL);
1856 
1857  status = SBlastFilterOptionsMerge(&result, opt1, NULL);
1858  BOOST_REQUIRE_EQUAL(0, (int) status);
1859  BOOST_REQUIRE(result);
1860  BOOST_REQUIRE_EQUAL(kNewLevel, result->dustOptions->level);
1861  BOOST_REQUIRE_EQUAL(kNewWindow, result->dustOptions->window);
1863  BOOST_REQUIRE(result == NULL);
1864 
1865  status = SBlastFilterOptionsMerge(&result, NULL, opt2);
1866  BOOST_REQUIRE_EQUAL(0, (int) status);
1867  BOOST_REQUIRE(result);
1868  BOOST_REQUIRE_EQUAL(true, !!result->mask_at_hash);
1869  BOOST_REQUIRE(result->repeatFilterOptions);
1871  BOOST_REQUIRE(result == NULL);
1872 
1875 }
1876 
1877 BOOST_AUTO_TEST_CASE(FilterStringFalse)
1878 {
1879  CBlastNucleotideOptionsHandle nucl_handle;
1880  nucl_handle.SetFilterString("F");/* NCBI_FAKE_WARNING */
1881  BOOST_REQUIRE_EQUAL(false, nucl_handle.GetMaskAtHash());
1882  BOOST_REQUIRE_EQUAL(false, nucl_handle.GetDustFiltering());
1883  BOOST_REQUIRE_EQUAL(0, nucl_handle.GetWindowMaskerTaxId());
1884  BOOST_REQUIRE(nucl_handle.GetWindowMaskerDatabase() == NULL);
1885 }
1886 
1887 BOOST_AUTO_TEST_CASE(MergeOptionHandle) {
1888 
1889  CBlastNucleotideOptionsHandle nucl_handle;
1890  nucl_handle.SetFilterString("R -d repeat/repeat_9606");/* NCBI_FAKE_WARNING */
1891  nucl_handle.SetMaskAtHash(true);
1892  nucl_handle.SetDustFiltering(true);
1893  BOOST_REQUIRE_EQUAL(true, nucl_handle.GetMaskAtHash());
1894  BOOST_REQUIRE_EQUAL(true, nucl_handle.GetDustFiltering());
1895 }
1896 
1897 BOOST_AUTO_TEST_CASE(OptionsHandleNotClear) {
1898  CBlastNucleotideOptionsHandle nucl_handle;
1899  nucl_handle.SetFilterString("R -d repeat/repeat_9606", false);/* NCBI_FAKE_WARNING */
1900  BOOST_REQUIRE_EQUAL(true, nucl_handle.GetDustFiltering());
1901  BOOST_REQUIRE_EQUAL(true, nucl_handle.GetRepeatFiltering());
1902 }
1903 
1904 BOOST_AUTO_TEST_CASE(OptionsHandleClear) {
1905  CBlastNucleotideOptionsHandle nucl_handle;
1906  nucl_handle.SetFilterString("R -d repeat/repeat_9606");/* NCBI_FAKE_WARNING */
1907  BOOST_REQUIRE_EQUAL(false, nucl_handle.GetDustFiltering());
1908  BOOST_REQUIRE_EQUAL(true, nucl_handle.GetRepeatFiltering());
1909  BOOST_REQUIRE_EQUAL(0, nucl_handle.GetWindowMaskerTaxId());
1910  BOOST_REQUIRE(nucl_handle.GetWindowMaskerDatabase() == NULL);
1911 }
1912 
1913 BOOST_AUTO_TEST_CASE(GetSeqLocInfoVector_EmptyQueryIdVector) {
1915  CPacked_seqint empty_seqids;
1916  TSeqLocInfoVector mask_v;
1917  BOOST_REQUIRE_THROW(
1918  Blast_GetSeqLocInfoVector(eBlastTypeBlastp, empty_seqids, mask, mask_v),
1919  CBlastException);
1920 }
1921 
1922 // Check that the conversion function will now create a vector of empty
1923 // mask lists.
1924 BOOST_AUTO_TEST_CASE(GetSeqLocInfoVector_EmptyMasks) {
1925  const EBlastProgramType kProgram = eBlastTypeBlastn;
1926  const size_t kNumSeqs = 10;
1928  (BlastMaskLocNew(kNumSeqs*GetNumberOfContexts(kProgram)));
1929 
1930  // since the masks won't have any data in them, we don't care about the
1931  // Seq-id's passed in
1932  const CPacked_seqint::TRanges ranges(kNumSeqs, TSeqRange(0, 100000));
1933  CSeq_id seqid(CSeq_id::e_Gi, 555);
1934  CPacked_seqint seqintervals(seqid, ranges);
1935 
1936  TSeqLocInfoVector mask_v;
1937 
1938  Blast_GetSeqLocInfoVector(kProgram, seqintervals, mask, mask_v);
1939 
1940  BOOST_REQUIRE_EQUAL((size_t)kNumSeqs, (size_t)mask_v.size());
1941  ITERATE(TSeqLocInfoVector, query_masks_list, mask_v) {
1942  BOOST_REQUIRE_EQUAL((size_t)0U, query_masks_list->size());
1943  }
1944 }
1945 
1946 BOOST_AUTO_TEST_CASE(BlastSeqLocCombineTest) {
1947  const int kNumberLocIn = 7;
1948  const int kLocStartIn[kNumberLocIn] =
1949  { 281312, 281356, 281416, 281454, 281895, 282435, 282999};
1950  const int kLocEndIn[kNumberLocIn] =
1951  { 281736, 281406, 281446, 281878, 282423, 282968, 283191};
1952 
1953  const int kNumberLocOut = 4;
1954  const int kLocStartOut[kNumberLocOut] =
1955  { 281312, 281895, 282435, 282999};
1956  const int kLocEndOut[kNumberLocOut] =
1957  { 281878, 282423, 282968, 283191};
1958 
1959  BlastSeqLoc *head = NULL;
1960  for (int index=0; index<kNumberLocIn; index++)
1961  {
1962  BlastSeqLocNew(&head, kLocStartIn[index],
1963  kLocEndIn[index]);
1964  }
1965 
1966  BlastSeqLocCombine(&head, 0);
1967  BlastSeqLoc* result = head;
1968  head = NULL;
1969 
1970  int count = 0;
1971  BlastSeqLoc* var = result;
1972  while (var)
1973  {
1974  var = var->next;
1975  count++;
1976  }
1977  BOOST_REQUIRE_EQUAL(count, kNumberLocOut);
1978 
1979  var = result;
1980  count = 0;
1981  while (var)
1982  {
1983  SSeqRange* ssr = var->ssr;
1984  BOOST_REQUIRE_EQUAL(ssr->left, kLocStartOut[count]);
1985  BOOST_REQUIRE_EQUAL(ssr->right, kLocEndOut[count]);
1986  var = var->next;
1987  count++;
1988  }
1989 
1991  BOOST_REQUIRE(result == NULL);
1992 }
1993 
1994 BOOST_AUTO_TEST_CASE(GetSeqLocInfoVector_AllPrograms) {
1995  vector<EBlastProgramType> programs =
1997 
1998  // Generate the different number of sequences to pass to test function
1999  CRandom random_gen((CRandom::TValue)time(0));
2000  vector<int> num_seqs_array;
2001  num_seqs_array.reserve(3);
2002  num_seqs_array.push_back(random_gen.GetRand(1,10));
2003  num_seqs_array.push_back(random_gen.GetRand(1,10));
2004  num_seqs_array.push_back(random_gen.GetRand(1,10));
2005 
2006  ITERATE(vector<EBlastProgramType>, program, programs) {
2007  ITERATE(vector<int>, num_seqs, num_seqs_array) {
2008  x_TestGetSeqLocInfoVector(*program, *num_seqs);
2009  }
2010  }
2011 
2012 }
2013 
2014 #if SEQLOC_MIX_QUERY_OK
2015  /// Test the dust filtering API on a mixed Seqloc input.
2016  BOOST_AUTO_TEST_CASE(DustSeqlocMix) {
2017  const int kNumInts = 20;
2018  const int kStarts[kNumInts] =
2019  { 838, 1838, 6542, 7459, 9246, 10431, 14807, 16336, 19563,
2020  20606, 21232, 22615, 23822, 27941, 29597, 30136, 31287,
2021  31786, 33315, 35402 };
2022  const int kEnds[kNumInts] =
2023  { 961, 2010, 6740, 7573, 9408, 10609, 15043, 16511, 19783,
2024  20748, 21365, 22817, 24049, 28171, 29839, 30348, 31362,
2025  31911, 33485, 37952 };
2026 #if 0 // These are the locations produced directly by CSymDustMasker
2027  const int kNumMaskLocs = 7;
2028  const int kMaskStarts[kNumMaskLocs] =
2029  { 2607, 3000, 3739, 4238, 5211, 5602, 5716 };
2030  const int kMaskStops[kNumMaskLocs] =
2031  { 2769, 3006, 3809, 4244, 5218, 5608, 5722 };
2032 #else // These are locations that have been mapped to the full sequence scale
2033  const int kNumMaskLocs = 8;
2034  const int kMaskStarts[kNumMaskLocs] =
2035  { 29678, 30136, 31305, 35786, 36285, 37258, 37649, 37763 };
2036  const int kMaskStops[kNumMaskLocs] =
2037  { 29839, 30136, 31311, 35856, 36291, 37265, 37655, 37769 };
2038 #endif
2039 
2040  int index;
2041 
2042  CSeq_id qid("gi|3417288");
2043  CRef<CSeq_loc> qloc(new CSeq_loc());
2044  for (index = 0; index < kNumInts; ++index) {
2045  CRef<CSeq_loc> next_loc(new CSeq_loc());
2046  next_loc->SetInt().SetFrom(kStarts[index]);
2047  next_loc->SetInt().SetTo(kEnds[index]);
2048  next_loc->SetInt().SetId(qid);
2049  qloc->SetMix().Set().push_back(next_loc);
2050  }
2051 
2052  CRef<CScope> scope(new CScope(CTestObjMgr::Instance().GetObjMgr()));
2053  scope->AddDefaults();
2054 
2055  unique_ptr<SSeqLoc> query(new SSeqLoc(qloc, scope));
2056  TSeqLocVector query_v;
2057  query_v.push_back(*query);
2058 
2059  CBlastNucleotideOptionsHandle nucl_handle;
2060  nucl_handle.SetDustFiltering(true);
2061  Blast_FindDustFilterLoc(query_v, &nucl_handle);
2062 
2063  int loc_index = 0;
2064  ITERATE(list< CRef<CSeq_interval> >, itr,
2065  query_v[0].mask->GetPacked_int().Get()) {
2066  BOOST_REQUIRE_EQUAL(kMaskStarts[loc_index],
2067  (int) (*itr)->GetFrom());
2068  BOOST_REQUIRE_EQUAL(kMaskStops[loc_index],
2069  (int) (*itr)->GetTo());
2070  ++loc_index;
2071  }
2072  BOOST_REQUIRE_EQUAL(kNumMaskLocs, loc_index);
2073  }
2074 #endif
2075 
2076 BOOST_AUTO_TEST_CASE(TestBlastSeqLocCombine_MergeElems)
2077 {
2078  TRangeVector rv;
2079  rv.push_back(TRangeVector::value_type(10, 77));
2080  rv.push_back(TRangeVector::value_type(0, 100));
2081  rv.push_back(TRangeVector::value_type(20, 45));
2082  rv.push_back(TRangeVector::value_type(3, 50));
2083  rv.push_back(TRangeVector::value_type(10, 77));
2084 
2086  BlastSeqLocCombine(&mask, 0);
2087  TRangeVector merged_rv;
2088  merged_rv.push_back(TRangeVector::value_type(0, 100));
2089 
2090  BlastSeqLoc* mask_itr = mask;
2091  ITERATE(TRangeVector, itr, merged_rv) {
2092  BOOST_REQUIRE(mask_itr != NULL);
2093  BOOST_REQUIRE_EQUAL((int)itr->GetFrom(), (int)mask_itr->ssr->left);
2094  BOOST_REQUIRE_EQUAL((int)itr->GetTo(), (int)mask_itr->ssr->right);
2095  mask_itr = mask_itr->next;
2096  }
2097  BOOST_REQUIRE(mask_itr == NULL);
2098 
2100  BOOST_REQUIRE(mask == NULL);
2101 }
2102 
2103 BOOST_AUTO_TEST_CASE(TestBlastSeqLocCombine_MergeIdenticals)
2104 {
2105  TRangeVector rv;
2106  rv.push_back(TRangeVector::value_type(380, 684));
2107  rv.push_back(TRangeVector::value_type(0, 74));
2108  rv.push_back(TRangeVector::value_type(78, 207));
2109  rv.push_back(TRangeVector::value_type(695, 776));
2110  rv.push_back(TRangeVector::value_type(380, 684));
2111  rv.push_back(TRangeVector::value_type(78, 212));
2112 
2114  BlastSeqLocCombine(&mask, 0);
2115  TRangeVector merged_rv;
2116  merged_rv.push_back(TRangeVector::value_type(0, 74));
2117  merged_rv.push_back(TRangeVector::value_type(78, 212));
2118  merged_rv.push_back(TRangeVector::value_type(380, 684));
2119  merged_rv.push_back(TRangeVector::value_type(695, 776));
2120 
2121  BlastSeqLoc* mask_itr = mask;
2122  ITERATE(TRangeVector, itr, merged_rv) {
2123  BOOST_REQUIRE(mask_itr != NULL);
2124  BOOST_REQUIRE_EQUAL((int)itr->GetFrom(), (int)mask_itr->ssr->left);
2125  BOOST_REQUIRE_EQUAL((int)itr->GetTo(), (int)mask_itr->ssr->right);
2126  mask_itr = mask_itr->next;
2127  }
2128  BOOST_REQUIRE(mask_itr == NULL);
2129 
2131  BOOST_REQUIRE(mask == NULL);
2132 }
2133 
2134 BOOST_AUTO_TEST_CASE(TestBlastSeqLocCombine_NoMerging)
2135 {
2136  TRangeVector rv;
2137  rv.push_back(TRangeVector::value_type(10, 77));
2138  rv.push_back(TRangeVector::value_type(250, 3400));
2139  rv.push_back(TRangeVector::value_type(3, 8));
2140 
2142  BlastSeqLocCombine(&mask, 0);
2143  TRangeVector merged_rv;
2144  merged_rv.push_back(TRangeVector::value_type(3, 8));
2145  merged_rv.push_back(TRangeVector::value_type(10, 77));
2146  merged_rv.push_back(TRangeVector::value_type(250, 3400));
2147 
2148  BlastSeqLoc* mask_itr = mask;
2149  ITERATE(TRangeVector, itr, merged_rv) {
2150  BOOST_REQUIRE(mask_itr != NULL);
2151  BOOST_REQUIRE_EQUAL((int)itr->GetFrom(), (int)mask_itr->ssr->left);
2152  BOOST_REQUIRE_EQUAL((int)itr->GetTo(), (int)mask_itr->ssr->right);
2153  mask_itr = mask_itr->next;
2154  }
2155  BOOST_REQUIRE(mask_itr == NULL);
2156 
2158  BOOST_REQUIRE(mask == NULL);
2159 }
2160 
2161 extern "C" void BlastSeqLocListReverse(BlastSeqLoc** head);
2162 
2163 BOOST_AUTO_TEST_CASE(TestBlastSeqLocListReverse)
2164 {
2165  TRangeVector rv;
2166  rv.push_back(TRangeVector::value_type(10, 77));
2167  rv.push_back(TRangeVector::value_type(0, 100));
2168  rv.push_back(TRangeVector::value_type(3, 50));
2169 
2172  reverse(rv.begin(), rv.end());
2173 
2174  BlastSeqLoc* mask_itr = mask;
2175  ITERATE(TRangeVector, itr, rv) {
2176  BOOST_REQUIRE(mask_itr != NULL);
2177  BOOST_REQUIRE_EQUAL((int)itr->GetFrom(), (int)mask_itr->ssr->left);
2178  BOOST_REQUIRE_EQUAL((int)itr->GetTo(), (int)mask_itr->ssr->right);
2179  mask_itr = mask_itr->next;
2180  }
2181  BOOST_REQUIRE(mask_itr == NULL);
2182 
2184  BOOST_REQUIRE(mask == NULL);
2185 }
2186 
2187 BOOST_AUTO_TEST_CASE(TestGetTaxIdWithWindowMaskerSupport)
2188 {
2189  set<int> taxids;
2191  BOOST_REQUIRE(taxids.empty() == false);
2192  BOOST_REQUIRE(taxids.find(9606) != taxids.end());
2193 }
2194 
#define static
bool IsReverse(ENa_strand s)
Definition: Na_strand.hpp:75
Declares the CBl2Seq (BLAST 2 Sequences) class.
Contains C++ wrapper classes to structures in algo/blast/core as well as some auxiliary functions to ...
#define NUM_FRAMES
Number of frames to which we translate in translating searches.
Definition: blast_def.h:88
void BlastSetUp_MaskQuery(BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info, const BlastMaskLoc *filter_maskloc, EBlastProgramType program_number)
Masks the sequence given a BlastMaskLoc.
BlastMaskLoc * BlastMaskLocFree(BlastMaskLoc *mask_loc)
Deallocate memory for a BlastMaskLoc structure as well as the BlastSeqLoc's pointed to.
Definition: blast_filter.c:789
Int2 BlastSetUp_GetFilteringLocations(BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info, EBlastProgramType program_number, const SBlastFilterOptions *filter_options, BlastMaskLoc **filter_out, Blast_Message **blast_message)
Does preparation for filtering and then calls BlastSetUp_Filter.
Int2 BlastFilteringOptionsFromString(EBlastProgramType program_number, const char *instructions, SBlastFilterOptions **filtering_options, Blast_Message **blast_message)
Produces SBlastFilterOptions from a string that has been traditionally supported in blast.
Definition: blast_filter.c:436
const Uint1 kNuclMask
BLASTNA element used to mask bases in BLAST.
Definition: blast_filter.c:38
Int2 BlastSetUp_Filter(EBlastProgramType program_number, Uint1 *sequence, Int4 length, Int4 offset, const SBlastFilterOptions *filter_options, BlastSeqLoc **seqloc_retval, Blast_Message **blast_message)
Runs seg filtering functions, according to the filtering options, returns BlastSeqLoc*.
Int2 BlastMaskLocDNAToProtein(BlastMaskLoc *mask_loc, const BlastQueryInfo *query_info)
Given a BlastMaskLoc with an array of lists of DNA mask locations, substitutes that array by a new ar...
Definition: blast_filter.c:806
void BlastSeqLocCombine(BlastSeqLoc **mask_loc, Int4 link_value)
Go through all mask locations in one sequence and combine any that overlap, deallocating the unneeded...
Definition: blast_filter.c:972
BlastMaskLoc * BlastMaskLocNew(Int4 total)
Allocate memory for a BlastMaskLoc.
Definition: blast_filter.c:760
BlastSeqLoc * BlastSeqLocFree(BlastSeqLoc *loc)
Deallocate all BlastSeqLoc objects in a chain.
Definition: blast_filter.c:737
BlastSeqLoc * BlastSeqLocNew(BlastSeqLoc **head, Int4 from, Int4 to)
Create and initialize a new sequence interval.
Definition: blast_filter.c:608
Int2 BlastMaskLocProteinToDNA(BlastMaskLoc *mask_loc, const BlastQueryInfo *query_info)
Given a BlastMaskLoc with an array of lists of mask locations per protein frame, recalculates all mas...
Definition: blast_filter.c:892
char * BlastFilteringOptionsToString(const SBlastFilterOptions *filtering_options)
Convert the filtering options structure to a string.
Definition: blast_filter.c:321
Declares the CBlastNucleotideOptionsHandle class.
Definitions which are dependant on the NCBI C++ Object Manager.
SDustOptions * SDustOptionsFree(SDustOptions *dust_options)
Frees SDustOptions.
Definition: blast_options.c:50
Int2 SRepeatFilterOptionsResetDB(SRepeatFilterOptions **repeat_options, const char *dbname)
Resets name of db for repeat filtering.
SRepeatFilterOptions * SRepeatFilterOptionsFree(SRepeatFilterOptions *repeat_options)
Frees SRepeatFilterOptions.
SBlastFilterOptions * SBlastFilterOptionsFree(SBlastFilterOptions *filter_options)
Frees SBlastFilterOptions and all subservient structures.
Int2 SBlastFilterOptionsMerge(SBlastFilterOptions **combined, const SBlastFilterOptions *opt1, const SBlastFilterOptions *opt2)
Merges two sets of options together, taking the non-default one as preferred.
Int2 SDustOptionsNew(SDustOptions **dust_options)
Allocates memory for SDustOptions, fills in defaults.
Definition: blast_options.c:57
@ eRepeats
Repeat filtering for nucleotides.
@ eDust
low-complexity for nucleotides.
@ eSeg
low-complexity for proteins.
Int2 SBlastFilterOptionsNew(SBlastFilterOptions **filter_options, EFilterOptions type)
Allocates memory for SBlastFilterOptions and.
Declares the CBlastOptionsHandle and CBlastOptionsFactory classes.
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
@ eBlastTypeBlastn
Definition: blast_program.h:74
@ eBlastTypeBlastx
Definition: blast_program.h:75
@ eBlastTypeBlastp
Definition: blast_program.h:73
Int4 BlastQueryInfoGetQueryLength(const BlastQueryInfo *qinfo, EBlastProgramType program, Int4 query_index)
Obtains the sequence length for a given query in the query, without taking into consideration any app...
Utilities initialize/setup BLAST.
void BlastSeqLoc_RestrictToInterval(BlastSeqLoc **mask, Int4 from, Int4 to)
Adjusts the mask locations coordinates to a sequence interval.
Definition: blast_setup.c:1030
@ eBlastn
Nucl-Nucl (traditional blastn)
Definition: blast_types.hpp:58
@ eBlastp
Protein-Protein.
Definition: blast_types.hpp:59
@ eBlastx
Translated nucl-Protein.
Definition: blast_types.hpp:60
Int1 BLAST_ContextToFrame(EBlastProgramType prog_number, Uint4 context_number)
This function translates the context number of a context into the frame of the sequence.
Definition: blast_util.c:839
BOOST_AUTO_TEST_CASE(TSeqLocVector2Packed_seqint_TestIntervals)
vector< TSeqRange > TRangeVector
void setupQueryStructures(TSeqLocVector &query_vector, const CBlastOptions &kOpts, BLAST_SequenceBlk **query_blk, BlastQueryInfo **qinfo)
void setupQueryInfoForOffsetTranslation(CBlastQueryInfo &query_info)
static BlastSeqLoc * s_RangeVector2BlastSeqLoc(const TRangeVector &rv)
static void x_TestGetFilteredQueryRegions(ENa_strand strand)
static bool x_AreAllBasesMasked(const Uint1 *sequence, int start, int stop)
void BlastSeqLocListReverse(BlastSeqLoc **head)
Reverse elements in the list.
Definition: blast_filter.c:705
static void x_TestGetSeqLocInfoVector(EBlastProgramType program, size_t num_seqs)
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
ncbi::TMaskedQueryRegions mask
AutoPtr –.
Definition: ncbimisc.hpp:401
Wrapper class for BLAST_SequenceBlk .
Definition: blast_aux.hpp:309
Defines BLAST error codes (user errors included)
static void x_TestLowerCaseMaskWith(ENa_strand strand, bool ignore_strand_in_mask)
Wrapper class for BlastMaskLoc .
Definition: blast_aux.hpp:354
Handle to the nucleotide-nucleotide options to the BLAST algorithm.
Encapsulates ALL the BLAST algorithm's options.
Collection of BlastSeqLoc lists for filtering processing.
Wrapper class for BlastQueryInfo .
Definition: blast_aux.hpp:311
Query Vector.
Definition: sseqloc.hpp:276
void AddQuery(CRef< CBlastSearchQuery > q)
Add a query to the set.
Definition: sseqloc.hpp:293
TMaskedQueryRegions GetMaskedRegions(size_type i) const
Get the masked regions for a query by number.
Definition: sseqloc.hpp:331
CRef< objects::CSeq_loc > GetMasks(size_type i) const
Convenience method to get a CSeq_loc representing the masking locations.
Definition: sseqloc.hpp:341
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
vector< CRange< TSeqPos > > TRanges
void AddInterval(const CSeq_interval &ival)
for convenience
CRandom::
Definition: random_gen.hpp:66
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
CSeqDBException.
Definition: seqdbcommon.hpp:73
structure for seqloc info
Definition: seqlocinfo.hpp:48
static CRef< CScope > NewScope(bool with_defaults=true)
Return a new scope, possibly (by default) with default loaders, which will include the Genbank loader...
Definition: simple_om.cpp:202
CRef< blast::CBlastSearchQuery > CreateBlastSearchQuery(objects::CSeq_id &id, objects::ENa_strand s=objects::eNa_strand_unknown)
static CTestObjMgr & Instance()
Definition: test_objmgr.cpp:69
Collection of masked regions for a single query sequence.
Definition: seqlocinfo.hpp:113
TMaskedQueryRegions RestrictToSeqInt(const objects::CSeq_interval &location) const
Return a new instance of this object that is restricted to the location specified.
Definition: seqlocinfo.cpp:98
typedef for the messages for an entire BLAST search, which could be comprised of multiple query seque...
bool empty() const
Definition: set.hpp:133
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
#define head
Definition: ct_nlmzip_i.h:138
Calls sym dust lib in algo/dustmask and returns CSeq_locs for use by BLAST.
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
static void query_test(int prepare, SQLRETURN expected, const char *expected_status)
Definition: array.c:14
CRef< objects::CPacked_seqint > TSeqLocVector2Packed_seqint(const TSeqLocVector &sequences)
Converts a TSeqLocVector into a CPacked_seqint.
int GetDustFilteringLinker() const
Get linker parameter for dust.
void Blast_GetSeqLocInfoVector(EBlastProgramType program, const objects::CPacked_seqint &queries, const BlastMaskLoc *mask, TSeqLocInfoVector &mask_v)
Converts a BlastMaskLoc internal structure into an object returned by the C++ API.
Definition: blast_aux.cpp:904
BlastQueryInfo * Release()
Definition: blast_aux.hpp:311
bool GetDustFiltering() const
Is dust filtering enabled?
void SetWindowMaskerTaxId(int taxid)
Enable window masker and select a taxid (or 0 to disable).
void SetupQueries(TSeqLocVector &queries, BlastQueryInfo *qinfo, BLAST_SequenceBlk **seqblk, EBlastProgramType prog, objects::ENa_strand strand_opt, TSearchMessages &messages)
Populates BLAST_SequenceBlk with sequence data for use in CORE BLAST.
objects::ENa_strand GetStrandOption() const
bool Empty()
Returns true if this object contains any masking information.
static CBlastOptionsHandle * Create(EProgram program, EAPILocality locality=CBlastOptions::eLocal)
Creates an options handle object configured with default options for the requested program,...
int GetDustFilteringLevel() const
Get level parameter for dust.
const char * GetWindowMaskerDatabase() const
Get the window masker database name (or NULL if not set).
EBlastProgramType GetProgramType() const
Returns the CORE BLAST notion of program type.
void GetTaxIdWithWindowMaskerSupport(set< int > &supported_taxids)
This function returns a list of NCBI taxonomy IDs for which there exists windowmasker masking data to...
void SetRepeatFilteringDB(const char *db)
Enable repeat filtering.
bool QueryHasMultipleFrames() const
Check whether the query is multiframe for this type of search.
void Blast_FindWindowMaskerLoc(CBlastQueryVector &query, const CBlastOptions *opts)
Find Window Masker filtered locations using a BlastOptions.
size_t GetNumFrames() const
const CBlastOptions & GetOptions() const
Return the object which this object is a handle for.
bool GetMaskAtHash() const
Returns whether masking should only be done for lookup table creation.
void SetRepeatFiltering(bool val)
Enable repeat filtering.
bool GetRepeatFiltering() const
Is repeat filtering enabled?
void Blast_FindRepeatFilterLoc(TSeqLocVector &query_loc, const CBlastOptionsHandle *opts_handle)
Finds repeats locations for a given set of sequences.
unsigned int GetNumberOfContexts(EBlastProgramType p)
Returns the number of contexts for a given BLAST program.
const set< ETranslationFrame > & ListFrames()
Returns the list of frame values for which this object contains masking information.
int GetDustFilteringWindow() const
Get window parameter for dust.
void SetupQueryInfo(TSeqLocVector &queries, EBlastProgramType prog, objects::ENa_strand strand_opt, BlastQueryInfo **qinfo)
Allocates the query information structure and fills the context offsets, in case of multiple queries,...
void UseProteinCoords(TSeqPos dna_length)
Adjusts all stored masks from nucleotide to protein offsets.
int GetWindowMaskerTaxId() const
Get the window masker taxid (or 0 if not set).
void SetMaskAtHash(bool m=true)
Sets MaskAtHash.
void SetFilterString(const char *f, bool clear=true)
Sets FilterString.
void Blast_FindDustFilterLoc(TSeqLocVector &queries, const CBlastNucleotideOptionsHandle *nucl_handle)
Finds dust locations for a given set of sequences by calling the the symmetric dust lib.
Definition: dust_filter.cpp:60
string Blast_ProgramNameFromType(EBlastProgramType program)
Returns a string program name, given a blast::EBlastProgramType enumeration.
Definition: blast_aux.cpp:813
void SetDustFiltering(bool val)
Enable dust filtering.
@ eNotSupported
Feature not supported.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
element_type * get(void) const
Get pointer.
Definition: ncbimisc.hpp:469
#define GI_CONST(gi)
Definition: ncbimisc.hpp:1087
#define NULL
Definition: ncbistd.hpp:225
TErrCode GetErrCode(void) const
Get error code.
Definition: ncbiexpt.cpp:453
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
Definition: Seq_id.hpp:1065
E_SIC Compare(const CSeq_id &sid2) const
Compare() - more general.
Definition: Seq_id.cpp:411
@ e_YES
SeqIds compared, but are different.
Definition: Seq_id.hpp:583
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
@ fCompareOverlapping
Check if seq-locs are overlapping.
@ eSame
CSeq_locs contain each other.
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
Uint4 TValue
Type of the generated integer value and/or the seed value.
Definition: random_gen.hpp:69
TValue GetRand(void)
Get the next random number in the interval [0..GetMax()] (inclusive)
Definition: random_gen.hpp:238
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
Definition: range.hpp:419
#define NPOS
Definition: ncbistr.hpp:133
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
void SetTo(TTo value)
Assign a value to To data member.
list< CRef< CSeq_interval > > Tdata
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
bool CanGetStrand(void) const
Check if it is safe to call GetStrand method.
TFrom GetFrom(void) const
Get the From member data.
void SetFrom(TFrom value)
Assign a value to From data member.
TTo GetTo(void) const
Get the To member data.
void SetStrand(TStrand value)
Assign a value to Strand data member.
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_both
in forward orientation
Definition: Na_strand_.hpp:68
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
static int input()
int i
int len
static char * prog
Definition: mdb_load.c:33
objects::CSeq_id * GenerateRandomSeqid_Gi()
vector< EBlastProgramType > GetAllBlastProgramTypes()
range(_Ty, _Ty) -> range< _Ty >
constexpr bool empty(list< Ts... >) noexcept
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
Magic spell ;-) needed for some weird compilers... very empiric.
int strcmp(const char *str1, const char *str2)
Definition: odbc_utils.hpp:160
unsigned int a
Definition: ncbi_localip.c:102
Defines: CTimeFormat - storage class for time format.
void abort()
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
The Object manager core.
Utilities to develop and debug unit tests for BLAST.
static pcre_uint8 * buffer
Definition: pcretest.c:1051
BOOST_AUTO_TEST_SUITE(psiblast_iteration)
C++ implementation of repeats filtering for C++ BLAST.
vector< TMaskedQueryRegions > TSeqLocInfoVector
Collection of masked regions for all queries in a BLAST search.
Definition: seqlocinfo.hpp:139
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
Structure to hold a sequence.
Definition: blast_def.h:242
Uint1 * sequence
Sequence used for search (could be translation).
Definition: blast_def.h:243
Int4 query_length
Length of this query, strand or frame.
Structure for keeping the query masking information.
Definition: blast_def.h:210
Int4 total_size
Total size of the BlastSeqLoc array below.
Definition: blast_def.h:218
BlastSeqLoc ** seqloc_array
Array of masked locations.
Definition: blast_def.h:231
The query related information.
BlastContextInfo * contexts
Information per context.
Int4 last_context
Index of the last element of the context array.
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
SSeqRange * ssr
location data on the sequence.
Definition: blast_def.h:206
struct BlastSeqLoc * next
next in linked list
Definition: blast_def.h:205
Structure to hold the a message from the core of the BLAST engine.
Definition: blast_message.h:70
All filtering options.
SRepeatFilterOptions * repeatFilterOptions
for organism specific repeat filtering.
SSegOptions * segOptions
low-complexity filtering for proteins sequences (includes translated nucleotides).
Boolean mask_at_hash
mask query only for lookup table creation
SWindowMaskerOptions * windowMaskerOptions
organism specific filtering with window masker.
SDustOptions * dustOptions
low-complexity filtering for nucleotides.
int window
initial window to trigger further work.
Structure to represent a single sequence to be fed to BLAST.
Definition: sseqloc.hpp:47
A structure containing two integers, used e.g.
Definition: blast_def.h:155
Int4 left
left endpoint of range (zero based)
Definition: blast_def.h:156
Int4 right
right endpoint of range (zero based)
Definition: blast_def.h:157
static string query
Definition: _hash_fun.h:40
Utility stuff for more convenient using of Boost.Test library.
else result
Definition: token2.c:20
static CS_CONTEXT * context
Definition: will_convert.c:21
Interface to retrieve list of available windowmasker filtering.
Blast wrappers for WindowMasker filtering.
voidp malloc(uInt size)
voidp calloc(uInt items, uInt size)
Modified on Wed Apr 17 13:08:11 2024 by modify_doxy.py rev. 669887