NCBI C++ ToolKit
unit_test_gap_analysis.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: unit_test_gap_analysis.cpp 66507 2015-03-10 13:22:21Z kornbluh $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Michael Kornbluh
27  *
28  * File Description:
29  *
30  * ===========================================================================
31  */
32 
33 #include <ncbi_pch.hpp>
34 
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbiargs.hpp>
37 #include <corelib/ncbienv.hpp>
38 #include <corelib/test_boost.hpp>
39 
41 #include <objmgr/util/sequence.hpp>
42 #include <objmgr/scope.hpp>
43 
45 
47 
50 
51 namespace {
52 
53  // for convenience
54  typedef CGapAnalysis GA;
55 
56  void s_PrintSummary(
57  const string & summary_name,
58  GA::EGapType eGapType,
59  const GA::TVectorGapLengthSummary & summary )
60  {
61  cerr << "BEGIN " << GA::s_GapTypeToStr(eGapType)
62  << " SUMMARY for " << summary_name << endl;
63  ITERATE( GA::TVectorGapLengthSummary, summary_iter,
64  summary )
65  {
66  const GA::SOneGapLengthSummary & one_summary =
67  **summary_iter;
68  cerr << "\tlen: " << one_summary.gap_length
69  << "\tnum seqs: " << one_summary.num_seqs
70  << "\tnum gaps: " << one_summary.num_gaps << endl;
71  }
72  cerr << "END GAP SUMMARY" << endl;
73  }
74 
75  void s_PrintSummaryForAllGapTypes(
76  const string & summary_name,
77  const CGapAnalysis & gap_analysis )
78  {
79  cout << endl;
80  cout << "GAP SUMMARY FOR ALL GAP_TYPES" << endl;
81 
82  GA::EGapType gap_types[] = {
83  GA::eGapType_SeqGap,
84  GA::eGapType_UnknownBases
85  };
86  ITERATE_0_IDX(gap_idx, ArraySize(gap_types)) {
87  GA::EGapType eGapType = gap_types[gap_idx];
88  s_PrintSummary(
89  summary_name,
90  eGapType,
91  *gap_analysis.GetGapLengthSummary(eGapType));
92  }
93  cout << endl;
94  }
95 
96  CRef<CScope> CreateScope()
97  {
98  DEFINE_STATIC_FAST_MUTEX(scope_mtx);
99  CFastMutexGuard guard(scope_mtx);
100 
101  static CRef<CObjectManager> om;
102  if( ! om ) {
105  }
106 
107  CRef<CScope> s_scope( new CScope(*om));
108  s_scope->AddDefaults();
109  return s_scope;
110  }
111 
112  void LoadSeqEntryIntoGapAnalysis(
113  CGapAnalysis & gap_analysis,
114  CRef<CSeq_entry> pSeqEntry,
115  GA::TAddFlag add_flags,
116  GA::TFlag fFlags = 0,
117  CScope *p_scope = NULL,
118  bool do_rev_comp = false)
119  {
120  CRef<CScope> scope(
121  p_scope
122  ? p_scope
123  : CreateScope().GetPointer() );
124 
125  if( do_rev_comp ) {
126  for( CTypeIterator<CSeq_entry> entry = Begin(*pSeqEntry);
127  entry; ++entry)
128  {
129  if( entry->IsSeq() ) {
131  entry->SetSeq().SetInst(), scope.GetPointer());
132  }
133  }
134  }
135 
136  CSeq_entry_Handle entry_h =
137  scope->AddTopLevelSeqEntry(*pSeqEntry);
138  BOOST_REQUIRE( entry_h );
139 
140  gap_analysis.AddSeqEntryGaps(
141  entry_h,
144  add_flags,
145  fFlags);
146  }
147 
148  AutoPtr<GA::TVectorGapLengthSummary> AnalyzeSeqEntryTextAsn(
149  CNcbiIstream & in_text_asn,
150  GA::TAddFlag add_flags = GA::fAddFlag_All,
151  GA::EGapType gap_type = GA::eGapType_All,
152  GA::TFlag fFlags = 0)
153  {
154  in_text_asn.seekg(0);
155 
156  CGapAnalysis gap_analysis;
157  CRef<CSeq_entry> pSeqEntry(new CSeq_entry);
158  in_text_asn >> MSerial_AsnText >> *pSeqEntry;
159 
160  LoadSeqEntryIntoGapAnalysis(
161  gap_analysis, pSeqEntry, add_flags, fFlags);
162 
163  return gap_analysis.GetGapLengthSummary(gap_type);
164  }
165 
166  struct SExpectedResult {
167  size_t gap_length;
168  size_t num_seqs;
169  size_t num_gaps;
170 
171  bool empty(void) const {
172  // all zeros
173  return 0 == (gap_length | num_seqs | num_gaps); }
174  };
175 
176  static const SExpectedResult s_ExpectedResultEnd = {0, 0, 0};
177 
178  ostream& operator<<(
179  ostream& s, const SExpectedResult & expected_result)
180  {
181  s << "SExpectedResult ("
182  << "gap_length: " << expected_result.gap_length
183  << ", num_seqs: " << expected_result.num_seqs
184  << ", num_gaps: " << expected_result.num_gaps << ")";
185  return s;
186  }
187 
188  ostream& operator<<(
189  ostream& s, const SExpectedResult expected_results[] )
190  {
191  s << "The expected results: (" << endl;
192 
193  for( size_t idx = 0; ! expected_results[idx].empty(); ++idx ) {
194  const SExpectedResult & one_expected_result =
195  expected_results[idx];
196  s << one_expected_result << endl;
197  }
198 
199  s << ")" << endl;
200  return s;
201  }
202 
203  void CheckExpectedResults(
204  const GA::TVectorGapLengthSummary & basic_gap_summary,
205  // last expected result is {0, 0, 0} to indicate we're at
206  // the end.
207  const SExpectedResult expected_results[])
208  {
209  cout << "CheckExpectedResults basic_gap_summary: "
210  << basic_gap_summary << endl;
211 
212  size_t idx = 0;
213  for( ; idx < basic_gap_summary.size(); ++idx) {
214  const GA::SOneGapLengthSummary & one_gap_summary =
215  *basic_gap_summary[idx];
216  const SExpectedResult & one_expected_result =
217  expected_results[idx];
218  BOOST_CHECK_EQUAL(
219  one_gap_summary.gap_length, one_expected_result.gap_length);
220  BOOST_CHECK_EQUAL(
221  one_gap_summary.num_seqs, one_expected_result.num_seqs);
222  BOOST_CHECK_EQUAL(
223  one_gap_summary.num_gaps, one_expected_result.num_gaps);
224  }
225  BOOST_CHECK( expected_results[idx].empty() );
226  }
227 
228  // expected results when getting summary for each type
229  struct SGapTypeExpectedResult {
230  const GA::EGapType gap_type;
231  // allow up to that many SExpectedResult's.
232  // Feel free to adjust this number as needed.
233  const SExpectedResult expected_result[5];
234  };
235 
236  void CheckGapTypeExpectedResult(
237  const CTempString & test_name,
238  const SGapTypeExpectedResult & gap_type_expected_result,
239  const GA::TVectorGapLengthSummary & basic_gap_summary)
240  {
241  cout << "In " << test_name << " running expected results for "
242  << GA::s_GapTypeToStr(gap_type_expected_result.gap_type) << ": "
243  << gap_type_expected_result.expected_result << endl;
244 
245  CheckExpectedResults(
246  basic_gap_summary,
247  gap_type_expected_result.expected_result);
248  }
249 }
250 
252 {
253  // Here we make descriptions of command line parameters that we are
254  // going to use.
255 
256  arg_desc->AddKey("basic-data", "InputFile",
257  "This is the basic input file used to run the test",
259  arg_desc->AddKey(
260  "in-letter-gap-data", "InputFile",
261  "This is the input file used to run the "
262  "'gaps as run of unknown bases' test",
264  arg_desc->AddKey(
265  "mixed-gap-type-data", "InputFile",
266  "This is the input file used to run the "
267  "'gaps as run of unknown bases and seq-gaps' test, distinguishing "
268  "between the two.",
270 
271 }
272 
274 {
275  const CArgs& args = CNcbiApplication::Instance()->GetArgs();
276 
277  CNcbiIfstream basic_data_fstrm(args["basic-data"].AsString().c_str());
278  AutoPtr<GA::TVectorGapLengthSummary> basic_gap_summary =
279  AnalyzeSeqEntryTextAsn(
280  basic_data_fstrm, GA::fAddFlag_All, GA::eGapType_All,
281  GA::fFlag_IncludeEndGaps);
282  s_PrintSummary(
283  "TestBasic - basic-data", GA::eGapType_All, *basic_gap_summary);
284 
285  SExpectedResult expected_results[] = {
286  { 5, 1, 1 },
287  { 10, 1, 1 },
288  { 40, 1, 1 },
289  { 101, 2, 3 },
290  s_ExpectedResultEnd
291  };
292  CheckExpectedResults(*basic_gap_summary, expected_results);
293 }
294 
295 BOOST_AUTO_TEST_CASE(TestSeqIdComparison)
296 {
297  // make sure Seq-ids are compared by contents, not pointer address.
298 
299  CRef<CSeq_id> pSeqId1( new CSeq_id("lcl|Seq_no_gaps") );
300  CRef<CSeq_id> pSeqId2( new CSeq_id("lcl|Seq_no_gaps") );
301  CRef<CSeq_id> pSeqId3( new CSeq_id("lcl|Seq_misc_seq_gaps_and_raw_bases"));
302 
303  const GA::TGapLength kGapLength = 10;
304  const TSeqPos kBioseqlen = 100;
305 
306  CGapAnalysis gap_analysis;
307  gap_analysis.AddGap(
308  GA::eGapType_All, pSeqId1, kGapLength, kBioseqlen, 2, 12);
309  gap_analysis.AddGap(
310  GA::eGapType_All, pSeqId2, kGapLength, kBioseqlen, 20, 30);
311  gap_analysis.AddGap(
312  GA::eGapType_All, pSeqId3, kGapLength, kBioseqlen, 40, 50);
313 
314  BOOST_CHECK_EQUAL(
315  gap_analysis.GetGapLengthSeqIds(GA::eGapType_All).find(
316  kGapLength)->second.size(),
317  2u );
318 }
319 
320 BOOST_AUTO_TEST_CASE(TestGapsAsLetters)
321 {
322  const CArgs& args = CNcbiApplication::Instance()->GetArgs();
323 
324  CNcbiIfstream gap_data_strm(args["in-letter-gap-data"].AsString().c_str());
325  AutoPtr<GA::TVectorGapLengthSummary> basic_gap_summary =
326  AnalyzeSeqEntryTextAsn(
327  gap_data_strm, GA::fAddFlag_IncludeUnknownBases,
328  GA::eGapType_All, GA::fFlag_IncludeEndGaps);
329  s_PrintSummary(
330  "TestGapsAsLetters - in-letter-gap-data",
331  GA::eGapType_All, *basic_gap_summary);
332 
333  SExpectedResult expected_results[] = {
334  { 1, 1, 2 },
335  { 2, 1, 1 },
336  { 10, 1, 1 },
337  s_ExpectedResultEnd
338  };
339  CheckExpectedResults(*basic_gap_summary, expected_results);
340 }
341 
343 {
344  const CArgs& args = CNcbiApplication::Instance()->GetArgs();
345 
346  CNcbiIfstream gap_data_fstrm(
347  args["mixed-gap-type-data"].AsString().c_str());
348  CRef<CSeq_entry> pSeqEntry(new CSeq_entry);
349  gap_data_fstrm >> MSerial_AsnText >> *pSeqEntry;
350 
351  CGapAnalysis gap_analysis;
352  LoadSeqEntryIntoGapAnalysis(
353  gap_analysis, pSeqEntry, GA::fAddFlag_All,
354  GA::fFlag_IncludeEndGaps);
355  s_PrintSummaryForAllGapTypes(
356  "TestEndGaps - mixed-gap-type-data", gap_analysis);
357 
358  const SGapTypeExpectedResult gap_type_expected_results[] = {
359  {
360  GA::eGapType_All,
361  {
362  { 2, 1, 1},
363  { 3, 1, 1},
364  { 8, 1, 1 },
365  { 23, 1, 2 },
366  s_ExpectedResultEnd
367  } },
368  {
369  GA::eGapType_SeqGap,
370  {
371  { 23, 1, 1 },
372  s_ExpectedResultEnd
373  }
374  },
375  {
376  GA::eGapType_UnknownBases,
377  {
378  { 2, 1, 1},
379  { 3, 1, 1},
380  { 8, 1, 1 },
381  { 23, 1, 1 },
382  s_ExpectedResultEnd
383  }
384  }
385  };
386  ITERATE_0_IDX(gap_type_idx, ArraySize(gap_type_expected_results)) {
387 
388  CheckGapTypeExpectedResult(
389  "TestEndGaps",
390  gap_type_expected_results[gap_type_idx],
391  *gap_analysis.GetGapLengthSummary(
392  gap_type_expected_results[gap_type_idx].gap_type));
393  }
394 }
395 
396 BOOST_AUTO_TEST_CASE(TestAllGapTypes)
397 {
398  const CArgs& args = CNcbiApplication::Instance()->GetArgs();
399 
400 
401 
402  // recall that end gaps will be ignored here
403  const SGapTypeExpectedResult gap_type_expected_results[] = {
404  {
405  GA::eGapType_All,
406  {
407  { 8, 1, 1 },
408  { 23, 1, 2 },
409  s_ExpectedResultEnd
410  } },
411  {
412  GA::eGapType_SeqGap,
413  {
414  { 23, 1, 1 },
415  s_ExpectedResultEnd
416  }
417  },
418  {
419  GA::eGapType_UnknownBases,
420  {
421  { 8, 1, 1 },
422  { 23, 1, 1 },
423  s_ExpectedResultEnd
424  }
425  }
426  };
427 
428  // should get same result regardless of strandedness
429  ITERATE_BOTH_BOOL_VALUES(is_minus_strand) {
430 
431  cout << "TestAllGapTypes "
432  << (is_minus_strand ? "minus" : "plus")
433  << " strand" << endl;
434 
435  CNcbiIfstream gap_data_fstrm(
436  args["mixed-gap-type-data"].AsString().c_str());
437  CRef<CSeq_entry> pSeqEntry(new CSeq_entry);
438  gap_data_fstrm >> MSerial_AsnText >> *pSeqEntry;
439 
440  CGapAnalysis gap_analysis;
441  LoadSeqEntryIntoGapAnalysis(
442  // it complements, too, but that doesn't matter
443  gap_analysis, pSeqEntry, GA::fAddFlag_All, 0, NULL, true);
444  s_PrintSummaryForAllGapTypes(
445  "TestAllGapTypes - mixed-gap-type-data", gap_analysis);
446 
447  ITERATE_0_IDX(gap_type_idx, ArraySize(gap_type_expected_results)) {
448 
449  const SGapTypeExpectedResult & gap_type_expected_result =
450  gap_type_expected_results[gap_type_idx];
451 
452  CheckGapTypeExpectedResult(
453  "TestAllGapTypes",
454  gap_type_expected_result,
455  *gap_analysis.GetGapLengthSummary(
456  gap_type_expected_result.gap_type));
457  }
458  }
459 }
460 
461 // so we can print CSerialObject's from a debugger
462 void PS(const CSerialObject * obj)
463 {
464  if( obj ) {
465  cout << MSerial_AsnText << *obj << endl;
466  } else {
467  cout << "(NULL)" << endl;
468  }
469 }
AutoPtr –.
Definition: ncbimisc.hpp:401
CArgs –.
Definition: ncbiargs.hpp:379
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: gbloader.cpp:366
Give this gaps, or handles containing gaps and then you can get statistics on those gaps.
const TMapGapLengthToSeqIds & GetGapLengthSeqIds(EGapType eGapType) const
Returns a map of gap_length to the set of all seq-ids that contain at least one gap of that length.
void AddSeqEntryGaps(const CSeq_entry_Handle &entry_h, CSeq_inst::EMol filter=CSeq_inst::eMol_not_set, CBioseq_CI::EBioseqLevelFlag level=CBioseq_CI::eLevel_All, TAddFlag add_flags=fAddFlag_All, TFlag fFlags=0, size_t max_resolve_count=kMax_Int)
Calls AddGap for each gap anywhere under the given CSeq_entry.
void AddGap(EGapType eGapType, TSeqIdConstRef pSeqId, TGapLength iGapLength, TSeqPos iBioseqLength, TSeqPos iGapStartPos, TSeqPos iGapEndPosExclusive, TFlag fFlags=0)
AddSeqEntryGaps is more convenient, but if you want finer-grained control you can use this function t...
AutoPtr< TVectorGapLengthSummary > GetGapLengthSummary(EGapType eGapType, ESortGapLength eSortGapLength=eSortGapLength_Length, ESortDir eSortDir=eSortDir_Ascending) const
This gives summary information about every gap-length encountered so far.
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:264
CScope –.
Definition: scope.hpp:92
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
Base class for all serializable objects.
Definition: serialbase.hpp:150
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
const_iterator find(const key_type &key) const
Definition: map.hpp:153
CNcbiOstream & operator<<(CNcbiOstream &out, const CEquivRange &range)
Definition: equiv_range.cpp:96
static char test_name[128]
Definition: utf8_2.c:34
Analyzes gaps and produces various statistics.
void ReverseComplement(const BidirectionalIterator &first, const BidirectionalIterator &last)
#define ITERATE_0_IDX(idx, up_to)
idx loops from 0 (inclusive) to up_to (exclusive)
Definition: ncbimisc.hpp:865
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
constexpr size_t ArraySize(const Element(&)[Size])
Definition: ncbimisc.hpp:1532
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define ITERATE_BOTH_BOOL_VALUES(BoolVar)
The body of the loop will be run with Var equal to false and then true.
Definition: ncbimisc.hpp:861
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
#define NULL
Definition: ncbistd.hpp:225
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
@ eLevel_All
Any bioseq.
Definition: bioseq_ci.hpp:73
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
#define DEFINE_STATIC_FAST_MUTEX(id)
Define static fast mutex and initialize it.
Definition: ncbimtx.hpp:496
@ eMol_not_set
> cdna = rna
Definition: Seq_inst_.hpp:109
bm::gap_word_t gap_length(const bm::gap_word_t *buf) noexcept
Returs GAP block length.
Definition: bmfunc.h:1603
constexpr bool empty(list< Ts... >) noexcept
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
The Object manager core.
CRef< objects::CObjectManager > om
Utility stuff for more convenient using of Boost.Test library.
USING_SCOPE(objects)
BOOST_AUTO_TEST_CASE(TestBasic)
NCBITEST_INIT_CMDLINE(arg_desc)
void PS(const CSerialObject *obj)
Modified on Thu Apr 25 08:17:31 2024 by modify_doxy.py rev. 669887