NCBI C++ ToolKit
phytree_calc_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

00001 /*  $Id: phytree_calc_unit_test.cpp 59704 2013-09-12 14:59:01Z boratyng $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE
00005  *               National Center for Biotechnology Information
00006  *
00007  *  This software/database is a "United States Government Work" under the
00008  *  terms of the United States Copyright Act.  It was written as part of
00009  *  the author's official duties as a United States Government employee and
00010  *  thus cannot be copyrighted.  This software/database is freely available
00011  *  to the public for use. The National Library of Medicine and the U.S.
00012  *  Government have not placed any restriction on its use or reproduction.
00013  *
00014  *  Although all reasonable efforts have been taken to ensure the accuracy
00015  *  and reliability of the software and data, the NLM and the U.S.
00016  *  Government do not and cannot warrant the performance or results that
00017  *  may be obtained by using this software or data. The NLM and the U.S.
00018  *  Government disclaim all warranties, express or implied, including
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.
00021  *
00022  *  Please cite the author in any work or product based on this material.
00023  *
00024  * ===========================================================================
00025  *
00026  * Author:  Greg Boratyn
00027  *
00028  * File Description:
00029  *   Unit tests for the Phylogenetic tree computation API
00030  *
00031  * ===========================================================================
00032  */
00033 
00034 #include <ncbi_pch.hpp>
00035 
00036 #include <objmgr/object_manager.hpp>
00037 #include <objmgr/scope.hpp>
00038 #include <objtools/data_loaders/genbank/gbloader.hpp>
00039 #include <objects/seqalign/Seq_align_set.hpp>
00040 
00041 #include <objtools/readers/fasta.hpp>
00042 #include <objtools/readers/reader_exception.hpp>
00043 
00044 #include <serial/iterator.hpp>
00045 #include <serial/serial.hpp>    
00046 #include <serial/objistr.hpp>
00047 #include <serial/objostr.hpp>
00048 
00049 #include <algo/phy_tree/phytree_calc.hpp>
00050 #include <math.h>
00051 
00052 #include <corelib/test_boost.hpp>
00053 
00054 #ifndef SKIP_DOXYGEN_PROCESSING
00055 
00056 #ifdef NCBI_COMPILER_MSVC
00057 #  define isfinite _finite
00058 #elif defined(NCBI_COMPILER_WORKSHOP)  &&  !defined(__builtin_isfinite)
00059 #  undef isfinite
00060 #  define isfinite finite
00061 #endif
00062 
00063 USING_NCBI_SCOPE;
00064 USING_SCOPE(objects);
00065 
00066 
00067 // Create scope with local sequences read from fasta file
00068 static CRef<CScope> s_CreateScope(const string& filename);
00069 
00070 // Traverse tree and check that each node has distance, number of leaves is
00071 // correct and leaves have correct labels.
00072 static void s_TraverseTree(const TPhyTreeNode* node, vector<bool>& leaves);
00073 
00074 // Check tree
00075 static bool s_TestTree(int num_sequences, const TPhyTreeNode* tree);
00076 
00077 // Traverse BioTreeDynamic
00078 static void s_TraverseDynTree(const CBioTreeDynamic::CBioNode* node,
00079                               vector<bool>& leaves);
00080 
00081 // Check BioTreeContainer
00082 static bool s_TestTreeContainer(int num_sequences,
00083                                 const CBioTreeContainer& tree);
00084 
00085 // Check tree computation from Seq-align
00086 static bool s_TestCalc(const CSeq_align& seq_align,
00087                        const CPhyTreeCalc& calc);
00088 
00089 // Check tree computation from Seq-align-set
00090 static bool s_TestCalc(const CSeq_align_set& seq_align_set,
00091                        const CPhyTreeCalc& calc);
00092 
00093 // Generate tree in Newick-like format
00094 static string s_GetNewickLike(const TPhyTreeNode* tree);
00095 
00096 /// Test class for accessing CPhyTreeCalc private methods and attributes
00097 class CTestPhyTreeCalc
00098 {
00099 public:
00100     static bool CalcDivergenceMatrix(CPhyTreeCalc& calc, vector<int>& included)
00101     {
00102         return calc.x_CalcDivergenceMatrix(included);
00103     }
00104 };
00105 
00106 
00107 BOOST_AUTO_TEST_SUITE(guide_tree_calc)
00108 
00109 // Test tree computation with protein Seq-align as input
00110 BOOST_AUTO_TEST_CASE(TestProteinSeqAlign)
00111 {
00112     CRef<CScope> scope = s_CreateScope("data/protein.fa");
00113     CNcbiIfstream istr("data/seqalign_protein_local.asn");
00114     BOOST_REQUIRE(istr);
00115     CSeq_align seq_align;
00116     istr >> MSerial_AsnText >> seq_align;
00117 
00118     const int kNumSeqs = 12;
00119 
00120     CPhyTreeCalc calc(seq_align, scope);
00121     vector<string> labels = calc.SetLabels();
00122     for (int i=1;i <= kNumSeqs;i++) {
00123         labels.push_back(NStr::IntToString(i));
00124     }
00125     BOOST_REQUIRE(calc.CalcBioTree());
00126 
00127     s_TestCalc(seq_align, calc);
00128 
00129     // test for specific result
00130     BOOST_REQUIRE_EQUAL(s_GetNewickLike(calc.GetTree()),
00131                         "((4:0.474, (1:0.814, ((2:0.092, 3:0.060):0.586, "
00132                         "(((9:0.044, ((7:0.012, 8:0.020):0.045, 6:0.044)"
00133                         ":0.018):0.070, 10:0.000):0.390, (0:0.000, 5:0.034)"
00134                         ":0.393):0.078):0.072):0.474):x)");
00135 }
00136 
00137 
00138 // Test tree computation with protein Seq-align-set as input
00139 BOOST_AUTO_TEST_CASE(TestProteinSeqAlignSet)
00140 {
00141     CRef<CScope> scope = s_CreateScope("data/protein.fa");
00142     CNcbiIfstream istr("data/seqannot_protein_local.asn");
00143     BOOST_REQUIRE(istr);
00144     CSeq_annot seq_annot;
00145     istr >> MSerial_AsnText >> seq_annot;
00146 
00147     CSeq_align_set seq_align_set;
00148     seq_align_set.Set() = seq_annot.GetData().GetAlign();
00149 
00150     const int kNumSeqs = 12;
00151     
00152     CPhyTreeCalc calc(seq_align_set, scope);
00153     vector<string> labels = calc.SetLabels();
00154     for (int i=1;i <= kNumSeqs;i++) {
00155         labels.push_back(NStr::IntToString(i));
00156     }
00157     BOOST_REQUIRE(calc.CalcBioTree());
00158 
00159     s_TestCalc(seq_align_set, calc);
00160 
00161     // test for specific result
00162     BOOST_REQUIRE_EQUAL(s_GetNewickLike(calc.GetTree()),
00163                         "((4:0.449, ((1:0.699, (2:0.061, 3:0.062):0.440)"
00164                         ":0.070, (((9:0.038, ((7:0.005, 8:0.017):0.044, "
00165                         "6:0.037):0.017):0.050, 10:0.000):0.403, (0:0.000, "
00166                         "5:0.034):0.360):0.104):0.449):x)");
00167 }
00168 
00169 
00170 // Test tree computation with nucleotide Seq-align as input
00171 BOOST_AUTO_TEST_CASE(TestNucleotideSeqAlign)
00172 {
00173     CRef<CScope> scope = s_CreateScope("data/nucleotide.fa");
00174     CNcbiIfstream istr("data/seqalign_nucleotide_local.asn");
00175     BOOST_REQUIRE(istr);
00176     CSeq_align seq_align;
00177     istr >> MSerial_AsnText >> seq_align;
00178 
00179     CPhyTreeCalc calc(seq_align, scope);
00180     calc.SetDistMethod(CPhyTreeCalc::eJukesCantor);
00181     BOOST_REQUIRE(calc.CalcBioTree());
00182 
00183     s_TestCalc(seq_align, calc);
00184 
00185     // test for specific result
00186     BOOST_REQUIRE_EQUAL(s_GetNewickLike(calc.GetTree()),
00187                         "(((5:0.000, 9:0.000):0.002, (7:0.000, (3:0.000, "
00188                         "(4:0.001, (1:0.000, (0:0.000, ((6:0.000, 8:0.000)"
00189                         ":0.000, 2:0.000):0.000):0.000):0.001):0.000):0.000)"
00190                         ":0.002):x)");
00191 }
00192 
00193 // Test tree computation with nucleotide Seq-align-set as input
00194 BOOST_AUTO_TEST_CASE(TestNucleotideSeqAlignSet)
00195 {
00196     CRef<CScope> scope = s_CreateScope("data/nucleotide.fa");
00197     CNcbiIfstream istr("data/seqannot_nucleotide_local.asn");
00198     BOOST_REQUIRE(istr);
00199     CSeq_annot seq_annot;    
00200     istr >> MSerial_AsnText >> seq_annot;
00201 
00202     CSeq_align_set seq_align_set;
00203     seq_align_set.Set() = seq_annot.GetData().GetAlign();
00204 
00205     CPhyTreeCalc calc(seq_align_set, scope);
00206     calc.SetDistMethod(CPhyTreeCalc::eJukesCantor);
00207     BOOST_REQUIRE(calc.CalcBioTree());
00208 
00209     s_TestCalc(seq_align_set, calc);
00210 
00211     // test for specific result
00212     BOOST_REQUIRE_EQUAL(s_GetNewickLike(calc.GetTree()),
00213                         "(((5:0.000, 9:0.000):0.002, (7:0.000, (3:0.000, "
00214                         "(4:0.001, (1:0.000, (0:0.000, ((6:0.000, 8:0.000)"
00215                         ":0.000, 2:0.000):0.000):0.000):0.001):0.000):0.000)"
00216                         ":0.002):x)");
00217 }
00218 
00219 
00220 // Test that exceptions are thrown for invalid input
00221 BOOST_AUTO_TEST_CASE(TestInvalidInput)
00222 {
00223     CRef<CScope> scope = s_CreateScope("data/protein.fa");
00224     CNcbiIfstream istr("data/seqalign_protein_local.asn");
00225     CSeq_align seq_align;
00226     istr >> MSerial_AsnText >> seq_align;
00227 
00228     CPhyTreeCalc calc(seq_align, scope);
00229 
00230     // If set, number of labels must be the same as number of input sequences
00231     vector<string>& labels = calc.SetLabels();
00232     labels.push_back("label1");
00233     labels.push_back("label2");
00234     BOOST_CHECK_THROW(calc.CalcBioTree(), CPhyTreeCalcException);
00235     calc.SetLabels().clear();
00236 
00237     // Max divergence must be a positive number
00238     calc.SetMaxDivergence(-0.1);
00239     BOOST_CHECK_THROW(calc.CalcBioTree(), CPhyTreeCalcException);
00240 
00241     // If Kimura distance is selected, then max divergence must be smaller
00242     // than 0.85
00243     calc.SetDistMethod(CPhyTreeCalc::eKimura);
00244     calc.SetMaxDivergence(0.851);
00245     BOOST_CHECK_THROW(calc.CalcBioTree(), CPhyTreeCalcException);
00246 
00247     // If no subset of input sequences satisfies max divergence condition
00248     calc.SetMaxDivergence(0.001);
00249     BOOST_CHECK(!calc.CalcBioTree());
00250     BOOST_CHECK(calc.GetMessages().size() > 0);
00251 }
00252 
00253 
00254 // Test tree computation using Grishin distance
00255 BOOST_AUTO_TEST_CASE(TestGrishinDistance)
00256 {
00257     CRef<CScope> scope = s_CreateScope("data/protein.fa");
00258     CNcbiIfstream istr("data/seqalign_protein_local.asn");
00259     CSeq_align seq_align;
00260     BOOST_REQUIRE(istr);
00261     istr >> MSerial_AsnText >> seq_align;
00262 
00263     CPhyTreeCalc calc(seq_align, scope);
00264     calc.SetDistMethod(CPhyTreeCalc::eGrishin);
00265     BOOST_REQUIRE(calc.CalcBioTree());
00266 
00267     s_TestTree(calc.GetSeqIds().size(), calc.GetTree());
00268 }
00269 
00270 
00271 // Test tree computation using Kimura distance
00272 BOOST_AUTO_TEST_CASE(TestKimuraDistance)
00273 {
00274     CRef<CScope> scope = s_CreateScope("data/protein.fa");
00275     CNcbiIfstream istr("data/seqannot_protein_local.asn");
00276     BOOST_REQUIRE(istr);
00277     CSeq_annot seq_annot;
00278     istr >> MSerial_AsnText >> seq_annot;
00279 
00280     CSeq_align_set seq_align_set;
00281     seq_align_set.Set() = seq_annot.GetData().GetAlign();
00282     CPhyTreeCalc calc(seq_align_set, scope);
00283 
00284     calc.SetDistMethod(CPhyTreeCalc::eKimura);
00285     BOOST_REQUIRE(calc.CalcBioTree());
00286 
00287     s_TestTree(calc.GetSeqIds().size(), calc.GetTree());
00288 }
00289 
00290 
00291 // Test tree computation using Grishin General distance
00292 BOOST_AUTO_TEST_CASE(TestGrishinGeneralDistance)
00293 {
00294     CRef<CScope> scope = s_CreateScope("data/protein.fa");
00295     CNcbiIfstream istr("data/seqalign_protein_local.asn");
00296     BOOST_REQUIRE(istr);
00297     CSeq_align seq_align;
00298     istr >> MSerial_AsnText >> seq_align;
00299 
00300     CPhyTreeCalc calc(seq_align, scope);
00301     calc.SetDistMethod(CPhyTreeCalc::eGrishinGeneral);
00302     BOOST_REQUIRE(calc.CalcBioTree());
00303 
00304     s_TestTree(calc.GetSeqIds().size(), calc.GetTree());
00305 }
00306 
00307 
00308 // Test tree computation using Jukes-Cantor distance
00309 BOOST_AUTO_TEST_CASE(TestJukesCantorDistance)
00310 {
00311     CRef<CScope> scope = s_CreateScope("data/nucleotide.fa");
00312     CNcbiIfstream istr("data/seqalign_nucleotide_local.asn");
00313     CSeq_align seq_align;
00314     istr >> MSerial_AsnText >> seq_align;
00315 
00316     CPhyTreeCalc calc(seq_align, scope);
00317     calc.SetDistMethod(CPhyTreeCalc::eJukesCantor);
00318     BOOST_REQUIRE(calc.CalcBioTree());
00319 
00320     s_TestTree(calc.GetSeqIds().size(), calc.GetTree());
00321 }
00322 
00323 
00324 // Test tree computation using Poisson distance
00325 BOOST_AUTO_TEST_CASE(TestPoissonDistance)
00326 {
00327     CRef<CScope> scope = s_CreateScope("data/nucleotide.fa");
00328     CNcbiIfstream istr("data/seqalign_nucleotide_local.asn");
00329     CSeq_align seq_align;
00330     istr >> MSerial_AsnText >> seq_align;
00331 
00332     CPhyTreeCalc calc(seq_align, scope);
00333     calc.SetDistMethod(CPhyTreeCalc::ePoisson);
00334     BOOST_REQUIRE(calc.CalcBioTree());
00335 
00336     s_TestTree(calc.GetSeqIds().size(), calc.GetTree());
00337 }
00338 
00339 
00340 // Test tree computation using Neighbor-Joining tree
00341 BOOST_AUTO_TEST_CASE(TestNJTree)
00342 {
00343     CRef<CScope> scope = s_CreateScope("data/protein.fa");
00344     CNcbiIfstream istr("data/seqalign_protein_local.asn");
00345     BOOST_REQUIRE(istr);
00346     CSeq_align seq_align;
00347     istr >> MSerial_AsnText >> seq_align;
00348 
00349     CPhyTreeCalc calc(seq_align, scope);
00350     calc.SetTreeMethod(CPhyTreeCalc::eNJ);
00351     BOOST_REQUIRE(calc.CalcBioTree());
00352 
00353     s_TestTree(calc.GetSeqIds().size(), calc.GetTree());
00354 }
00355 
00356 
00357 // Test tree computation using Fast Minimum Evolution tree
00358 BOOST_AUTO_TEST_CASE(TestFastMETree)
00359 {
00360     CRef<CScope> scope = s_CreateScope("data/protein.fa");
00361     CNcbiIfstream istr("data/seqalign_protein_local.asn");
00362     BOOST_REQUIRE(istr);
00363     CSeq_align seq_align;
00364     istr >> MSerial_AsnText >> seq_align;
00365 
00366     CPhyTreeCalc calc(seq_align, scope);
00367     calc.SetTreeMethod(CPhyTreeCalc::eFastME);
00368     BOOST_REQUIRE(calc.CalcBioTree());
00369 
00370     s_TestTree(calc.GetSeqIds().size(), calc.GetTree());
00371 }
00372 
00373 // Verify that CDistMethods::Divergence() does not return a finite number for
00374 // a pair of sequences with a gap in each position. Make sure that the function
00375 // isfinite() works.
00376 // Checking for Nan and Inf does not always work for optimized 32-bit
00377 // builds.
00378 #if NCBI_PLATFORM_BITS == 64
00379 BOOST_AUTO_TEST_CASE(TestNanDivergence)
00380 {
00381     const string kSeq1 = "A--";
00382     const string kSeq2 = "-A-";
00383 
00384     // sequence lengths must be the same
00385     BOOST_REQUIRE_EQUAL(kSeq1.length(), kSeq2.length());
00386     // there must not be a column with a residue in both sequences
00387     for (size_t i=0;i < kSeq1.length();i++) {
00388         BOOST_REQUIRE(kSeq1[i] == '-' || kSeq2[i] == '-');
00389     }
00390 
00391     // for a pair of sequences with a gap in each column divergence must not
00392     // be finite
00393     BOOST_REQUIRE(!isfinite(CDistMethods::Divergence(kSeq1, kSeq2)));
00394 }
00395 #endif
00396 
00397 // Verify that tree computing functions throw when non-finite number is present
00398 // in the distance matrix
00399 BOOST_AUTO_TEST_CASE(TestRejectIncorrectDistance)
00400 {
00401     // create a distance matrix
00402     CDistMethods::TMatrix dmat(2, 2, 0.0);
00403 
00404     // test for a negative number
00405     dmat(0, 1) = dmat(1, 0) = -1.0;
00406     BOOST_REQUIRE_THROW(CDistMethods::NjTree(dmat), invalid_argument);
00407     BOOST_REQUIRE_THROW(CDistMethods::FastMeTree(dmat), invalid_argument);
00408 
00409 
00410 #if NCBI_PLATFORM_BITS == 64
00411     // checking for Nan and Inf does not always work for optimized 32-bit
00412     // builds
00413 
00414     // test for infinity
00415     dmat(0, 1) = dmat(1, 0) = numeric_limits<double>::infinity();
00416     BOOST_REQUIRE_THROW(CDistMethods::NjTree(dmat), invalid_argument);
00417     BOOST_REQUIRE_THROW(CDistMethods::FastMeTree(dmat), invalid_argument);
00418 
00419     // test for quiet NaN
00420     dmat(0, 1) = dmat(1, 0) = numeric_limits<double>::quiet_NaN();
00421     BOOST_REQUIRE_THROW(CDistMethods::NjTree(dmat), invalid_argument);
00422     BOOST_REQUIRE_THROW(CDistMethods::FastMeTree(dmat), invalid_argument);
00423 
00424     // test for signaling NaN
00425     dmat(0, 1) = dmat(1, 0) = numeric_limits<double>::signaling_NaN();
00426     BOOST_REQUIRE_THROW(CDistMethods::NjTree(dmat), invalid_argument);
00427     BOOST_REQUIRE_THROW(CDistMethods::FastMeTree(dmat), invalid_argument);
00428 #endif
00429 }
00430 
00431 // Verify that alignment with a gap in each column is rejected during
00432 // divergence computation
00433 BOOST_AUTO_TEST_CASE(TestRejectPairWithGapInEachPosition)
00434 {
00435     CRef<CScope> scope = s_CreateScope("data/protein.fa");
00436     const int num_seqs = 2;
00437 
00438     // create alignment with a gap in each position
00439     CSeq_align align;
00440     align.SetDim(num_seqs);
00441     CRef<CSeq_id> id(new CSeq_id("lcl|1"));
00442     CDense_seg& denseg = align.SetSegs().SetDenseg();
00443     denseg.SetDim(num_seqs);
00444     denseg.SetNumseg(2);
00445     denseg.SetIds().push_back(id);
00446     denseg.SetIds().push_back(id);
00447     // starts = {0, -1, -1, 0}
00448     denseg.SetStarts().push_back(0);
00449     denseg.SetStarts().push_back(-1);
00450     denseg.SetStarts().push_back(-1);
00451     denseg.SetStarts().push_back(0);
00452     // lens = {10, 10}
00453     denseg.SetLens().push_back(10);
00454     denseg.SetLens().push_back(10);
00455     denseg.Validate();
00456         
00457     // make sure that there is a gap in each position for a pair of sequences
00458     // in the alignment
00459     const vector<int>& starts = denseg.GetStarts();
00460     for (size_t i=0;i < starts.size();i+=num_seqs) {
00461         BOOST_REQUIRE(starts[i] < 0 || starts[i+1] < 0);
00462     }
00463 
00464     CPhyTreeCalc calc(align, scope);
00465     vector<int> included_inds;
00466     bool valid = CTestPhyTreeCalc::CalcDivergenceMatrix(calc, included_inds);
00467 
00468     // for alignment with 2 sequences, one was rejected, and the function must
00469     // return false
00470     BOOST_REQUIRE_EQUAL(valid, false);
00471 
00472     // the first sequence is always included, and the second myst be rejected
00473     // because the divergence in infinite
00474     BOOST_REQUIRE_EQUAL(included_inds.size(), 1u);
00475 
00476     // the first sequence is always included
00477     BOOST_REQUIRE_EQUAL(included_inds[0], 0);
00478 }
00479 
00480 
00481 BOOST_AUTO_TEST_SUITE_END()
00482 
00483 
00484 static CRef<CScope> s_CreateScope(const string& filename)
00485 {
00486     CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
00487     CRef<CScope> scope(new CScope(*objmgr));
00488     scope->AddDefaults();
00489 
00490     CNcbiIfstream instream(filename.c_str());
00491     BOOST_REQUIRE(instream);
00492 
00493     CStreamLineReader line_reader(instream);
00494     CFastaReader fasta_reader(line_reader, 
00495                               CFastaReader::fAssumeProt |
00496                               CFastaReader::fForceType |
00497                               CFastaReader::fNoParseID);
00498 
00499     while (!line_reader.AtEOF()) {
00500 
00501         CRef<CSeq_entry> entry = fasta_reader.ReadOneSeq();
00502 
00503         if (entry == 0) {
00504             NCBI_THROW(CObjReaderException, eInvalid, 
00505                         "Could not retrieve seq entry");
00506         }
00507         scope->AddTopLevelSeqEntry(*entry);
00508         CTypeConstIterator<CBioseq> itr(ConstBegin(*entry));
00509     }
00510 
00511     return scope;
00512 }
00513 
00514 
00515 // Traverse tree and check that each node has distance, number of leaves is
00516 // correct and leaves have correct labels.
00517 static void s_TraverseTree(const TPhyTreeNode* node, vector<bool>& leaves)
00518 {
00519     // each node except for root must have distance set
00520     BOOST_REQUIRE(!node->GetParent() || node->GetValue().IsSetDist());
00521 
00522     if (node->IsLeaf()) {
00523 
00524         string label = node->GetValue().GetLabel();
00525 
00526         // each leaf node must have a label
00527         BOOST_REQUIRE(!label.empty());
00528 
00529         // label must be a number between zero and number of leaves - 1
00530         int id = NStr::StringToInt(label);
00531         BOOST_REQUIRE(id >= 0 && id < (int)leaves.size());
00532 
00533         // each label must not appear more than once
00534         BOOST_REQUIRE(!leaves[id]);
00535         leaves[id] = true;
00536 
00537         return;
00538     }
00539 
00540     TPhyTreeNode::TNodeList_CI child(node->SubNodeBegin());
00541     while (child != node->SubNodeEnd()) {
00542         s_TraverseTree(*child, leaves);
00543         child++;
00544     }
00545 }
00546 
00547 // Check tree
00548 static bool s_TestTree(int num_sequences, const TPhyTreeNode* tree)
00549 {
00550     vector<bool> leaves(num_sequences, false);
00551 
00552     // check that all num_sequences leaves are present
00553     s_TraverseTree(tree, leaves);
00554     ITERATE (vector<bool>, it, leaves) {
00555         BOOST_REQUIRE(*it);
00556     }
00557 
00558     return true;
00559 }
00560 
00561 // This is not really Newick format, because it prints root's edge length
00562 // and doean not put ';' at the end.
00563 static void s_GetNewick(const TPhyTreeNode* node, string& result)
00564 {
00565     if (!node->IsLeaf()) {
00566         result += "(";
00567 
00568         TPhyTreeNode::TNodeList_CI child(node->SubNodeBegin());
00569         while (child != node->SubNodeEnd()) {
00570             if (child != node->SubNodeBegin()) {
00571                 result += ", ";
00572             }
00573             s_GetNewick(*child, result);
00574             child++;
00575         }
00576         result += ")";
00577     }
00578 
00579     result += node->GetValue().GetLabel();
00580     result += ":";    
00581     result += node->GetValue().IsSetDist()
00582         ? NStr::DoubleToString(node->GetValue().GetDist(), 3) : "x";    
00583 }
00584 
00585 static string s_GetNewickLike(const TPhyTreeNode* tree)
00586 {
00587     string result;
00588     result += "(";
00589     s_GetNewick(tree, result);
00590     result += ")";
00591 
00592     return result;
00593 }
00594 
00595 
00596 // Traverse BioTreeDynamic
00597 static void s_TraverseDynTree(const CBioTreeDynamic::CBioNode* node,
00598                               vector<bool>& leaves)
00599 {
00600     if (node->IsLeaf()) {
00601 
00602         string label = node->GetFeature("label");
00603 
00604         // each leaf node must have a label
00605         BOOST_REQUIRE(!label.empty());
00606 
00607         // label must be a number between zero and number of leaves - 1
00608         int id = NStr::StringToInt(label);
00609         BOOST_REQUIRE(id >= 0 && id < (int)leaves.size());
00610 
00611         // each label must not appear more than once
00612         BOOST_REQUIRE(!leaves[id]);
00613         leaves[id] = true;
00614 
00615         return;
00616     }
00617 
00618     CBioTreeDynamic::CBioNode::TParent::TNodeList_CI child
00619         = node->SubNodeBegin();
00620 
00621     while (child != node->SubNodeEnd()) {
00622         s_TraverseDynTree((CBioTreeDynamic::CBioNode*)*child, leaves);
00623         child++;
00624     }
00625 }
00626 
00627 
00628 // Check BioTreeContainer
00629 static bool s_TestTreeContainer(int num_sequences, const CBioTreeContainer& tree)
00630 {
00631     // Make container 2 dynamic and traverse the tree
00632     CBioTreeDynamic dyntree;
00633     BioTreeConvertContainer2Dynamic(dyntree, tree);
00634 
00635     vector<bool> leaves(num_sequences, false);
00636 
00637     // check that all num_sequences leaves are present
00638     s_TraverseDynTree(dyntree.GetTreeNode(), leaves);
00639     ITERATE (vector<bool>, it, leaves) {
00640         BOOST_REQUIRE(*it);
00641     }
00642 
00643     return true;
00644 }
00645 
00646 
00647 // Check tree computation from Seq_align
00648 static bool s_TestCalc(const CSeq_align& seq_align,
00649                        const CPhyTreeCalc& calc)
00650 {
00651     const int kNumInputSequences = seq_align.GetDim();
00652     const int kNumUsedSequences = calc.GetSeqIds().size();
00653     
00654     // divergence matrix must have the same number of rows/columns as number
00655     // sequences included in tree computation
00656     const CPhyTreeCalc::CDistMatrix& diverg = calc.GetDivergenceMatrix();
00657     BOOST_REQUIRE_EQUAL(kNumUsedSequences, diverg.GetNumElements());
00658     for (int i=0; i < diverg.GetNumElements();i++) {
00659         for (int j=0;j < diverg.GetNumElements();j++) {
00660             BOOST_REQUIRE(diverg(i, j) >= 0.0 && diverg(i, j) <= 1.0);
00661         }
00662     }
00663 
00664     // distance matrix must have the same number of rows/columns as number
00665     // sequences included in tree computation
00666     const CPhyTreeCalc::CDistMatrix& dist = calc.GetDistanceMatrix();
00667     BOOST_REQUIRE_EQUAL(kNumUsedSequences, dist.GetNumElements());
00668     BOOST_REQUIRE_EQUAL(kNumUsedSequences, diverg.GetNumElements());
00669     for (int i=0; i < diverg.GetNumElements();i++) {
00670         for (int j=0;j < diverg.GetNumElements();j++) {
00671             BOOST_REQUIRE(dist(i, j) >= 0.0);
00672         }
00673     }
00674    
00675     // GetSeqAlign() function must return alignment of sequences used in tree
00676     // computation
00677     BOOST_REQUIRE_EQUAL(kNumUsedSequences, (int)calc.GetSeqAlign()->GetDim());
00678 
00679 
00680     BOOST_REQUIRE_EQUAL(kNumInputSequences,
00681                         kNumUsedSequences + (int)calc.GetRemovedSeqIds().size());
00682 
00683     // Make sure that each Seq-id in calc's Seq-align is appears in input
00684     // Seq-align
00685     const vector< CRef<CSeq_id> > input_seq_ids
00686         = seq_align.GetSegs().GetDenseg().GetIds();
00687 
00688     const vector< CRef<CSeq_id> > used_seq_ids
00689         = calc.GetSeqAlign()->GetSegs().GetDenseg().GetIds();
00690 
00691     vector<bool> found(used_seq_ids.size(), false);
00692     for (size_t i=0;i < used_seq_ids.size();i++) {
00693         for (size_t j=0;j < input_seq_ids.size();j++) {
00694             if (used_seq_ids[i]->Match(*input_seq_ids[j])) {
00695                 found[i] = true;
00696                 break;
00697             }
00698         }
00699     }
00700     ITERATE (vector<bool>, it, found) {
00701         BOOST_REQUIRE(*it);
00702     }
00703 
00704 
00705     s_TestTree(kNumUsedSequences, calc.GetTree());
00706     s_TestTreeContainer(kNumUsedSequences, *calc.GetSerialTree());
00707 
00708     return true;
00709 }
00710 
00711 
00712 // Check tree computation from Seq_align_set
00713 static bool s_TestCalc(const CSeq_align_set& seq_align_set,
00714                        const CPhyTreeCalc& calc)
00715 {
00716     vector< CRef<CSeq_id> > input_seq_ids;
00717     input_seq_ids.reserve(seq_align_set.Get().size());
00718     input_seq_ids.push_back(
00719             (*seq_align_set.Get().begin())->GetSegs().GetDenseg().GetIds()[0]);
00720 
00721     ITERATE (CSeq_align_set::Tdata, it, seq_align_set.Get()) {
00722         BOOST_REQUIRE_EQUAL((*it)->GetSegs().GetDenseg().GetDim(), 2);
00723         int i = (int)input_seq_ids.size() - 1;
00724         while (i >= 0 && !(*it)->GetSegs().GetDenseg().GetIds()[1]->Match(
00725                                                         *input_seq_ids[i])) {
00726                 i--;
00727         }
00728         if (i < 0) {
00729             input_seq_ids.push_back((*it)->GetSegs().GetDenseg().GetIds()[1]);
00730         }
00731     }
00732 
00733     const int kNumInputSequences = (int)input_seq_ids.size();
00734     const int kNumUsedSequences = calc.GetSeqIds().size();
00735     
00736     const CPhyTreeCalc::CDistMatrix& diverg = calc.GetDivergenceMatrix();
00737     BOOST_REQUIRE_EQUAL(kNumUsedSequences, diverg.GetNumElements());
00738     for (int i=0; i < diverg.GetNumElements();i++) {
00739         for (int j=0;j < diverg.GetNumElements();j++) {
00740             BOOST_REQUIRE(diverg(i, j) >= 0.0 && diverg(i, j) <= 1.0);
00741         }
00742     }
00743 
00744     const CPhyTreeCalc::CDistMatrix& dist = calc.GetDistanceMatrix();
00745     BOOST_REQUIRE_EQUAL(kNumUsedSequences, dist.GetNumElements());
00746     for (int i=0; i < dist.GetNumElements();i++) {
00747         for (int j=0;j < dist.GetNumElements();j++) {
00748             BOOST_REQUIRE(dist(i, j) >= 0.0);
00749         }
00750     }
00751    
00752     BOOST_REQUIRE_EQUAL(kNumUsedSequences, (int)calc.GetSeqAlign()->GetDim());
00753 
00754 
00755     BOOST_REQUIRE_EQUAL(kNumInputSequences,
00756                         kNumUsedSequences + (int)calc.GetRemovedSeqIds().size());
00757 
00758     // Make sure that each Seq-id in calc's Seq-align is appears in input
00759     // Seq-align
00760     const vector< CRef<CSeq_id> > used_seq_ids
00761         = calc.GetSeqAlign()->GetSegs().GetDenseg().GetIds();
00762 
00763     vector<bool> found(used_seq_ids.size(), false);
00764     for (size_t i=0;i < used_seq_ids.size();i++) {
00765         for (size_t j=0;j < input_seq_ids.size();j++) {
00766             if (used_seq_ids[i]->Match(*input_seq_ids[j])) {
00767                 found[i] = true;
00768                 break;
00769             }
00770         }
00771     }
00772     ITERATE (vector<bool>, it, found) {
00773         BOOST_REQUIRE(*it);
00774     }
00775 
00776 
00777     s_TestTree(kNumUsedSequences, calc.GetTree());
00778     s_TestTreeContainer(kNumUsedSequences, *calc.GetSerialTree());
00779 
00780     return true;
00781 }
00782 
00783 
00784 #endif /* SKIP_DOXYGEN_PROCESSING */
00785 
Modified on Tue Sep 23 17:20:32 2014 by modify_doxy.py rev. 426318