NCBI C++ ToolKit
defline_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

00001 /*  $Id: defline_unit_test.cpp 62981 2014-05-21 14:04:06Z madden $
00002 * ===========================================================================
00003 *
00004 *                            PUBLIC DOMAIN NOTICE
00005 *               National Center for Biotechnology Information
00006 *
00007 *  This software/database is a "United States Government Work" under the
00008 *  terms of the United States Copyright Act.  It was written as part of
00009 *  the author's official duties as a United States Government employee and
00010 *  thus cannot be copyrighted.  This software/database is freely available
00011 *  to the public for use. The National Library of Medicine and the U.S.
00012 *  Government have not placed any restriction on its use or reproduction.
00013 *
00014 *  Although all reasonable efforts have been taken to ensure the accuracy
00015 *  and reliability of the software and data, the NLM and the U.S.
00016 *  Government do not and cannot warrant the performance or results that
00017 *  may be obtained by using this software or data. The NLM and the U.S.
00018 *  Government disclaim all warranties, express or implied, including
00019 *  warranties of performance, merchantability or fitness for any particular
00020 *  purpose.
00021 *
00022 *  Please cite the author in any work or product based on this material.
00023 *
00024 * ===========================================================================
00025 *
00026 * Author:  Tom Madden, NCBI
00027 *
00028 * File Description:
00029 *   Unit test for sorting of Seq-ids in BLAST deflines
00030 *
00031 *
00032 * ===========================================================================
00033 */
00034 
00035 #include <ncbi_pch.hpp>
00036 
00037 #include <corelib/ncbi_system.hpp>
00038 #include <corelib/ncbiapp.hpp>
00039 
00040 #include <objects/seqloc/Seq_id.hpp>
00041 #include <objects/blastdb/defline_extra.hpp>
00042 #include <objects/blastdb/Blast_def_line.hpp>
00043 #include <objects/blastdb/Blast_def_line_set.hpp>
00044 
00045 // generated includes
00046 #include <objects/blastdb/Blast_def_line_set.hpp>
00047 
00048 #include <util/random_gen.hpp>
00049 
00050 
00051 
00052 // This header must be included before all Boost.Test headers if there are any
00053 #include <corelib/test_boost.hpp>
00054 
00055 
00056 
00057 
00058 USING_NCBI_SCOPE;
00059 USING_SCOPE(objects);
00060 
00061 
00062 NCBITEST_AUTO_INIT()
00063 {
00064     // Your application initialization code here (optional)
00065 }
00066 
00067 
00068 NCBITEST_INIT_CMDLINE(arg_desc)
00069 {
00070     // Describe command line parameters that we are going to use
00071     arg_desc->AddFlag
00072         ("enable_TestTimeout",
00073          "Run TestTimeout test, which is artificially disabled by default in"
00074          "order to avoid unwanted failure during the daily automated builds.");
00075 }
00076 
00077 
00078 
00079 NCBITEST_AUTO_FINI()
00080 {
00081     // Your application finalization code here (optional)
00082 }
00083 
00084 // Make one defline.
00085 static CRef<CBlast_def_line> s_MakeDefline(const string& id, const string& title)
00086 {
00087     CRef<CBlast_def_line> defline(new CBlast_def_line());
00088     list<CRef<CSeq_id> >& seqid_list = defline->SetSeqid();
00089     CSeq_id::ParseFastaIds(seqid_list, id);
00090     defline->SetTitle(title);
00091 
00092     return defline;
00093 }
00094 
00095 // Make a series of deflines in random order.
00096 CRef<CBlast_def_line_set>
00097 s_MakeRandomDeflineSet(const char* const theIds[], size_t array_size)
00098 {
00099     vector<CRef<CBlast_def_line> > defline_v;
00100     for (size_t i=0; i<array_size; ++i)
00101     {
00102         defline_v.push_back(s_MakeDefline(theIds[i], "the title"));
00103     }
00104 
00105     CRandom rnd(1);
00106     for (size_t i=0; i<array_size; ++i)
00107     {
00108         swap(defline_v[i], defline_v[rnd.GetRand(0, defline_v.size()-1)]);
00109     }
00110 
00111     CRef<CBlast_def_line_set> defline_set(new CBlast_def_line_set());
00112     for (size_t index=0; index<array_size; index++)
00113     {
00114         const string fasta_str = defline_v[index]->GetSeqid().back()->AsFastaString();
00115         // cerr << fasta_str << "\n";
00116         defline_set->Set().push_back(defline_v[index]);
00117     }
00118     return defline_set;
00119 }
00120 
00121 
00122 
00123 // WP before NP before YP
00124 BOOST_AUTO_TEST_CASE(SortRefSeqProteinSet1)
00125 {
00126     // theIds has the canonical order
00127     const char* const theIds[] = {
00128         "gi|289223532|ref|WP_003131952.1|",
00129         "gi|15674171|ref|NP_268346.1|",
00130         "gi|116513137|ref|YP_812044.1|",
00131         "gi|125625229|ref|YP_001033712.1|",
00132         "gi|281492845|ref|YP_003354825.1|",
00133         "gi|385831755|ref|YP_005869568.1|",
00134         "gi|13878750|sp|Q9CDN0.1|RS18_LACLA",
00135         "gi|122939895|sp|Q02VU1.1|RS18_LACLS",
00136         "gi|166220956|sp|A2RNZ2.1|RS18_LACLM",
00137         "gi|12725253|gb|AAK06287.1|AE006448_5",
00138         "gi|116108791|gb|ABJ73931.1|",
00139         "gi|124494037|emb|CAL99037.1|",
00140         "gi|281376497|gb|ADA65983.1|",
00141         "gi|300072039|gb|ADJ61439.1|",
00142         "gi|326407763|gb|ADZ64834.1|"
00143     };
00144 
00145     CRef<CBlast_def_line_set> defline_set = s_MakeRandomDeflineSet(theIds, ArraySize(theIds));
00146 
00147     defline_set->SortBySeqIdRank(true);
00148 
00149     int index=0;
00150     ITERATE(CBlast_def_line_set::Tdata, itr, defline_set->Get())
00151     {
00152         const string fasta_str = (*itr)->GetSeqid().back()->AsFastaString();
00153         // cerr << fasta_str << "\n";
00154         string startId(theIds[index]);
00155         BOOST_CHECK_MESSAGE(startId.find(fasta_str) != string::npos, "Error for " << fasta_str);
00156         index++;
00157     }
00158 }
00159 
00160 // WP before YP
00161 BOOST_AUTO_TEST_CASE(SortRefSeqProteinSet2)
00162 {
00163     const char* const theIds[] = {
00164         "gi|446057344|ref|WP_000135199.1|",
00165         "gi|443615715|ref|YP_007379571.1|",
00166         "gi|444353545|ref|YP_007389689.1|",
00167         "gi|448240163|ref|YP_007404216.1|",
00168         "gi|449306713|ref|YP_007439069.1|",
00169         "gi|67472372|sp|P0A7T7.2|RS18_ECOLI",
00170         "gi|67472373|sp|P0A7T8.2|RS18_ECOL6",
00171         "gi|67472374|sp|P0A7T9.2|RS18_ECO57",
00172     };
00173 
00174     CRef<CBlast_def_line_set> defline_set = s_MakeRandomDeflineSet(theIds, ArraySize(theIds));
00175 
00176     defline_set->SortBySeqIdRank(true);
00177 
00178     int index=0;
00179     ITERATE(CBlast_def_line_set::Tdata, itr, defline_set->Get())
00180     {
00181         const string fasta_str = (*itr)->GetSeqid().back()->AsFastaString();
00182         // cerr << fasta_str << "\n";
00183         string startId(theIds[index]);
00184         // cerr << startId << "\n";
00185         BOOST_CHECK_MESSAGE(startId.find(fasta_str) != string::npos, "Error for " << fasta_str);
00186         index++;
00187     }
00188 }
00189 
00190 // NP before XP
00191 BOOST_AUTO_TEST_CASE(SortRefSeqProteinSet3)
00192 {
00193     // theIds has the canonical order
00194     const char* const theIds[] = {
00195         "gi|4757812|ref|NP_004880.1|",
00196         "gi|114614837|ref|XP_001139040.1|",
00197         "gi|426357086|ref|XP_004045879.1|",
00198         "gi|7404340|sp|P56134.3|ATPK_HUMAN",
00199         "gi|3335128|gb|AAC39887.1|",
00200         "gi|3552030|gb|AAC34895.1|",
00201         "gi|48145899|emb|CAG33172.1|",
00202         "gi|49457306|emb|CAG46952.1|",
00203         "gi|51094625|gb|EAL23877.1|",
00204         "gi|119597067|gb|EAW76661.1|"
00205     };
00206 
00207     CRef<CBlast_def_line_set> defline_set = s_MakeRandomDeflineSet(theIds, ArraySize(theIds));
00208 
00209     defline_set->SortBySeqIdRank(true);
00210 
00211     int index=0;
00212     ITERATE(CBlast_def_line_set::Tdata, itr, defline_set->Get())
00213     {
00214         const string fasta_str = (*itr)->GetSeqid().back()->AsFastaString();
00215         // cerr << fasta_str << "\n";
00216         string startId(theIds[index]);
00217         BOOST_CHECK_MESSAGE(startId.find(fasta_str) != string::npos, "Error for " << fasta_str);
00218         index++;
00219     }
00220 }
00221 
00222 // AP after NP
00223 BOOST_AUTO_TEST_CASE(SortRefSeqProteinSet4)
00224 {
00225     // theIds has the canonical order
00226     const char* const theIds[] = {
00227         "gi|9626178|ref|NP_040526.1|",
00228         "gi|56160501|ref|AP_000176.1|",
00229         "gi|56160855|ref|AP_000513.1|"
00230         "gi|139364|sp|P03252.1|PRO_ADE02",
00231         "gi|34810217|pdb|1NLN|A",
00232         "gi|33330456|gb|AAQ10554.1|"
00233     };
00234 
00235     CRef<CBlast_def_line_set> defline_set = s_MakeRandomDeflineSet(theIds, ArraySize(theIds));
00236 
00237     defline_set->SortBySeqIdRank(true);
00238 
00239     int index=0;
00240     ITERATE(CBlast_def_line_set::Tdata, itr, defline_set->Get())
00241     {
00242         const string fasta_str = (*itr)->GetSeqid().back()->AsFastaString();
00243         // cerr << fasta_str << "\n";
00244         string startId(theIds[index]);
00245         BOOST_CHECK_MESSAGE(startId.find(fasta_str) != string::npos, "Error for " << fasta_str);
00246         index++;
00247     }
00248 }
00249 
00250 // NM before XM
00251 BOOST_AUTO_TEST_CASE(SortRefSeqNucleotideSet1)
00252 {
00253     // theIds has the canonical order
00254     const char* const theIds[] = {
00255         "gi|133922597|ref|NM_001083308.1|",
00256         "gi|55621735|ref|XM_526424.1|",
00257         "gi|397465444|ref|XM_003804458.1|",
00258         "gi|61741314|gb|AY858112.1|"
00259     };
00260 
00261     CRef<CBlast_def_line_set> defline_set = s_MakeRandomDeflineSet(theIds, ArraySize(theIds));
00262 
00263     defline_set->SortBySeqIdRank(false);
00264 
00265     int index=0;
00266     ITERATE(CBlast_def_line_set::Tdata, itr, defline_set->Get())
00267     {
00268         const string fasta_str = (*itr)->GetSeqid().back()->AsFastaString();
00269         // cerr << fasta_str << "\n";
00270         string startId(theIds[index]);
00271         BOOST_CHECK_MESSAGE(startId.find(fasta_str) != string::npos, "Error for " << fasta_str);
00272         index++;
00273     }
00274 }
00275 
00276 
00277 BOOST_AUTO_TEST_CASE(Test_GetLeafTaxIds)
00278 {
00279     CBlast_def_line def_line;   // initially empty
00280 
00281     // Test GetLeafTaxIds with nothing in 'links'.
00282     CBlast_def_line::TTaxIds taxids = def_line.GetLeafTaxIds();
00283     BOOST_CHECK_EQUAL(taxids.size(), 0);
00284 
00285     list<int> taxid_list;       // initially empty
00286 
00287     // Test GetLeafTaxIds with values in 'links'.
00288     taxid_list.push_back(200003);
00289     taxid_list.push_back(200002);
00290     taxid_list.push_back(200001);
00291     def_line.ResetTaxid();
00292     def_line.SetLinks() = taxid_list;
00293     taxids = def_line.GetLeafTaxIds();
00294     BOOST_CHECK_EQUAL(taxids.size(), 3);
00295 }
00296 
00297 
00298 BOOST_AUTO_TEST_CASE(Test_SetLeafTaxIds)
00299 {
00300     CBlast_def_line          def_line;      // initially empty
00301     CBlast_def_line::TTaxIds taxid_set;     // initially empty
00302 
00303     // Test SetLeafTaxIds with no taxids.
00304     def_line.SetLeafTaxIds(taxid_set);
00305     CBlast_def_line::TTaxIds taxids = def_line.GetLeafTaxIds();
00306     BOOST_CHECK_EQUAL(taxids.size(), 0);
00307     BOOST_CHECK(!def_line.IsSetLinks());
00308 
00309     // Test SetLeafTaxIds with multiple taxids.
00310     def_line.ResetLinks();
00311     taxid_set.clear();
00312     taxid_set.insert(100002);
00313     taxid_set.insert(100003);
00314     taxid_set.insert(100004);
00315     taxid_set.insert(100005);
00316     def_line.SetLeafTaxIds(taxid_set);
00317     taxids = def_line.GetLeafTaxIds();
00318     BOOST_CHECK_EQUAL(taxid_set.size(), taxids.size());
00319     BOOST_CHECK(def_line.IsSetLinks());
00320 }
Modified on Thu Mar 05 12:50:54 2015 by modify_doxy.py rev. 426318