NCBI C++ ToolKit
seqdb.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

00001 /*  $Id: seqdb.cpp 65709 2014-12-22 21:17:30Z rackerst $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE
00005  *               National Center for Biotechnology Information
00006  *
00007  *  This software/database is a "United States Government Work" under the
00008  *  terms of the United States Copyright Act.  It was written as part of
00009  *  the author's official duties as a United States Government employee and
00010  *  thus cannot be copyrighted.  This software/database is freely available
00011  *  to the public for use. The National Library of Medicine and the U.S.
00012  *  Government have not placed any restriction on its use or reproduction.
00013  *
00014  *  Although all reasonable efforts have been taken to ensure the accuracy
00015  *  and reliability of the software and data, the NLM and the U.S.
00016  *  Government do not and cannot warrant the performance or results that
00017  *  may be obtained by using this software or data. The NLM and the U.S.
00018  *  Government disclaim all warranties, express or implied, including
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.
00021  *
00022  *  Please cite the author in any work or product based on this material.
00023  *
00024  * ===========================================================================
00025  *
00026  * Author:  Kevin Bealer
00027  *
00028  */
00029 
00030 /// @file seqdb.cpp
00031 /// Implementation for the CSeqDB class, the top level class for SeqDB.
00032 
00033 #ifndef SKIP_DOXYGEN_PROCESSING
00034 static char const rcsid[] = "$Id: seqdb.cpp 65709 2014-12-22 21:17:30Z rackerst $";
00035 #endif /* SKIP_DOXYGEN_PROCESSING */
00036 
00037 #include <ncbi_pch.hpp>
00038 #include <objtools/blast/seqdb_reader/seqdb.hpp>
00039 #include <util/sequtil/sequtil_convert.hpp>
00040 #include "seqdbimpl.hpp"
00041 #include <objtools/blast/seqdb_reader/impl/seqdbgeneral.hpp>
00042 #include <map>
00043 #include <string>
00044 
00045 #include <serial/objistr.hpp>
00046 #include <serial/objostr.hpp>
00047 #include <serial/serial.hpp>
00048 #include <serial/objostrasnb.hpp>
00049 #include <serial/objistrasnb.hpp>
00050 
00051 #include <objects/general/Object_id.hpp>
00052 #include <objects/general/User_object.hpp>
00053 #include <objects/general/User_field.hpp>
00054 #include <objects/general/Dbtag.hpp>
00055 
00056 BEGIN_NCBI_SCOPE
00057 
00058 const string CSeqDB::kOidNotFound("OID not found");
00059 
00060 /// Helper function to translate enumerated type to character.
00061 ///
00062 /// @param seqtype
00063 ///   The sequence type (eProtein, eNucleotide, or eUnknown).
00064 /// @return
00065 ///   The sequence type as a char ('p', 'n', or '-').
00066 
00067 static char s_GetSeqTypeChar(CSeqDB::ESeqType seqtype)
00068 {
00069     switch(seqtype) {
00070     case CSeqDB::eProtein:
00071         return 'p';
00072     case CSeqDB::eNucleotide:
00073         return 'n';
00074     case CSeqDB::eUnknown:
00075         return '-';
00076     }
00077 
00078     NCBI_THROW(CSeqDBException,
00079                eArgErr,
00080                "Invalid sequence type specified.");
00081 }
00082 
00083 /// Helper function to build private implementation object.
00084 ///
00085 /// This method builds and returns the object which implements the
00086 /// functionality for the CSeqDB API.  If this method is called with
00087 /// '-' for the sequence data type, protein will be tried first, then
00088 /// nucleotide.  The created object will be returned.  Either
00089 /// kSeqTypeProt for a protein database, kSeqTypeNucl for nucleotide,
00090 /// or kSeqTypeUnkn to less this function try one then the other.
00091 ///
00092 /// @param dbname
00093 ///   A list of database or alias names, seperated by spaces.
00094 /// @param prot_nucl
00095 ///   Specify whether to use protein, nucleotide, or either.
00096 /// @param oid_begin
00097 ///   Iterator will skip OIDs less than this value.  Only OIDs
00098 ///   found in the OID lists (if any) will be returned.
00099 /// @param oid_end
00100 ///   Iterator will return up to (but not including) this OID.
00101 /// @param use_mmap
00102 ///   If kSeqDBMMap is specified (the default), memory mapping is
00103 ///   attempted.  If kSeqDBNoMMap is specified, or memory mapping
00104 ///   fails, this platform does not support it, the less efficient
00105 ///   read and write calls are used instead.
00106 /// @param gi_list
00107 ///   This ID list specifies OIDs and deflines to include.
00108 /// @param neg_list
00109 ///   This negative ID list specifies deflines and OIDs to exclude.
00110 /// @param idset
00111 ///   If set, this specifies IDs to either include or exclude.
00112 /// @return
00113 ///   The CSeqDBImpl object that was created.
00114 
00115 static CSeqDBImpl *
00116 s_SeqDBInit(const string       & dbname,
00117             char                 prot_nucl,
00118             int                  oid_begin,
00119             int                  oid_end,
00120             bool                 use_mmap,
00121             CSeqDBGiList       * gi_list = NULL,
00122             CSeqDBNegativeList * neg_list = NULL,
00123             CSeqDBIdSet          idset = CSeqDBIdSet())
00124 {
00125     CSeqDBImpl * impl = 0;
00126 
00127     if (prot_nucl == '-') {
00128         try {
00129             prot_nucl = 'p';
00130             impl = new CSeqDBImpl(dbname,
00131                                   prot_nucl,
00132                                   oid_begin,
00133                                   oid_end,
00134                                   use_mmap,
00135                                   gi_list,
00136                                   neg_list,
00137                                   idset);
00138         }
00139         catch(CSeqDBException &) {
00140             prot_nucl = 'n';
00141         }
00142     }
00143 
00144     if (! impl) {
00145         impl = new CSeqDBImpl(dbname,
00146                               prot_nucl,
00147                               oid_begin,
00148                               oid_end,
00149                               use_mmap,
00150                               gi_list,
00151                               neg_list,
00152                               idset);
00153     }
00154 
00155     _ASSERT(impl);
00156 
00157     return impl;
00158 }
00159 
00160 CSeqDB::CSeqDB(const string & dbname,
00161                ESeqType       seqtype,
00162                CSeqDBGiList * gi_list)
00163 {
00164     if (dbname.size() == 0) {
00165         NCBI_THROW(CSeqDBException,
00166                    eArgErr,
00167                    "Database name is required.");
00168     }
00169 
00170     char seq_type = s_GetSeqTypeChar(seqtype);
00171 
00172     m_Impl = s_SeqDBInit(dbname,
00173                          seq_type,
00174                          0,
00175                          0,
00176                          true,
00177                          gi_list);
00178 
00179     m_Impl->Verify();
00180 }
00181 
00182 CSeqDB::CSeqDB(const string       & dbname,
00183                ESeqType             seqtype,
00184                CSeqDBNegativeList * nlist)
00185 {
00186     if (dbname.size() == 0) {
00187         NCBI_THROW(CSeqDBException,
00188                    eArgErr,
00189                    "Database name is required.");
00190     }
00191 
00192     m_Impl = s_SeqDBInit(dbname,
00193                          s_GetSeqTypeChar(seqtype),
00194                          0,
00195                          0,
00196                          true,
00197                          NULL,
00198                          nlist);
00199 
00200     m_Impl->Verify();
00201 }
00202 
00203 // This could become the primary constructor for SeqDB, and those
00204 // taking positive and negative lists could be deprecated.  This
00205 // implies refactoring of code using SeqDB, addition of the third
00206 // (string/Seq-id) type IDs to the IdSet, and changes to client code.
00207 // Some non-SeqDB code uses FindOID and other methods of the GI list,
00208 // comparable functionality would need to be added to IdSet().
00209 //
00210 // Before any of that is done, all the SeqDB classes should be made to
00211 // use CSeqDBIdSet instead of using positive and negative lists.  This
00212 // implies widespread changes to CSeqDBIdSet and SeqDB internal code.
00213 //
00214 // I'll leave those changes for another time -- for now I'll just add
00215 // the pieces of framework that seem useful and are implied by the
00216 // current design.
00217 
00218 CSeqDB::CSeqDB(const string & dbname, ESeqType seqtype, CSeqDBIdSet ids)
00219 {
00220     if (dbname.size() == 0) {
00221         NCBI_THROW(CSeqDBException,
00222                    eArgErr,
00223                    "Database name is required.");
00224     }
00225 
00226     CRef<CSeqDBNegativeList> neg;
00227     CRef<CSeqDBGiList> pos;
00228 
00229     if (! ids.Blank()) {
00230         if (ids.IsPositive()) {
00231             pos = ids.GetPositiveList();
00232         } else {
00233             neg = ids.GetNegativeList();
00234         }
00235     }
00236 
00237     m_Impl = s_SeqDBInit(dbname,
00238                          s_GetSeqTypeChar(seqtype),
00239                          0,
00240                          0,
00241                          true,
00242                          pos.GetPointerOrNull(),
00243                          neg.GetPointerOrNull(),
00244                          ids);
00245 
00246     m_Impl->Verify();
00247 }
00248 
00249 CSeqDB::CSeqDB(const vector<string> & dbs,
00250                ESeqType               seqtype,
00251                CSeqDBGiList         * gi_list)
00252 {
00253     string dbname;
00254     SeqDB_CombineAndQuote(dbs, dbname);
00255 
00256     if (dbname.size() == 0) {
00257         NCBI_THROW(CSeqDBException,
00258                    eArgErr,
00259                    "Database name is required.");
00260     }
00261 
00262     m_Impl = s_SeqDBInit(dbname,
00263                          s_GetSeqTypeChar(seqtype),
00264                          0,
00265                          0,
00266                          true,
00267                          gi_list);
00268 
00269     m_Impl->Verify();
00270 }
00271 
00272 CSeqDB::CSeqDB(const string & dbname,
00273                ESeqType       seqtype,
00274                int            oid_begin,
00275                int            oid_end,
00276                bool           use_mmap,
00277                CSeqDBGiList * gi_list)
00278 {
00279     if (dbname.size() == 0) {
00280         NCBI_THROW(CSeqDBException,
00281                    eArgErr,
00282                    "Database name is required.");
00283     }
00284 
00285     m_Impl = s_SeqDBInit(dbname,
00286                          s_GetSeqTypeChar(seqtype),
00287                          oid_begin,
00288                          oid_end,
00289                          use_mmap,
00290                          gi_list);
00291 
00292     m_Impl->Verify();
00293 }
00294 
00295 CSeqDB::CSeqDB(const vector<string> & dbs,
00296                ESeqType               seqtype,
00297                int                    oid_begin,
00298                int                    oid_end,
00299                bool                   use_mmap,
00300                CSeqDBGiList         * gi_list)
00301 {
00302     string dbname;
00303     SeqDB_CombineAndQuote(dbs, dbname);
00304 
00305     if (dbname.size() == 0) {
00306         NCBI_THROW(CSeqDBException,
00307                    eArgErr,
00308                    "Database name is required.");
00309     }
00310 
00311     m_Impl = s_SeqDBInit(dbname,
00312                          s_GetSeqTypeChar(seqtype),
00313                          oid_begin,
00314                          oid_end,
00315                          use_mmap,
00316                          gi_list);
00317 
00318     m_Impl->Verify();
00319 }
00320 
00321 CSeqDB::CSeqDB()
00322 {
00323     m_Impl = new CSeqDBImpl;
00324     m_Impl->Verify();
00325 }
00326 
00327 int CSeqDB::GetSeqLength(int oid) const
00328 {
00329     m_Impl->Verify();
00330     int length = m_Impl->GetSeqLength(oid);
00331     m_Impl->Verify();
00332 
00333     return length;
00334 }
00335 
00336 int CSeqDB::GetSeqLengthApprox(int oid) const
00337 {
00338     m_Impl->Verify();
00339     int length = m_Impl->GetSeqLengthApprox(oid);
00340     m_Impl->Verify();
00341 
00342     return length;
00343 }
00344 
00345 CRef<CBlast_def_line_set> CSeqDB::GetHdr(int oid) const
00346 {
00347     m_Impl->Verify();
00348     CRef<CBlast_def_line_set> rv = m_Impl->GetHdr(oid);
00349     m_Impl->Verify();
00350 
00351     return rv;
00352 }
00353 
00354 CSeqDB::ESeqType CSeqDB::GetSequenceType() const
00355 {
00356     switch(m_Impl->GetSeqType()) {
00357     case 'p':
00358         return eProtein;
00359     case 'n':
00360         return eNucleotide;
00361     }
00362 
00363     NCBI_THROW(CSeqDBException,
00364                eArgErr,
00365                "Internal sequence type is not valid.");
00366 }
00367 
00368 void CSeqDB::GetTaxIDs(int             oid,
00369                        map<TGi, int> & gi_to_taxid,
00370                        bool            persist) const
00371 {
00372     m_Impl->Verify();
00373     m_Impl->GetTaxIDs(oid, gi_to_taxid, persist);
00374     m_Impl->Verify();
00375 }
00376 
00377 void CSeqDB::GetTaxIDs(int           oid,
00378                        vector<int> & taxids,
00379                        bool          persist) const
00380 {
00381     m_Impl->Verify();
00382     m_Impl->GetTaxIDs(oid, taxids, persist);
00383     m_Impl->Verify();
00384 }
00385 
00386 void CSeqDB::GetLeafTaxIDs(
00387         int                  oid,
00388         map<TGi, set<int> >& gi_to_taxid_set,
00389         bool                 persist
00390 ) const
00391 {
00392     m_Impl->Verify();
00393     m_Impl->GetLeafTaxIDs(oid, gi_to_taxid_set, persist);
00394     m_Impl->Verify();
00395 }
00396 
00397 void CSeqDB::GetLeafTaxIDs(
00398         int          oid,
00399         vector<int>& taxids,
00400         bool         persist
00401 ) const
00402 {
00403     m_Impl->Verify();
00404     m_Impl->GetLeafTaxIDs(oid, taxids, persist);
00405     m_Impl->Verify();
00406 }
00407 
00408 CRef<CBioseq>
00409 CSeqDB::GetBioseq(int oid, int target_gi, const CSeq_id * target_id) const
00410 {
00411     m_Impl->Verify();
00412     CRef<CBioseq> rv = m_Impl->GetBioseq(oid, target_gi, target_id, true);
00413     m_Impl->Verify();
00414 
00415     return rv;
00416 }
00417 
00418 CRef<CBioseq>
00419 CSeqDB::GetBioseqNoData(int oid, int target_gi, const CSeq_id * target_id) const
00420 {
00421     m_Impl->Verify();
00422     CRef<CBioseq> rv = m_Impl->GetBioseq(oid, target_gi, target_id, false);
00423     m_Impl->Verify();
00424 
00425     return rv;
00426 }
00427 
00428 void CSeqDB::RetSequence(const char ** buffer) const
00429 {
00430     m_Impl->Verify();
00431     m_Impl->RetSequence(buffer);
00432     m_Impl->Verify();
00433 }
00434 
00435 int CSeqDB::GetSequence(int oid, const char ** buffer) const
00436 {
00437     m_Impl->Verify();
00438     int rv = m_Impl->GetSequence(oid, buffer);
00439     m_Impl->Verify();
00440 
00441     return rv;
00442 }
00443 
00444 CRef<CSeq_data> CSeqDB::GetSeqData(int     oid,
00445                                    TSeqPos begin,
00446                                    TSeqPos end) const
00447 {
00448     m_Impl->Verify();
00449     CRef<CSeq_data> rv = m_Impl->GetSeqData(oid, begin, end);
00450     m_Impl->Verify();
00451 
00452     return rv;
00453 }
00454 
00455 int CSeqDB::GetAmbigSeq(int oid, const char ** buffer, int nucl_code) const
00456 {
00457     m_Impl->Verify();
00458     int rv = m_Impl->GetAmbigSeq(oid,
00459                                  (char **)buffer,
00460                                  nucl_code,
00461                                  0,
00462                                  (ESeqDBAllocType) 0);
00463     m_Impl->Verify();
00464 
00465     return rv;
00466 }
00467 
00468 void CSeqDB::RetAmbigSeq(const char ** buffer) const
00469 {
00470     m_Impl->Verify();
00471     m_Impl->RetAmbigSeq(buffer);
00472     m_Impl->Verify();
00473 }
00474 
00475 int CSeqDB::GetAmbigSeq(int           oid,
00476                         const char ** buffer,
00477                         int           nucl_code,
00478                         int           begin_offset,
00479                         int           end_offset) const
00480 {
00481     m_Impl->Verify();
00482 
00483     SSeqDBSlice region(begin_offset, end_offset);
00484 
00485     int rv = m_Impl->GetAmbigSeq(oid,
00486                                  (char **)buffer,
00487                                  nucl_code,
00488                                  & region,
00489                                  (ESeqDBAllocType) 0);
00490 
00491     m_Impl->Verify();
00492 
00493     return rv;
00494 }
00495 
00496 int CSeqDB::GetAmbigSeqAlloc(int             oid,
00497                              char         ** buffer,
00498                              int             nucl_code,
00499                              ESeqDBAllocType strategy,
00500                              TSequenceRanges *masks) const
00501 {
00502     m_Impl->Verify();
00503 
00504     if ((strategy != eMalloc) && (strategy != eNew)) {
00505         NCBI_THROW(CSeqDBException,
00506                    eArgErr,
00507                    "Invalid allocation strategy specified.");
00508     }
00509 
00510     int rv = m_Impl->GetAmbigSeq(oid, buffer, nucl_code, 0, strategy, masks);
00511 
00512     m_Impl->Verify();
00513 
00514     return rv;
00515 }
00516 
00517 string CSeqDB::GetTitle() const
00518 {
00519     return m_Impl->GetTitle();
00520 }
00521 
00522 string CSeqDB::GetDate() const
00523 {
00524     return m_Impl->GetDate();
00525 }
00526 
00527 CTime
00528 CSeqDB::GetDate(const string   & dbname,
00529                 ESeqType         seqtype)
00530 {
00531     vector<string> vols;
00532     CSeqDB::FindVolumePaths(dbname, seqtype, vols);
00533     string fmt = "b d, Y  H:m P";
00534     CTime retv;
00535     char date[128];
00536     ITERATE(vector<string>, vol, vols) {
00537         string fn = *vol + ((seqtype == CSeqDB::eProtein)? ".pin" : ".nin");
00538         ifstream f(fn.c_str(), ios::in|ios::binary);
00539         char s[4];   // size of next chunk
00540         if (f.is_open()) {
00541             f.seekg(8, ios::beg);
00542             f.read(s, 4);
00543             Uint4 offset = SeqDB_GetStdOrd((Uint4 *) s);
00544             f.seekg(offset, ios::cur);
00545             f.read(s, 4);
00546             offset = SeqDB_GetStdOrd((Uint4 *) s);
00547             f.read(date, offset);
00548             CTime d(string(date), fmt);
00549             if (retv.IsEmpty() || d > retv) {
00550                 retv = d;
00551             }
00552         }
00553     }
00554     return retv;
00555 }
00556 
00557 int CSeqDB::GetNumSeqs() const
00558 {
00559     return m_Impl->GetNumSeqs();
00560 }
00561 
00562 int CSeqDB::GetNumSeqsStats() const
00563 {
00564     return m_Impl->GetNumSeqsStats();
00565 }
00566 
00567 int CSeqDB::GetNumOIDs() const
00568 {
00569     return m_Impl->GetNumOIDs();
00570 }
00571 
00572 Uint8 CSeqDB::GetTotalLength() const
00573 {
00574     return m_Impl->GetTotalLength();
00575 }
00576 
00577 Uint8 CSeqDB::GetExactTotalLength()
00578 {
00579     return m_Impl->GetExactTotalLength();
00580 }
00581 
00582 Uint8 CSeqDB::GetTotalLengthStats() const
00583 {
00584     return m_Impl->GetTotalLengthStats();
00585 }
00586 
00587 Uint8 CSeqDB::GetVolumeLength() const
00588 {
00589     return m_Impl->GetVolumeLength();
00590 }
00591 
00592 int CSeqDB::GetMaxLength() const
00593 {
00594     return m_Impl->GetMaxLength();
00595 }
00596 
00597 int CSeqDB::GetMinLength() const
00598 {
00599     return m_Impl->GetMinLength();
00600 }
00601 
00602 CSeqDB::~CSeqDB()
00603 {
00604     m_Impl->Verify();
00605 
00606     if (m_Impl)
00607         delete m_Impl;
00608 }
00609 
00610 CSeqDBIter CSeqDB::Begin() const
00611 {
00612     return CSeqDBIter(this, 0);
00613 }
00614 
00615 bool CSeqDB::CheckOrFindOID(int & oid) const
00616 {
00617     m_Impl->Verify();
00618     bool rv = m_Impl->CheckOrFindOID(oid);
00619     m_Impl->Verify();
00620 
00621     return rv;
00622 }
00623 
00624 
00625 CSeqDB::EOidListType
00626 CSeqDB::GetNextOIDChunk(int         & begin,
00627                         int         & end,
00628                         int         size,
00629                         vector<int> & lst,
00630                         int         * state)
00631 {
00632     m_Impl->Verify();
00633 
00634     CSeqDB::EOidListType rv =
00635         m_Impl->GetNextOIDChunk(begin, end, size, lst, state);
00636 
00637     m_Impl->Verify();
00638 
00639     return rv;
00640 }
00641 
00642 void CSeqDB::ResetInternalChunkBookmark()
00643 {
00644     m_Impl->ResetInternalChunkBookmark();
00645 }
00646 
00647 const string & CSeqDB::GetDBNameList() const
00648 {
00649     return m_Impl->GetDBNameList();
00650 }
00651 
00652 list< CRef<CSeq_id> > CSeqDB::GetSeqIDs(int oid) const
00653 {
00654     m_Impl->Verify();
00655 
00656     list< CRef<CSeq_id> > rv = m_Impl->GetSeqIDs(oid);
00657 
00658     m_Impl->Verify();
00659 
00660     return rv;
00661 }
00662 
00663 int CSeqDB::GetSeqGI(int oid) const
00664 {
00665     return m_Impl->GetSeqGI(oid);
00666 }
00667 
00668 bool CSeqDB::PigToOid(int pig, int & oid) const
00669 {
00670     m_Impl->Verify();
00671     bool rv = m_Impl->PigToOid(pig, oid);
00672     m_Impl->Verify();
00673 
00674     return rv;
00675 }
00676 
00677 bool CSeqDB::OidToPig(int oid, int & pig) const
00678 {
00679     m_Impl->Verify();
00680     bool rv = m_Impl->OidToPig(oid, pig);
00681     m_Impl->Verify();
00682 
00683     return rv;
00684 }
00685 
00686 bool CSeqDB::TiToOid(Int8 ti, int & oid) const
00687 {
00688     m_Impl->Verify();
00689     bool rv = m_Impl->TiToOid(ti, oid);
00690     m_Impl->Verify();
00691 
00692     return rv;
00693 }
00694 
00695 bool CSeqDB::GiToOid(int gi, int & oid) const
00696 {
00697     m_Impl->Verify();
00698     bool rv = m_Impl->GiToOid(gi, oid);
00699     m_Impl->Verify();
00700 
00701     return rv;
00702 }
00703 
00704 bool CSeqDB::OidToGi(int oid, int & gi) const
00705 {
00706     m_Impl->Verify();
00707     bool rv = m_Impl->OidToGi(oid, gi);
00708     m_Impl->Verify();
00709 
00710     return rv;
00711 }
00712 
00713 bool CSeqDB::PigToGi(int pig, int & gi) const
00714 {
00715     m_Impl->Verify();
00716     bool rv = false;
00717 
00718     int oid(0);
00719 
00720     if (m_Impl->PigToOid(pig, oid)) {
00721         rv = m_Impl->OidToGi(oid, gi);
00722     }
00723     m_Impl->Verify();
00724 
00725     return rv;
00726 }
00727 
00728 bool CSeqDB::GiToPig(int gi, int & pig) const
00729 {
00730     m_Impl->Verify();
00731     bool rv = false;
00732 
00733     int oid(0);
00734 
00735     if (m_Impl->GiToOid(gi, oid)) {
00736         rv = m_Impl->OidToPig(oid, pig);
00737     }
00738 
00739     m_Impl->Verify();
00740 
00741     return rv;
00742 }
00743 
00744 void CSeqDB::AccessionToOids(const string & acc, vector<int> & oids) const
00745 {
00746     m_Impl->Verify();
00747     m_Impl->AccessionToOids(acc, oids);
00748 
00749     // If we have a numeric ID and the search failed, try to look it
00750     // up as a GI (but not as a PIG or TI).  Due to the presence of
00751     // PDB ids like "pdb|1914|a", the faster GitToOid is not done
00752     // first (unless the caller does so.)
00753 
00754     if (oids.empty()) {
00755         try {
00756             int gi = NStr::StringToInt(acc, NStr::fConvErr_NoThrow);
00757             int oid(-1);
00758 
00759             if (gi > 0 && GiToOid(gi, oid)) {
00760                 int oid0 = oid;
00761                 if (m_Impl->CheckOrFindOID(oid) && (oid==oid0)) {
00762                     oids.push_back(oid);
00763                 }
00764             }
00765         }
00766         catch(...) {
00767         }
00768     }
00769 
00770     m_Impl->Verify();
00771 }
00772 
00773 void CSeqDB::SeqidToOids(const CSeq_id & seqid, vector<int> & oids) const
00774 {
00775     m_Impl->Verify();
00776     m_Impl->SeqidToOids(seqid, oids, true);
00777     m_Impl->Verify();
00778 }
00779 
00780 bool CSeqDB::SeqidToOid(const CSeq_id & seqid, int & oid) const
00781 {
00782     m_Impl->Verify();
00783     bool rv = false;
00784 
00785     oid = -1;
00786 
00787     vector<int> oids;
00788     m_Impl->SeqidToOids(seqid, oids, false);
00789 
00790     if (! oids.empty()) {
00791         rv = true;
00792         oid = oids[0];
00793     }
00794 
00795     m_Impl->Verify();
00796 
00797     return rv;
00798 }
00799 
00800 void CSeqDB::SetMemoryBound(Uint8 membound, Uint8 slice_size)
00801 {
00802     m_Impl->SetMemoryBound(membound);
00803 }
00804 
00805 int CSeqDB::GetOidAtOffset(int first_seq, Uint8 residue) const
00806 {
00807     m_Impl->Verify();
00808     int rv = m_Impl->GetOidAtOffset(first_seq, residue);
00809     m_Impl->Verify();
00810 
00811     return rv;
00812 }
00813 
00814 CSeqDBIter::CSeqDBIter(const CSeqDB * db, int oid)
00815     : m_DB    (db),
00816       m_OID   (oid),
00817       m_Data  (0),
00818       m_Length((int) -1)
00819 {
00820     if (m_DB->CheckOrFindOID(m_OID)) {
00821         x_GetSeq();
00822     }
00823 }
00824 
00825 CSeqDBIter::CSeqDBIter(const CSeqDBIter & other)
00826     : m_DB    (other.m_DB),
00827       m_OID   (other.m_OID),
00828       m_Data  (0),
00829       m_Length((int) -1)
00830 {
00831     if (m_DB->CheckOrFindOID(m_OID)) {
00832         x_GetSeq();
00833     }
00834 }
00835 
00836 /// Copy one iterator to another.
00837 CSeqDBIter & CSeqDBIter::operator =(const CSeqDBIter & other)
00838 {
00839     x_RetSeq();
00840 
00841     m_DB = other.m_DB;
00842     m_OID = other.m_OID;
00843     m_Data = 0;
00844     m_Length = -1;
00845 
00846     if (m_DB->CheckOrFindOID(m_OID)) {
00847         x_GetSeq();
00848     }
00849 
00850     return *this;
00851 }
00852 
00853 CSeqDBIter & CSeqDBIter::operator++()
00854 {
00855     x_RetSeq();
00856 
00857     ++m_OID;
00858 
00859     if (m_DB->CheckOrFindOID(m_OID)) {
00860         x_GetSeq();
00861     } else {
00862         m_Length = -1;
00863     }
00864 
00865     return *this;
00866 }
00867 
00868 CRef<CBioseq>
00869 CSeqDB::GiToBioseq(int gi) const
00870 {
00871     m_Impl->Verify();
00872 
00873     CRef<CBioseq> bs;
00874     int oid(0);
00875 
00876     if (m_Impl->GiToOid(gi, oid)) {
00877         bs = m_Impl->GetBioseq(oid, gi, NULL, true);
00878     }
00879 
00880     m_Impl->Verify();
00881 
00882     return bs;
00883 }
00884 
00885 CRef<CBioseq>
00886 CSeqDB::PigToBioseq(int pig) const
00887 {
00888     m_Impl->Verify();
00889 
00890     int oid(0);
00891     CRef<CBioseq> bs;
00892 
00893     if (m_Impl->PigToOid(pig, oid)) {
00894         bs = m_Impl->GetBioseq(oid, 0, NULL, true);
00895     }
00896 
00897     m_Impl->Verify();
00898 
00899     return bs;
00900 }
00901 
00902 CRef<CBioseq>
00903 CSeqDB::SeqidToBioseq(const CSeq_id & seqid) const
00904 {
00905     m_Impl->Verify();
00906 
00907     vector<int> oids;
00908     CRef<CBioseq> bs;
00909 
00910     m_Impl->SeqidToOids(seqid, oids, false);
00911 
00912     if (! oids.empty()) {
00913         bs = m_Impl->GetBioseq(oids[0], 0, &seqid, true);
00914     }
00915 
00916     m_Impl->Verify();
00917 
00918     return bs;
00919 }
00920 
00921 void
00922 CSeqDB::FindVolumePaths(const string   & dbname,
00923                         ESeqType         seqtype,
00924                         vector<string> & paths,
00925                         vector<string> * alias_paths,
00926                         bool             recursive,
00927                         bool             expand_links)
00928 {
00929     if (seqtype == CSeqDB::eProtein) {
00930         CSeqDBImpl::FindVolumePaths(dbname, 'p', paths, alias_paths, recursive, expand_links);
00931     } else if (seqtype == CSeqDB::eNucleotide) {
00932         CSeqDBImpl::FindVolumePaths(dbname, 'n', paths, alias_paths, recursive, expand_links);
00933     } else {
00934         try {
00935             CSeqDBImpl::FindVolumePaths(dbname, 'p', paths, alias_paths, recursive, expand_links);
00936         }
00937         catch(...) {
00938             CSeqDBImpl::FindVolumePaths(dbname, 'n', paths, alias_paths, recursive, expand_links);
00939         }
00940     }
00941 }
00942 
00943 void
00944 CSeqDB::FindVolumePaths(vector<string> & paths, bool recursive) const
00945 {
00946     m_Impl->Verify();
00947     m_Impl->FindVolumePaths(paths, recursive);
00948     m_Impl->Verify();
00949 }
00950 
00951 void
00952 CSeqDB::GetGis(int oid, vector<int> & gis, bool append) const
00953 {
00954     m_Impl->Verify();
00955 
00956     // This could be done a little faster at a lower level, but not
00957     // necessarily by too much.  If this operation is important to
00958     // performance, that decision can be revisited.
00959 
00960     list< CRef<CSeq_id> > seqids = GetSeqIDs(oid);
00961 
00962     if (! append) {
00963         gis.clear();
00964     }
00965 
00966     ITERATE(list< CRef<CSeq_id> >, seqid, seqids) {
00967         if ((**seqid).IsGi()) {
00968             gis.push_back(GI_TO(int, (**seqid).GetGi()));
00969         }
00970     }
00971 
00972     m_Impl->Verify();
00973 }
00974 
00975 void CSeqDB::SetIterationRange(int oid_begin, int oid_end)
00976 {
00977     m_Impl->SetIterationRange(oid_begin, oid_end);
00978 }
00979 
00980 void CSeqDB::GetAliasFileValues(TAliasFileValues & afv)
00981 {
00982     m_Impl->Verify();
00983     m_Impl->GetAliasFileValues(afv);
00984     m_Impl->Verify();
00985 }
00986 
00987 void CSeqDB::GetTaxInfo(int taxid, SSeqDBTaxInfo & info)
00988 {
00989     CSeqDBImpl::GetTaxInfo(taxid, info);
00990 }
00991 
00992 void CSeqDB::GetTotals(ESummaryType   sumtype,
00993                        int          * oid_count,
00994                        Uint8        * total_length,
00995                        bool           use_approx) const
00996 {
00997     m_Impl->Verify();
00998     m_Impl->GetTotals(sumtype, oid_count, total_length, use_approx);
00999     m_Impl->Verify();
01000 }
01001 
01002 const CSeqDBGiList * CSeqDB::GetGiList() const
01003 {
01004     return m_Impl->GetGiList();
01005 }
01006 
01007 CSeqDBIdSet CSeqDB::GetIdSet() const
01008 {
01009     return m_Impl->GetIdSet();
01010 }
01011 
01012 void CSeqDB::SetDefaultMemoryBound(Uint8 bytes)
01013 {
01014     CSeqDBImpl::SetDefaultMemoryBound(bytes);
01015 }
01016 
01017 void CSeqDB::GetSequenceAsString(int      oid,
01018                                  string & output,
01019                                  TSeqRange range /* = TSeqRange() */) const
01020 {
01021     CSeqUtil::ECoding code_to = ((GetSequenceType() == CSeqDB::eProtein)
01022                                  ? CSeqUtil::e_Iupacaa
01023                                  : CSeqUtil::e_Iupacna);
01024 
01025     GetSequenceAsString(oid, code_to, output, range);
01026 }
01027 
01028 void CSeqDB::GetSequenceAsString(int                 oid,
01029                                  CSeqUtil::ECoding   coding,
01030                                  string            & output,
01031                                  TSeqRange range /* = TSeqRange() */) const
01032 {
01033     output.erase();
01034 
01035     string raw;
01036     const char * buffer = 0;
01037     int length = 0;
01038 
01039     // Protein dbs ignore encodings, always returning ncbistdaa.
01040     if (range.NotEmpty()) {
01041         length = GetAmbigSeq(oid, & buffer, kSeqDBNuclNcbiNA8,
01042                              range.GetFrom(), range.GetToOpen());
01043     } else {
01044         length = GetAmbigSeq(oid, & buffer, kSeqDBNuclNcbiNA8);
01045     }
01046 
01047     try {
01048         raw.assign(buffer, length);
01049     }
01050     catch(...) {
01051         RetAmbigSeq(& buffer);
01052         throw;
01053     }
01054     RetAmbigSeq(& buffer);
01055 
01056     CSeqUtil::ECoding code_from = ((GetSequenceType() == CSeqDB::eProtein)
01057                                    ? CSeqUtil::e_Ncbistdaa
01058                                    : CSeqUtil::e_Ncbi8na);
01059 
01060     string result;
01061 
01062     if (code_from == coding) {
01063         result.swap(raw);
01064     } else {
01065         CSeqConvert::Convert(raw,
01066                              code_from,
01067                              0,
01068                              length,
01069                              result,
01070                              coding);
01071     }
01072 
01073     output.swap(result);
01074 }
01075 
01076 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION  > 550)) && \
01077      (!defined(NCBI_COMPILER_MIPSPRO)) )
01078 void CSeqDB::ListColumns(vector<string> & titles)
01079 {
01080     m_Impl->ListColumns(titles);
01081 }
01082 
01083 int CSeqDB::GetColumnId(const string & title)
01084 {
01085     return m_Impl->GetColumnId(title);
01086 }
01087 
01088 const map<string,string> &
01089 CSeqDB::GetColumnMetaData(int column_id)
01090 {
01091     return m_Impl->GetColumnMetaData(column_id);
01092 }
01093 
01094 const string & CSeqDB::GetColumnValue(int column_id, const string & key)
01095 {
01096     static string mt;
01097     return SeqDB_MapFind(GetColumnMetaData(column_id), key, mt);
01098 }
01099 
01100 const map<string,string> &
01101 CSeqDB::GetColumnMetaData(int            column_id,
01102                           const string & volname)
01103 {
01104     return m_Impl->GetColumnMetaData(column_id, volname);
01105 }
01106 
01107 void CSeqDB::GetColumnBlob(int            col_id,
01108                            int            oid,
01109                            CBlastDbBlob & blob)
01110 {
01111     m_Impl->GetColumnBlob(col_id, oid, true, blob);
01112 }
01113 
01114 void CSeqDB::GetAvailableMaskAlgorithms(vector<int> & algorithms)
01115 {
01116     m_Impl->GetAvailableMaskAlgorithms(algorithms);
01117 }
01118 
01119 int CSeqDB::GetMaskAlgorithmId(const string &algo_name) const
01120 {
01121     return m_Impl->GetMaskAlgorithmId(algo_name);
01122 }
01123 
01124 string CSeqDB::GetAvailableMaskAlgorithmDescriptions()
01125 {
01126     return m_Impl->GetAvailableMaskAlgorithmDescriptions();
01127 }
01128 
01129 vector<int> CSeqDB::ValidateMaskAlgorithms(const vector<int>& algorithm_ids)
01130 {
01131     vector<int> invalid_algo_ids, available_algo_ids;
01132     GetAvailableMaskAlgorithms(available_algo_ids);
01133     invalid_algo_ids.reserve(algorithm_ids.size());
01134     if (available_algo_ids.empty()) {
01135         copy(algorithm_ids.begin(), algorithm_ids.end(),
01136              back_inserter(invalid_algo_ids));
01137         return invalid_algo_ids;
01138     }
01139 
01140     ITERATE(vector<int>, itr, algorithm_ids) {
01141         vector<int>::const_iterator pos = find(available_algo_ids.begin(),
01142                                                available_algo_ids.end(), *itr);
01143         if (pos == available_algo_ids.end()) {
01144             invalid_algo_ids.push_back(*itr);
01145         }
01146     }
01147     return invalid_algo_ids;
01148 }
01149 
01150 void CSeqDB::GetMaskAlgorithmDetails(int                 algorithm_id,
01151                                      objects::EBlast_filter_program & program,
01152                                      string            & program_name,
01153                                      string            & algo_opts)
01154 {
01155     string sid;
01156     m_Impl->GetMaskAlgorithmDetails(algorithm_id, sid, program_name,
01157                                     algo_opts);
01158     Int4 id(0);
01159     NStr::StringToNumeric(sid, &id, NStr::fConvErr_NoThrow, 10);
01160     program = (objects::EBlast_filter_program)id;
01161 }
01162 
01163 void CSeqDB::GetMaskAlgorithmDetails(int                 algorithm_id,
01164                                      string            & program,
01165                                      string            & program_name,
01166                                      string            & algo_opts)
01167 {
01168     m_Impl->GetMaskAlgorithmDetails(algorithm_id, program, program_name,
01169                                     algo_opts);
01170 }
01171 
01172 void CSeqDB::GetMaskData(int                 oid,
01173                          int                 algo_id,
01174                          TSequenceRanges   & ranges)
01175 {
01176     m_Impl->GetMaskData(oid, algo_id, ranges);
01177 }
01178 
01179 #endif
01180 
01181 
01182 void CSeqDB::GarbageCollect(void)
01183 {
01184     m_Impl->GarbageCollect();
01185 }
01186 
01187 void CSeqDB::SetOffsetRanges(int                        oid,
01188                              const CSeqDB::TRangeList & offset_ranges,
01189                              bool                       append_ranges,
01190                              bool                       cache_data)
01191 {
01192     m_Impl->Verify();
01193 
01194     m_Impl->SetOffsetRanges(oid,
01195                             offset_ranges,
01196                             append_ranges,
01197                             cache_data);
01198 
01199     m_Impl->Verify();
01200 }
01201 
01202 void CSeqDB::RemoveOffsetRanges(int oid)
01203 {
01204     static TRangeList empty;
01205     SetOffsetRanges(oid, empty, false, false);
01206 }
01207 
01208 void CSeqDB::FlushOffsetRangeCache()
01209 {
01210     m_Impl->FlushOffsetRangeCache();
01211 }
01212 
01213 void CSeqDB::SetNumberOfThreads(int num_threads, bool force_mt)
01214 {
01215     m_Impl->Verify();
01216 
01217     m_Impl->SetNumberOfThreads(num_threads, force_mt);
01218 }
01219 
01220 string CSeqDB::ESeqType2String(ESeqType type)
01221 {
01222     string retval("Unknown");
01223     switch (type) {
01224     case eProtein: retval.assign("Protein"); break;
01225     case eNucleotide: retval.assign("Nucleotide"); break;
01226     case eUnknown:
01227     default: break;
01228     }
01229     return retval;
01230 }
01231 
01232 string CSeqDB::GenerateSearchPath()
01233 {
01234     return CSeqDBAtlas::GenerateSearchPath();
01235 }
01236 
01237 /// Functor class for FindFilesInDir
01238 class CBlastDbFinder {
01239 public:
01240     void operator() (CDirEntry& de) {
01241         const string& extn = de.GetPath().substr(de.GetPath().length() - 3, 1);
01242         SSeqDBInitInfo value;
01243         // rm extension
01244         value.m_BlastDbName = de.GetPath().substr(0, de.GetPath().length() - 4);
01245         CNcbiOstrstream oss;
01246         // Needed for escaping spaces
01247         oss << "\"" << value.m_BlastDbName << "\"";
01248         value.m_BlastDbName = CNcbiOstrstreamToString(oss);
01249         value.m_MoleculeType =
01250             (extn == "n" ? CSeqDB::eNucleotide : CSeqDB::eProtein);
01251         m_DBs.push_back(value);
01252     }
01253 
01254     vector<SSeqDBInitInfo> m_DBs;
01255 
01256     /// Auxiliary function to get the original file name found by this object
01257     string GetFileName(size_t idx) {
01258         SSeqDBInitInfo& info = m_DBs[idx];
01259         string retval = NStr::Replace(info.m_BlastDbName, "\"", kEmptyStr);
01260         if (info.m_MoleculeType == CSeqDB::eNucleotide) {
01261             string alias = retval + ".nal", index = retval + ".nin";
01262             retval = (CFile(alias).Exists() ? alias : index);
01263         } else {
01264             string alias = retval + ".pal", index = retval + ".pin";
01265             retval = (CFile(alias).Exists() ? alias : index);
01266         }
01267         return retval;
01268     }
01269 };
01270 
01271 /** Functor object for s_RemoveAliasComponents where the path name is matched
01272  * in SSeqDBInitInfo */
01273 class PathFinder {
01274 public:
01275     PathFinder(const string& p) : m_Path(p) {}
01276     bool operator() (const SSeqDBInitInfo& value) const {
01277         return (NStr::Find(value.m_BlastDbName, m_Path) != NPOS);
01278     }
01279 
01280 private:
01281     string m_Path;
01282 };
01283 
01284 static void s_RemoveAliasComponents(CBlastDbFinder& finder)
01285 {
01286     set<string> dbs2remove;
01287     for (size_t i = 0; i < finder.m_DBs.size(); i++) {
01288         string path = finder.GetFileName(i);
01289         if (path[path.size()-1] != 'l') { // not an alias file
01290             continue;
01291         }
01292         CNcbiIfstream in(path.c_str());
01293         if (!in) {
01294             continue;
01295         }
01296         string line;
01297         while (getline(in, line)) {
01298             if (NStr::StartsWith(line, "DBLIST")) {
01299                 vector<string> tokens;
01300                 NStr::Tokenize(line, " ", tokens, NStr::eMergeDelims);
01301                 for (size_t j = 1; j < tokens.size(); j++) {
01302                     dbs2remove.insert(tokens[j]);
01303                 }
01304             }
01305         }
01306     }
01307 
01308     ITERATE(set<string>, i, dbs2remove) {
01309         finder.m_DBs.erase(remove_if(finder.m_DBs.begin(), finder.m_DBs.end(),
01310                                      PathFinder(*i)),
01311                            finder.m_DBs.end());
01312     }
01313 }
01314 
01315 vector<SSeqDBInitInfo>
01316 FindBlastDBs(const string& path, const string& dbtype, bool recurse,
01317              bool include_alias_files /* = false */,
01318              bool remove_redundant_dbs /* = false */)
01319 {
01320     // 1. Find every database volume (but not alias files etc).
01321     vector<string> fmasks, dmasks;
01322 
01323     // If the type is 'guess' we do both types of databases.
01324 
01325     if (dbtype != "nucl") {
01326         fmasks.push_back("*.pin");
01327         if (include_alias_files) {
01328             fmasks.push_back("*.pal");
01329         }
01330     }
01331     if (dbtype != "prot") {
01332         fmasks.push_back("*.nin");
01333         if (include_alias_files) {
01334             fmasks.push_back("*.nal");
01335         }
01336     }
01337     dmasks.push_back("*");
01338 
01339     EFindFiles flags = (EFindFiles)
01340         (fFF_File | (recurse ? fFF_Recursive : 0));
01341 
01342     CBlastDbFinder dbfinder;
01343     FindFilesInDir(CDir(path), fmasks, dmasks, dbfinder, flags);
01344     if (remove_redundant_dbs) {
01345         s_RemoveAliasComponents(dbfinder);
01346     }
01347     sort(dbfinder.m_DBs.begin(), dbfinder.m_DBs.end());
01348     return dbfinder.m_DBs;
01349 }
01350 
01351 Int8 CSeqDB::GetSliceSize() const
01352 {
01353     m_Impl->Verify();
01354 
01355     return m_Impl->GetSliceSize();
01356 }
01357 
01358 Int8 CSeqDB::GetDiskUsage() const
01359 {
01360     vector<string> paths;
01361     FindVolumePaths(paths);
01362     _ASSERT( !paths.empty() );
01363 
01364     Int8 retval = 0;
01365 
01366     vector<string> extn;
01367     const bool is_protein(GetSequenceType() == CSeqDB::eProtein);
01368     SeqDB_GetFileExtensions(is_protein, extn);
01369 
01370     ITERATE(vector<string>, path, paths) {
01371         ITERATE(vector<string>, ext, extn) {
01372             CFile file(*path + "." + *ext);
01373             if (file.Exists()) {
01374                 Int8 length = file.GetLength();
01375                 if (length != -1) {
01376                     retval += length;
01377                 } else {
01378                     ERR_POST(Error << "Error retrieving file size for "
01379                                    << file.GetPath());
01380                 }
01381             }
01382         }
01383     }
01384     return retval;
01385 }
01386 
01387 CSeqDB::ESeqType
01388 ParseMoleculeTypeString(const string& s)
01389 {
01390     CSeqDB::ESeqType retval = CSeqDB::eUnknown;
01391     if (NStr::StartsWith(s, "prot", NStr::eNocase)) {
01392         retval = CSeqDB::eProtein;
01393     } else if (NStr::StartsWith(s, "nucl", NStr::eNocase)) {
01394         retval = CSeqDB::eNucleotide;
01395     } else if (NStr::StartsWith(s, "guess", NStr::eNocase)) {
01396         retval = CSeqDB::eUnknown;
01397     } else {
01398         _ASSERT("Unknown molecule for BLAST DB" != 0);
01399     }
01400     return retval;
01401 }
01402 
01403 bool DeleteBlastDb(const string& dbpath, CSeqDB::ESeqType seq_type)
01404 {
01405     int num_files_removed = 0;
01406     vector<string> db_files, alias_files;
01407 
01408     vector<string> extn;
01409     SeqDB_GetFileExtensions((seq_type == CSeqDB::eProtein), extn);
01410 
01411     try { CSeqDB::FindVolumePaths(dbpath, seq_type, db_files, &alias_files); }
01412     catch (...) {}    // ignore any errors from the invocation above
01413     ITERATE(vector<string>, f, db_files) {
01414         ITERATE(vector<string>, e, extn) {
01415             CNcbiOstrstream oss;
01416             oss << *f << "." << *e;
01417             const string fname = CNcbiOstrstreamToString(oss);
01418             if (CFile(fname).Remove()) {
01419                 LOG_POST(Info << "Deleted " << fname);
01420                 num_files_removed++;
01421             }
01422         }
01423     }
01424     ITERATE(vector<string>, f, alias_files) {
01425         if (CFile(*f).Remove()) {
01426             LOG_POST(Info << "Deleted " << *f);
01427             num_files_removed++;
01428         }
01429     }
01430     return static_cast<bool>(num_files_removed != 0);
01431 }
01432 
01433 const char* CSeqDB::kBlastDbDateFormat = "b d, Y  H:m P";
01434 
01435 set<string>
01436 CWgsDbTrimmer::x_ExtractOriginalWgsDbs()
01437 {
01438     vector<string> orig_wgs_dbs;
01439     NStr::Tokenize(m_OrigWgsList, " ", orig_wgs_dbs);
01440     set<string> retval;
01441     copy(orig_wgs_dbs.begin(), orig_wgs_dbs.end(), inserter(retval,
01442                                                             retval.begin()));
01443     return retval;
01444 }
01445 
01446 CWgsDbTrimmer::CWgsDbTrimmer(const string& wgs_db_list)
01447     : m_OrigWgsList(wgs_db_list)
01448 {
01449     CMutexGuard guard(CNcbiApplication::GetInstanceMutex());
01450     CNcbiApplication* app = CNcbiApplication::Instance();
01451     if (app) {
01452         m_Path = app->GetEnvironment().Get("WGS_GILIST_DIR");
01453     }
01454 }
01455 
01456 CWgsDbTrimmer::TGiLists
01457 CWgsDbTrimmer::x_ReadGiListsForDbs()
01458 {
01459     TGiLists retval;
01460     if (m_Path.empty()) {
01461         return retval;
01462     }
01463 
01464     set<string> orig_wgs_dbs = x_ExtractOriginalWgsDbs();
01465     if ( !orig_wgs_dbs.empty() ) {
01466         const string kExtn = ".gil";
01467         ITERATE(set<string>, wgs_db_name, orig_wgs_dbs) {
01468             CNcbiOstrstream oss;
01469             oss << m_Path << "/" << CDirEntry(*wgs_db_name).GetName() << kExtn;
01470             string fname = CNcbiOstrstreamToString(oss);
01471             vector<int> gis;
01472             try {
01473                 bool in_order = false;
01474                 SeqDB_ReadGiList(fname, gis, &in_order);
01475                 if ( !in_order ) {
01476                     sort(gis.begin(), gis.end());
01477                 }
01478             } catch (...) {} // if there's no GI list, save it
01479             retval[*wgs_db_name] = gis;
01480             _TRACE("Read " << gis.size() << " from " << fname);
01481         }
01482     }
01483     return retval;
01484 }
01485 
01486 string CWgsDbTrimmer::GetDbList()
01487 {
01488     TGiLists wgs_gi_lists = x_ReadGiListsForDbs();
01489     if (wgs_gi_lists.empty()) {
01490         // no GI lists were found in WGS_GILIST_DIR, we can't do anything
01491         return m_OrigWgsList;
01492     }
01493     set<string> trimmed_wgs_dbs;
01494     ITERATE(set<TGi>, gi, m_Gis) {
01495         if (wgs_gi_lists.empty()) {
01496             break;
01497         }
01498         NON_CONST_ITERATE(TGiLists, gis4wgs_db, wgs_gi_lists) {
01499             const string& wgs_db_name = gis4wgs_db->first;
01500             const vector<int>& wgs_gis = gis4wgs_db->second;
01501             if (find(wgs_gis.begin(), wgs_gis.end(), GI_TO(int, *gi)) != wgs_gis.end()) {
01502                 trimmed_wgs_dbs.insert(wgs_db_name);
01503                 wgs_gi_lists.erase(wgs_db_name);
01504                 break;
01505             }
01506         }
01507     }
01508     ITERATE(TGiLists, gis4wgs_db, wgs_gi_lists) {
01509         const string& wgs_db_name = gis4wgs_db->first;
01510         const vector<int>& wgs_gis = gis4wgs_db->second;
01511         if (wgs_gis.empty()) {
01512             trimmed_wgs_dbs.insert(wgs_db_name);
01513         }
01514     }
01515 
01516     CNcbiOstrstream oss;
01517     ITERATE(set<string>, wgs_db, trimmed_wgs_dbs) {
01518         oss << *wgs_db << " ";
01519     }
01520     return NStr::TruncateSpaces(CNcbiOstrstreamToString(oss));
01521 }
01522 
01523 END_NCBI_SCOPE
01524 
Modified on Sat Dec 27 10:28:30 2014 by modify_doxy.py rev. 426318