NCBI C++ ToolKit
build_db.cpp
Go to the documentation of this file.
00001 /*  $Id: build_db.cpp 52637 2012-01-13 14:28:18Z fongah2 $
00002 * ===========================================================================
00003 *
00004 *                            PUBLIC DOMAIN NOTICE
00005 *               National Center for Biotechnology Information
00006 *
00007 *  This software/database is a "United States Government Work" under the
00008 *  terms of the United States Copyright Act.  It was written as part of
00009 *  the author's official duties as a United States Government employee and
00010 *  thus cannot be copyrighted.  This software/database is freely available
00011 *  to the public for use. The National Library of Medicine and the U.S.
00012 *  Government have not placed any restriction on its use or reproduction.
00013 *
00014 *  Although all reasonable efforts have been taken to ensure the accuracy
00015 *  and reliability of the software and data, the NLM and the U.S.
00016 *  Government do not and cannot warrant the performance or results that
00017 *  may be obtained by using this software or data. The NLM and the U.S.
00018 *  Government disclaim all warranties, express or implied, including
00019 *  warranties of performance, merchantability or fitness for any particular
00020 *  purpose.
00021 *
00022 *  Please cite the author in any work or product based on this material.
00023 *
00024 * ===========================================================================
00025 *
00026 * Author:  Kevin Bealer
00027 *
00028 */
00029 
00030 /** @file build_db.cpp 
00031   Code to build a database given various sources of sequence data.
00032 */
00033 #ifndef SKIP_DOXYGEN_PROCESSING
00034 static char const rcsid[] = "$Id: build_db.cpp 52637 2012-01-13 14:28:18Z fongah2 $";
00035 #endif /* SKIP_DOXYGEN_PROCESSING */
00036 
00037 #include <ncbi_pch.hpp>
00038 
00039 // Blast databases
00040 
00041 #include <objtools/blast/seqdb_reader/seqdbexpert.hpp>
00042 #include <objtools/blast/seqdb_writer/writedb.hpp>
00043 #include <objtools/readers/fasta.hpp>
00044 
00045 // Object Manager
00046 
00047 #include <objmgr/object_manager.hpp>
00048 #include <objmgr/scope.hpp>
00049 #include <objmgr/seq_vector.hpp>
00050 #include <objtools/readers/reader_exception.hpp> // for CObjReaderParseException
00051 
00052 // Other utilities
00053 
00054 #include <util/sequtil/sequtil_convert.hpp>
00055 
00056 // Local
00057 
00058 #include <objtools/blast/seqdb_writer/build_db.hpp>
00059 #include <objtools/blast/seqdb_writer/multisource_util.hpp>
00060 
00061 #ifndef SKIP_DOXYGEN_PROCESSING
00062 BEGIN_NCBI_SCOPE
00063 USING_SCOPE(objects);
00064 #endif
00065 
00066 int debug_mode = 0;
00067 
00068 void CBuildDatabase::x_ResolveRemoteId(CRef<objects::CSeq_id> & seqid, int & gi)
00069 {
00070     CScope::TIds ids = x_GetScope().GetIds(*seqid);
00071     
00072     bool have_seqid = false;
00073     bool have_gi = false;
00074     
00075     gi = 0;
00076     
00077     ITERATE(CScope::TIds, iter, ids) {
00078         CConstRef<CSeq_id> id = iter->GetSeqId();
00079         if (debug_mode > 5)
00080             m_LogFile << "Seq-id " << seqid->AsFastaString()
00081                       << " contains id " << id->AsFastaString() << endl;
00082         
00083         if (id->IsGi()) {
00084             if (gi > 0) {
00085                 if (debug_mode > 5)
00086                     m_LogFile << "WARNING: multiple GIs discovered; gi[0] = "
00087                               << gi << endl;
00088             } else {
00089                 if (debug_mode > 5)
00090                     m_LogFile << "Seq-id " << seqid->AsFastaString()
00091                               << " resolved to "
00092                               << id->GetGi() << endl;
00093                 gi = id->GetGi();
00094                 have_gi = true;
00095             }
00096         } else if ((! have_seqid) && (id->Which() == seqid->Which())) {
00097             m_LogFile << "Remote: Resolving <" << seqid->AsFastaString()
00098                       << "> to <" << id->AsFastaString() << ">" << endl;
00099             
00100             if (id->GetTextseq_Id() == NULL ||
00101                 id->GetTextseq_Id()->IsSetVersion() == false) {
00102                 
00103                 m_LogFile
00104                     << "Warning: Resolution still does not provide version."
00105                     << endl;
00106             } else {
00107                 seqid.Reset(const_cast<CSeq_id*>(id.GetPointer()));
00108                 have_seqid = true;
00109             }
00110         }
00111         
00112         if (have_gi)
00113             break;
00114     }
00115 }
00116 
00117 // Resolve all ids to GIs, storing them in a GI list.
00118 
00119 CRef<CInputGiList> CBuildDatabase::x_ResolveGis(const vector<string> & ids)
00120 {
00121     CRef<CInputGiList> gi_list(new CInputGiList);
00122     
00123     ITERATE(vector<string>, id, ids) {
00124         // There are three possibilities:
00125         //
00126         // 1. Numbers are added to the list as GIs.
00127         // 2. Remote services may be called to determine the most
00128         //    recent version.
00129         // 3. Non-numerical types are added to the list as Seq-ids.
00130         //
00131         // For #2, the remote service call is only made if:
00132         //
00133         // A. Remote services are enabled.
00134         // B. The Seq-id can have a version (only CTextseq_id types.)
00135         // C. The version is not present.
00136         
00137         int gi(0);
00138         bool specific = false;
00139         CRef<CSeq_id> seqid;
00140         
00141         bool worked = CheckAccession(*id, gi, seqid, specific);
00142         
00143         // If a source database is specified, try that as a backup
00144         // resolution mechanism.
00145         
00146         if (! worked) {
00147             if (m_SourceDb.NotEmpty()) {
00148                 worked = x_ResolveFromSource(*id, seqid);
00149             }
00150         }
00151         
00152         if (! worked) {
00153             m_LogFile << "Did not recognize id: \"" << *id << "\"" << endl;
00154             continue;
00155         }
00156         
00157         // 1. Numeric GI
00158         
00159         if (gi != 0) {
00160             if (debug_mode > 5)
00161                 m_LogFile << "Found numerical GI:" << gi << endl;
00162             
00163             gi_list->AppendGi(gi);
00164             continue;
00165         }
00166         
00167         // 2. Possible remote resolution.  We look for a GI and if
00168         // that is not found, try to find a Seq-id of the same type
00169         // (but with a version).
00170         
00171         if (m_UseRemote && (! specific)) {
00172             x_ResolveRemoteId(seqid, gi);
00173             
00174             if (gi != 0) {
00175                 gi_list->AppendGi(gi);
00176                 continue;
00177             }
00178         }
00179         
00180         // 3. Just add the Seq-id as a Seq-id.
00181         
00182         gi_list->AppendSi(*id);
00183     }
00184     
00185     return gi_list;
00186 }
00187 
00188 bool CBuildDatabase::x_ResolveFromSource(const string  & acc,
00189                                          CRef<objects::CSeq_id> & id)
00190 {
00191     if (m_SourceDb.Empty()) {
00192         return false;
00193     }
00194     
00195     vector<int> oids;
00196     m_SourceDb->AccessionToOids(acc, oids);
00197     
00198     bool found(false), done(false);
00199     
00200     ITERATE(vector<int>, oid, oids) {
00201         list< CRef<CSeq_id> > ids = m_SourceDb->GetSeqIDs(*oid);
00202         
00203         ITERATE(list< CRef<CSeq_id> >, seqid, ids) {
00204             CRef<CSeq_id> s = *seqid;
00205             
00206             string S = s->AsFastaString();
00207             size_t pos = S.find(acc);
00208             
00209             if (pos != string::npos) {
00210                 size_t endpos = pos + acc.size();
00211                 
00212                 bool start_okay = (pos == 0 || S[pos-1] == '|');
00213                 bool end_okay = ((endpos == S.size()) ||
00214                                  (S[endpos] == '.' ||
00215                                   S[endpos] == '|'));
00216                 
00217                 if (start_okay && end_okay) {
00218                     done = true;
00219                 }
00220                 
00221                 if (done || (! found)) {
00222                     found = true;
00223                     id = s;
00224                 }
00225             }
00226             
00227             if (done)
00228                 break;
00229         }
00230         
00231         if (done)
00232             break;
00233     }
00234     
00235     return found;
00236 }
00237 
00238 void CBuildDatabase::x_DupLocal()
00239 {
00240     TIdToBits bitset;
00241     
00242     // Get sequence, deflines, ambiguities, and sometimes pigs.  The
00243     // simplest route (for WriteDB) is raw data + asn deflines, so we
00244     // use that when possible.
00245     
00246     CStopWatch sw(CStopWatch::eStart);
00247     int count = 0;
00248     
00249     for(int oid = 0; m_SourceDb->CheckOrFindOID(oid); oid++) {
00250         // Raw data.
00251         
00252         const char * buffer (0);
00253         int          slength(0);
00254         int          alength(0);
00255         
00256         m_SourceDb->GetRawSeqAndAmbig(oid, & buffer, & slength, & alength);
00257         
00258         CSequenceReturn seqret(*m_SourceDb, buffer);
00259         
00260         CTempString sequence(buffer, slength);
00261         CTempString ambig(buffer + slength, alength);
00262         
00263         // Deflines
00264         
00265         CRef<CBlast_def_line_set> headers = m_SourceDb->GetHdr(oid);
00266         m_DeflineCount += headers->Get().size();
00267         m_OIDCount ++;
00268         
00269         x_SetLinkAndMbit(headers);
00270         
00271         // Always include the taxid; although OPTIONAL, some programs
00272         // expect it, since the C ASN.1 loaders always emit integers.
00273         
00274         m_Taxids->FixTaxId(headers);
00275         
00276         // Now, add the sequence to the WriteDB database.
00277         
00278         m_OutputDb->AddSequence(sequence, ambig);
00279         m_OutputDb->SetDeflines(*headers);
00280         count ++;
00281     }
00282     
00283     if (count) {
00284         double t = sw.Elapsed();
00285         
00286         m_LogFile << "Duplication from source DB; duplicated "
00287                   << count << " sequences in " << t << " seconds." << endl;
00288     }
00289 }
00290 
00291 // This could be moved to writedb once it is tested and working.
00292 
00293 static CConstRef<CBioseq> s_FixBioseqDeltas(CConstRef<objects::CBioseq> bs)
00294 {
00295     if ((! bs->CanGetInst()) || bs->GetInst().CanGetSeq_data()) {
00296         return bs;
00297     }
00298     
00299     if (bs->CanGetInst() &&
00300         bs->GetInst().CanGetExt() &&
00301         bs->GetInst().GetExt().IsDelta() &&
00302         bs->GetInst().CanGetMol() &&
00303         !CSeq_inst::IsNa(bs->GetInst().GetMol())) {
00304         
00305         NCBI_THROW(CMultisourceException, eArg,
00306                    "Protein delta sequences are not supported.");
00307     }
00308     
00309     try {
00310         const CDelta_ext & dext = bs->GetInst().GetExt().GetDelta();
00311         
00312         if(dext.Get().front()->Which() != CDelta_seq::e_Literal)
00313             return bs;
00314 
00315         typedef list< CRef< CDelta_seq > > TItems;
00316         
00317         // Don't really want to use na4, because a half byte at the
00318         // end of a string would require that string to be manually
00319         // adjusted before appending.
00320         
00321         string seq8na;
00322         if (bs->GetInst().CanGetLength()) {
00323             seq8na.reserve(bs->GetInst().GetLength());
00324         }
00325         
00326         string na8;
00327         
00328         ITERATE(TItems, item, dext.Get()) {
00329             const CSeq_literal & L = (**item).GetLiteral();
00330 
00331             if (!L.CanGetSeq_data()) {
00332                 if (L.CanGetLength()){
00333                    seq8na.append(L.GetLength(), 0x0f);
00334                    continue;
00335                 } else {
00336                    NCBI_THROW(CMultisourceException, eArg,
00337                       "Part of the delta sequence, including its length, is un-available.");
00338                 }
00339             }
00340             
00341             if (L.GetSeq_data().IsNcbi2na()) {
00342                 CSeqConvert::Convert(L.GetSeq_data().GetNcbi2na(),
00343                                      CSeqUtil::e_Ncbi2na,
00344                                      0,
00345                                      L.GetLength(),
00346                                      na8,
00347                                      CSeqUtil::e_Ncbi8na);
00348             } else if (L.GetSeq_data().IsNcbi4na()) {
00349                 CSeqConvert::Convert(L.GetSeq_data().GetNcbi4na(),
00350                                      CSeqUtil::e_Ncbi4na,
00351                                      0,
00352                                      L.GetLength(),
00353                                      na8,
00354                                      CSeqUtil::e_Ncbi8na);
00355             } else {
00356                 NCBI_THROW(CMultisourceException, eArg,
00357                            "Unhandled type of sequence data encountered.");
00358             }
00359             
00360             seq8na += na8;
00361             na8.resize(0);
00362         }
00363         
00364         // Now convert back to 4na, since WriteDB does not yet handle
00365         // 8na sequences.
00366         
00367         int length = seq8na.size();
00368         vector<char> seq4na;
00369         CSeqConvert::Convert(seq8na,
00370                              CSeqUtil::e_Ncbi8na,
00371                              0,
00372                              length,
00373                              seq4na,
00374                              CSeqUtil::e_Ncbi4na);
00375         
00376         // Copy the needed fields of the CBioseq (but remove the delta
00377         // sequence) and add a Seq-data.
00378         
00379         CRef<CBioseq> bs2(new CBioseq);
00380         
00381         if (bs->IsSetId()) {
00382             bs2->SetId() = bs->GetId();
00383         }
00384         
00385         if (bs->IsSetDescr()) {
00386             bs2->SetDescr(const_cast<CSeq_descr&>(bs->GetDescr()));
00387         }
00388         
00389         CRef<CSeq_inst> inst(new CSeq_inst);
00390         
00391         inst->SetSeq_data().SetNcbi4na().Set().swap(seq4na);
00392         inst->SetMol(CSeq_inst::eMol_na);
00393         inst->SetLength(length);
00394         inst->SetRepr(CSeq_inst::eRepr_raw);
00395         
00396         bs2->SetInst(*inst);
00397         
00398         if (bs->IsSetAnnot()) {
00399             bs2->SetAnnot() = bs->GetAnnot();
00400         }
00401         
00402         bs = bs2;
00403     }
00404     catch(CInvalidChoiceSelection &) {
00405         NCBI_THROW(CMultisourceException, eArg,
00406                    "Bioseq must have Seq-data or "
00407                    "Delta containing only literals.");
00408     }
00409     
00410     return bs;
00411 }
00412 
00413 
00414 void CBuildDatabase::x_AddPig(CRef<objects::CBlast_def_line_set> headers)
00415 {
00416     int pig = 0;
00417     const CBlast_def_line  &  defline = *(headers->Get().front());
00418     if (defline.IsSetOther_info())
00419         pig = defline.GetOther_info().front();
00420 
00421     m_OutputDb->SetPig(pig);
00422 }
00423 
00424 void CBuildDatabase::x_EditHeaders(CRef<objects::CBlast_def_line_set> headers)
00425 {
00426     // Always include the taxid; although OPTIONAL, some programs
00427     // expect it, since the C ASN.1 loaders always emit integers.
00428     
00429     m_Taxids->FixTaxId(headers);
00430     
00431     // Edit the linkouts
00432     
00433     x_SetLinkAndMbit(headers);
00434 }
00435 
00436 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION  > 550)) && \
00437      (!defined(NCBI_COMPILER_MIPSPRO)) )
00438 void
00439 CBuildDatabase::x_AddMasksForSeqId(const list< CRef<CSeq_id> >& ids) 
00440 {
00441     if (m_MaskData.Empty()) {
00442         return;
00443     }
00444     
00445     const CMaskedRangesVector& rng = m_MaskData->GetRanges(ids);
00446     if (rng.empty()) {
00447         return;
00448     }
00449 
00450     vector <int> gis;
00451     ITERATE(list< CRef<CSeq_id> >, id, ids) {
00452         if ((*id)->IsGi()) {
00453             gis.push_back((*id)->GetGi());
00454         }
00455     }
00456     m_OutputDb->SetMaskData(rng, gis);
00457     m_FoundMatchingMasks = true;
00458 }
00459 #endif
00460 
00461 bool CBuildDatabase::x_EditAndAddBioseq(CConstRef<objects::CBioseq>   bs,
00462                                         objects::CSeqVector         * sv,
00463                                         bool                          add_pig)
00464 {
00465     CRef<CBlast_def_line_set> headers =
00466         CWriteDB::ExtractBioseqDeflines(*bs, m_ParseIDs);
00467     
00468     x_EditHeaders(headers);
00469     
00470     // Add the sequence
00471     if (sv) {
00472         m_OutputDb->AddSequence(*bs, *sv);
00473     } else {
00474         bs = s_FixBioseqDeltas(bs);
00475         if(bs->GetInst().CanGetSeq_data())
00476             m_OutputDb->AddSequence(*bs);
00477         else
00478             return false;
00479     }
00480 
00481     m_DeflineCount += headers->Get().size();
00482     m_OIDCount ++;
00483 
00484     if(add_pig) {
00485         x_AddPig(headers);
00486     }
00487     
00488     m_OutputDb->SetDeflines(*headers);
00489     
00490 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION  > 550)) && \
00491      (!defined(NCBI_COMPILER_MIPSPRO)) )
00492     const list< CRef<CSeq_id> > & ids = bs->GetId();
00493     x_AddMasksForSeqId(ids);
00494 #endif
00495     return true;
00496 }
00497 
00498 void CBuildDatabase::x_AddOneRemoteSequence(const objects::CSeq_id & seqid,
00499                                             bool          & found_all,
00500                                             bool          & error)
00501 {
00502     // Get handle and bioseq
00503     
00504     CConstRef<CBioseq> bs;
00505     CBioseq_Handle bsh;
00506     
00507     try {
00508         bsh = x_GetScope().GetBioseqHandle(seqid);
00509         bs = bsh.GetCompleteBioseq();
00510         
00511         if (debug_mode > 5) m_LogFile << MSerial_AsnText << *bs << endl;
00512     }
00513     catch (const CException & e) {
00514         m_LogFile << "Caught exception for query: "
00515                   << seqid.AsFastaString() << endl
00516                   << e.what() << endl;
00517         found_all = false;
00518         error = true;
00519     }
00520     
00521     if (bsh.GetState() & CBioseq_Handle::fState_not_found) {
00522         error = true;
00523     }
00524     
00525     
00526 
00527     CSeqVector sv(bsh);
00528     
00529     if(!x_EditAndAddBioseq(bs, & sv))
00530         error = true;
00531 
00532     if (error) {
00533             if (debug_mode > 5)
00534                 m_LogFile << "Could not find entry for: "
00535                           << seqid.AsFastaString() << endl;
00536 
00537             found_all = false;
00538             return;
00539     }
00540     
00541     if (debug_mode > 5)
00542         m_LogFile << "-- REMOTE: Found sequence "
00543                   << seqid.AsFastaString() << endl;
00544 }
00545 
00546 bool CBuildDatabase::x_AddRemoteSequences(CInputGiList & gi_list)
00547 {
00548     CStopWatch sw(CStopWatch::eStart);
00549     int count = 0;
00550     
00551     bool found_all = true;
00552     
00553     int num_gis = gi_list.GetNumGis();
00554     int i = 0;
00555     
00556     for(i = 0; i < num_gis; i++) {
00557         if (m_Verbose)
00558             m_LogFile << "GI " << gi_list.GetKey<int>(i);
00559         
00560         // We only need to fetch here for those cases where the SeqDB
00561         // attempt could not translate the GI.
00562         
00563         if (gi_list.GetGiOid(i).oid == -1) {
00564             if (m_Verbose)
00565                 m_LogFile << " not found locally; adding remotely." << endl;
00566             
00567             CRef<CSeq_id> id(new CSeq_id);
00568             id->SetGi(gi_list.GetKey<int>(i));
00569             
00570             bool error = false;
00571             
00572             x_AddOneRemoteSequence(*id, found_all, error);
00573             count++;
00574         } else {
00575             if (m_Verbose)
00576                 m_LogFile << " found locally; not adding remotely." << endl;
00577         }
00578     }
00579     
00580     int num_seqids = gi_list.GetNumSis();
00581     
00582     for(i = 0; i < num_seqids; i++) {
00583         if (m_Verbose)
00584             m_LogFile << "Seq-id "
00585                       << gi_list.GetKey<string>(i);
00586         
00587         // We only need to fetch here for those cases where the SeqDB
00588         // attempt could not translate the GI.
00589         
00590         if (gi_list.GetSiOid(i).oid == -1) {
00591             if (m_Verbose)
00592                 m_LogFile << " not found locally; adding remotely." << endl;
00593             
00594             bool error = false;
00595             
00596             string acc = gi_list.GetKey<string>(i);
00597             CRef<CSeq_id> id(new CSeq_id(acc));
00598             x_AddOneRemoteSequence(*id, found_all, error);
00599             count++;
00600         } else {
00601             if (m_Verbose)
00602                 m_LogFile << " found locally; not adding remotely." << endl;
00603         }
00604     }
00605     
00606     if (count) {
00607         double t = sw.Elapsed();
00608         
00609         m_LogFile << "Adding sequences from remote source; added "
00610                   << count << " sequences in " << t << " seconds." << endl;
00611     }
00612     
00613     return found_all;
00614 }
00615 
00616 bool
00617 CBuildDatabase::x_ReportUnresolvedIds(const CInputGiList & gi_list) const
00618 {
00619     bool success = true;
00620     
00621     int num_gis = gi_list.GetNumGis();
00622     
00623     int unresolved = 0;
00624     
00625     int i;
00626     for(i = 0; i < num_gis; i++) {
00627         // We only need to fetch here for those cases where the SeqDB
00628         // attempt could not translate the GI.
00629         
00630         if (gi_list.GetGiOid(i).oid == -1) {
00631             if (m_Verbose)
00632                 m_LogFile << "GI " << gi_list.GetKey<int>(i)
00633                           << " was not resolvable." << endl;
00634             
00635             success = false;
00636             unresolved ++;
00637         } else {
00638             if (m_Verbose)
00639                 m_LogFile << "GI " << gi_list.GetKey<int>(i)
00640                           << " found locally." << endl;
00641         }
00642     }
00643     
00644     int num_seqids = gi_list.GetNumSis();
00645     
00646     for(i = 0; i < num_seqids; i++) {
00647         // We only need to fetch here for those cases where the SeqDB
00648         // attempt could not translate the GI.
00649         
00650         if (gi_list.GetSiOid(i).oid == -1) {
00651             if (m_Verbose)
00652                 m_LogFile << "Seq-id "
00653                           << gi_list.GetKey<string>(i)
00654                           << " was not resolvable." << endl;
00655             
00656             unresolved ++;
00657             success = false;
00658         } else {
00659             if (m_Verbose)
00660                 m_LogFile << "Seq-id "
00661                           << gi_list.GetKey<string>(i)
00662                           << " found locally." << endl;
00663         }
00664     }
00665     
00666     if (unresolved) {
00667         m_LogFile << "Could not resolve " << unresolved << " IDs." << endl;
00668     }
00669     
00670     success = false;
00671     unresolved ++;
00672     
00673     return success;
00674 }
00675 
00676 class CFastaBioseqSource : public IBioseqSource {
00677 public:
00678     CFastaBioseqSource(CNcbiIstream & fasta_file,
00679                        bool is_protein,
00680                        bool parse_ids);
00681     
00682     ~CFastaBioseqSource();
00683     
00684     virtual CConstRef<CBioseq> GetNext();
00685     
00686 private:
00687     CRef<ILineReader> m_LineReader;
00688     CFastaReader* m_FastaReader;
00689 };
00690 
00691 CFastaBioseqSource::CFastaBioseqSource(CNcbiIstream & fasta_file,
00692                                        bool is_protein,
00693                                        bool parse_ids)
00694     : m_FastaReader(NULL)
00695 {
00696     m_LineReader.Reset(new CBufferedLineReader(fasta_file));
00697     
00698     typedef CFastaReader::EFlags TFlags;
00699     
00700     int iflags = CFastaReader::fAllSeqIds | CFastaReader::fForceType;
00701     
00702     if (is_protein) {
00703         iflags |= CFastaReader::fAssumeProt;
00704     } else {
00705         iflags |= CFastaReader::fAssumeNuc;
00706     }
00707     
00708     if (parse_ids) {
00709         iflags |= CFastaReader::fAllSeqIds;
00710     } else {
00711         iflags |= CFastaReader::fNoParseID;
00712     }
00713     
00714     TFlags flags = (TFlags) iflags;
00715     
00716     m_FastaReader = new CFastaReader(*m_LineReader, flags);
00717 }
00718 
00719 CFastaBioseqSource::~CFastaBioseqSource()
00720 {
00721     delete m_FastaReader;
00722 }
00723 
00724 CConstRef<CBioseq> CFastaBioseqSource::GetNext()
00725 {
00726     CConstRef<CBioseq> rv;
00727     
00728     if (m_LineReader.NotEmpty() && ! m_LineReader->AtEOF()) {
00729         CRef<CSeq_entry> entry;
00730         try { entry = m_FastaReader->ReadOneSeq(); }
00731         catch (const CObjReaderParseException& e) { 
00732             static const string kKeyword("m_Pos = ");
00733             SIZE_TYPE start = NStr::Find(e.what(), kKeyword);
00734             SIZE_TYPE end = NStr::Find(e.what(), ")", start);
00735             string pos("unknown");
00736             if (start != NPOS && end != NPOS) {
00737                 start += kKeyword.size();
00738                 pos = string(e.what()).substr(start, end-start);
00739             }
00740             ERR_POST(Error << "Error while reading input at position " << pos);
00741             ERR_POST(Error << "Aborting processing prematurely.");
00742             // additional handling needed
00743             throw(e);
00744         }
00745         
00746         if (entry.NotEmpty()) {
00747             _ASSERT(entry->IsSeq());
00748             rv.Reset(& entry->GetSeq());
00749         }
00750     }
00751     
00752     // Any failure to read a Bioseq is considered an EOF.
00753     
00754     if (rv.Empty()) {
00755         m_LineReader.Reset();
00756     }
00757     
00758     return rv;
00759 }
00760 
00761 bool CBuildDatabase::AddSequences(IBioseqSource & src, bool add_pig)
00762 {
00763     bool found = false;
00764     
00765     CStopWatch sw(CStopWatch::eStart);
00766     int count = 0;
00767     
00768     CConstRef<CBioseq> bs = src.GetNext();
00769     
00770     while(bs.NotEmpty()) {
00771         string bioseq_id("Unknown");
00772 
00773         if (bs->CanGetId()) {
00774             const list< CRef<CSeq_id> > & ids = bs->GetId();
00775             if (! ids.empty() && ids.front().NotEmpty()) {
00776                 bioseq_id.assign(ids.front()->AsFastaString());
00777             }
00778         }
00779 
00780         if(bs->IsAa() != m_IsProtein ){
00781                     bs = src.GetNext();
00782                     continue;
00783        }
00784 
00785         if ((bs->GetLength() == 0) || (!x_EditAndAddBioseq(bs, NULL, add_pig))){
00786             m_LogFile << "Ignoring sequence '" << bioseq_id
00787                       << "' as it has no sequence data" << endl;
00788             bs = src.GetNext();
00789             continue;
00790         }
00791 
00792         if (m_Verbose) {
00793             m_LogFile << "Adding bioseq from fasta; first id is: '" << bioseq_id
00794                 << "'" << endl;
00795         }
00796         
00797         // No linkouts or memberships here (yet).
00798 
00799         found = true;
00800         
00801         count++;
00802         
00803         if (debug_mode > 5) m_LogFile << "-- FASTA: Found sequence." << endl;
00804         
00805         bs = src.GetNext();
00806     }
00807     
00808     if (count) {
00809         double t = sw.Elapsed();
00810         
00811         m_LogFile << "Adding sequences from FASTA; added "
00812                   << count << " sequences in " << t << " seconds." << endl;
00813     }
00814     
00815     return found;
00816 }
00817 
00818 bool CBuildDatabase::AddSequences(IRawSequenceSource & src)
00819 {
00820     CStopWatch sw(CStopWatch::eStart);
00821     
00822     bool done = false;
00823     bool rv = false;
00824     
00825 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION  > 550)) && \
00826      (!defined(NCBI_COMPILER_MIPSPRO)) )
00827     // Get all column names.
00828     
00829     vector<string> all_names;
00830     map<int, int> in2out;
00831     int mask_id = -1;
00832     
00833     src.GetColumnNames(all_names);
00834     
00835     for(int i = 0; i < (int) all_names.size(); i++) {
00836         string name = all_names[i];
00837         int in_id = src.GetColumnId(name);
00838 
00839         // skip masking data column
00840         if (name == "BlastDb/MaskData") {
00841             mask_id = in_id;
00842             continue;
00843         }
00844         int out_id = m_OutputDb->FindColumn(name);
00845         
00846         if (out_id < 0) {
00847             out_id = m_OutputDb->CreateUserColumn(name);
00848         }
00849         
00850         typedef map<string,string> StringPairMap;
00851         const StringPairMap & meta = src.GetColumnMetaData(in_id);
00852         
00853         ITERATE(StringPairMap, iter, meta) {
00854             m_OutputDb->AddColumnMetaData(out_id, iter->first, iter->second);
00855         }
00856         
00857         in2out[in_id] = out_id;
00858     }
00859 #endif    
00860     // Copy all data.
00861     
00862     vector<CTempString> column_blobs;
00863     vector<int> column_ids;
00864     
00865     int count = 0;
00866     
00867     while(! done) {
00868         CTempString sequence, ambiguities;
00869         CRef<CBlast_def_line_set> deflines;
00870         CMaskedRangesVector  mask_data;
00871         
00872         if (src.GetNext(sequence,
00873                         ambiguities,
00874                         deflines,
00875                         mask_data,
00876                         column_ids,
00877                         column_blobs)) {
00878             
00879             // Copy data
00880             
00881             _ASSERT(column_blobs.size() == column_ids.size());
00882             
00883             if (sequence.empty()) {
00884                 NCBI_THROW(CMultisourceException, eArg,
00885                            "Error in raw data: no sequence");
00886             }
00887             
00888             if ((! ambiguities.empty()) && m_IsProtein) {
00889                 NCBI_THROW(CMultisourceException, eArg,
00890                            "Error in raw data: "
00891                            "protein db cannot with ambiguities");
00892             }
00893             
00894             if (deflines.Empty()) {
00895                 NCBI_THROW(CMultisourceException, eArg,
00896                            "Error in raw data: no headers provided");
00897             }
00898             
00899             x_EditHeaders(deflines);
00900             
00901             m_OutputDb->AddSequence(sequence, ambiguities);
00902             x_AddPig(deflines);
00903             m_OutputDb->SetDeflines(*deflines);
00904             
00905 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION  > 550)) && \
00906      (!defined(NCBI_COMPILER_MIPSPRO)) )
00907             for(int i = 0; i < (int)column_ids.size(); i++) {
00908                 int in_id = column_ids[i];
00909                 if (in_id == mask_id) continue;
00910                 
00911                 if (column_blobs[i].size() == 0)
00912                     continue;
00913                 
00914                 _ASSERT(in2out.find(in_id) != in2out.end());
00915                 
00916                 int out_id = in2out[in_id];
00917                 
00918                 CTempString blob_in = column_blobs[i];
00919                 CBlastDbBlob & blob_out = m_OutputDb->SetBlobData(out_id);
00920                 
00921                 blob_out.Clear();
00922                 blob_out.WriteRaw(& blob_in.data()[0], blob_in.size());
00923             }
00924             // Don't forget about the IMaskDataSource!
00925             vector <int> gis;  // GIs associated with this sequence
00926             if (!mask_data.empty() || !m_MaskData.Empty()) {
00927                 ITERATE(CBlast_def_line_set::Tdata, defline, deflines->Get()) {
00928                     const list< CRef<CSeq_id> > & ids = (*defline)->GetSeqid();
00929                     ITERATE(list< CRef<CSeq_id> >, id, ids) {
00930                         if ((*id)->IsGi()) {
00931                             gis.push_back((*id)->GetGi());
00932                         }
00933                     }
00934                     if (!m_MaskData.Empty()) {
00935                         const CMaskedRangesVector rng = m_MaskData->GetRanges(ids);
00936                         if (!rng.empty()) {
00937                             mask_data.insert(mask_data.end(), rng.begin(), rng.end());  
00938                             m_FoundMatchingMasks = true;
00939                         }
00940                     }
00941                 }
00942             }
00943             if (!mask_data.empty()) {
00944                 m_OutputDb->SetMaskData(mask_data, gis);
00945             }
00946 #endif
00947             
00948             rv = true;
00949             count ++;
00950         } else {
00951             done = true;
00952         }
00953     }
00954     
00955     if (count) {
00956         double t = sw.Elapsed();
00957         
00958         m_LogFile << "Adding sequences from raw db source; added "
00959                   << count << " sequences in " << t << " seconds." << endl;
00960     }
00961     
00962     return rv;
00963 }
00964 
00965 static void s_CreateDirectories(const string& dbname)
00966 {
00967     CDirEntry dir_entry(dbname);
00968     string dir_name = dir_entry.GetDir(CDirEntry::eIfEmptyPath_Empty);
00969     if (dir_name.empty()) {
00970         return;
00971     }
00972 
00973     CDir d(dir_name);
00974     if ( !d.Exists() ) {
00975         if ( !d.CreatePath() ) {
00976             string msg("Failed to create directory '" + d.GetName() + "'");
00977             NCBI_THROW(CMultisourceException, eOutputFileError, msg);
00978         }
00979     }
00980     if (!d.CheckAccess(CDirEntry::fWrite)) {
00981         string msg("You do not have write permissions on '" + 
00982                    d.GetName() + "'");
00983         NCBI_THROW(CMultisourceException, eOutputFileError, msg);
00984     }
00985 }
00986 
00987 CBuildDatabase::CBuildDatabase(const string         & dbname,
00988                                const string         & title,
00989                                bool                   is_protein,
00990                                CWriteDB::TIndexType   indexing,
00991                                bool                   use_gi_mask,
00992                                ostream              * logfile)
00993     : m_IsProtein    (is_protein),
00994       m_KeepLinks    (false),
00995       m_KeepMbits    (false),
00996       m_Taxids       (new CTaxIdSet()),
00997       m_LogFile      (*logfile),
00998       m_UseRemote    (true),
00999       m_DeflineCount (0),
01000       m_OIDCount     (0),
01001       m_Verbose      (false),
01002       m_ParseIDs     (((indexing & CWriteDB::eFullIndex) != 0 ? true : false)),
01003       m_FoundMatchingMasks(false)
01004 {
01005     s_CreateDirectories(dbname);
01006     m_LogFile << "\n\nBuilding a new DB, current time: "
01007               << CTime(CTime::eCurrent).AsString() << endl;
01008     
01009     m_LogFile << "New DB name:   " << dbname << endl;
01010     m_LogFile << "New DB title:  " << title << endl;
01011     m_LogFile << "Sequence type: "
01012               << (is_protein ? "Protein" : "Nucleotide") << endl;
01013     
01014     CWriteDB::ESeqType seqtype =
01015         (is_protein ? CWriteDB::eProtein : CWriteDB::eNucleotide);
01016     
01017     m_OutputDb.Reset(new CWriteDB(dbname,
01018                                   seqtype,
01019                                   title,
01020                                   indexing,
01021                                   m_ParseIDs,
01022                                   use_gi_mask));
01023     
01024     // Standard 1 GB limit
01025     
01026     m_OutputDb->SetMaxFileSize(1000*1000*1000);
01027 }
01028 
01029 CBuildDatabase::CBuildDatabase(const string & dbname,
01030                                const string & title,
01031                                bool           is_protein,
01032                                bool           sparse,
01033                                bool           parse_seqids,
01034                                bool           use_gi_mask,
01035                                ostream      * logfile)
01036     : m_IsProtein    (is_protein),
01037       m_KeepLinks    (false),
01038       m_KeepMbits    (false),
01039       m_Taxids       (new CTaxIdSet()),
01040       m_LogFile      (*logfile),
01041       m_UseRemote    (true),
01042       m_DeflineCount (0),
01043       m_OIDCount     (0),
01044       m_Verbose      (false),
01045       m_ParseIDs     (parse_seqids),
01046       m_FoundMatchingMasks(false)
01047 {
01048     s_CreateDirectories(dbname);
01049     m_LogFile << "\n\nBuilding a new DB, current time: "
01050               << CTime(CTime::eCurrent).AsString() << endl;
01051     
01052     m_LogFile << "New DB name:   " << dbname << endl;
01053     m_LogFile << "New DB title:  " << title << endl;
01054     m_LogFile << "Sequence type: "
01055               << (is_protein ? "Protein" : "Nucleotide") << endl;
01056     
01057     CWriteDB::ESeqType seqtype =
01058         (is_protein ? CWriteDB::eProtein : CWriteDB::eNucleotide);
01059     
01060     CWriteDB::EIndexType ix = (sparse
01061                                ? CWriteDB::eSparseIndex
01062                                : CWriteDB::eDefault);
01063     
01064     m_OutputDb.Reset(new CWriteDB(dbname,
01065                                   seqtype,
01066                                   title,
01067                                   ix,
01068                                   m_ParseIDs,
01069                                   use_gi_mask));
01070 
01071     // Standard 1 GB limit
01072     
01073     m_OutputDb->SetMaxFileSize(1000*1000*1000);
01074 }
01075 
01076 CBuildDatabase::~CBuildDatabase()
01077 {
01078     if (m_MaskData.NotEmpty() && !m_FoundMatchingMasks) {
01079         ERR_POST(Error << "No sequences matched any of the masks provided.\n"
01080                        << "Please ensure that the -parse_seqids option is used "
01081                        << "in the\nfiltering program as well as makeblastdb.");
01082     }
01083     if (!m_Taxids->HasEverFixedId()) {
01084         ERR_POST(Error << "No sequences matched any of the taxids provided.");
01085     }
01086 }
01087 
01088 void CBuildDatabase::SetTaxids(CTaxIdSet & taxids)
01089 {
01090     m_Taxids.Reset(& taxids);
01091 }
01092 
01093 void CBuildDatabase::SetMaskLetters(const string & letters)
01094 {
01095     m_OutputDb->SetMaskedLetters(letters);
01096 }
01097 
01098 CScope & CBuildDatabase::x_GetScope()
01099 {
01100     if (m_Scope.Empty()) {
01101         if (m_ObjMgr.Empty()) {
01102             m_ObjMgr.Reset(CObjectManager::GetInstance());
01103         }
01104         
01105         m_Scope.Reset(new CScope(*m_ObjMgr));
01106         
01107         // Add default loaders (GB loader in this demo) to the scope.
01108         m_Scope->AddDefaults();
01109     }
01110     
01111     return *m_Scope;
01112 }
01113 
01114 void CBuildDatabase::SetSourceDb(CRef<CSeqDBExpert> seqdb)
01115 {
01116     m_LogFile << "Configured source DB: " << seqdb->GetDBNameList() << endl;
01117     m_LogFile << "Source DB has title:  " << seqdb->GetTitle() << endl;
01118     m_LogFile << "Source DB time stamp: " << seqdb->GetDate() << endl;
01119     m_SourceDb = seqdb;
01120 }
01121 
01122 void CBuildDatabase::SetSourceDb(const string & src_db_name)
01123 {
01124     _ASSERT(src_db_name.size());
01125     CRef<CSeqDBExpert> src_db(new CSeqDBExpert(src_db_name,
01126                                                m_IsProtein
01127                                                ? CSeqDB::eProtein
01128                                                : CSeqDB::eNucleotide));
01129     
01130     SetSourceDb(src_db);
01131 }
01132 
01133 void CBuildDatabase::SetLinkouts(const TLinkoutMap & linkouts,
01134                                  bool                keep_links)
01135 {
01136     m_LogFile << "Keep Linkouts: " << (keep_links ? "T" : "F") << endl;
01137     MapToLMBits(linkouts, m_Id2Links);
01138     m_KeepLinks = keep_links;
01139 }
01140 
01141 void CBuildDatabase::SetMembBits(const TLinkoutMap & membbits,
01142                                  bool                keep_mbits)
01143 {
01144     m_LogFile << "Keep MBits: " << (keep_mbits ? "T" : "F") << endl;
01145     MapToLMBits(membbits, m_Id2Mbits);
01146     m_KeepMbits = keep_mbits;
01147 }
01148 
01149 bool
01150 CBuildDatabase::Build(const vector<string> & ids,
01151                       CNcbiIstream         * fasta_file)
01152 {
01153     CStopWatch sw(CStopWatch::eStart);
01154     
01155     StartBuild();
01156     
01157     bool success = AddIds(ids);
01158     
01159     if (success) {
01160         success = AddFasta(*fasta_file);
01161     }
01162     
01163     bool success2 = EndBuild();
01164     
01165     success = success || success2;
01166     
01167     double t = sw.Elapsed();
01168     
01169     m_LogFile << "Total sequences stored: " << m_OIDCount << endl;
01170     m_LogFile << "Total deflines stored: " << m_DeflineCount << endl;
01171     
01172     m_LogFile << "Total time to build database: "
01173               << t << " seconds.\n" << endl;
01174     
01175     return success;
01176 }
01177 
01178 void CBuildDatabase::StartBuild()
01179 {
01180 }
01181 
01182 bool CBuildDatabase::AddIds(const vector<string> & ids)
01183 {
01184     
01185     bool success = true;
01186     
01187     // Resolve all ids to GIs, storing them in a GI list.
01188     
01189     CRef<CInputGiList> gi_list;
01190     
01191     if (m_SourceDb.NotEmpty() && ! ids.empty()) {
01192         gi_list = x_ResolveGis(ids);
01193     }
01194     
01195     // Translate the GI list.
01196     
01197     if (gi_list.NotEmpty() &&
01198         (gi_list->GetNumGis() || gi_list->GetNumSis())) {
01199         
01200         // The process of constructing a SeqDB object with a user GI
01201         // list causes translation of the User GI list, and is the
01202         // fastest way of performing such a translation in bulk.  It
01203         // is possible to iterate the list afterwards to determine
01204         // what subset of it that has been translated; non-translated
01205         // GIs will need to be fetched using a data loader.
01206         //
01207         // It is not necessary, however, to iterate the GI list to
01208         // find OIDs that correspond to the filtered DB; these can be
01209         // found using OID iteration over SeqDB, which produces a
01210         // better ordering inasmuch as the reads from the source
01211         // sequence data will be sequential on disk.
01212         
01213         _ASSERT(m_SourceDb.NotEmpty());
01214         
01215         CRef<CSeqDBExpert> filtered
01216             (new CSeqDBExpert(m_SourceDb->GetDBNameList(),
01217                               m_SourceDb->GetSequenceType(),
01218                               &* gi_list));
01219         
01220         m_SourceDb = filtered;
01221         
01222         // Add all local database sequences to the output DB.
01223         
01224         x_DupLocal();
01225         
01226         if (m_Verbose) {
01227             // Map oid to gi.
01228             map<int,int> seen_it;
01229             
01230             for(int i = 0; i < gi_list->GetNumGis(); i++) {
01231                 int this_oid = gi_list->GetGiOid(i).oid;
01232                 int this_gi = gi_list->GetGiOid(i).gi;
01233             
01234                 if (this_oid != -1) {
01235                     if (seen_it.find(this_oid) == seen_it.end()) {
01236                         seen_it[this_oid] = this_gi;
01237                     } else {
01238                         m_LogFile << "GI " << this_gi
01239                                   << " is duplicate of GI "
01240                                   << seen_it[this_oid]
01241                                   << endl;
01242                     }
01243                 }
01244             }
01245         }
01246     }
01247     
01248     if (gi_list.NotEmpty()) {
01249         if (m_UseRemote) {
01250             success = x_AddRemoteSequences(*gi_list);
01251         } else {
01252             success = x_ReportUnresolvedIds(*gi_list);
01253         }
01254     }
01255     
01256     return success;
01257 }
01258 
01259 bool CBuildDatabase::AddFasta(CNcbiIstream & fasta_file)
01260 {
01261     // Add any fasta sequences as well.
01262     bool success = true;
01263     
01264     if (fasta_file) {
01265         CFastaBioseqSource fbs(fasta_file,
01266                                m_IsProtein,
01267                                m_ParseIDs);
01268         
01269         try {
01270             success = AddSequences(fbs);
01271         }
01272         catch (...) {
01273             EndBuild(true);
01274             throw;
01275         }
01276     }
01277     return success;
01278 }
01279 
01280 bool CBuildDatabase::EndBuild(bool erase)
01281 {
01282     bool success = false;
01283     bool can_not_close = false;
01284     
01285     try {
01286         m_OutputDb->Close();
01287     } catch (...) {
01288         if (!erase) {
01289             erase = true;    
01290             can_not_close = true;
01291         }
01292     }
01293 
01294     vector<string> vols;
01295     vector<string> files;
01296     
01297     m_OutputDb->ListVolumes(vols);
01298     m_OutputDb->ListFiles(files);
01299     
01300     m_LogFile << endl;
01301     
01302     _ASSERT(vols.empty() == files.empty());
01303     
01304     if (vols.empty()) {
01305         m_LogFile << "No volumes were created because no sequences were found."
01306                   << endl;
01307         
01308         success = false;
01309     } else {
01310         ITERATE(vector<string>, iterv, vols) {
01311             m_LogFile << "volume: " << *iterv << endl;
01312         }
01313         
01314         m_LogFile << endl;
01315         ITERATE(vector<string>, iterf, files) {
01316             m_LogFile << "file: " << *iterf << endl;
01317             if (erase) {
01318                 CFile(*iterf).Remove();
01319             }
01320         }
01321     }
01322     
01323     m_LogFile << endl;
01324 
01325     if (can_not_close) {
01326         NCBI_THROW(CWriteDBException, eArgErr,
01327                    "Can not close files.");
01328     }
01329     
01330     return success;
01331 }
01332 
01333 
01334 static void
01335 s_SetDeflineBits(objects::CBlast_def_line & defline,
01336                  TIdToBits       & bitmap,
01337                  bool              keep_old,
01338                  bool              is_memb,
01339                  vector<string>  & keys)
01340 {
01341     bool found = false;
01342     int value = 0;
01343     
01344     ITERATE(vector<string>, key, keys) {
01345         if (! key->size())
01346             continue;
01347         
01348         TIdToBits::iterator item = bitmap.find(*key);
01349         
01350         if (item != bitmap.end()) {
01351             found = true;
01352             value |= item->second;
01353         }
01354     }
01355     
01356     if (found) {
01357         list<int> & linkv = (is_memb
01358                              ? defline.SetMemberships()
01359                              : defline.SetLinks());
01360             
01361         if (! keep_old) {
01362             linkv.clear();
01363         }
01364             
01365         if (linkv.empty()) {
01366             linkv.push_back(value);
01367         } else {
01368             linkv.front() |= value;
01369         }
01370     } else {
01371         if (! keep_old) {
01372             if (is_memb) {
01373                 defline.ResetMemberships();
01374             } else {
01375                 defline.ResetLinks();
01376             }
01377         }
01378     }
01379 }
01380 
01381 void 
01382 CBuildDatabase::x_SetLinkAndMbit(CRef<objects::CBlast_def_line_set> headers)
01383 {
01384     vector<string> keys;
01385     
01386     NON_CONST_ITERATE(CBlast_def_line_set::Tdata, iter, headers->Set()) {
01387         CBlast_def_line & defline = **iter;
01388         GetDeflineKeys(defline, keys);
01389         
01390         s_SetDeflineBits(defline, m_Id2Links, m_KeepLinks, false, keys);
01391         s_SetDeflineBits(defline, m_Id2Mbits, m_KeepMbits, true, keys);
01392     }
01393 }
01394 
01395 void CBuildDatabase::SetMaxFileSize(Uint8 max_file_size)
01396 {
01397     m_OutputDb->SetMaxFileSize(max_file_size);
01398 }
01399 
01400 int
01401 CBuildDatabase::RegisterMaskingAlgorithm(EBlast_filter_program program, 
01402                                          const string        & options,
01403                                          const string        & name)
01404 {
01405 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION  > 550)) && \
01406      (!defined(NCBI_COMPILER_MIPSPRO)) )
01407     return m_OutputDb->RegisterMaskAlgorithm(program, options, name);
01408 #else
01409     return 0;
01410 #endif
01411 }
01412 
01413 void CBuildDatabase::SetMaskDataSource(IMaskDataSource & ranges)
01414 {
01415     m_MaskData.Reset(& ranges);
01416 }
01417 
01418 END_NCBI_SCOPE
Modified on Wed May 23 13:10:40 2012 by modify_doxy.py rev. 337098