|
NCBI C++ ToolKit
|
00001 /* $Id: build_db.cpp 52637 2012-01-13 14:28:18Z fongah2 $ 00002 * =========================================================================== 00003 * 00004 * PUBLIC DOMAIN NOTICE 00005 * National Center for Biotechnology Information 00006 * 00007 * This software/database is a "United States Government Work" under the 00008 * terms of the United States Copyright Act. It was written as part of 00009 * the author's official duties as a United States Government employee and 00010 * thus cannot be copyrighted. This software/database is freely available 00011 * to the public for use. The National Library of Medicine and the U.S. 00012 * Government have not placed any restriction on its use or reproduction. 00013 * 00014 * Although all reasonable efforts have been taken to ensure the accuracy 00015 * and reliability of the software and data, the NLM and the U.S. 00016 * Government do not and cannot warrant the performance or results that 00017 * may be obtained by using this software or data. The NLM and the U.S. 00018 * Government disclaim all warranties, express or implied, including 00019 * warranties of performance, merchantability or fitness for any particular 00020 * purpose. 00021 * 00022 * Please cite the author in any work or product based on this material. 00023 * 00024 * =========================================================================== 00025 * 00026 * Author: Kevin Bealer 00027 * 00028 */ 00029 00030 /** @file build_db.cpp 00031 Code to build a database given various sources of sequence data. 00032 */ 00033 #ifndef SKIP_DOXYGEN_PROCESSING 00034 static char const rcsid[] = "$Id: build_db.cpp 52637 2012-01-13 14:28:18Z fongah2 $"; 00035 #endif /* SKIP_DOXYGEN_PROCESSING */ 00036 00037 #include <ncbi_pch.hpp> 00038 00039 // Blast databases 00040 00041 #include <objtools/blast/seqdb_reader/seqdbexpert.hpp> 00042 #include <objtools/blast/seqdb_writer/writedb.hpp> 00043 #include <objtools/readers/fasta.hpp> 00044 00045 // Object Manager 00046 00047 #include <objmgr/object_manager.hpp> 00048 #include <objmgr/scope.hpp> 00049 #include <objmgr/seq_vector.hpp> 00050 #include <objtools/readers/reader_exception.hpp> // for CObjReaderParseException 00051 00052 // Other utilities 00053 00054 #include <util/sequtil/sequtil_convert.hpp> 00055 00056 // Local 00057 00058 #include <objtools/blast/seqdb_writer/build_db.hpp> 00059 #include <objtools/blast/seqdb_writer/multisource_util.hpp> 00060 00061 #ifndef SKIP_DOXYGEN_PROCESSING 00062 BEGIN_NCBI_SCOPE 00063 USING_SCOPE(objects); 00064 #endif 00065 00066 int debug_mode = 0; 00067 00068 void CBuildDatabase::x_ResolveRemoteId(CRef<objects::CSeq_id> & seqid, int & gi) 00069 { 00070 CScope::TIds ids = x_GetScope().GetIds(*seqid); 00071 00072 bool have_seqid = false; 00073 bool have_gi = false; 00074 00075 gi = 0; 00076 00077 ITERATE(CScope::TIds, iter, ids) { 00078 CConstRef<CSeq_id> id = iter->GetSeqId(); 00079 if (debug_mode > 5) 00080 m_LogFile << "Seq-id " << seqid->AsFastaString() 00081 << " contains id " << id->AsFastaString() << endl; 00082 00083 if (id->IsGi()) { 00084 if (gi > 0) { 00085 if (debug_mode > 5) 00086 m_LogFile << "WARNING: multiple GIs discovered; gi[0] = " 00087 << gi << endl; 00088 } else { 00089 if (debug_mode > 5) 00090 m_LogFile << "Seq-id " << seqid->AsFastaString() 00091 << " resolved to " 00092 << id->GetGi() << endl; 00093 gi = id->GetGi(); 00094 have_gi = true; 00095 } 00096 } else if ((! have_seqid) && (id->Which() == seqid->Which())) { 00097 m_LogFile << "Remote: Resolving <" << seqid->AsFastaString() 00098 << "> to <" << id->AsFastaString() << ">" << endl; 00099 00100 if (id->GetTextseq_Id() == NULL || 00101 id->GetTextseq_Id()->IsSetVersion() == false) { 00102 00103 m_LogFile 00104 << "Warning: Resolution still does not provide version." 00105 << endl; 00106 } else { 00107 seqid.Reset(const_cast<CSeq_id*>(id.GetPointer())); 00108 have_seqid = true; 00109 } 00110 } 00111 00112 if (have_gi) 00113 break; 00114 } 00115 } 00116 00117 // Resolve all ids to GIs, storing them in a GI list. 00118 00119 CRef<CInputGiList> CBuildDatabase::x_ResolveGis(const vector<string> & ids) 00120 { 00121 CRef<CInputGiList> gi_list(new CInputGiList); 00122 00123 ITERATE(vector<string>, id, ids) { 00124 // There are three possibilities: 00125 // 00126 // 1. Numbers are added to the list as GIs. 00127 // 2. Remote services may be called to determine the most 00128 // recent version. 00129 // 3. Non-numerical types are added to the list as Seq-ids. 00130 // 00131 // For #2, the remote service call is only made if: 00132 // 00133 // A. Remote services are enabled. 00134 // B. The Seq-id can have a version (only CTextseq_id types.) 00135 // C. The version is not present. 00136 00137 int gi(0); 00138 bool specific = false; 00139 CRef<CSeq_id> seqid; 00140 00141 bool worked = CheckAccession(*id, gi, seqid, specific); 00142 00143 // If a source database is specified, try that as a backup 00144 // resolution mechanism. 00145 00146 if (! worked) { 00147 if (m_SourceDb.NotEmpty()) { 00148 worked = x_ResolveFromSource(*id, seqid); 00149 } 00150 } 00151 00152 if (! worked) { 00153 m_LogFile << "Did not recognize id: \"" << *id << "\"" << endl; 00154 continue; 00155 } 00156 00157 // 1. Numeric GI 00158 00159 if (gi != 0) { 00160 if (debug_mode > 5) 00161 m_LogFile << "Found numerical GI:" << gi << endl; 00162 00163 gi_list->AppendGi(gi); 00164 continue; 00165 } 00166 00167 // 2. Possible remote resolution. We look for a GI and if 00168 // that is not found, try to find a Seq-id of the same type 00169 // (but with a version). 00170 00171 if (m_UseRemote && (! specific)) { 00172 x_ResolveRemoteId(seqid, gi); 00173 00174 if (gi != 0) { 00175 gi_list->AppendGi(gi); 00176 continue; 00177 } 00178 } 00179 00180 // 3. Just add the Seq-id as a Seq-id. 00181 00182 gi_list->AppendSi(*id); 00183 } 00184 00185 return gi_list; 00186 } 00187 00188 bool CBuildDatabase::x_ResolveFromSource(const string & acc, 00189 CRef<objects::CSeq_id> & id) 00190 { 00191 if (m_SourceDb.Empty()) { 00192 return false; 00193 } 00194 00195 vector<int> oids; 00196 m_SourceDb->AccessionToOids(acc, oids); 00197 00198 bool found(false), done(false); 00199 00200 ITERATE(vector<int>, oid, oids) { 00201 list< CRef<CSeq_id> > ids = m_SourceDb->GetSeqIDs(*oid); 00202 00203 ITERATE(list< CRef<CSeq_id> >, seqid, ids) { 00204 CRef<CSeq_id> s = *seqid; 00205 00206 string S = s->AsFastaString(); 00207 size_t pos = S.find(acc); 00208 00209 if (pos != string::npos) { 00210 size_t endpos = pos + acc.size(); 00211 00212 bool start_okay = (pos == 0 || S[pos-1] == '|'); 00213 bool end_okay = ((endpos == S.size()) || 00214 (S[endpos] == '.' || 00215 S[endpos] == '|')); 00216 00217 if (start_okay && end_okay) { 00218 done = true; 00219 } 00220 00221 if (done || (! found)) { 00222 found = true; 00223 id = s; 00224 } 00225 } 00226 00227 if (done) 00228 break; 00229 } 00230 00231 if (done) 00232 break; 00233 } 00234 00235 return found; 00236 } 00237 00238 void CBuildDatabase::x_DupLocal() 00239 { 00240 TIdToBits bitset; 00241 00242 // Get sequence, deflines, ambiguities, and sometimes pigs. The 00243 // simplest route (for WriteDB) is raw data + asn deflines, so we 00244 // use that when possible. 00245 00246 CStopWatch sw(CStopWatch::eStart); 00247 int count = 0; 00248 00249 for(int oid = 0; m_SourceDb->CheckOrFindOID(oid); oid++) { 00250 // Raw data. 00251 00252 const char * buffer (0); 00253 int slength(0); 00254 int alength(0); 00255 00256 m_SourceDb->GetRawSeqAndAmbig(oid, & buffer, & slength, & alength); 00257 00258 CSequenceReturn seqret(*m_SourceDb, buffer); 00259 00260 CTempString sequence(buffer, slength); 00261 CTempString ambig(buffer + slength, alength); 00262 00263 // Deflines 00264 00265 CRef<CBlast_def_line_set> headers = m_SourceDb->GetHdr(oid); 00266 m_DeflineCount += headers->Get().size(); 00267 m_OIDCount ++; 00268 00269 x_SetLinkAndMbit(headers); 00270 00271 // Always include the taxid; although OPTIONAL, some programs 00272 // expect it, since the C ASN.1 loaders always emit integers. 00273 00274 m_Taxids->FixTaxId(headers); 00275 00276 // Now, add the sequence to the WriteDB database. 00277 00278 m_OutputDb->AddSequence(sequence, ambig); 00279 m_OutputDb->SetDeflines(*headers); 00280 count ++; 00281 } 00282 00283 if (count) { 00284 double t = sw.Elapsed(); 00285 00286 m_LogFile << "Duplication from source DB; duplicated " 00287 << count << " sequences in " << t << " seconds." << endl; 00288 } 00289 } 00290 00291 // This could be moved to writedb once it is tested and working. 00292 00293 static CConstRef<CBioseq> s_FixBioseqDeltas(CConstRef<objects::CBioseq> bs) 00294 { 00295 if ((! bs->CanGetInst()) || bs->GetInst().CanGetSeq_data()) { 00296 return bs; 00297 } 00298 00299 if (bs->CanGetInst() && 00300 bs->GetInst().CanGetExt() && 00301 bs->GetInst().GetExt().IsDelta() && 00302 bs->GetInst().CanGetMol() && 00303 !CSeq_inst::IsNa(bs->GetInst().GetMol())) { 00304 00305 NCBI_THROW(CMultisourceException, eArg, 00306 "Protein delta sequences are not supported."); 00307 } 00308 00309 try { 00310 const CDelta_ext & dext = bs->GetInst().GetExt().GetDelta(); 00311 00312 if(dext.Get().front()->Which() != CDelta_seq::e_Literal) 00313 return bs; 00314 00315 typedef list< CRef< CDelta_seq > > TItems; 00316 00317 // Don't really want to use na4, because a half byte at the 00318 // end of a string would require that string to be manually 00319 // adjusted before appending. 00320 00321 string seq8na; 00322 if (bs->GetInst().CanGetLength()) { 00323 seq8na.reserve(bs->GetInst().GetLength()); 00324 } 00325 00326 string na8; 00327 00328 ITERATE(TItems, item, dext.Get()) { 00329 const CSeq_literal & L = (**item).GetLiteral(); 00330 00331 if (!L.CanGetSeq_data()) { 00332 if (L.CanGetLength()){ 00333 seq8na.append(L.GetLength(), 0x0f); 00334 continue; 00335 } else { 00336 NCBI_THROW(CMultisourceException, eArg, 00337 "Part of the delta sequence, including its length, is un-available."); 00338 } 00339 } 00340 00341 if (L.GetSeq_data().IsNcbi2na()) { 00342 CSeqConvert::Convert(L.GetSeq_data().GetNcbi2na(), 00343 CSeqUtil::e_Ncbi2na, 00344 0, 00345 L.GetLength(), 00346 na8, 00347 CSeqUtil::e_Ncbi8na); 00348 } else if (L.GetSeq_data().IsNcbi4na()) { 00349 CSeqConvert::Convert(L.GetSeq_data().GetNcbi4na(), 00350 CSeqUtil::e_Ncbi4na, 00351 0, 00352 L.GetLength(), 00353 na8, 00354 CSeqUtil::e_Ncbi8na); 00355 } else { 00356 NCBI_THROW(CMultisourceException, eArg, 00357 "Unhandled type of sequence data encountered."); 00358 } 00359 00360 seq8na += na8; 00361 na8.resize(0); 00362 } 00363 00364 // Now convert back to 4na, since WriteDB does not yet handle 00365 // 8na sequences. 00366 00367 int length = seq8na.size(); 00368 vector<char> seq4na; 00369 CSeqConvert::Convert(seq8na, 00370 CSeqUtil::e_Ncbi8na, 00371 0, 00372 length, 00373 seq4na, 00374 CSeqUtil::e_Ncbi4na); 00375 00376 // Copy the needed fields of the CBioseq (but remove the delta 00377 // sequence) and add a Seq-data. 00378 00379 CRef<CBioseq> bs2(new CBioseq); 00380 00381 if (bs->IsSetId()) { 00382 bs2->SetId() = bs->GetId(); 00383 } 00384 00385 if (bs->IsSetDescr()) { 00386 bs2->SetDescr(const_cast<CSeq_descr&>(bs->GetDescr())); 00387 } 00388 00389 CRef<CSeq_inst> inst(new CSeq_inst); 00390 00391 inst->SetSeq_data().SetNcbi4na().Set().swap(seq4na); 00392 inst->SetMol(CSeq_inst::eMol_na); 00393 inst->SetLength(length); 00394 inst->SetRepr(CSeq_inst::eRepr_raw); 00395 00396 bs2->SetInst(*inst); 00397 00398 if (bs->IsSetAnnot()) { 00399 bs2->SetAnnot() = bs->GetAnnot(); 00400 } 00401 00402 bs = bs2; 00403 } 00404 catch(CInvalidChoiceSelection &) { 00405 NCBI_THROW(CMultisourceException, eArg, 00406 "Bioseq must have Seq-data or " 00407 "Delta containing only literals."); 00408 } 00409 00410 return bs; 00411 } 00412 00413 00414 void CBuildDatabase::x_AddPig(CRef<objects::CBlast_def_line_set> headers) 00415 { 00416 int pig = 0; 00417 const CBlast_def_line & defline = *(headers->Get().front()); 00418 if (defline.IsSetOther_info()) 00419 pig = defline.GetOther_info().front(); 00420 00421 m_OutputDb->SetPig(pig); 00422 } 00423 00424 void CBuildDatabase::x_EditHeaders(CRef<objects::CBlast_def_line_set> headers) 00425 { 00426 // Always include the taxid; although OPTIONAL, some programs 00427 // expect it, since the C ASN.1 loaders always emit integers. 00428 00429 m_Taxids->FixTaxId(headers); 00430 00431 // Edit the linkouts 00432 00433 x_SetLinkAndMbit(headers); 00434 } 00435 00436 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \ 00437 (!defined(NCBI_COMPILER_MIPSPRO)) ) 00438 void 00439 CBuildDatabase::x_AddMasksForSeqId(const list< CRef<CSeq_id> >& ids) 00440 { 00441 if (m_MaskData.Empty()) { 00442 return; 00443 } 00444 00445 const CMaskedRangesVector& rng = m_MaskData->GetRanges(ids); 00446 if (rng.empty()) { 00447 return; 00448 } 00449 00450 vector <int> gis; 00451 ITERATE(list< CRef<CSeq_id> >, id, ids) { 00452 if ((*id)->IsGi()) { 00453 gis.push_back((*id)->GetGi()); 00454 } 00455 } 00456 m_OutputDb->SetMaskData(rng, gis); 00457 m_FoundMatchingMasks = true; 00458 } 00459 #endif 00460 00461 bool CBuildDatabase::x_EditAndAddBioseq(CConstRef<objects::CBioseq> bs, 00462 objects::CSeqVector * sv, 00463 bool add_pig) 00464 { 00465 CRef<CBlast_def_line_set> headers = 00466 CWriteDB::ExtractBioseqDeflines(*bs, m_ParseIDs); 00467 00468 x_EditHeaders(headers); 00469 00470 // Add the sequence 00471 if (sv) { 00472 m_OutputDb->AddSequence(*bs, *sv); 00473 } else { 00474 bs = s_FixBioseqDeltas(bs); 00475 if(bs->GetInst().CanGetSeq_data()) 00476 m_OutputDb->AddSequence(*bs); 00477 else 00478 return false; 00479 } 00480 00481 m_DeflineCount += headers->Get().size(); 00482 m_OIDCount ++; 00483 00484 if(add_pig) { 00485 x_AddPig(headers); 00486 } 00487 00488 m_OutputDb->SetDeflines(*headers); 00489 00490 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \ 00491 (!defined(NCBI_COMPILER_MIPSPRO)) ) 00492 const list< CRef<CSeq_id> > & ids = bs->GetId(); 00493 x_AddMasksForSeqId(ids); 00494 #endif 00495 return true; 00496 } 00497 00498 void CBuildDatabase::x_AddOneRemoteSequence(const objects::CSeq_id & seqid, 00499 bool & found_all, 00500 bool & error) 00501 { 00502 // Get handle and bioseq 00503 00504 CConstRef<CBioseq> bs; 00505 CBioseq_Handle bsh; 00506 00507 try { 00508 bsh = x_GetScope().GetBioseqHandle(seqid); 00509 bs = bsh.GetCompleteBioseq(); 00510 00511 if (debug_mode > 5) m_LogFile << MSerial_AsnText << *bs << endl; 00512 } 00513 catch (const CException & e) { 00514 m_LogFile << "Caught exception for query: " 00515 << seqid.AsFastaString() << endl 00516 << e.what() << endl; 00517 found_all = false; 00518 error = true; 00519 } 00520 00521 if (bsh.GetState() & CBioseq_Handle::fState_not_found) { 00522 error = true; 00523 } 00524 00525 00526 00527 CSeqVector sv(bsh); 00528 00529 if(!x_EditAndAddBioseq(bs, & sv)) 00530 error = true; 00531 00532 if (error) { 00533 if (debug_mode > 5) 00534 m_LogFile << "Could not find entry for: " 00535 << seqid.AsFastaString() << endl; 00536 00537 found_all = false; 00538 return; 00539 } 00540 00541 if (debug_mode > 5) 00542 m_LogFile << "-- REMOTE: Found sequence " 00543 << seqid.AsFastaString() << endl; 00544 } 00545 00546 bool CBuildDatabase::x_AddRemoteSequences(CInputGiList & gi_list) 00547 { 00548 CStopWatch sw(CStopWatch::eStart); 00549 int count = 0; 00550 00551 bool found_all = true; 00552 00553 int num_gis = gi_list.GetNumGis(); 00554 int i = 0; 00555 00556 for(i = 0; i < num_gis; i++) { 00557 if (m_Verbose) 00558 m_LogFile << "GI " << gi_list.GetKey<int>(i); 00559 00560 // We only need to fetch here for those cases where the SeqDB 00561 // attempt could not translate the GI. 00562 00563 if (gi_list.GetGiOid(i).oid == -1) { 00564 if (m_Verbose) 00565 m_LogFile << " not found locally; adding remotely." << endl; 00566 00567 CRef<CSeq_id> id(new CSeq_id); 00568 id->SetGi(gi_list.GetKey<int>(i)); 00569 00570 bool error = false; 00571 00572 x_AddOneRemoteSequence(*id, found_all, error); 00573 count++; 00574 } else { 00575 if (m_Verbose) 00576 m_LogFile << " found locally; not adding remotely." << endl; 00577 } 00578 } 00579 00580 int num_seqids = gi_list.GetNumSis(); 00581 00582 for(i = 0; i < num_seqids; i++) { 00583 if (m_Verbose) 00584 m_LogFile << "Seq-id " 00585 << gi_list.GetKey<string>(i); 00586 00587 // We only need to fetch here for those cases where the SeqDB 00588 // attempt could not translate the GI. 00589 00590 if (gi_list.GetSiOid(i).oid == -1) { 00591 if (m_Verbose) 00592 m_LogFile << " not found locally; adding remotely." << endl; 00593 00594 bool error = false; 00595 00596 string acc = gi_list.GetKey<string>(i); 00597 CRef<CSeq_id> id(new CSeq_id(acc)); 00598 x_AddOneRemoteSequence(*id, found_all, error); 00599 count++; 00600 } else { 00601 if (m_Verbose) 00602 m_LogFile << " found locally; not adding remotely." << endl; 00603 } 00604 } 00605 00606 if (count) { 00607 double t = sw.Elapsed(); 00608 00609 m_LogFile << "Adding sequences from remote source; added " 00610 << count << " sequences in " << t << " seconds." << endl; 00611 } 00612 00613 return found_all; 00614 } 00615 00616 bool 00617 CBuildDatabase::x_ReportUnresolvedIds(const CInputGiList & gi_list) const 00618 { 00619 bool success = true; 00620 00621 int num_gis = gi_list.GetNumGis(); 00622 00623 int unresolved = 0; 00624 00625 int i; 00626 for(i = 0; i < num_gis; i++) { 00627 // We only need to fetch here for those cases where the SeqDB 00628 // attempt could not translate the GI. 00629 00630 if (gi_list.GetGiOid(i).oid == -1) { 00631 if (m_Verbose) 00632 m_LogFile << "GI " << gi_list.GetKey<int>(i) 00633 << " was not resolvable." << endl; 00634 00635 success = false; 00636 unresolved ++; 00637 } else { 00638 if (m_Verbose) 00639 m_LogFile << "GI " << gi_list.GetKey<int>(i) 00640 << " found locally." << endl; 00641 } 00642 } 00643 00644 int num_seqids = gi_list.GetNumSis(); 00645 00646 for(i = 0; i < num_seqids; i++) { 00647 // We only need to fetch here for those cases where the SeqDB 00648 // attempt could not translate the GI. 00649 00650 if (gi_list.GetSiOid(i).oid == -1) { 00651 if (m_Verbose) 00652 m_LogFile << "Seq-id " 00653 << gi_list.GetKey<string>(i) 00654 << " was not resolvable." << endl; 00655 00656 unresolved ++; 00657 success = false; 00658 } else { 00659 if (m_Verbose) 00660 m_LogFile << "Seq-id " 00661 << gi_list.GetKey<string>(i) 00662 << " found locally." << endl; 00663 } 00664 } 00665 00666 if (unresolved) { 00667 m_LogFile << "Could not resolve " << unresolved << " IDs." << endl; 00668 } 00669 00670 success = false; 00671 unresolved ++; 00672 00673 return success; 00674 } 00675 00676 class CFastaBioseqSource : public IBioseqSource { 00677 public: 00678 CFastaBioseqSource(CNcbiIstream & fasta_file, 00679 bool is_protein, 00680 bool parse_ids); 00681 00682 ~CFastaBioseqSource(); 00683 00684 virtual CConstRef<CBioseq> GetNext(); 00685 00686 private: 00687 CRef<ILineReader> m_LineReader; 00688 CFastaReader* m_FastaReader; 00689 }; 00690 00691 CFastaBioseqSource::CFastaBioseqSource(CNcbiIstream & fasta_file, 00692 bool is_protein, 00693 bool parse_ids) 00694 : m_FastaReader(NULL) 00695 { 00696 m_LineReader.Reset(new CBufferedLineReader(fasta_file)); 00697 00698 typedef CFastaReader::EFlags TFlags; 00699 00700 int iflags = CFastaReader::fAllSeqIds | CFastaReader::fForceType; 00701 00702 if (is_protein) { 00703 iflags |= CFastaReader::fAssumeProt; 00704 } else { 00705 iflags |= CFastaReader::fAssumeNuc; 00706 } 00707 00708 if (parse_ids) { 00709 iflags |= CFastaReader::fAllSeqIds; 00710 } else { 00711 iflags |= CFastaReader::fNoParseID; 00712 } 00713 00714 TFlags flags = (TFlags) iflags; 00715 00716 m_FastaReader = new CFastaReader(*m_LineReader, flags); 00717 } 00718 00719 CFastaBioseqSource::~CFastaBioseqSource() 00720 { 00721 delete m_FastaReader; 00722 } 00723 00724 CConstRef<CBioseq> CFastaBioseqSource::GetNext() 00725 { 00726 CConstRef<CBioseq> rv; 00727 00728 if (m_LineReader.NotEmpty() && ! m_LineReader->AtEOF()) { 00729 CRef<CSeq_entry> entry; 00730 try { entry = m_FastaReader->ReadOneSeq(); } 00731 catch (const CObjReaderParseException& e) { 00732 static const string kKeyword("m_Pos = "); 00733 SIZE_TYPE start = NStr::Find(e.what(), kKeyword); 00734 SIZE_TYPE end = NStr::Find(e.what(), ")", start); 00735 string pos("unknown"); 00736 if (start != NPOS && end != NPOS) { 00737 start += kKeyword.size(); 00738 pos = string(e.what()).substr(start, end-start); 00739 } 00740 ERR_POST(Error << "Error while reading input at position " << pos); 00741 ERR_POST(Error << "Aborting processing prematurely."); 00742 // additional handling needed 00743 throw(e); 00744 } 00745 00746 if (entry.NotEmpty()) { 00747 _ASSERT(entry->IsSeq()); 00748 rv.Reset(& entry->GetSeq()); 00749 } 00750 } 00751 00752 // Any failure to read a Bioseq is considered an EOF. 00753 00754 if (rv.Empty()) { 00755 m_LineReader.Reset(); 00756 } 00757 00758 return rv; 00759 } 00760 00761 bool CBuildDatabase::AddSequences(IBioseqSource & src, bool add_pig) 00762 { 00763 bool found = false; 00764 00765 CStopWatch sw(CStopWatch::eStart); 00766 int count = 0; 00767 00768 CConstRef<CBioseq> bs = src.GetNext(); 00769 00770 while(bs.NotEmpty()) { 00771 string bioseq_id("Unknown"); 00772 00773 if (bs->CanGetId()) { 00774 const list< CRef<CSeq_id> > & ids = bs->GetId(); 00775 if (! ids.empty() && ids.front().NotEmpty()) { 00776 bioseq_id.assign(ids.front()->AsFastaString()); 00777 } 00778 } 00779 00780 if(bs->IsAa() != m_IsProtein ){ 00781 bs = src.GetNext(); 00782 continue; 00783 } 00784 00785 if ((bs->GetLength() == 0) || (!x_EditAndAddBioseq(bs, NULL, add_pig))){ 00786 m_LogFile << "Ignoring sequence '" << bioseq_id 00787 << "' as it has no sequence data" << endl; 00788 bs = src.GetNext(); 00789 continue; 00790 } 00791 00792 if (m_Verbose) { 00793 m_LogFile << "Adding bioseq from fasta; first id is: '" << bioseq_id 00794 << "'" << endl; 00795 } 00796 00797 // No linkouts or memberships here (yet). 00798 00799 found = true; 00800 00801 count++; 00802 00803 if (debug_mode > 5) m_LogFile << "-- FASTA: Found sequence." << endl; 00804 00805 bs = src.GetNext(); 00806 } 00807 00808 if (count) { 00809 double t = sw.Elapsed(); 00810 00811 m_LogFile << "Adding sequences from FASTA; added " 00812 << count << " sequences in " << t << " seconds." << endl; 00813 } 00814 00815 return found; 00816 } 00817 00818 bool CBuildDatabase::AddSequences(IRawSequenceSource & src) 00819 { 00820 CStopWatch sw(CStopWatch::eStart); 00821 00822 bool done = false; 00823 bool rv = false; 00824 00825 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \ 00826 (!defined(NCBI_COMPILER_MIPSPRO)) ) 00827 // Get all column names. 00828 00829 vector<string> all_names; 00830 map<int, int> in2out; 00831 int mask_id = -1; 00832 00833 src.GetColumnNames(all_names); 00834 00835 for(int i = 0; i < (int) all_names.size(); i++) { 00836 string name = all_names[i]; 00837 int in_id = src.GetColumnId(name); 00838 00839 // skip masking data column 00840 if (name == "BlastDb/MaskData") { 00841 mask_id = in_id; 00842 continue; 00843 } 00844 int out_id = m_OutputDb->FindColumn(name); 00845 00846 if (out_id < 0) { 00847 out_id = m_OutputDb->CreateUserColumn(name); 00848 } 00849 00850 typedef map<string,string> StringPairMap; 00851 const StringPairMap & meta = src.GetColumnMetaData(in_id); 00852 00853 ITERATE(StringPairMap, iter, meta) { 00854 m_OutputDb->AddColumnMetaData(out_id, iter->first, iter->second); 00855 } 00856 00857 in2out[in_id] = out_id; 00858 } 00859 #endif 00860 // Copy all data. 00861 00862 vector<CTempString> column_blobs; 00863 vector<int> column_ids; 00864 00865 int count = 0; 00866 00867 while(! done) { 00868 CTempString sequence, ambiguities; 00869 CRef<CBlast_def_line_set> deflines; 00870 CMaskedRangesVector mask_data; 00871 00872 if (src.GetNext(sequence, 00873 ambiguities, 00874 deflines, 00875 mask_data, 00876 column_ids, 00877 column_blobs)) { 00878 00879 // Copy data 00880 00881 _ASSERT(column_blobs.size() == column_ids.size()); 00882 00883 if (sequence.empty()) { 00884 NCBI_THROW(CMultisourceException, eArg, 00885 "Error in raw data: no sequence"); 00886 } 00887 00888 if ((! ambiguities.empty()) && m_IsProtein) { 00889 NCBI_THROW(CMultisourceException, eArg, 00890 "Error in raw data: " 00891 "protein db cannot with ambiguities"); 00892 } 00893 00894 if (deflines.Empty()) { 00895 NCBI_THROW(CMultisourceException, eArg, 00896 "Error in raw data: no headers provided"); 00897 } 00898 00899 x_EditHeaders(deflines); 00900 00901 m_OutputDb->AddSequence(sequence, ambiguities); 00902 x_AddPig(deflines); 00903 m_OutputDb->SetDeflines(*deflines); 00904 00905 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \ 00906 (!defined(NCBI_COMPILER_MIPSPRO)) ) 00907 for(int i = 0; i < (int)column_ids.size(); i++) { 00908 int in_id = column_ids[i]; 00909 if (in_id == mask_id) continue; 00910 00911 if (column_blobs[i].size() == 0) 00912 continue; 00913 00914 _ASSERT(in2out.find(in_id) != in2out.end()); 00915 00916 int out_id = in2out[in_id]; 00917 00918 CTempString blob_in = column_blobs[i]; 00919 CBlastDbBlob & blob_out = m_OutputDb->SetBlobData(out_id); 00920 00921 blob_out.Clear(); 00922 blob_out.WriteRaw(& blob_in.data()[0], blob_in.size()); 00923 } 00924 // Don't forget about the IMaskDataSource! 00925 vector <int> gis; // GIs associated with this sequence 00926 if (!mask_data.empty() || !m_MaskData.Empty()) { 00927 ITERATE(CBlast_def_line_set::Tdata, defline, deflines->Get()) { 00928 const list< CRef<CSeq_id> > & ids = (*defline)->GetSeqid(); 00929 ITERATE(list< CRef<CSeq_id> >, id, ids) { 00930 if ((*id)->IsGi()) { 00931 gis.push_back((*id)->GetGi()); 00932 } 00933 } 00934 if (!m_MaskData.Empty()) { 00935 const CMaskedRangesVector rng = m_MaskData->GetRanges(ids); 00936 if (!rng.empty()) { 00937 mask_data.insert(mask_data.end(), rng.begin(), rng.end()); 00938 m_FoundMatchingMasks = true; 00939 } 00940 } 00941 } 00942 } 00943 if (!mask_data.empty()) { 00944 m_OutputDb->SetMaskData(mask_data, gis); 00945 } 00946 #endif 00947 00948 rv = true; 00949 count ++; 00950 } else { 00951 done = true; 00952 } 00953 } 00954 00955 if (count) { 00956 double t = sw.Elapsed(); 00957 00958 m_LogFile << "Adding sequences from raw db source; added " 00959 << count << " sequences in " << t << " seconds." << endl; 00960 } 00961 00962 return rv; 00963 } 00964 00965 static void s_CreateDirectories(const string& dbname) 00966 { 00967 CDirEntry dir_entry(dbname); 00968 string dir_name = dir_entry.GetDir(CDirEntry::eIfEmptyPath_Empty); 00969 if (dir_name.empty()) { 00970 return; 00971 } 00972 00973 CDir d(dir_name); 00974 if ( !d.Exists() ) { 00975 if ( !d.CreatePath() ) { 00976 string msg("Failed to create directory '" + d.GetName() + "'"); 00977 NCBI_THROW(CMultisourceException, eOutputFileError, msg); 00978 } 00979 } 00980 if (!d.CheckAccess(CDirEntry::fWrite)) { 00981 string msg("You do not have write permissions on '" + 00982 d.GetName() + "'"); 00983 NCBI_THROW(CMultisourceException, eOutputFileError, msg); 00984 } 00985 } 00986 00987 CBuildDatabase::CBuildDatabase(const string & dbname, 00988 const string & title, 00989 bool is_protein, 00990 CWriteDB::TIndexType indexing, 00991 bool use_gi_mask, 00992 ostream * logfile) 00993 : m_IsProtein (is_protein), 00994 m_KeepLinks (false), 00995 m_KeepMbits (false), 00996 m_Taxids (new CTaxIdSet()), 00997 m_LogFile (*logfile), 00998 m_UseRemote (true), 00999 m_DeflineCount (0), 01000 m_OIDCount (0), 01001 m_Verbose (false), 01002 m_ParseIDs (((indexing & CWriteDB::eFullIndex) != 0 ? true : false)), 01003 m_FoundMatchingMasks(false) 01004 { 01005 s_CreateDirectories(dbname); 01006 m_LogFile << "\n\nBuilding a new DB, current time: " 01007 << CTime(CTime::eCurrent).AsString() << endl; 01008 01009 m_LogFile << "New DB name: " << dbname << endl; 01010 m_LogFile << "New DB title: " << title << endl; 01011 m_LogFile << "Sequence type: " 01012 << (is_protein ? "Protein" : "Nucleotide") << endl; 01013 01014 CWriteDB::ESeqType seqtype = 01015 (is_protein ? CWriteDB::eProtein : CWriteDB::eNucleotide); 01016 01017 m_OutputDb.Reset(new CWriteDB(dbname, 01018 seqtype, 01019 title, 01020 indexing, 01021 m_ParseIDs, 01022 use_gi_mask)); 01023 01024 // Standard 1 GB limit 01025 01026 m_OutputDb->SetMaxFileSize(1000*1000*1000); 01027 } 01028 01029 CBuildDatabase::CBuildDatabase(const string & dbname, 01030 const string & title, 01031 bool is_protein, 01032 bool sparse, 01033 bool parse_seqids, 01034 bool use_gi_mask, 01035 ostream * logfile) 01036 : m_IsProtein (is_protein), 01037 m_KeepLinks (false), 01038 m_KeepMbits (false), 01039 m_Taxids (new CTaxIdSet()), 01040 m_LogFile (*logfile), 01041 m_UseRemote (true), 01042 m_DeflineCount (0), 01043 m_OIDCount (0), 01044 m_Verbose (false), 01045 m_ParseIDs (parse_seqids), 01046 m_FoundMatchingMasks(false) 01047 { 01048 s_CreateDirectories(dbname); 01049 m_LogFile << "\n\nBuilding a new DB, current time: " 01050 << CTime(CTime::eCurrent).AsString() << endl; 01051 01052 m_LogFile << "New DB name: " << dbname << endl; 01053 m_LogFile << "New DB title: " << title << endl; 01054 m_LogFile << "Sequence type: " 01055 << (is_protein ? "Protein" : "Nucleotide") << endl; 01056 01057 CWriteDB::ESeqType seqtype = 01058 (is_protein ? CWriteDB::eProtein : CWriteDB::eNucleotide); 01059 01060 CWriteDB::EIndexType ix = (sparse 01061 ? CWriteDB::eSparseIndex 01062 : CWriteDB::eDefault); 01063 01064 m_OutputDb.Reset(new CWriteDB(dbname, 01065 seqtype, 01066 title, 01067 ix, 01068 m_ParseIDs, 01069 use_gi_mask)); 01070 01071 // Standard 1 GB limit 01072 01073 m_OutputDb->SetMaxFileSize(1000*1000*1000); 01074 } 01075 01076 CBuildDatabase::~CBuildDatabase() 01077 { 01078 if (m_MaskData.NotEmpty() && !m_FoundMatchingMasks) { 01079 ERR_POST(Error << "No sequences matched any of the masks provided.\n" 01080 << "Please ensure that the -parse_seqids option is used " 01081 << "in the\nfiltering program as well as makeblastdb."); 01082 } 01083 if (!m_Taxids->HasEverFixedId()) { 01084 ERR_POST(Error << "No sequences matched any of the taxids provided."); 01085 } 01086 } 01087 01088 void CBuildDatabase::SetTaxids(CTaxIdSet & taxids) 01089 { 01090 m_Taxids.Reset(& taxids); 01091 } 01092 01093 void CBuildDatabase::SetMaskLetters(const string & letters) 01094 { 01095 m_OutputDb->SetMaskedLetters(letters); 01096 } 01097 01098 CScope & CBuildDatabase::x_GetScope() 01099 { 01100 if (m_Scope.Empty()) { 01101 if (m_ObjMgr.Empty()) { 01102 m_ObjMgr.Reset(CObjectManager::GetInstance()); 01103 } 01104 01105 m_Scope.Reset(new CScope(*m_ObjMgr)); 01106 01107 // Add default loaders (GB loader in this demo) to the scope. 01108 m_Scope->AddDefaults(); 01109 } 01110 01111 return *m_Scope; 01112 } 01113 01114 void CBuildDatabase::SetSourceDb(CRef<CSeqDBExpert> seqdb) 01115 { 01116 m_LogFile << "Configured source DB: " << seqdb->GetDBNameList() << endl; 01117 m_LogFile << "Source DB has title: " << seqdb->GetTitle() << endl; 01118 m_LogFile << "Source DB time stamp: " << seqdb->GetDate() << endl; 01119 m_SourceDb = seqdb; 01120 } 01121 01122 void CBuildDatabase::SetSourceDb(const string & src_db_name) 01123 { 01124 _ASSERT(src_db_name.size()); 01125 CRef<CSeqDBExpert> src_db(new CSeqDBExpert(src_db_name, 01126 m_IsProtein 01127 ? CSeqDB::eProtein 01128 : CSeqDB::eNucleotide)); 01129 01130 SetSourceDb(src_db); 01131 } 01132 01133 void CBuildDatabase::SetLinkouts(const TLinkoutMap & linkouts, 01134 bool keep_links) 01135 { 01136 m_LogFile << "Keep Linkouts: " << (keep_links ? "T" : "F") << endl; 01137 MapToLMBits(linkouts, m_Id2Links); 01138 m_KeepLinks = keep_links; 01139 } 01140 01141 void CBuildDatabase::SetMembBits(const TLinkoutMap & membbits, 01142 bool keep_mbits) 01143 { 01144 m_LogFile << "Keep MBits: " << (keep_mbits ? "T" : "F") << endl; 01145 MapToLMBits(membbits, m_Id2Mbits); 01146 m_KeepMbits = keep_mbits; 01147 } 01148 01149 bool 01150 CBuildDatabase::Build(const vector<string> & ids, 01151 CNcbiIstream * fasta_file) 01152 { 01153 CStopWatch sw(CStopWatch::eStart); 01154 01155 StartBuild(); 01156 01157 bool success = AddIds(ids); 01158 01159 if (success) { 01160 success = AddFasta(*fasta_file); 01161 } 01162 01163 bool success2 = EndBuild(); 01164 01165 success = success || success2; 01166 01167 double t = sw.Elapsed(); 01168 01169 m_LogFile << "Total sequences stored: " << m_OIDCount << endl; 01170 m_LogFile << "Total deflines stored: " << m_DeflineCount << endl; 01171 01172 m_LogFile << "Total time to build database: " 01173 << t << " seconds.\n" << endl; 01174 01175 return success; 01176 } 01177 01178 void CBuildDatabase::StartBuild() 01179 { 01180 } 01181 01182 bool CBuildDatabase::AddIds(const vector<string> & ids) 01183 { 01184 01185 bool success = true; 01186 01187 // Resolve all ids to GIs, storing them in a GI list. 01188 01189 CRef<CInputGiList> gi_list; 01190 01191 if (m_SourceDb.NotEmpty() && ! ids.empty()) { 01192 gi_list = x_ResolveGis(ids); 01193 } 01194 01195 // Translate the GI list. 01196 01197 if (gi_list.NotEmpty() && 01198 (gi_list->GetNumGis() || gi_list->GetNumSis())) { 01199 01200 // The process of constructing a SeqDB object with a user GI 01201 // list causes translation of the User GI list, and is the 01202 // fastest way of performing such a translation in bulk. It 01203 // is possible to iterate the list afterwards to determine 01204 // what subset of it that has been translated; non-translated 01205 // GIs will need to be fetched using a data loader. 01206 // 01207 // It is not necessary, however, to iterate the GI list to 01208 // find OIDs that correspond to the filtered DB; these can be 01209 // found using OID iteration over SeqDB, which produces a 01210 // better ordering inasmuch as the reads from the source 01211 // sequence data will be sequential on disk. 01212 01213 _ASSERT(m_SourceDb.NotEmpty()); 01214 01215 CRef<CSeqDBExpert> filtered 01216 (new CSeqDBExpert(m_SourceDb->GetDBNameList(), 01217 m_SourceDb->GetSequenceType(), 01218 &* gi_list)); 01219 01220 m_SourceDb = filtered; 01221 01222 // Add all local database sequences to the output DB. 01223 01224 x_DupLocal(); 01225 01226 if (m_Verbose) { 01227 // Map oid to gi. 01228 map<int,int> seen_it; 01229 01230 for(int i = 0; i < gi_list->GetNumGis(); i++) { 01231 int this_oid = gi_list->GetGiOid(i).oid; 01232 int this_gi = gi_list->GetGiOid(i).gi; 01233 01234 if (this_oid != -1) { 01235 if (seen_it.find(this_oid) == seen_it.end()) { 01236 seen_it[this_oid] = this_gi; 01237 } else { 01238 m_LogFile << "GI " << this_gi 01239 << " is duplicate of GI " 01240 << seen_it[this_oid] 01241 << endl; 01242 } 01243 } 01244 } 01245 } 01246 } 01247 01248 if (gi_list.NotEmpty()) { 01249 if (m_UseRemote) { 01250 success = x_AddRemoteSequences(*gi_list); 01251 } else { 01252 success = x_ReportUnresolvedIds(*gi_list); 01253 } 01254 } 01255 01256 return success; 01257 } 01258 01259 bool CBuildDatabase::AddFasta(CNcbiIstream & fasta_file) 01260 { 01261 // Add any fasta sequences as well. 01262 bool success = true; 01263 01264 if (fasta_file) { 01265 CFastaBioseqSource fbs(fasta_file, 01266 m_IsProtein, 01267 m_ParseIDs); 01268 01269 try { 01270 success = AddSequences(fbs); 01271 } 01272 catch (...) { 01273 EndBuild(true); 01274 throw; 01275 } 01276 } 01277 return success; 01278 } 01279 01280 bool CBuildDatabase::EndBuild(bool erase) 01281 { 01282 bool success = false; 01283 bool can_not_close = false; 01284 01285 try { 01286 m_OutputDb->Close(); 01287 } catch (...) { 01288 if (!erase) { 01289 erase = true; 01290 can_not_close = true; 01291 } 01292 } 01293 01294 vector<string> vols; 01295 vector<string> files; 01296 01297 m_OutputDb->ListVolumes(vols); 01298 m_OutputDb->ListFiles(files); 01299 01300 m_LogFile << endl; 01301 01302 _ASSERT(vols.empty() == files.empty()); 01303 01304 if (vols.empty()) { 01305 m_LogFile << "No volumes were created because no sequences were found." 01306 << endl; 01307 01308 success = false; 01309 } else { 01310 ITERATE(vector<string>, iterv, vols) { 01311 m_LogFile << "volume: " << *iterv << endl; 01312 } 01313 01314 m_LogFile << endl; 01315 ITERATE(vector<string>, iterf, files) { 01316 m_LogFile << "file: " << *iterf << endl; 01317 if (erase) { 01318 CFile(*iterf).Remove(); 01319 } 01320 } 01321 } 01322 01323 m_LogFile << endl; 01324 01325 if (can_not_close) { 01326 NCBI_THROW(CWriteDBException, eArgErr, 01327 "Can not close files."); 01328 } 01329 01330 return success; 01331 } 01332 01333 01334 static void 01335 s_SetDeflineBits(objects::CBlast_def_line & defline, 01336 TIdToBits & bitmap, 01337 bool keep_old, 01338 bool is_memb, 01339 vector<string> & keys) 01340 { 01341 bool found = false; 01342 int value = 0; 01343 01344 ITERATE(vector<string>, key, keys) { 01345 if (! key->size()) 01346 continue; 01347 01348 TIdToBits::iterator item = bitmap.find(*key); 01349 01350 if (item != bitmap.end()) { 01351 found = true; 01352 value |= item->second; 01353 } 01354 } 01355 01356 if (found) { 01357 list<int> & linkv = (is_memb 01358 ? defline.SetMemberships() 01359 : defline.SetLinks()); 01360 01361 if (! keep_old) { 01362 linkv.clear(); 01363 } 01364 01365 if (linkv.empty()) { 01366 linkv.push_back(value); 01367 } else { 01368 linkv.front() |= value; 01369 } 01370 } else { 01371 if (! keep_old) { 01372 if (is_memb) { 01373 defline.ResetMemberships(); 01374 } else { 01375 defline.ResetLinks(); 01376 } 01377 } 01378 } 01379 } 01380 01381 void 01382 CBuildDatabase::x_SetLinkAndMbit(CRef<objects::CBlast_def_line_set> headers) 01383 { 01384 vector<string> keys; 01385 01386 NON_CONST_ITERATE(CBlast_def_line_set::Tdata, iter, headers->Set()) { 01387 CBlast_def_line & defline = **iter; 01388 GetDeflineKeys(defline, keys); 01389 01390 s_SetDeflineBits(defline, m_Id2Links, m_KeepLinks, false, keys); 01391 s_SetDeflineBits(defline, m_Id2Mbits, m_KeepMbits, true, keys); 01392 } 01393 } 01394 01395 void CBuildDatabase::SetMaxFileSize(Uint8 max_file_size) 01396 { 01397 m_OutputDb->SetMaxFileSize(max_file_size); 01398 } 01399 01400 int 01401 CBuildDatabase::RegisterMaskingAlgorithm(EBlast_filter_program program, 01402 const string & options, 01403 const string & name) 01404 { 01405 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \ 01406 (!defined(NCBI_COMPILER_MIPSPRO)) ) 01407 return m_OutputDb->RegisterMaskAlgorithm(program, options, name); 01408 #else 01409 return 0; 01410 #endif 01411 } 01412 01413 void CBuildDatabase::SetMaskDataSource(IMaskDataSource & ranges) 01414 { 01415 m_MaskData.Reset(& ranges); 01416 } 01417 01418 END_NCBI_SCOPE
1.7.5.1
Modified on Wed May 23 13:10:40 2012 by modify_doxy.py rev. 337098