|
NCBI C++ ToolKit
|
00001 /* $Id: lds_object.cpp 45852 2010-05-24 16:43:37Z grichenk $ 00002 * =========================================================================== 00003 * 00004 * PUBLIC DOMAIN NOTICE 00005 * National Center for Biotechnology Information 00006 * 00007 * This software/database is a "United States Government Work" under the 00008 * terms of the United States Copyright Act. It was written as part of 00009 * the author's official duties as a United States Government employee and 00010 * thus cannot be copyrighted. This software/database is freely available 00011 * to the public for use. The National Library of Medicine and the U.S. 00012 * Government have not placed any restriction on its use or reproduction. 00013 * 00014 * Although all reasonable efforts have been taken to ensure the accuracy 00015 * and reliability of the software and data, the NLM and the U.S. 00016 * Government do not and cannot warrant the performance or results that 00017 * may be obtained by using this software or data. The NLM and the U.S. 00018 * Government disclaim all warranties, express or implied, including 00019 * warranties of performance, merchantability or fitness for any particular 00020 * purpose. 00021 * 00022 * Please cite the author in any work or product based on this material. 00023 * 00024 * =========================================================================== 00025 * 00026 * Author: Anatoliy Kuznetsov, Victor Joukov 00027 * 00028 * File Description: CLDS_Object implementation. 00029 * 00030 */ 00031 00032 00033 #include <ncbi_pch.hpp> 00034 #include <objects/seqset/seqset__.hpp> 00035 #include <objects/seq/seq__.hpp> 00036 #include <objects/seqalign/seqalign__.hpp> 00037 #include <objects/seqfeat/Seq_feat.hpp> 00038 #include <objects/seqres/Seq_graph.hpp> 00039 #include <objects/seqloc/seqloc__.hpp> 00040 #include <objects/biblio/Id_pat.hpp> 00041 #include <objects/general/Dbtag.hpp> 00042 #include <objects/general/Object_id.hpp> 00043 00044 #include <db/bdb/bdb_cursor.hpp> 00045 #include <db/bdb/bdb_util.hpp> 00046 00047 #include <objtools/readers/fasta.hpp> 00048 00049 #include <objtools/lds/lds_object.hpp> 00050 #include <objtools/lds/lds_set.hpp> 00051 #include <objtools/lds/lds_util.hpp> 00052 #include <objtools/lds/lds.hpp> 00053 #include <objtools/lds/lds_query.hpp> 00054 #include <objtools/error_codes.hpp> 00055 00056 #include <objmgr/object_manager.hpp> 00057 #include <objmgr/scope.hpp> 00058 #include <objmgr/util/sequence.hpp> 00059 00060 #include <serial/objhook.hpp> 00061 #include <serial/objistr.hpp> 00062 #include <serial/objectiter.hpp> 00063 #include <serial/objectio.hpp> 00064 #include <serial/iterator.hpp> 00065 00066 #define TRY_FAST_TITLE 1 00067 #define CREATE_SCOPES 1 00068 00069 #define NCBI_USE_ERRCODE_X Objtools_LDS_Object 00070 00071 00072 BEGIN_NCBI_SCOPE 00073 BEGIN_SCOPE(objects) 00074 00075 00076 /// @internal 00077 class CLDS_FastaScanner : public IFastaEntryScan 00078 { 00079 public: 00080 CLDS_FastaScanner(CLDS_Object& obj, 00081 int file_id, 00082 int type_id); 00083 00084 virtual void EntryFound(CRef<CSeq_entry> se, 00085 CNcbiStreampos stream_position); 00086 private: 00087 CLDS_Object& m_Obj; 00088 int m_FileId; 00089 int m_TypeId; 00090 }; 00091 00092 CLDS_FastaScanner::CLDS_FastaScanner(CLDS_Object& obj, 00093 int file_id, 00094 int type_id) 00095 : m_Obj(obj), 00096 m_FileId(file_id), 00097 m_TypeId(type_id) 00098 {} 00099 00100 void CLDS_FastaScanner::EntryFound(CRef<CSeq_entry> se, 00101 CNcbiStreampos stream_position) 00102 { 00103 if (!se->IsSeq()) 00104 return; 00105 00106 SFastaFileMap::SFastaEntry fasta_entry; 00107 fasta_entry.stream_offset = stream_position; 00108 00109 // extract sequence info 00110 00111 const CSeq_entry::TSeq& bioseq = se->GetSeq(); 00112 const CSeq_id* sid = bioseq.GetFirstId(); 00113 fasta_entry.seq_id = sid->AsFastaString(); 00114 00115 fasta_entry.all_seq_ids.resize(0); 00116 if (bioseq.CanGetId()) { 00117 const CBioseq::TId& seq_ids = bioseq.GetId(); 00118 string id_str; 00119 ITERATE(CBioseq::TId, it, seq_ids) { 00120 const CBioseq::TId::value_type& vt = *it; 00121 id_str = vt->AsFastaString(); 00122 fasta_entry.all_seq_ids.push_back(id_str); 00123 } 00124 } 00125 00126 if (bioseq.CanGetDescr()) { 00127 const CSeq_descr& d = bioseq.GetDescr(); 00128 if (d.CanGet()) { 00129 const CSeq_descr_Base::Tdata& data = d.Get(); 00130 if (!data.empty()) { 00131 CSeq_descr_Base::Tdata::const_iterator it = 00132 data.begin(); 00133 if (it != data.end()) { 00134 CRef<CSeqdesc> ref_desc = *it; 00135 ref_desc->GetLabel(&fasta_entry.description, 00136 CSeqdesc::eContent); 00137 } 00138 } 00139 } 00140 } 00141 00142 // store entry record 00143 00144 // concatenate all ids 00145 string seq_ids; 00146 ITERATE(SFastaFileMap::SFastaEntry::TFastaSeqIds, 00147 it_id, fasta_entry.all_seq_ids) { 00148 seq_ids.append(*it_id); 00149 seq_ids.append(" "); 00150 } 00151 00152 m_Obj.SaveObject(m_FileId, 00153 fasta_entry.seq_id, 00154 fasta_entry.description, 00155 seq_ids, 00156 fasta_entry.stream_offset, 00157 m_TypeId); 00158 00159 } 00160 00161 00162 void CLDS_Object::DeleteUpdateCascadeFiles(const CLDS_Set& files_deleted, 00163 const CLDS_Set& files_updated) 00164 { 00165 CLDS_Set objects_deleted; 00166 CLDS_Set annotations_deleted; 00167 DeleteCascadeFiles(files_deleted, &objects_deleted, &annotations_deleted); 00168 UpdateCascadeFiles(files_updated); 00169 if ( files_deleted.any() || files_updated.any() ) { 00170 // re-index 00171 BuildSeqIdIdx(); 00172 } 00173 } 00174 00175 00176 void CLDS_Object::DeleteCascadeFiles(const CLDS_Set& file_ids, 00177 CLDS_Set* objects_deleted, 00178 CLDS_Set* annotations_deleted) 00179 { 00180 if (file_ids.none()) 00181 return; 00182 00183 // 00184 // Delete records from "object" table 00185 // 00186 {{ 00187 CBDB_FileCursor cur(m_db.object_db); 00188 cur.SetCondition(CBDB_FileCursor::eFirst); 00189 while (cur.Fetch() == eBDB_Ok) { 00190 int fid = m_db.object_db.file_id; 00191 if (fid && LDS_SetTest(file_ids, fid)) { 00192 /* 00193 int object_attr_id = m_db.object_db.object_attr_id; 00194 00195 if (object_attr_id) { // delete dependent object attr 00196 m_db.object_attr_db.object_attr_id = object_attr_id; 00197 m_db.object_attr_db.Delete(); 00198 } 00199 */ 00200 int object_id = m_db.object_db.object_id; 00201 00202 objects_deleted->set(object_id); 00203 m_db.object_db.Delete(); 00204 } 00205 } 00206 00207 }} 00208 00209 // 00210 // Delete "annot2obj" 00211 // 00212 {{ 00213 CBDB_FileCursor cur(m_db.annot2obj_db); 00214 cur.SetCondition(CBDB_FileCursor::eFirst); 00215 while (cur.Fetch() == eBDB_Ok) { 00216 int object_id = m_db.annot2obj_db.object_id; 00217 if (object_id && LDS_SetTest(*objects_deleted, object_id)) { 00218 m_db.annot2obj_db.Delete(); 00219 } 00220 } 00221 00222 }} 00223 00224 // 00225 // Delete "annotation" 00226 // 00227 {{ 00228 CBDB_FileCursor cur(m_db.annot_db); 00229 cur.SetCondition(CBDB_FileCursor::eFirst); 00230 while (cur.Fetch() == eBDB_Ok) { 00231 if ( !m_db.object_db.file_id.IsNull() ) { 00232 int fid = m_db.object_db.file_id; 00233 if (fid && LDS_SetTest(file_ids, fid)) { 00234 int annot_id = m_db.annot_db.annot_id; 00235 annotations_deleted->set(annot_id); 00236 m_db.annot_db.Delete(); 00237 } 00238 } 00239 } 00240 00241 }} 00242 00243 // 00244 // Delete "seq_id_list" 00245 // 00246 {{ 00247 00248 {{ 00249 CLDS_Set::enumerator en = objects_deleted->first(); 00250 for ( ; en.valid(); ++en) { 00251 int id = *en; 00252 m_db.seq_id_list.object_id = id; 00253 m_db.seq_id_list.Delete(); 00254 } 00255 }} 00256 00257 CLDS_Set::enumerator en = annotations_deleted->first(); 00258 for ( ; en.valid(); ++en) { 00259 int id = *en; 00260 m_db.seq_id_list.object_id = id; 00261 m_db.seq_id_list.Delete(); 00262 } 00263 00264 }} 00265 00266 } 00267 00268 00269 void CLDS_Object::UpdateCascadeFiles(const CLDS_Set& file_ids) 00270 { 00271 if (file_ids.none()) { 00272 return; 00273 } 00274 00275 CLDS_Set objects_deleted; 00276 CLDS_Set annotations_deleted; 00277 DeleteCascadeFiles(file_ids, &objects_deleted, &annotations_deleted); 00278 00279 CLDS_Set::enumerator en(file_ids.first()); 00280 for ( ; en.valid(); ++en) { 00281 int fid = *en; 00282 m_db.file_db.file_id = fid; 00283 00284 if (m_db.file_db.Fetch() == eBDB_Ok) { 00285 string fname(m_db.file_db.file_name); 00286 CFormatGuess::EFormat format = 00287 (CFormatGuess::EFormat)(int)m_db.file_db.format; 00288 00289 LOG_POST_X(1, Info << "<< Updating file >>: " << fname); 00290 00291 UpdateFileObjects(fid, fname, format); 00292 } 00293 } // ITERATE 00294 } 00295 00296 00297 class CLDS_SkipObjectHook : public CReadObjectHook 00298 { 00299 public: 00300 virtual void ReadObject(CObjectIStream& in, 00301 const CObjectInfo& obj) { 00302 DefaultSkip(in, obj); 00303 } 00304 }; 00305 00306 00307 class CLDS_Seq_ids : public CObject 00308 { 00309 public: 00310 typedef vector<CRef<CSeq_id> > TIds; 00311 typedef vector<int> TGis; 00312 void clear() 00313 { 00314 m_Ids.clear(); 00315 m_Gis.clear(); 00316 } 00317 void AddSeq_id(const CSeq_id& id) 00318 { 00319 if ( id.IsGi() ) { 00320 AddGi(id.GetGi()); 00321 } 00322 else if ( m_Ids.empty() || !m_Ids.back()->Equals(id) ) { 00323 m_Ids.push_back(Ref(SerialClone(id))); 00324 } 00325 } 00326 void AddGi(int gi) 00327 { 00328 if ( m_Gis.empty() || m_Gis.back() != gi ) { 00329 m_Gis.push_back(gi); 00330 } 00331 } 00332 00333 TIds m_Ids; 00334 TGis m_Gis; 00335 }; 00336 00337 class CLDS_CollectSeq_idsReader : public CSkipObjectHook 00338 { 00339 public: 00340 CLDS_CollectSeq_idsReader(void) 00341 : m_Seq_id(new CSeq_id()), m_Collect(0) 00342 { 00343 } 00344 00345 virtual void SkipObject(CObjectIStream& in, 00346 const CObjectTypeInfo& type) { 00347 if ( m_Collect ) { 00348 DefaultRead(in, ObjectInfo(*m_Seq_id)); 00349 m_Collect->AddSeq_id(*m_Seq_id); 00350 } 00351 else { 00352 DefaultSkip(in, type); 00353 } 00354 } 00355 00356 void Collect(CLDS_Seq_ids* ids) { 00357 m_Collect = ids; 00358 } 00359 00360 class CGuard 00361 { 00362 public: 00363 CGuard(CLDS_CollectSeq_idsReader& reader, CLDS_Seq_ids& ids) 00364 : m_Reader(reader) 00365 { 00366 reader.Collect(&ids); 00367 } 00368 ~CGuard() 00369 { 00370 m_Reader.Collect(0); 00371 } 00372 private: 00373 CLDS_CollectSeq_idsReader& m_Reader; 00374 00375 CGuard(const CGuard&); 00376 void operator=(const CGuard&); 00377 }; 00378 00379 private: 00380 CRef<CSeq_id> m_Seq_id; 00381 CLDS_Seq_ids* m_Collect; 00382 }; 00383 00384 00385 class PLessObjectPtr 00386 { 00387 public: 00388 bool operator()(const CObjectInfo& a, const CObjectInfo& b) const { 00389 return a.GetObjectPtr() < b.GetObjectPtr(); 00390 } 00391 }; 00392 00393 00394 class CLDS_Seq_idsCollector : public CReadClassMemberHook 00395 { 00396 public: 00397 typedef map<CObjectInfo, CRef<CLDS_Seq_ids>, PLessObjectPtr> TIdsMap; 00398 00399 CLDS_Seq_idsCollector(CLDS_CollectSeq_idsReader* collector) 00400 : m_Collector(collector) 00401 { 00402 } 00403 00404 virtual void ReadClassMember(CObjectIStream& in, 00405 const CObjectInfoMI& member) { 00406 CRef<CLDS_Seq_ids>& ids = m_Ids[member.GetClassObject()]; 00407 ids = new CLDS_Seq_ids(); 00408 CLDS_CollectSeq_idsReader::CGuard guard(*m_Collector, *ids); 00409 DefaultSkip(in, member); 00410 } 00411 00412 CLDS_Seq_ids* GetIds(const CObjectInfo& obj) { 00413 TIdsMap::iterator iter = m_Ids.find(obj); 00414 return iter == m_Ids.end()? 0: iter->second.GetPointer(); 00415 } 00416 void ClearIds(void) { 00417 m_Ids.clear(); 00418 } 00419 00420 private: 00421 CRef<CLDS_CollectSeq_idsReader> m_Collector; 00422 TIdsMap m_Ids; 00423 }; 00424 00425 00426 class CLDS_GBReleaseReadHook : public CReadClassMemberHook 00427 { 00428 public: 00429 CLDS_GBReleaseReadHook(CLDS_Object& lobj, 00430 CLDS_CoreObjectsReader& objects); 00431 ~CLDS_GBReleaseReadHook(void); 00432 00433 virtual void ReadClassMember(CObjectIStream& in, 00434 const CObjectInfoMI& member); 00435 00436 void Remove(CObjectIStream& in) { 00437 if ( !m_Removed ) { 00438 m_Removed = true; 00439 CObjectTypeInfo type = CType<CBioseq_set>(); 00440 type.FindMember("seq-set").ResetLocalReadHook(in); 00441 } 00442 } 00443 bool Separate(void) const { 00444 return m_Separate; 00445 } 00446 00447 private: 00448 CLDS_Object& m_LObj; 00449 CLDS_CoreObjectsReader& m_Objects; 00450 bool m_Removed; 00451 bool m_Separate; 00452 }; 00453 00454 00455 CLDS_GBReleaseReadHook::CLDS_GBReleaseReadHook(CLDS_Object& lobj, 00456 CLDS_CoreObjectsReader& objects) 00457 : m_LObj(lobj), 00458 m_Objects(objects), 00459 m_Removed(false), 00460 m_Separate(false) 00461 { 00462 } 00463 00464 00465 CLDS_GBReleaseReadHook::~CLDS_GBReleaseReadHook(void) 00466 { 00467 } 00468 00469 00470 void CLDS_GBReleaseReadHook::ReadClassMember(CObjectIStream& in, 00471 const CObjectInfoMI& member) 00472 { 00473 Remove(in); 00474 CBioseq_set* seq_set = CType<CBioseq_set>::Get(member.GetClassObject()); 00475 _ASSERT(seq_set); 00476 if ( seq_set ) { 00477 switch ( m_LObj.GetGBReleasMode() ) { 00478 case CLDS_Object::eForceGBRelease: 00479 m_Separate = true; 00480 break; 00481 case CLDS_Object::eGuessGBRelease: 00482 if ( (!seq_set->IsSetClass() || 00483 seq_set->GetClass() == CBioseq_set::eClass_genbank) && 00484 //!seq_set->IsSetId() && 00485 //!seq_set->IsSetColl() && 00486 //!seq_set->IsSetLevel() && 00487 //!seq_set->IsSetRelease() && 00488 //!seq_set->IsSetDate() && 00489 !seq_set->IsSetDescr() ) { 00490 m_Separate = true; 00491 } 00492 break; 00493 default: 00494 break; 00495 } 00496 } 00497 if ( m_Separate ) { 00498 m_Objects.Reset(); 00499 LOG_POST_X(3, Info << CTime(CTime::eCurrent) << 00500 ": Scanning combined Bioseq-set found in: " << 00501 m_Objects.GetFileName()); 00502 int entry_count = 0, object_count = 0; 00503 // iterate over the sequence of entries 00504 CRef<CSeq_entry> se(new CSeq_entry); 00505 for ( CIStreamContainerIterator it(in, member); it; ++it ) { 00506 CNcbiStreampos pos = in.GetStreamPos(); 00507 it >> *se; 00508 ++entry_count; 00509 m_LObj.SaveObject(&m_Objects, &m_Objects.GetObjectsVector()[0]); 00510 object_count += m_LObj.SaveObjects(m_Objects, true); 00511 } 00512 LOG_POST_X(3, Info << CTime(CTime::eCurrent) << ": LDS: " 00513 << object_count 00514 << " object(s) found in " 00515 << entry_count << " Seq-entries in: " 00516 << m_Objects.GetFileName()); 00517 } 00518 else { 00519 DefaultRead(in, member); 00520 } 00521 } 00522 00523 00524 bool CLDS_Object::UpdateBinaryASNObject(CObjectIStream& in, 00525 CLDS_CoreObjectsReader& objects, 00526 CObjectTypeInfo type) 00527 { 00528 CNcbiStreampos start_pos = in.GetStreamPos(); 00529 objects.Reset(); 00530 LOG_POST_X(4, Info 00531 << "Trying ASN.1 binary top level object:" 00532 << type.GetName() ); 00533 CRef<CLDS_GBReleaseReadHook> hook; 00534 try { 00535 if ( m_GBReleaseMode != eNoGBRelease && 00536 type == CType<CBioseq_set>() ) { 00537 // try to avoid loading full GenBank release Bioseq-set 00538 hook = new CLDS_GBReleaseReadHook(*this, objects); 00539 type.FindMember("seq-set").SetLocalReadHook(in, hook); 00540 } 00541 CObjectInfo object_info(type); 00542 CStopWatch sw(CStopWatch::eStart); 00543 in.Read(object_info); 00544 if ( hook && hook->Separate() ) { 00545 LOG_POST_X(5, Info 00546 << "Binary ASN.1 combined object found: " 00547 << type.GetName() 00548 << " in " << sw.Elapsed()); 00549 } 00550 else { 00551 LOG_POST_X(5, Info 00552 << "Binary ASN.1 top level object found: " 00553 << type.GetName() 00554 << " in " << sw.Elapsed()); 00555 } 00556 if ( hook ) { 00557 hook->Remove(in); 00558 } 00559 return true; 00560 } 00561 catch (CEofException& ) { 00562 } 00563 catch (CException& _DEBUG_ARG(e)) { 00564 _TRACE(" failed to read: " << e.GetMsg()); 00565 } 00566 if ( hook ) { 00567 hook->Remove(in); 00568 } 00569 in.SetStreamPos(start_pos); 00570 return false; 00571 } 00572 00573 00574 int CLDS_Object::SaveObjects(CLDS_CoreObjectsReader& objects, 00575 bool internal) 00576 { 00577 int ret = 0; 00578 CLDS_CoreObjectsReader::TObjectVector& objs = objects.GetObjectsVector(); 00579 if ( !objs.empty() ) { 00580 size_t count = objs.size(); 00581 if ( !internal ) { 00582 LOG_POST_X(3, Info << CTime(CTime::eCurrent) << 00583 ": Saving " << count << 00584 " object(s) found in: " << objects.GetFileName()); 00585 } 00586 for (size_t i = 0; i < count; ++i) { 00587 CLDS_CoreObjectsReader::SObjectDetails& obj_info = objs[i]; 00588 // If object is not in the database yet. 00589 if (obj_info.ext_id == 0) { 00590 SaveObject(&objects, &obj_info); 00591 ++ret; 00592 } 00593 } 00594 if ( !internal ) { 00595 LOG_POST_X(3, Info << CTime(CTime::eCurrent) << ": LDS: " 00596 << count 00597 << " object(s) found in: "<<objects.GetFileName()); 00598 } 00599 objects.ClearObjectsVector(); 00600 } 00601 else { 00602 if ( !internal ) { 00603 if ( objects.GetTotalObjects() == 0 ) { 00604 LOG_POST_X(4, Info << 00605 "LDS: No objects found in:" << 00606 objects.GetFileName()); 00607 } 00608 else { 00609 LOG_POST_X(4, Info << 00610 "LDS: No more objects found in:" << 00611 objects.GetFileName()); 00612 } 00613 } 00614 } 00615 if ( m_Seq_idsCollector ) { 00616 m_Seq_idsCollector->ClearIds(); 00617 } 00618 return ret; 00619 } 00620 00621 00622 void CLDS_Object::UpdateBinaryASNObjects(int file_id, 00623 const string& file_name) 00624 { 00625 vector<CObjectTypeInfo> types; 00626 types.push_back(CType<CBioseq_set>()); 00627 types.push_back(CType<CSeq_entry>()); 00628 types.push_back(CType<CBioseq>()); 00629 types.push_back(CType<CSeq_annot>()); 00630 types.push_back(CType<CSeq_align>()); 00631 types.push_back(CType<CSeq_align_set>()); 00632 vector<CObjectTypeInfo> skip_types; 00633 skip_types.push_back(CType<CSeq_data>()); 00634 skip_types.push_back(CType<CSeq_ext>()); 00635 skip_types.push_back(CType<CSeq_hist>()); 00636 00637 LOG_POST_X(2, Info << CTime(CTime::eCurrent) << 00638 ": Scanning file: " << file_name); 00639 00640 CRef<CLDS_CollectSeq_idsReader> seq_id_hook(new CLDS_CollectSeq_idsReader); 00641 m_Seq_idsCollector = new CLDS_Seq_idsCollector(seq_id_hook); 00642 CRef<CLDS_CoreObjectsReader> objects 00643 (new CLDS_CoreObjectsReader(file_id, file_name)); 00644 auto_ptr<CObjectIStream> 00645 in(CObjectIStream::Open(file_name, eSerial_AsnBinary)); 00646 00647 {{ // setup hooks 00648 ITERATE ( vector<CObjectTypeInfo>, it, types ) { 00649 it->SetLocalReadHook(*in, objects); 00650 } 00651 CRef<CLDS_SkipObjectHook> skipper(new CLDS_SkipObjectHook); 00652 ITERATE ( vector<CObjectTypeInfo>, it, skip_types ) { 00653 it->SetLocalReadHook(*in, skipper); 00654 } 00655 CObjectTypeInfo seq_id_type = CType<CSeq_id>(); 00656 seq_id_type.SetLocalSkipHook(*in, seq_id_hook); 00657 CObjectTypeInfo annot_type = CType<CSeq_annot>(); 00658 annot_type.FindMember("data").SetLocalReadHook(*in, m_Seq_idsCollector); 00659 }} 00660 00661 size_t last_type = 0; 00662 while ( in->HaveMoreData() ) { 00663 // first try previous type 00664 bool found = UpdateBinaryASNObject(*in, *objects, types[last_type]); 00665 if ( !found ) { 00666 // then all remaining possible types 00667 for ( size_t i = 0; i < types.size(); ++i ) { 00668 if ( i != last_type ) { // already tried 00669 if ( UpdateBinaryASNObject(*in, *objects, types[i]) ) { 00670 found = true; 00671 last_type = i; 00672 break; 00673 } 00674 } 00675 } 00676 } 00677 if ( !found ) { 00678 break; 00679 } 00680 SaveObjects(*objects, false); 00681 } 00682 } 00683 00684 00685 void CLDS_Object::UpdateFileObjects(int file_id, 00686 const string& file_name, 00687 CFormatGuess::EFormat format) 00688 { 00689 FindMaxObjRecId(); 00690 00691 if (format == CFormatGuess::eBinaryASN ) { 00692 UpdateBinaryASNObjects(file_id, file_name); 00693 } 00694 else if (format == CFormatGuess::eTextASN || 00695 format == CFormatGuess::eXml) { 00696 00697 LOG_POST_X(2, Info << CTime(CTime::eCurrent) << 00698 ": Scanning file: " << file_name); 00699 00700 CLDS_CoreObjectsReader sniffer(file_id, file_name); 00701 ESerialDataFormat stream_format = FormatGuess2Serial(format); 00702 00703 CNcbiIfstream str_input(file_name.c_str(), IOS_BASE::binary); 00704 auto_ptr<CObjectIStream> input(CObjectIStream::Open(stream_format, 00705 str_input)); 00706 CRef<CLDS_CollectSeq_idsReader> seq_id_hook(new CLDS_CollectSeq_idsReader); 00707 m_Seq_idsCollector = new CLDS_Seq_idsCollector(seq_id_hook); 00708 CObjectTypeInfo seq_id_type = CType<CSeq_id>(); 00709 seq_id_type.SetLocalSkipHook(*input, seq_id_hook); 00710 CObjectTypeInfo annot_type = CType<CSeq_annot>(); 00711 annot_type.FindMember("data").SetLocalReadHook(*input, m_Seq_idsCollector); 00712 00713 sniffer.Probe(*input); 00714 00715 SaveObjects(sniffer, false); 00716 } else if ( format == CFormatGuess::eFasta ){ 00717 00718 int type_id; 00719 {{ 00720 map<string, int>::const_iterator it = m_ObjTypeMap.find("FastaEntry"); 00721 _ASSERT(it != m_ObjTypeMap.end()); 00722 type_id = it->second; 00723 }} 00724 00725 CNcbiIfstream input(file_name.c_str(), IOS_BASE::binary); 00726 00727 CLDS_FastaScanner fscan(*this, file_id, type_id); 00728 ScanFastaFile(&fscan, 00729 input, 00730 CFastaReader::fAssumeNuc | 00731 CFastaReader::fAllSeqIds | 00732 CFastaReader::fOneSeq | 00733 CFastaReader::fNoSeqData | 00734 CFastaReader::fParseRawID); 00735 } else { 00736 LOG_POST_X(5, Info << "Unsupported file format: " << file_name); 00737 } 00738 00739 00740 } 00741 00742 00743 int CLDS_Object::SaveObject(int file_id, 00744 const string& seq_id, 00745 const string& description, 00746 const string& seq_ids, 00747 CNcbiStreampos pos, 00748 int type_id) 00749 { 00750 ++m_MaxObjRecId; 00751 EBDB_ErrCode err; 00752 /* 00753 m_db.object_attr_db.object_attr_id = m_MaxObjRecId; 00754 m_db.object_attr_db.object_title = description; 00755 EBDB_ErrCode err = m_db.object_attr_db.Insert(); 00756 BDB_CHECK(err, "LDS::ObjectAttribute"); 00757 */ 00758 m_db.object_db.object_id = m_MaxObjRecId; 00759 m_db.object_db.file_id = file_id; 00760 m_db.object_db.seqlist_id = 0; 00761 m_db.object_db.object_type = type_id; 00762 Int8 ipos = NcbiStreamposToInt8(pos); 00763 m_db.object_db.file_pos = ipos; 00764 // m_db.object_db.object_attr_id = m_MaxObjRecId; 00765 m_db.object_db.TSE_object_id = 0; 00766 m_db.object_db.parent_object_id = 0; 00767 m_db.object_db.object_title.Set(description.c_str(), 00768 CBDB_FieldStringBase::eTruncateOnOverflowLogError); 00769 m_db.object_db.seq_ids = seq_ids; 00770 00771 string ups = seq_id; 00772 NStr::ToUpper(ups); 00773 m_db.object_db.primary_seqid = ups; 00774 00775 LOG_POST_X(6, Info << "Saving Fasta object: " << seq_id); 00776 00777 err = m_db.object_db.Insert(); 00778 BDB_CHECK(err, "LDS::Object"); 00779 00780 return m_MaxObjRecId; 00781 } 00782 00783 00784 int CLDS_Object::SaveObject(CLDS_CoreObjectsReader* objects, 00785 CLDS_CoreObjectsReader::SObjectDetails* obj_info, 00786 bool force_object) 00787 { 00788 int top_level_id, parent_id; 00789 00790 _ASSERT(obj_info->ext_id == 0); // Making sure the object is not in the DB yet 00791 00792 if (obj_info->is_top_level) { 00793 top_level_id = parent_id = 0; 00794 } 00795 else { 00796 // Find the direct parent 00797 {{ 00798 CLDS_CoreObjectsReader::SObjectDetails* parent_obj_info 00799 = objects->FindObjectInfo(obj_info->parent_offset); 00800 _ASSERT(parent_obj_info); 00801 parent_id = parent_obj_info->ext_id; 00802 if ( parent_id == 0 ) { // not yet in the database 00803 // Recursively save the parent 00804 parent_id = SaveObject(objects, parent_obj_info, true); 00805 } 00806 }} 00807 00808 // Find the top level grand parent 00809 {{ 00810 CLDS_CoreObjectsReader::SObjectDetails* top_obj_info 00811 = objects->FindObjectInfo(obj_info->top_level_offset); 00812 _ASSERT(top_obj_info); 00813 top_level_id = top_obj_info->ext_id; 00814 if ( top_level_id == 0 ) { // not yet in the database 00815 // Recursively save the parent 00816 top_level_id = SaveObject(objects, top_obj_info, true); 00817 } 00818 }} 00819 00820 } 00821 00822 const string& type_name = obj_info->info.GetName(); 00823 00824 map<string, int>::const_iterator it = m_ObjTypeMap.find(type_name); 00825 if (it == m_ObjTypeMap.end()) { 00826 LOG_POST_X(7, Info << "Unrecognized type: " << type_name); 00827 return 0; 00828 } 00829 int type_id = it->second; 00830 00831 00832 string id_str; 00833 string title; 00834 string all_ids; 00835 00836 ++m_MaxObjRecId; 00837 00838 if ( IsObject(*obj_info, &id_str, &title, &all_ids) || force_object ) { 00839 m_db.object_db.primary_seqid = NStr::ToUpper(id_str); 00840 00841 obj_info->ext_id = m_MaxObjRecId; // Keep external id for the next scan 00842 EBDB_ErrCode err; 00843 /* 00844 m_db.object_attr_db.object_attr_id = m_MaxObjRecId; 00845 m_db.object_attr_db.object_title = molecule_title; 00846 m_db.object_attr_db.seq_ids = NStr::ToUpper(all_seq_id); 00847 EBDB_ErrCode err = m_db.object_attr_db.Insert(); 00848 BDB_CHECK(err, "LDS::ObjectAttr"); 00849 */ 00850 m_db.object_db.object_id = m_MaxObjRecId; 00851 m_db.object_db.file_id = objects->GetFileId(); 00852 m_db.object_db.seqlist_id = 0; // TODO: 00853 m_db.object_db.object_type = type_id; 00854 Int8 i8 = NcbiStreamposToInt8(obj_info->offset); 00855 m_db.object_db.file_pos = i8; 00856 // m_db.object_db.object_attr_id = m_MaxObjRecId; 00857 m_db.object_db.TSE_object_id = top_level_id; 00858 m_db.object_db.parent_object_id = parent_id; 00859 m_db.object_db.object_title = title; 00860 m_db.object_db.seq_ids = NStr::ToUpper(all_ids); 00861 00862 00863 // LOG_POST_X(8, Info<<"Saving object: " << type_name << " " << id_str); 00864 00865 err = m_db.object_db.Insert(); 00866 BDB_CHECK(err, "LDS::Object"); 00867 00868 } 00869 else if ( CSeq_annot* annot = CType<CSeq_annot>().Get(obj_info->info)) { 00870 // Set of seq ids referenced in the annotation 00871 // 00872 set<string> ref_seq_ids; 00873 CLDS_Seq_ids *ids = 00874 m_Seq_idsCollector? m_Seq_idsCollector->GetIds(obj_info->info): 0; 00875 if ( ids ) { 00876 ITERATE ( CLDS_Seq_ids::TIds, it, ids->m_Ids ) { 00877 const CSeq_id& id = **it; 00878 ref_seq_ids.insert(id.AsFastaString()); 00879 } 00880 00881 CLDS_Seq_ids::TGis& gis = ids->m_Gis; 00882 sort(gis.begin(), gis.end()); 00883 gis.erase(unique(gis.begin(), gis.end()), gis.end()); 00884 CSeq_id id; 00885 ITERATE ( CLDS_Seq_ids::TGis, it, gis ) { 00886 id.SetGi(*it); 00887 ref_seq_ids.insert(id.AsFastaString()); 00888 } 00889 //LOG_POST_X(9, Info << 00890 // "Saving " << ref_seq_ids.size() << 00891 // " Seq-ids in Seq-annot"); 00892 } 00893 else if ( annot->CanGetData()) { 00894 // Check for alignment in annotation 00895 // 00896 const CSeq_annot_Base::C_Data& adata = annot->GetData(); 00897 if (adata.Which() == CSeq_annot_Base::C_Data::e_Align) { 00898 const CSeq_annot_Base::C_Data::TAlign& al_list = 00899 adata.GetAlign(); 00900 ITERATE (CSeq_annot_Base::C_Data::TAlign, it, al_list){ 00901 if (!(*it)->CanGetSegs()) 00902 continue; 00903 00904 const CSeq_align::TSegs& segs = (*it)->GetSegs(); 00905 switch (segs.Which()) 00906 { 00907 case CSeq_align::C_Segs::e_Std: 00908 { 00909 const CSeq_align_Base::C_Segs::TStd& std_list = 00910 segs.GetStd(); 00911 ITERATE (CSeq_align_Base::C_Segs::TStd, it2, std_list) { 00912 const CRef<CStd_seg>& seg = *it2; 00913 const CStd_seg::TIds& ids = seg->GetIds(); 00914 00915 ITERATE(CStd_seg::TIds, it3, ids) { 00916 ref_seq_ids.insert((*it3)->AsFastaString()); 00917 00918 } // ITERATE 00919 00920 } // ITERATE 00921 } 00922 break; 00923 case CSeq_align::C_Segs::e_Denseg: 00924 { 00925 const CSeq_align_Base::C_Segs::TDenseg& denseg = 00926 segs.GetDenseg(); 00927 const CDense_seg::TIds& ids = denseg.GetIds(); 00928 00929 ITERATE (CDense_seg::TIds, it3, ids) { 00930 ref_seq_ids.insert((*it3)->AsFastaString()); 00931 } // ITERATE 00932 00933 } 00934 break; 00935 //case CSeq_align::C_Segs::e_Packed: 00936 //case CSeq_align::C_Segs::e_Disc: 00937 default: 00938 break; 00939 } 00940 00941 } // ITERATE 00942 } 00943 } 00944 00945 // Save all seq ids referred by the alignment 00946 // 00947 ITERATE (set<string>, it, ref_seq_ids) { 00948 m_db.seq_id_list.object_id = m_MaxObjRecId; 00949 m_db.seq_id_list.seq_id = it->c_str(); 00950 00951 EBDB_ErrCode err = m_db.seq_id_list.Insert(); 00952 BDB_CHECK(err, "LDS::seq_id_list"); 00953 } 00954 00955 obj_info->ext_id = m_MaxObjRecId; // Keep external id for the next scan 00956 00957 m_db.annot_db.annot_id = m_MaxObjRecId; 00958 m_db.annot_db.file_id = objects->GetFileId(); 00959 m_db.annot_db.annot_type = type_id; 00960 Int8 i8 = NcbiStreamposToInt8(obj_info->offset); 00961 m_db.annot_db.file_pos = i8; 00962 m_db.annot_db.TSE_object_id = top_level_id; 00963 m_db.annot_db.parent_object_id = parent_id; 00964 /* 00965 LOG_POST_X(9, Info << "Saving annotation: " 00966 << type_name 00967 << " " 00968 << id_str 00969 << " " 00970 << (!top_level_id ? "Top Level. " : " ") 00971 << "offs=" 00972 << obj_info->offset 00973 ); 00974 */ 00975 00976 EBDB_ErrCode err = m_db.annot_db.Insert(); 00977 BDB_CHECK(err, "LDS::Annotation"); 00978 00979 m_db.annot2obj_db.object_id = parent_id; 00980 m_db.annot2obj_db.annot_id = m_MaxObjRecId; 00981 00982 err = m_db.annot2obj_db.Insert(); 00983 BDB_CHECK(err, "LDS::Annot2Obj"); 00984 00985 } 00986 00987 return obj_info->ext_id; 00988 } 00989 00990 00991 CScope* CLDS_Object::GetScope(void) 00992 { 00993 if ( !m_Scope && m_TSE ) { 00994 m_Scope = new CScope(*m_TSE_Manager); 00995 m_Scope->AddTopLevelSeqEntry(*m_TSE); 00996 } 00997 return m_Scope; 00998 } 00999 01000 01001 bool 01002 CLDS_Object::IsObject(const CLDS_CoreObjectsReader::SObjectDetails& parse_info, 01003 string* object_str_id, 01004 string* object_title, 01005 string* object_all_ids) 01006 { 01007 if ( CREATE_SCOPES && parse_info.is_top_level ) { 01008 m_TSE_Manager = CObjectManager::GetInstance(); 01009 m_Scope.Reset(); 01010 m_TSE_Info = parse_info.info; 01011 m_TSE.Reset(); 01012 if ( CSeq_entry* obj = CType<CSeq_entry>().Get(m_TSE_Info) ) { 01013 m_TSE = obj; 01014 m_TSE->Parentize(); 01015 return true; 01016 } 01017 else if ( CBioseq_set* obj = CType<CBioseq_set>().Get(m_TSE_Info) ) { 01018 m_TSE = new CSeq_entry; 01019 m_TSE->SetSet(*obj); 01020 m_TSE->Parentize(); 01021 return true; 01022 } 01023 else if ( CBioseq* obj = CType<CBioseq>().Get(m_TSE_Info) ) { 01024 m_TSE = new CSeq_entry; 01025 m_TSE->SetSeq(*obj); 01026 m_TSE->Parentize(); 01027 GetBioseqInfo(parse_info, *obj, 01028 object_str_id, object_title, object_all_ids); 01029 return true; 01030 } 01031 else if ( CSeq_annot* obj = CType<CSeq_annot>().Get(m_TSE_Info) ) { 01032 m_TSE = new CSeq_entry; 01033 m_TSE->SetSet().SetSeq_set(); 01034 m_TSE->SetSet().SetAnnot().push_back(Ref(obj)); 01035 m_TSE->Parentize(); 01036 GetAnnotInfo(parse_info, *obj, 01037 object_str_id, object_title, object_all_ids); 01038 return true; 01039 } 01040 else if ( CSeq_align* obj = CType<CSeq_align>().Get(m_TSE_Info) ) { 01041 CRef<CSeq_annot> annot(new CSeq_annot); 01042 CSeq_annot::TData::TAlign& arr = annot->SetData().SetAlign(); 01043 arr.push_back(Ref(obj)); 01044 m_TSE = new CSeq_entry; 01045 m_TSE->SetSet().SetSeq_set(); 01046 m_TSE->SetSet().SetAnnot().push_back(annot); 01047 m_TSE->Parentize(); 01048 GetAnnotInfo(parse_info, *annot, 01049 object_str_id, object_title, object_all_ids); 01050 return true; 01051 } 01052 else if (CSeq_align_set* obj=CType<CSeq_align_set>().Get(m_TSE_Info)) { 01053 CRef<CSeq_annot> annot(new CSeq_annot); 01054 CSeq_annot::TData::TAlign& arr = annot->SetData().SetAlign(); 01055 NON_CONST_ITERATE ( CSeq_align_set::Tdata, it, obj->Set() ) { 01056 arr.push_back(*it); 01057 } 01058 m_TSE = new CSeq_entry; 01059 m_TSE->SetSet().SetSeq_set(); 01060 m_TSE->SetSet().SetAnnot().push_back(annot); 01061 m_TSE->Parentize(); 01062 GetAnnotInfo(parse_info, *annot, 01063 object_str_id, object_title, object_all_ids); 01064 return true; 01065 } 01066 } 01067 01068 if ( CBioseq* obj = CType<CBioseq>().Get(parse_info.info) ) { 01069 GetBioseqInfo(parse_info, *obj, 01070 object_str_id, object_title, object_all_ids); 01071 return true; 01072 } 01073 else if ( CType<CSeq_annot>().Get(parse_info.info) || 01074 CType<CSeq_align>().Get(parse_info.info) || 01075 CType<CSeq_align_set>().Get(parse_info.info) ) { 01076 return false; 01077 } 01078 return true; 01079 } 01080 01081 01082 void CLDS_Object::GetBioseqInfo(const CLDS_CoreObjectsReader::SObjectDetails& /*obj_info*/, 01083 const CBioseq& bioseq, 01084 string* object_str_id, 01085 string* object_title, 01086 string* object_all_ids) 01087 { 01088 const CSeq_id* seq_id = bioseq.GetFirstId(); 01089 if ( seq_id ) { 01090 *object_str_id = seq_id->AsFastaString(); 01091 } 01092 01093 if ( TRY_FAST_TITLE && sequence::GetTitle(bioseq, object_title) ) { 01094 // Good, we've got title fast way. 01095 } 01096 else if (CScope* scope = GetScope()) { // we are under OM here 01097 CBioseq_Handle bio_handle = scope->GetBioseqHandle(bioseq); 01098 if ( bio_handle ) { 01099 *object_title = sequence::GetTitle(bio_handle); 01100 //LOG_POST_X(10, Info<<"object title: "<<*molecule_title); 01101 } 01102 else { 01103 // the last resort 01104 bioseq.GetLabel(object_title, CBioseq::eBoth); 01105 } 01106 01107 } 01108 else { // non-OM controlled object 01109 bioseq.GetLabel(object_title, CBioseq::eBoth); 01110 } 01111 01112 ITERATE ( CBioseq::TId, it, bioseq.GetId() ) { 01113 const CSeq_id* seq_id = *it; 01114 if ( seq_id ) { 01115 object_all_ids->append(seq_id->AsFastaString()); 01116 object_all_ids->append(" "); 01117 } 01118 } 01119 } 01120 01121 01122 void CLDS_Object::GetAnnotInfo(const CLDS_CoreObjectsReader::SObjectDetails& obj_info, 01123 const CSeq_annot& annot, 01124 string* object_str_id, 01125 string* object_title, 01126 string* object_all_ids) 01127 { 01128 set<string> ref_seq_ids; 01129 CLDS_Seq_ids *ids = 01130 m_Seq_idsCollector? m_Seq_idsCollector->GetIds(obj_info.info): 0; 01131 if ( ids ) { 01132 ITERATE ( CLDS_Seq_ids::TIds, it, ids->m_Ids ) { 01133 const CSeq_id& id = **it; 01134 string str_id = id.AsFastaString(); 01135 ref_seq_ids.insert(NStr::ToUpper(str_id)); 01136 } 01137 01138 CLDS_Seq_ids::TGis& gis = ids->m_Gis; 01139 sort(gis.begin(), gis.end()); 01140 gis.erase(unique(gis.begin(), gis.end()), gis.end()); 01141 CSeq_id id; 01142 ITERATE ( CLDS_Seq_ids::TGis, it, gis ) { 01143 id.SetGi(*it); 01144 string str_id = id.AsFastaString(); 01145 ref_seq_ids.insert(NStr::ToUpper(str_id)); 01146 } 01147 //LOG_POST_X(9, Info << 01148 // "Saving " << ref_seq_ids.size() << 01149 // " Seq-ids in Seq-annot"); 01150 } 01151 else if ( annot.CanGetData() ) { 01152 // Check for alignment in annotation 01153 // 01154 const CSeq_annot_Base::C_Data& adata = annot.GetData(); 01155 if ( adata.IsAlign() ) { 01156 const CSeq_annot_Base::C_Data::TAlign& al_list = adata.GetAlign(); 01157 ITERATE (CSeq_annot_Base::C_Data::TAlign, it, al_list){ 01158 if (!(*it)->CanGetSegs()) 01159 continue; 01160 01161 const CSeq_align::TSegs& segs = (*it)->GetSegs(); 01162 switch (segs.Which()) 01163 { 01164 case CSeq_align::C_Segs::e_Std: 01165 { 01166 const CSeq_align_Base::C_Segs::TStd& std_list = 01167 segs.GetStd(); 01168 ITERATE (CSeq_align_Base::C_Segs::TStd, it2, std_list) { 01169 const CRef<CStd_seg>& seg = *it2; 01170 const CStd_seg::TIds& ids = seg->GetIds(); 01171 01172 ITERATE(CStd_seg::TIds, it3, ids) { 01173 string str_id = (*it3)->AsFastaString(); 01174 ref_seq_ids.insert(NStr::ToUpper(str_id)); 01175 01176 } // ITERATE 01177 01178 } // ITERATE 01179 } 01180 break; 01181 case CSeq_align::C_Segs::e_Denseg: 01182 { 01183 const CSeq_align_Base::C_Segs::TDenseg& denseg = 01184 segs.GetDenseg(); 01185 const CDense_seg::TIds& ids = denseg.GetIds(); 01186 01187 ITERATE (CDense_seg::TIds, it3, ids) { 01188 string str_id = (*it3)->AsFastaString(); 01189 ref_seq_ids.insert(NStr::ToUpper(str_id)); 01190 } // ITERATE 01191 01192 } 01193 break; 01194 //case CSeq_align::C_Segs::e_Packed: 01195 //case CSeq_align::C_Segs::e_Disc: 01196 default: 01197 break; 01198 } 01199 01200 } // ITERATE 01201 } 01202 else { 01203 for (CTypeConstIterator<CSeq_id> it(ConstBegin(annot)); it; ++it) { 01204 const CSeq_id& id = *it; 01205 string str_id = id.AsFastaString(); 01206 ref_seq_ids.insert(NStr::ToUpper(str_id)); 01207 } 01208 } 01209 } 01210 01211 // Save all seq ids referred by the alignment 01212 // 01213 ITERATE (set<string>, it, ref_seq_ids) { 01214 object_all_ids->append(*it); 01215 object_all_ids->append(" "); 01216 } 01217 } 01218 01219 01220 int CLDS_Object::FindMaxObjRecId() 01221 { 01222 if (m_MaxObjRecId) { 01223 return m_MaxObjRecId; 01224 } 01225 01226 LDS_GETMAXID(m_MaxObjRecId, m_db.object_db, object_id); 01227 01228 int ann_rec_id = 0; 01229 LDS_GETMAXID(ann_rec_id, m_db.annot_db, annot_id); 01230 01231 if (ann_rec_id > m_MaxObjRecId) { 01232 m_MaxObjRecId = ann_rec_id; 01233 } 01234 01235 return m_MaxObjRecId; 01236 } 01237 01238 01239 static bool s_GetSequenceBase(const CObject_id& obj_id, 01240 SLDS_SeqIdBase* seqid_base) 01241 { 01242 switch (obj_id.Which()) { 01243 case CObject_id::e_Id: 01244 seqid_base->int_id = obj_id.GetId(); 01245 seqid_base->str_id.erase(); 01246 return true; 01247 case CObject_id::e_Str: 01248 seqid_base->int_id = 0; 01249 seqid_base->str_id = obj_id.GetStr(); 01250 return true; 01251 default: 01252 break; 01253 } 01254 return false; 01255 } 01256 01257 01258 static bool s_GetSequenceBase(const CPDB_seq_id& pdb_id, 01259 SLDS_SeqIdBase* seqid_base) 01260 { 01261 seqid_base->int_id = 0; 01262 seqid_base->str_id = pdb_id.GetMol().Get(); 01263 seqid_base->str_id += '|'; 01264 char chain = (char) pdb_id.GetChain(); 01265 if ( chain == '|' ) { 01266 seqid_base->str_id += "VB"; 01267 } 01268 else if ( chain == '\0' ) { 01269 seqid_base->str_id += ' '; 01270 } 01271 else if ( islower((unsigned char)chain) ) { 01272 seqid_base->str_id.append(2, chain); 01273 } 01274 else { 01275 seqid_base->str_id += chain; 01276 } 01277 return true; 01278 } 01279 01280 01281 void LDS_GetSequenceBase(const CSeq_id& seq_id, 01282 SLDS_SeqIdBase* seqid_base) 01283 { 01284 _ASSERT(seqid_base); 01285 01286 int obj_id_int = 0; 01287 const CTextseq_id* obj_id_txt = 0; 01288 01289 switch (seq_id.Which()) { 01290 case CSeq_id::e_Local: 01291 if ( s_GetSequenceBase(seq_id.GetLocal(), seqid_base) ) { 01292 return; 01293 } 01294 break; 01295 case CSeq_id::e_Gibbsq: 01296 obj_id_int = seq_id.GetGibbsq(); 01297 break; 01298 case CSeq_id::e_Gibbmt: 01299 obj_id_int = seq_id.GetGibbmt(); 01300 break; 01301 case CSeq_id::e_Giim: 01302 obj_id_int = seq_id.GetGiim().GetId(); 01303 break; 01304 case CSeq_id::e_Genbank: 01305 obj_id_txt = &seq_id.GetGenbank(); 01306 break; 01307 case CSeq_id::e_Embl: 01308 obj_id_txt = &seq_id.GetEmbl(); 01309 break; 01310 case CSeq_id::e_Pir: 01311 obj_id_txt = &seq_id.GetPir(); 01312 break; 01313 case CSeq_id::e_Swissprot: 01314 obj_id_txt = &seq_id.GetSwissprot(); 01315 break; 01316 case CSeq_id::e_Patent: 01317 {{ 01318 seqid_base->int_id = 0; 01319 seqid_base->str_id = ""; 01320 const CId_pat& pat = seq_id.GetPatent().GetCit(); 01321 pat.GetLabel(&seqid_base->str_id); 01322 }} 01323 return; 01324 case CSeq_id::e_Other: 01325 obj_id_txt = &seq_id.GetOther(); 01326 break; 01327 case CSeq_id::e_General: 01328 {{ 01329 seqid_base->int_id = 0; 01330 seqid_base->str_id = ""; 01331 seq_id.GetGeneral().GetLabel(&seqid_base->str_id); 01332 }} 01333 return; 01334 case CSeq_id::e_Gi: 01335 obj_id_int = seq_id.GetGi(); 01336 break; 01337 case CSeq_id::e_Ddbj: 01338 obj_id_txt = &seq_id.GetDdbj(); 01339 break; 01340 case CSeq_id::e_Prf: 01341 obj_id_txt = &seq_id.GetPrf(); 01342 break; 01343 case CSeq_id::e_Pdb: 01344 if ( s_GetSequenceBase(seq_id.GetPdb(), seqid_base) ) { 01345 return; 01346 } 01347 break; 01348 case CSeq_id::e_Tpg: 01349 obj_id_txt = &seq_id.GetTpg(); 01350 break; 01351 case CSeq_id::e_Tpe: 01352 obj_id_txt = &seq_id.GetTpe(); 01353 break; 01354 case CSeq_id::e_Tpd: 01355 obj_id_txt = &seq_id.GetTpd(); 01356 break; 01357 case CSeq_id::e_Gpipe: 01358 obj_id_txt = &seq_id.GetGpipe(); 01359 break; 01360 default: 01361 _ASSERT(0); 01362 break; 01363 } 01364 01365 const string* id_str = 0; 01366 01367 if (obj_id_int) { 01368 seqid_base->int_id = obj_id_int; 01369 seqid_base->str_id.erase(); 01370 return; 01371 } 01372 01373 if (obj_id_txt) { 01374 if (obj_id_txt->CanGetAccession()) { 01375 const CTextseq_id::TAccession& acc = 01376 obj_id_txt->GetAccession(); 01377 id_str = &acc; 01378 } else { 01379 if (obj_id_txt->CanGetName()) { 01380 const CTextseq_id::TName& name = 01381 obj_id_txt->GetName(); 01382 id_str = &name; 01383 } 01384 } 01385 } 01386 01387 if (id_str) { 01388 seqid_base->int_id = 0; 01389 seqid_base->str_id = *id_str; 01390 return; 01391 } 01392 01393 LOG_POST_X(11, Warning 01394 << "SeqId indexer: unsupported type " 01395 << seq_id.AsFastaString()); 01396 01397 seqid_base->Init(); 01398 01399 } 01400 01401 bool LDS_GetSequenceBase(const string& seq_id_str, 01402 SLDS_SeqIdBase* seqid_base, 01403 CSeq_id* conv_seq_id) 01404 { 01405 if ( seq_id_str.empty() ) { 01406 return false; 01407 } 01408 01409 _ASSERT(seqid_base); 01410 01411 CRef<CSeq_id> tmp_seq_id; 01412 01413 if (conv_seq_id == 0) { 01414 tmp_seq_id.Reset((conv_seq_id = new CSeq_id)); 01415 01416 } 01417 01418 bool can_convert = true; 01419 01420 try { 01421 conv_seq_id->Set(seq_id_str); 01422 } catch (CSeqIdException&) { 01423 try { 01424 conv_seq_id->Set(CSeq_id::e_Local, seq_id_str); 01425 } catch (CSeqIdException&) { 01426 can_convert = false; 01427 LOG_POST_X(12, Error << 01428 "Cannot convert seq id string: " << seq_id_str); 01429 seqid_base->Init(); 01430 } 01431 } 01432 01433 if (can_convert) { 01434 LDS_GetSequenceBase(*conv_seq_id, seqid_base); 01435 } 01436 01437 return can_convert; 01438 } 01439 01440 01441 /// Scanner functor to build id index 01442 /// 01443 /// @internal 01444 /// 01445 class CLDS_BuildIdIdx 01446 { 01447 public: 01448 CLDS_BuildIdIdx(CLDS_Database& db, bool control_dups) 01449 : m_db(db), 01450 m_coll(db.GetTables()), 01451 m_SeqId(new CSeq_id), 01452 m_ControlDups(control_dups), 01453 m_Query(new CLDS_Query(db)), 01454 m_ObjIds(bm::BM_GAP) 01455 { 01456 if (m_ControlDups) { 01457 m_SequenceFind.reset(new CLDS_Query::CSequenceFinder(*m_Query)); 01458 } 01459 } 01460 01461 void operator()(SLDS_ObjectDB& dbf) 01462 { 01463 int object_id = dbf.object_id; // PK 01464 01465 if (!dbf.primary_seqid.IsNull()) { 01466 dbf.primary_seqid.ToString(m_PriSeqId_Str); 01467 01468 x_AddToIdx(m_PriSeqId_Str, object_id); 01469 } 01470 01471 dbf.seq_ids.ToString(m_SeqId_Str); 01472 vector<string> seq_id_arr; 01473 NStr::Tokenize(m_SeqId_Str, " ", seq_id_arr, NStr::eMergeDelims); 01474 ITERATE (vector<string>, it, seq_id_arr) { 01475 const string& seq_id_str = *it; 01476 if (NStr::CompareNocase(seq_id_str,m_PriSeqId_Str)==0) { 01477 continue; 01478 } 01479 x_AddToIdx(seq_id_str, object_id); 01480 } 01481 } 01482 01483 private: 01484 void x_AddToIdx(const string& seq_id_str, int rec_id) 01485 { 01486 bool can_convert = 01487 LDS_GetSequenceBase(seq_id_str, &m_SBase, &*m_SeqId); 01488 if (can_convert) { 01489 if (m_ControlDups) { 01490 _ASSERT(m_SequenceFind.get()); 01491 CLDS_Set& cand = m_SequenceFind->GetCandidates(); 01492 cand.clear(); 01493 m_SequenceFind->Screen(m_SBase); 01494 if (cand.any()) { 01495 CLDS_Set dup_ids(bm::BM_GAP); 01496 m_SequenceFind->FindInCandidates(seq_id_str, &dup_ids); 01497 01498 if (dup_ids.any()) { 01499 unsigned id = dup_ids.get_first(); 01500 m_Query->ReportDuplicateObjectSeqId(seq_id_str, 01501 id, 01502 rec_id); 01503 } 01504 } 01505 } 01506 01507 x_AddToIdx(m_SBase, rec_id); 01508 } 01509 } 01510 01511 void x_AddToIdx(const SLDS_SeqIdBase& sbase, int rec_id) 01512 { 01513 if (sbase.int_id) { 01514 _TRACE("int id: "<<sbase.int_id<<" -> "<<rec_id); 01515 m_coll.obj_seqid_int_idx.id = sbase.int_id; 01516 m_coll.obj_seqid_int_idx.row_id = rec_id; 01517 m_coll.obj_seqid_int_idx.Insert(); 01518 } 01519 else if (!sbase.str_id.empty()) { 01520 _TRACE("str id: "<<sbase.str_id<<" -> "<<rec_id); 01521 m_coll.obj_seqid_txt_idx.id = sbase.str_id; 01522 m_coll.obj_seqid_txt_idx.row_id = rec_id; 01523 m_coll.obj_seqid_txt_idx.Insert(); 01524 } 01525 } 01526 01527 private: 01528 CLDS_BuildIdIdx(const CLDS_BuildIdIdx&); 01529 CLDS_BuildIdIdx& operator=(const CLDS_BuildIdIdx&); 01530 01531 private: 01532 CLDS_Database& m_db; 01533 SLDS_TablesCollection& m_coll; 01534 string m_PriSeqId_Str; 01535 string m_SeqId_Str; 01536 CRef<CSeq_id> m_SeqId; 01537 SLDS_SeqIdBase m_SBase; 01538 bool m_ControlDups; ///< Control id duplicates 01539 auto_ptr<CLDS_Query> m_Query; 01540 auto_ptr<CLDS_Query::CSequenceFinder> m_SequenceFind; 01541 CLDS_Set m_ObjIds; ///< id set for duplicate search 01542 }; 01543 01544 void CLDS_Object::BuildSeqIdIdx() 01545 { 01546 m_db.obj_seqid_int_idx.Truncate(); 01547 m_db.obj_seqid_txt_idx.Truncate(); 01548 01549 LOG_POST_X(13, Info << "Building sequence id index on objects..."); 01550 01551 CLDS_BuildIdIdx func(m_DataBase, m_ControlDupIds); 01552 BDB_iterate_file(m_db.object_db, func); 01553 } 01554 01555 01556 CLDS_Object::CLDS_Object(CLDS_Database& db, 01557 const map<string, int>& obj_map) 01558 : m_DataBase(db), 01559 m_db(db.GetTables()), 01560 m_ObjTypeMap(obj_map), 01561 m_MaxObjRecId(0), 01562 m_ControlDupIds(false), 01563 m_GBReleaseMode(eDefaultGBReleaseMode) 01564 { 01565 } 01566 01567 01568 CLDS_Object::~CLDS_Object() 01569 { 01570 } 01571 01572 01573 END_SCOPE(objects) 01574 END_NCBI_SCOPE
1.7.5.1
Modified on Wed May 23 13:03:22 2012 by modify_doxy.py rev. 337098