NCBI C++ ToolKit
lds_object.cpp
Go to the documentation of this file.
00001 /*  $Id: lds_object.cpp 45852 2010-05-24 16:43:37Z grichenk $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE
00005  *               National Center for Biotechnology Information
00006  *
00007  *  This software/database is a "United States Government Work" under the
00008  *  terms of the United States Copyright Act.  It was written as part of
00009  *  the author's official duties as a United States Government employee and
00010  *  thus cannot be copyrighted.  This software/database is freely available
00011  *  to the public for use. The National Library of Medicine and the U.S.
00012  *  Government have not placed any restriction on its use or reproduction.
00013  *
00014  *  Although all reasonable efforts have been taken to ensure the accuracy
00015  *  and reliability of the software and data, the NLM and the U.S.
00016  *  Government do not and cannot warrant the performance or results that
00017  *  may be obtained by using this software or data. The NLM and the U.S.
00018  *  Government disclaim all warranties, express or implied, including
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.
00021  *
00022  *  Please cite the author in any work or product based on this material.
00023  *
00024  * ===========================================================================
00025  *
00026  * Author: Anatoliy Kuznetsov, Victor Joukov
00027  *
00028  * File Description:  CLDS_Object implementation.
00029  *
00030  */
00031 
00032 
00033 #include <ncbi_pch.hpp>
00034 #include <objects/seqset/seqset__.hpp>
00035 #include <objects/seq/seq__.hpp>
00036 #include <objects/seqalign/seqalign__.hpp>
00037 #include <objects/seqfeat/Seq_feat.hpp>
00038 #include <objects/seqres/Seq_graph.hpp>
00039 #include <objects/seqloc/seqloc__.hpp>
00040 #include <objects/biblio/Id_pat.hpp>
00041 #include <objects/general/Dbtag.hpp>
00042 #include <objects/general/Object_id.hpp>
00043 
00044 #include <db/bdb/bdb_cursor.hpp>
00045 #include <db/bdb/bdb_util.hpp>
00046 
00047 #include <objtools/readers/fasta.hpp>
00048 
00049 #include <objtools/lds/lds_object.hpp>
00050 #include <objtools/lds/lds_set.hpp>
00051 #include <objtools/lds/lds_util.hpp>
00052 #include <objtools/lds/lds.hpp>
00053 #include <objtools/lds/lds_query.hpp>
00054 #include <objtools/error_codes.hpp>
00055 
00056 #include <objmgr/object_manager.hpp>
00057 #include <objmgr/scope.hpp>
00058 #include <objmgr/util/sequence.hpp>
00059 
00060 #include <serial/objhook.hpp>
00061 #include <serial/objistr.hpp>
00062 #include <serial/objectiter.hpp>
00063 #include <serial/objectio.hpp>
00064 #include <serial/iterator.hpp>
00065 
00066 #define TRY_FAST_TITLE 1
00067 #define CREATE_SCOPES 1
00068 
00069 #define NCBI_USE_ERRCODE_X   Objtools_LDS_Object
00070 
00071 
00072 BEGIN_NCBI_SCOPE
00073 BEGIN_SCOPE(objects)
00074 
00075 
00076 /// @internal
00077 class CLDS_FastaScanner : public IFastaEntryScan
00078 {
00079 public:
00080     CLDS_FastaScanner(CLDS_Object& obj,
00081                       int          file_id,
00082                       int          type_id);
00083 
00084     virtual void EntryFound(CRef<CSeq_entry> se,
00085                             CNcbiStreampos   stream_position);
00086 private:
00087     CLDS_Object& m_Obj;
00088     int          m_FileId;
00089     int          m_TypeId;
00090 };
00091 
00092 CLDS_FastaScanner::CLDS_FastaScanner(CLDS_Object& obj,
00093                                      int          file_id,
00094                                      int          type_id)
00095  : m_Obj(obj),
00096    m_FileId(file_id),
00097    m_TypeId(type_id)
00098 {}
00099 
00100 void CLDS_FastaScanner::EntryFound(CRef<CSeq_entry> se,
00101                                    CNcbiStreampos   stream_position)
00102 {
00103     if (!se->IsSeq())
00104         return;
00105 
00106     SFastaFileMap::SFastaEntry  fasta_entry;
00107     fasta_entry.stream_offset = stream_position;
00108 
00109     // extract sequence info
00110 
00111     const CSeq_entry::TSeq& bioseq = se->GetSeq();
00112     const CSeq_id* sid = bioseq.GetFirstId();
00113     fasta_entry.seq_id = sid->AsFastaString();
00114 
00115     fasta_entry.all_seq_ids.resize(0);
00116     if (bioseq.CanGetId()) {
00117         const CBioseq::TId& seq_ids = bioseq.GetId();
00118         string id_str;
00119         ITERATE(CBioseq::TId, it, seq_ids) {
00120             const CBioseq::TId::value_type& vt = *it;
00121             id_str = vt->AsFastaString();
00122             fasta_entry.all_seq_ids.push_back(id_str);
00123         }
00124     }
00125 
00126     if (bioseq.CanGetDescr()) {
00127         const CSeq_descr& d = bioseq.GetDescr();
00128         if (d.CanGet()) {
00129             const CSeq_descr_Base::Tdata& data = d.Get();
00130             if (!data.empty()) {
00131                 CSeq_descr_Base::Tdata::const_iterator it =
00132                                                     data.begin();
00133                 if (it != data.end()) {
00134                     CRef<CSeqdesc> ref_desc = *it;
00135                     ref_desc->GetLabel(&fasta_entry.description,
00136                                         CSeqdesc::eContent);
00137                 }
00138             }
00139         }
00140     }
00141 
00142     // store entry record
00143 
00144     // concatenate all ids
00145     string seq_ids;
00146     ITERATE(SFastaFileMap::SFastaEntry::TFastaSeqIds,
00147             it_id, fasta_entry.all_seq_ids) {
00148         seq_ids.append(*it_id);
00149         seq_ids.append(" ");
00150     }
00151 
00152     m_Obj.SaveObject(m_FileId,
00153                      fasta_entry.seq_id,
00154                      fasta_entry.description,
00155                      seq_ids,
00156                      fasta_entry.stream_offset,
00157                      m_TypeId);
00158 
00159 }
00160 
00161 
00162 void CLDS_Object::DeleteUpdateCascadeFiles(const CLDS_Set& files_deleted,
00163                                            const CLDS_Set& files_updated)
00164 {
00165     CLDS_Set objects_deleted;
00166     CLDS_Set annotations_deleted;
00167     DeleteCascadeFiles(files_deleted, &objects_deleted, &annotations_deleted);
00168     UpdateCascadeFiles(files_updated);
00169     if ( files_deleted.any() || files_updated.any() ) {
00170         // re-index
00171         BuildSeqIdIdx();
00172     }
00173 }
00174 
00175 
00176 void CLDS_Object::DeleteCascadeFiles(const CLDS_Set& file_ids,
00177                                      CLDS_Set* objects_deleted,
00178                                      CLDS_Set* annotations_deleted)
00179 {
00180     if (file_ids.none())
00181         return;
00182 
00183     //
00184     //  Delete records from "object" table
00185     //
00186     {{
00187     CBDB_FileCursor cur(m_db.object_db);
00188     cur.SetCondition(CBDB_FileCursor::eFirst);
00189     while (cur.Fetch() == eBDB_Ok) {
00190         int fid = m_db.object_db.file_id;
00191         if (fid && LDS_SetTest(file_ids, fid)) {
00192 /*
00193             int object_attr_id = m_db.object_db.object_attr_id;
00194 
00195             if (object_attr_id) {  // delete dependent object attr
00196                 m_db.object_attr_db.object_attr_id = object_attr_id;
00197                 m_db.object_attr_db.Delete();
00198             }
00199 */
00200             int object_id = m_db.object_db.object_id;
00201 
00202             objects_deleted->set(object_id);
00203             m_db.object_db.Delete();
00204         }
00205     }
00206 
00207     }}
00208 
00209     //
00210     // Delete "annot2obj"
00211     //
00212     {{
00213     CBDB_FileCursor cur(m_db.annot2obj_db);
00214     cur.SetCondition(CBDB_FileCursor::eFirst);
00215     while (cur.Fetch() == eBDB_Ok) {
00216         int object_id = m_db.annot2obj_db.object_id;
00217         if (object_id && LDS_SetTest(*objects_deleted, object_id)) {
00218             m_db.annot2obj_db.Delete();
00219         }
00220     }
00221 
00222     }}
00223 
00224     //
00225     // Delete "annotation"
00226     //
00227     {{
00228     CBDB_FileCursor cur(m_db.annot_db);
00229     cur.SetCondition(CBDB_FileCursor::eFirst);
00230     while (cur.Fetch() == eBDB_Ok) {
00231         if ( !m_db.object_db.file_id.IsNull() ) {
00232             int fid = m_db.object_db.file_id;
00233             if (fid && LDS_SetTest(file_ids, fid)) {
00234                 int annot_id = m_db.annot_db.annot_id;
00235                 annotations_deleted->set(annot_id);
00236                 m_db.annot_db.Delete();
00237             }
00238         }
00239     }
00240 
00241     }}
00242 
00243     //
00244     // Delete "seq_id_list"
00245     //
00246     {{
00247 
00248     {{
00249     CLDS_Set::enumerator en = objects_deleted->first();
00250     for ( ; en.valid(); ++en) {
00251         int id = *en;
00252         m_db.seq_id_list.object_id = id;
00253         m_db.seq_id_list.Delete();
00254     }
00255     }}
00256 
00257     CLDS_Set::enumerator en = annotations_deleted->first();
00258     for ( ; en.valid(); ++en) {
00259         int id = *en;
00260         m_db.seq_id_list.object_id = id;
00261         m_db.seq_id_list.Delete();
00262     }
00263 
00264     }}
00265 
00266 }
00267 
00268 
00269 void CLDS_Object::UpdateCascadeFiles(const CLDS_Set& file_ids)
00270 {
00271     if (file_ids.none()) {
00272         return;
00273     }
00274 
00275     CLDS_Set objects_deleted;
00276     CLDS_Set annotations_deleted;
00277     DeleteCascadeFiles(file_ids, &objects_deleted, &annotations_deleted);
00278 
00279     CLDS_Set::enumerator en(file_ids.first());
00280     for ( ; en.valid(); ++en) {
00281         int fid = *en;
00282         m_db.file_db.file_id = fid;
00283 
00284         if (m_db.file_db.Fetch() == eBDB_Ok) {
00285             string fname(m_db.file_db.file_name);
00286             CFormatGuess::EFormat format =
00287                 (CFormatGuess::EFormat)(int)m_db.file_db.format;
00288 
00289             LOG_POST_X(1, Info << "<< Updating file >>: " << fname);
00290 
00291             UpdateFileObjects(fid, fname, format);
00292         }
00293     } // ITERATE
00294 }
00295 
00296 
00297 class CLDS_SkipObjectHook : public CReadObjectHook
00298 {
00299 public:
00300     virtual void ReadObject(CObjectIStream& in,
00301                             const CObjectInfo& obj) {
00302         DefaultSkip(in, obj);
00303     }
00304 };
00305 
00306 
00307 class CLDS_Seq_ids : public CObject
00308 {
00309 public:
00310     typedef vector<CRef<CSeq_id> > TIds;
00311     typedef vector<int> TGis;
00312     void clear()
00313         {
00314             m_Ids.clear();
00315             m_Gis.clear();
00316         }
00317     void AddSeq_id(const CSeq_id& id)
00318         {
00319             if ( id.IsGi() ) {
00320                 AddGi(id.GetGi());
00321             }
00322             else if ( m_Ids.empty() || !m_Ids.back()->Equals(id) ) {
00323                 m_Ids.push_back(Ref(SerialClone(id)));
00324             }
00325         }
00326     void AddGi(int gi)
00327         {
00328             if ( m_Gis.empty() || m_Gis.back() != gi ) {
00329                 m_Gis.push_back(gi);
00330             }
00331         }
00332 
00333     TIds m_Ids;
00334     TGis m_Gis;
00335 };
00336 
00337 class CLDS_CollectSeq_idsReader : public CSkipObjectHook
00338 {
00339 public:
00340     CLDS_CollectSeq_idsReader(void)
00341         : m_Seq_id(new CSeq_id()), m_Collect(0)
00342         {
00343         }
00344 
00345     virtual void SkipObject(CObjectIStream& in,
00346                             const CObjectTypeInfo& type) {
00347         if ( m_Collect ) {
00348             DefaultRead(in, ObjectInfo(*m_Seq_id));
00349             m_Collect->AddSeq_id(*m_Seq_id);
00350         }
00351         else {
00352             DefaultSkip(in, type);
00353         }
00354     }
00355 
00356     void Collect(CLDS_Seq_ids* ids) {
00357         m_Collect = ids;
00358     }
00359 
00360     class CGuard
00361     {
00362     public:
00363         CGuard(CLDS_CollectSeq_idsReader& reader, CLDS_Seq_ids& ids)
00364             : m_Reader(reader)
00365             {
00366                 reader.Collect(&ids);
00367             }
00368         ~CGuard()
00369             {
00370                 m_Reader.Collect(0);
00371             }
00372     private:
00373         CLDS_CollectSeq_idsReader& m_Reader;
00374 
00375         CGuard(const CGuard&);
00376         void operator=(const CGuard&);
00377     };
00378 
00379 private:
00380     CRef<CSeq_id> m_Seq_id;
00381     CLDS_Seq_ids* m_Collect;
00382 };
00383 
00384 
00385 class PLessObjectPtr
00386 {
00387 public:
00388     bool operator()(const CObjectInfo& a, const CObjectInfo& b) const {
00389         return a.GetObjectPtr() < b.GetObjectPtr();
00390     }
00391 };
00392 
00393 
00394 class CLDS_Seq_idsCollector : public CReadClassMemberHook
00395 {
00396 public:
00397     typedef map<CObjectInfo, CRef<CLDS_Seq_ids>, PLessObjectPtr> TIdsMap;
00398 
00399     CLDS_Seq_idsCollector(CLDS_CollectSeq_idsReader* collector)
00400         : m_Collector(collector)
00401         {
00402         }
00403 
00404     virtual void ReadClassMember(CObjectIStream& in,
00405                                  const CObjectInfoMI& member) {
00406         CRef<CLDS_Seq_ids>& ids = m_Ids[member.GetClassObject()];
00407         ids = new CLDS_Seq_ids();
00408         CLDS_CollectSeq_idsReader::CGuard guard(*m_Collector, *ids);
00409         DefaultSkip(in, member);
00410     }
00411 
00412     CLDS_Seq_ids* GetIds(const CObjectInfo& obj) {
00413         TIdsMap::iterator iter = m_Ids.find(obj);
00414         return iter == m_Ids.end()? 0: iter->second.GetPointer();
00415     }
00416     void ClearIds(void) {
00417         m_Ids.clear();
00418     }
00419 
00420 private:
00421     CRef<CLDS_CollectSeq_idsReader> m_Collector;
00422     TIdsMap m_Ids;
00423 };
00424 
00425 
00426 class CLDS_GBReleaseReadHook : public CReadClassMemberHook
00427 {
00428 public:
00429     CLDS_GBReleaseReadHook(CLDS_Object& lobj,
00430                            CLDS_CoreObjectsReader& objects);
00431     ~CLDS_GBReleaseReadHook(void);
00432 
00433     virtual void ReadClassMember(CObjectIStream& in,
00434                                  const CObjectInfoMI& member);
00435 
00436     void Remove(CObjectIStream& in) {
00437         if ( !m_Removed ) {
00438             m_Removed = true;
00439             CObjectTypeInfo type = CType<CBioseq_set>();
00440             type.FindMember("seq-set").ResetLocalReadHook(in);
00441         }
00442     }
00443     bool Separate(void) const {
00444         return m_Separate;
00445     }
00446 
00447 private:
00448     CLDS_Object& m_LObj;
00449     CLDS_CoreObjectsReader& m_Objects;
00450     bool m_Removed;
00451     bool m_Separate;
00452 };
00453 
00454 
00455 CLDS_GBReleaseReadHook::CLDS_GBReleaseReadHook(CLDS_Object& lobj,
00456                                                CLDS_CoreObjectsReader& objects)
00457     : m_LObj(lobj),
00458       m_Objects(objects),
00459       m_Removed(false),
00460       m_Separate(false)
00461 {
00462 }
00463 
00464 
00465 CLDS_GBReleaseReadHook::~CLDS_GBReleaseReadHook(void)
00466 {
00467 }
00468 
00469 
00470 void CLDS_GBReleaseReadHook::ReadClassMember(CObjectIStream& in,
00471                                              const CObjectInfoMI& member)
00472 {
00473     Remove(in);
00474     CBioseq_set* seq_set = CType<CBioseq_set>::Get(member.GetClassObject());
00475     _ASSERT(seq_set);
00476     if ( seq_set ) {
00477         switch ( m_LObj.GetGBReleasMode() ) {
00478         case CLDS_Object::eForceGBRelease:
00479             m_Separate = true;
00480             break;
00481         case CLDS_Object::eGuessGBRelease:
00482             if ( (!seq_set->IsSetClass() ||
00483                   seq_set->GetClass() == CBioseq_set::eClass_genbank) &&
00484                  //!seq_set->IsSetId() &&
00485                  //!seq_set->IsSetColl() &&
00486                  //!seq_set->IsSetLevel() &&
00487                  //!seq_set->IsSetRelease() &&
00488                  //!seq_set->IsSetDate() &&
00489                  !seq_set->IsSetDescr() ) {
00490                 m_Separate = true;
00491             }
00492             break;
00493         default:
00494             break;
00495         }
00496     }
00497     if ( m_Separate ) {
00498         m_Objects.Reset();
00499         LOG_POST_X(3, Info << CTime(CTime::eCurrent) <<
00500                    ": Scanning combined Bioseq-set found in: " <<
00501                    m_Objects.GetFileName());
00502         int entry_count = 0, object_count = 0;
00503         // iterate over the sequence of entries
00504         CRef<CSeq_entry> se(new CSeq_entry);
00505         for ( CIStreamContainerIterator it(in, member); it; ++it ) {
00506             CNcbiStreampos pos = in.GetStreamPos();
00507             it >> *se;
00508             ++entry_count;
00509             m_LObj.SaveObject(&m_Objects, &m_Objects.GetObjectsVector()[0]);
00510             object_count += m_LObj.SaveObjects(m_Objects, true);
00511         }
00512         LOG_POST_X(3, Info << CTime(CTime::eCurrent) << ": LDS: "
00513                    << object_count
00514                    << " object(s) found in "
00515                    << entry_count << " Seq-entries in: "
00516                    << m_Objects.GetFileName());
00517     }
00518     else {
00519         DefaultRead(in, member);
00520     }
00521 }
00522 
00523 
00524 bool CLDS_Object::UpdateBinaryASNObject(CObjectIStream& in,
00525                                         CLDS_CoreObjectsReader& objects,
00526                                         CObjectTypeInfo type)
00527 {
00528     CNcbiStreampos start_pos = in.GetStreamPos();
00529     objects.Reset();
00530     LOG_POST_X(4, Info
00531                << "Trying ASN.1 binary top level object:"
00532                << type.GetName() );
00533     CRef<CLDS_GBReleaseReadHook> hook;
00534     try {
00535         if ( m_GBReleaseMode != eNoGBRelease &&
00536              type == CType<CBioseq_set>() ) {
00537             // try to avoid loading full GenBank release Bioseq-set
00538             hook = new CLDS_GBReleaseReadHook(*this, objects);
00539             type.FindMember("seq-set").SetLocalReadHook(in, hook);
00540         }
00541         CObjectInfo object_info(type);
00542         CStopWatch sw(CStopWatch::eStart);
00543         in.Read(object_info);
00544         if ( hook && hook->Separate() ) {
00545             LOG_POST_X(5, Info
00546                        << "Binary ASN.1 combined object found: "
00547                        << type.GetName()
00548                        << " in " << sw.Elapsed());
00549         }
00550         else {
00551             LOG_POST_X(5, Info
00552                        << "Binary ASN.1 top level object found: "
00553                        << type.GetName()
00554                        << " in " << sw.Elapsed());
00555         }
00556         if ( hook ) {
00557             hook->Remove(in);
00558         }
00559         return true;
00560     }
00561     catch (CEofException& ) {
00562     }
00563     catch (CException& _DEBUG_ARG(e)) {
00564         _TRACE("  failed to read: " << e.GetMsg());
00565     }
00566     if ( hook ) {
00567         hook->Remove(in);
00568     }
00569     in.SetStreamPos(start_pos);
00570     return false;
00571 }
00572 
00573 
00574 int CLDS_Object::SaveObjects(CLDS_CoreObjectsReader& objects,
00575                              bool internal)
00576 {
00577     int ret = 0;
00578     CLDS_CoreObjectsReader::TObjectVector& objs = objects.GetObjectsVector();
00579     if ( !objs.empty() ) {
00580         size_t count = objs.size();
00581         if ( !internal ) {
00582             LOG_POST_X(3, Info << CTime(CTime::eCurrent) <<
00583                        ": Saving " << count <<
00584                        " object(s) found in: " << objects.GetFileName());
00585         }
00586         for (size_t i = 0; i < count; ++i) {
00587             CLDS_CoreObjectsReader::SObjectDetails& obj_info = objs[i];
00588             // If object is not in the database yet.
00589             if (obj_info.ext_id == 0) {
00590                 SaveObject(&objects, &obj_info);
00591                 ++ret;
00592             }
00593         }
00594         if ( !internal ) {
00595             LOG_POST_X(3, Info << CTime(CTime::eCurrent) << ": LDS: "
00596                        << count
00597                        << " object(s) found in: "<<objects.GetFileName());
00598         }
00599         objects.ClearObjectsVector();
00600     }
00601     else {
00602         if ( !internal ) {
00603             if ( objects.GetTotalObjects() == 0 ) {
00604                 LOG_POST_X(4, Info <<
00605                            "LDS: No objects found in:" <<
00606                            objects.GetFileName());
00607             }
00608             else {
00609                 LOG_POST_X(4, Info <<
00610                            "LDS: No more objects found in:" <<
00611                            objects.GetFileName());
00612             }
00613         }
00614     }
00615     if ( m_Seq_idsCollector ) {
00616         m_Seq_idsCollector->ClearIds();
00617     }
00618     return ret;
00619 }
00620 
00621 
00622 void CLDS_Object::UpdateBinaryASNObjects(int file_id,
00623                                          const string& file_name)
00624 {
00625     vector<CObjectTypeInfo> types;
00626     types.push_back(CType<CBioseq_set>());
00627     types.push_back(CType<CSeq_entry>());
00628     types.push_back(CType<CBioseq>());
00629     types.push_back(CType<CSeq_annot>());
00630     types.push_back(CType<CSeq_align>());
00631     types.push_back(CType<CSeq_align_set>());
00632     vector<CObjectTypeInfo> skip_types;
00633     skip_types.push_back(CType<CSeq_data>());
00634     skip_types.push_back(CType<CSeq_ext>());
00635     skip_types.push_back(CType<CSeq_hist>());
00636 
00637     LOG_POST_X(2, Info << CTime(CTime::eCurrent) <<
00638                ": Scanning file: " << file_name);
00639 
00640     CRef<CLDS_CollectSeq_idsReader> seq_id_hook(new CLDS_CollectSeq_idsReader);
00641     m_Seq_idsCollector = new CLDS_Seq_idsCollector(seq_id_hook);
00642     CRef<CLDS_CoreObjectsReader> objects
00643         (new CLDS_CoreObjectsReader(file_id, file_name));
00644     auto_ptr<CObjectIStream>
00645         in(CObjectIStream::Open(file_name, eSerial_AsnBinary));
00646 
00647     {{ // setup hooks
00648         ITERATE ( vector<CObjectTypeInfo>, it, types ) {
00649             it->SetLocalReadHook(*in, objects);
00650         }
00651         CRef<CLDS_SkipObjectHook> skipper(new CLDS_SkipObjectHook);
00652         ITERATE ( vector<CObjectTypeInfo>, it, skip_types ) {
00653             it->SetLocalReadHook(*in, skipper);
00654         }
00655         CObjectTypeInfo seq_id_type = CType<CSeq_id>();
00656         seq_id_type.SetLocalSkipHook(*in, seq_id_hook);
00657         CObjectTypeInfo annot_type = CType<CSeq_annot>();
00658         annot_type.FindMember("data").SetLocalReadHook(*in, m_Seq_idsCollector);
00659     }}
00660 
00661     size_t last_type = 0;
00662     while ( in->HaveMoreData() ) {
00663         // first try previous type
00664         bool found = UpdateBinaryASNObject(*in, *objects, types[last_type]);
00665         if ( !found ) {
00666             // then all remaining possible types
00667             for ( size_t i = 0; i < types.size(); ++i ) {
00668                 if ( i != last_type ) { // already tried
00669                     if ( UpdateBinaryASNObject(*in, *objects, types[i]) ) {
00670                         found = true;
00671                         last_type = i;
00672                         break;
00673                     }
00674                 }
00675             }
00676         }
00677         if ( !found ) {
00678             break;
00679         }
00680         SaveObjects(*objects, false);
00681     }
00682 }
00683 
00684 
00685 void CLDS_Object::UpdateFileObjects(int file_id,
00686                                     const string& file_name,
00687                                     CFormatGuess::EFormat format)
00688 {
00689     FindMaxObjRecId();
00690 
00691     if (format == CFormatGuess::eBinaryASN ) {
00692         UpdateBinaryASNObjects(file_id, file_name);
00693     }
00694     else if (format == CFormatGuess::eTextASN ||
00695              format == CFormatGuess::eXml) {
00696 
00697         LOG_POST_X(2, Info << CTime(CTime::eCurrent) <<
00698                    ": Scanning file: " << file_name);
00699 
00700         CLDS_CoreObjectsReader sniffer(file_id, file_name);
00701         ESerialDataFormat stream_format = FormatGuess2Serial(format);
00702 
00703         CNcbiIfstream str_input(file_name.c_str(), IOS_BASE::binary);
00704         auto_ptr<CObjectIStream> input(CObjectIStream::Open(stream_format,
00705                                                             str_input));
00706         CRef<CLDS_CollectSeq_idsReader> seq_id_hook(new CLDS_CollectSeq_idsReader);
00707         m_Seq_idsCollector = new CLDS_Seq_idsCollector(seq_id_hook);
00708         CObjectTypeInfo seq_id_type = CType<CSeq_id>();
00709         seq_id_type.SetLocalSkipHook(*input, seq_id_hook);
00710         CObjectTypeInfo annot_type = CType<CSeq_annot>();
00711         annot_type.FindMember("data").SetLocalReadHook(*input, m_Seq_idsCollector);
00712 
00713         sniffer.Probe(*input);
00714 
00715         SaveObjects(sniffer, false);
00716     } else if ( format == CFormatGuess::eFasta ){
00717 
00718         int type_id;
00719         {{
00720         map<string, int>::const_iterator it = m_ObjTypeMap.find("FastaEntry");
00721         _ASSERT(it != m_ObjTypeMap.end());
00722         type_id = it->second;
00723         }}
00724 
00725         CNcbiIfstream input(file_name.c_str(), IOS_BASE::binary);
00726 
00727         CLDS_FastaScanner fscan(*this, file_id, type_id);
00728         ScanFastaFile(&fscan,
00729                       input,
00730                       CFastaReader::fAssumeNuc  |
00731                       CFastaReader::fAllSeqIds  |
00732                       CFastaReader::fOneSeq     |
00733                       CFastaReader::fNoSeqData  |
00734                       CFastaReader::fParseRawID);
00735     } else {
00736         LOG_POST_X(5, Info << "Unsupported file format: " << file_name);
00737     }
00738 
00739 
00740 }
00741 
00742 
00743 int CLDS_Object::SaveObject(int file_id,
00744                             const string& seq_id,
00745                             const string& description,
00746                             const string& seq_ids,
00747                             CNcbiStreampos pos,
00748                             int type_id)
00749 {
00750     ++m_MaxObjRecId;
00751     EBDB_ErrCode err;
00752 /*
00753     m_db.object_attr_db.object_attr_id = m_MaxObjRecId;
00754     m_db.object_attr_db.object_title = description;
00755     EBDB_ErrCode err = m_db.object_attr_db.Insert();
00756     BDB_CHECK(err, "LDS::ObjectAttribute");
00757 */
00758     m_db.object_db.object_id = m_MaxObjRecId;
00759     m_db.object_db.file_id = file_id;
00760     m_db.object_db.seqlist_id = 0;
00761     m_db.object_db.object_type = type_id;
00762     Int8 ipos = NcbiStreamposToInt8(pos);
00763     m_db.object_db.file_pos = ipos;
00764 //    m_db.object_db.object_attr_id = m_MaxObjRecId;
00765     m_db.object_db.TSE_object_id = 0;
00766     m_db.object_db.parent_object_id = 0;
00767     m_db.object_db.object_title.Set(description.c_str(),
00768         CBDB_FieldStringBase::eTruncateOnOverflowLogError);
00769     m_db.object_db.seq_ids = seq_ids;
00770 
00771     string ups = seq_id;
00772     NStr::ToUpper(ups);
00773     m_db.object_db.primary_seqid = ups;
00774 
00775     LOG_POST_X(6, Info << "Saving Fasta object: " << seq_id);
00776 
00777     err = m_db.object_db.Insert();
00778     BDB_CHECK(err, "LDS::Object");
00779 
00780     return m_MaxObjRecId;
00781 }
00782 
00783 
00784 int CLDS_Object::SaveObject(CLDS_CoreObjectsReader* objects,
00785                             CLDS_CoreObjectsReader::SObjectDetails* obj_info,
00786                             bool force_object)
00787 {
00788     int top_level_id, parent_id;
00789 
00790     _ASSERT(obj_info->ext_id == 0);  // Making sure the object is not in the DB yet
00791 
00792     if (obj_info->is_top_level) {
00793         top_level_id = parent_id = 0;
00794     }
00795     else {
00796         // Find the direct parent
00797         {{
00798             CLDS_CoreObjectsReader::SObjectDetails* parent_obj_info
00799                 = objects->FindObjectInfo(obj_info->parent_offset);
00800             _ASSERT(parent_obj_info);
00801             parent_id = parent_obj_info->ext_id;
00802             if ( parent_id == 0 ) { // not yet in the database
00803                 // Recursively save the parent
00804                 parent_id = SaveObject(objects, parent_obj_info, true);
00805             }
00806         }}
00807 
00808         // Find the top level grand parent
00809         {{
00810             CLDS_CoreObjectsReader::SObjectDetails* top_obj_info
00811                 = objects->FindObjectInfo(obj_info->top_level_offset);
00812             _ASSERT(top_obj_info);
00813             top_level_id = top_obj_info->ext_id;
00814             if ( top_level_id == 0 ) { // not yet in the database
00815                 // Recursively save the parent
00816                 top_level_id = SaveObject(objects, top_obj_info, true);
00817             }
00818         }}
00819 
00820     }
00821 
00822     const string& type_name = obj_info->info.GetName();
00823 
00824     map<string, int>::const_iterator it = m_ObjTypeMap.find(type_name);
00825     if (it == m_ObjTypeMap.end()) {
00826         LOG_POST_X(7, Info << "Unrecognized type: " << type_name);
00827         return 0;
00828     }
00829     int type_id = it->second;
00830 
00831 
00832     string id_str;
00833     string title;
00834     string all_ids;
00835 
00836     ++m_MaxObjRecId;
00837 
00838     if ( IsObject(*obj_info, &id_str, &title, &all_ids) || force_object ) {
00839         m_db.object_db.primary_seqid = NStr::ToUpper(id_str);
00840 
00841         obj_info->ext_id = m_MaxObjRecId; // Keep external id for the next scan
00842         EBDB_ErrCode err;
00843 /*
00844         m_db.object_attr_db.object_attr_id = m_MaxObjRecId;
00845         m_db.object_attr_db.object_title = molecule_title;
00846         m_db.object_attr_db.seq_ids = NStr::ToUpper(all_seq_id);
00847         EBDB_ErrCode err = m_db.object_attr_db.Insert();
00848         BDB_CHECK(err, "LDS::ObjectAttr");
00849 */
00850         m_db.object_db.object_id = m_MaxObjRecId;
00851         m_db.object_db.file_id = objects->GetFileId();
00852         m_db.object_db.seqlist_id = 0;  // TODO:
00853         m_db.object_db.object_type = type_id;
00854         Int8 i8 = NcbiStreamposToInt8(obj_info->offset);
00855         m_db.object_db.file_pos = i8;
00856 //        m_db.object_db.object_attr_id = m_MaxObjRecId;
00857         m_db.object_db.TSE_object_id = top_level_id;
00858         m_db.object_db.parent_object_id = parent_id;
00859         m_db.object_db.object_title = title;
00860         m_db.object_db.seq_ids = NStr::ToUpper(all_ids);
00861 
00862 
00863 //        LOG_POST_X(8, Info<<"Saving object: " << type_name << " " << id_str);
00864 
00865         err = m_db.object_db.Insert();
00866         BDB_CHECK(err, "LDS::Object");
00867 
00868     }
00869     else if ( CSeq_annot* annot = CType<CSeq_annot>().Get(obj_info->info)) {
00870         // Set of seq ids referenced in the annotation
00871         //
00872         set<string> ref_seq_ids;
00873         CLDS_Seq_ids *ids =
00874             m_Seq_idsCollector? m_Seq_idsCollector->GetIds(obj_info->info): 0;
00875         if ( ids ) {
00876             ITERATE ( CLDS_Seq_ids::TIds, it, ids->m_Ids ) {
00877                 const CSeq_id& id = **it;
00878                 ref_seq_ids.insert(id.AsFastaString());
00879             }
00880 
00881             CLDS_Seq_ids::TGis& gis = ids->m_Gis;
00882             sort(gis.begin(), gis.end());
00883             gis.erase(unique(gis.begin(), gis.end()), gis.end());
00884             CSeq_id id;
00885             ITERATE ( CLDS_Seq_ids::TGis, it, gis ) {
00886                 id.SetGi(*it);
00887                 ref_seq_ids.insert(id.AsFastaString());
00888             }
00889             //LOG_POST_X(9, Info <<
00890             //           "Saving " << ref_seq_ids.size() <<
00891             //           " Seq-ids in Seq-annot");
00892         }
00893         else if ( annot->CanGetData()) {
00894             // Check for alignment in annotation
00895             //
00896             const CSeq_annot_Base::C_Data& adata = annot->GetData();
00897             if (adata.Which() == CSeq_annot_Base::C_Data::e_Align) {
00898                 const CSeq_annot_Base::C_Data::TAlign& al_list =
00899                     adata.GetAlign();
00900                 ITERATE (CSeq_annot_Base::C_Data::TAlign, it, al_list){
00901                     if (!(*it)->CanGetSegs())
00902                         continue;
00903 
00904                     const CSeq_align::TSegs& segs = (*it)->GetSegs();
00905                     switch (segs.Which())
00906                         {
00907                         case CSeq_align::C_Segs::e_Std:
00908                         {
00909                             const CSeq_align_Base::C_Segs::TStd& std_list =
00910                                 segs.GetStd();
00911                             ITERATE (CSeq_align_Base::C_Segs::TStd, it2, std_list) {
00912                                 const CRef<CStd_seg>& seg = *it2;
00913                                 const CStd_seg::TIds& ids = seg->GetIds();
00914 
00915                                 ITERATE(CStd_seg::TIds, it3, ids) {
00916                                     ref_seq_ids.insert((*it3)->AsFastaString());
00917 
00918                                 } // ITERATE
00919 
00920                             } // ITERATE
00921                         }
00922                         break;
00923                         case CSeq_align::C_Segs::e_Denseg:
00924                         {
00925                             const CSeq_align_Base::C_Segs::TDenseg& denseg =
00926                                 segs.GetDenseg();
00927                             const CDense_seg::TIds& ids = denseg.GetIds();
00928 
00929                             ITERATE (CDense_seg::TIds, it3, ids) {
00930                                 ref_seq_ids.insert((*it3)->AsFastaString());
00931                             } // ITERATE
00932 
00933                         }
00934                         break;
00935                         //case CSeq_align::C_Segs::e_Packed:
00936                         //case CSeq_align::C_Segs::e_Disc:
00937                         default:
00938                             break;
00939                         }
00940 
00941                 } // ITERATE
00942             }
00943         }
00944 
00945         // Save all seq ids referred by the alignment
00946         //
00947         ITERATE (set<string>, it, ref_seq_ids) {
00948             m_db.seq_id_list.object_id = m_MaxObjRecId;
00949             m_db.seq_id_list.seq_id = it->c_str();
00950 
00951             EBDB_ErrCode err = m_db.seq_id_list.Insert();
00952             BDB_CHECK(err, "LDS::seq_id_list");
00953         }
00954 
00955         obj_info->ext_id = m_MaxObjRecId; // Keep external id for the next scan
00956 
00957         m_db.annot_db.annot_id = m_MaxObjRecId;
00958         m_db.annot_db.file_id = objects->GetFileId();
00959         m_db.annot_db.annot_type = type_id;
00960         Int8 i8 = NcbiStreamposToInt8(obj_info->offset);
00961         m_db.annot_db.file_pos = i8;
00962         m_db.annot_db.TSE_object_id = top_level_id;
00963         m_db.annot_db.parent_object_id = parent_id;
00964 /*
00965         LOG_POST_X(9, Info << "Saving annotation: "
00966                            << type_name
00967                            << " "
00968                            << id_str
00969                            << " "
00970                            << (!top_level_id ? "Top Level. " : " ")
00971                            << "offs="
00972                            << obj_info->offset
00973                   );
00974 */
00975 
00976         EBDB_ErrCode err = m_db.annot_db.Insert();
00977         BDB_CHECK(err, "LDS::Annotation");
00978 
00979         m_db.annot2obj_db.object_id = parent_id;
00980         m_db.annot2obj_db.annot_id = m_MaxObjRecId;
00981 
00982         err = m_db.annot2obj_db.Insert();
00983         BDB_CHECK(err, "LDS::Annot2Obj");
00984 
00985     }
00986 
00987     return obj_info->ext_id;
00988 }
00989 
00990 
00991 CScope* CLDS_Object::GetScope(void)
00992 {
00993     if ( !m_Scope && m_TSE ) {
00994         m_Scope = new CScope(*m_TSE_Manager);
00995         m_Scope->AddTopLevelSeqEntry(*m_TSE);
00996     }
00997     return m_Scope;
00998 }
00999 
01000 
01001 bool
01002 CLDS_Object::IsObject(const CLDS_CoreObjectsReader::SObjectDetails& parse_info,
01003                       string* object_str_id,
01004                       string* object_title,
01005                       string* object_all_ids)
01006 {
01007     if ( CREATE_SCOPES && parse_info.is_top_level ) {
01008         m_TSE_Manager = CObjectManager::GetInstance();
01009         m_Scope.Reset();
01010         m_TSE_Info = parse_info.info;
01011         m_TSE.Reset();
01012         if ( CSeq_entry* obj = CType<CSeq_entry>().Get(m_TSE_Info) ) {
01013             m_TSE = obj;
01014             m_TSE->Parentize();
01015             return true;
01016         }
01017         else if ( CBioseq_set* obj = CType<CBioseq_set>().Get(m_TSE_Info) ) {
01018             m_TSE = new CSeq_entry;
01019             m_TSE->SetSet(*obj);
01020             m_TSE->Parentize();
01021             return true;
01022         }
01023         else if ( CBioseq* obj = CType<CBioseq>().Get(m_TSE_Info) ) {
01024             m_TSE = new CSeq_entry;
01025             m_TSE->SetSeq(*obj);
01026             m_TSE->Parentize();
01027             GetBioseqInfo(parse_info, *obj,
01028                           object_str_id, object_title, object_all_ids);
01029             return true;
01030         }
01031         else if ( CSeq_annot* obj = CType<CSeq_annot>().Get(m_TSE_Info) ) {
01032             m_TSE = new CSeq_entry;
01033             m_TSE->SetSet().SetSeq_set();
01034             m_TSE->SetSet().SetAnnot().push_back(Ref(obj));
01035             m_TSE->Parentize();
01036             GetAnnotInfo(parse_info, *obj,
01037                          object_str_id, object_title, object_all_ids);
01038             return true;
01039         }
01040         else if ( CSeq_align* obj = CType<CSeq_align>().Get(m_TSE_Info) ) {
01041             CRef<CSeq_annot> annot(new CSeq_annot);
01042             CSeq_annot::TData::TAlign& arr = annot->SetData().SetAlign();
01043             arr.push_back(Ref(obj));
01044             m_TSE = new CSeq_entry;
01045             m_TSE->SetSet().SetSeq_set();
01046             m_TSE->SetSet().SetAnnot().push_back(annot);
01047             m_TSE->Parentize();
01048             GetAnnotInfo(parse_info, *annot,
01049                          object_str_id, object_title, object_all_ids);
01050             return true;
01051         }
01052         else if (CSeq_align_set* obj=CType<CSeq_align_set>().Get(m_TSE_Info)) {
01053             CRef<CSeq_annot> annot(new CSeq_annot);
01054             CSeq_annot::TData::TAlign& arr = annot->SetData().SetAlign();
01055             NON_CONST_ITERATE ( CSeq_align_set::Tdata, it, obj->Set() ) {
01056                 arr.push_back(*it);
01057             }
01058             m_TSE = new CSeq_entry;
01059             m_TSE->SetSet().SetSeq_set();
01060             m_TSE->SetSet().SetAnnot().push_back(annot);
01061             m_TSE->Parentize();
01062             GetAnnotInfo(parse_info, *annot,
01063                          object_str_id, object_title, object_all_ids);
01064             return true;
01065         }
01066     }
01067 
01068     if ( CBioseq* obj = CType<CBioseq>().Get(parse_info.info) ) {
01069         GetBioseqInfo(parse_info, *obj,
01070                       object_str_id, object_title, object_all_ids);
01071         return true;
01072     }
01073     else if ( CType<CSeq_annot>().Get(parse_info.info) ||
01074               CType<CSeq_align>().Get(parse_info.info) ||
01075               CType<CSeq_align_set>().Get(parse_info.info) ) {
01076         return false;
01077     }
01078     return true;
01079 }
01080 
01081 
01082 void CLDS_Object::GetBioseqInfo(const CLDS_CoreObjectsReader::SObjectDetails& /*obj_info*/,
01083                                 const CBioseq& bioseq,
01084                                 string* object_str_id,
01085                                 string* object_title,
01086                                 string* object_all_ids)
01087 {
01088     const CSeq_id* seq_id = bioseq.GetFirstId();
01089     if ( seq_id ) {
01090         *object_str_id = seq_id->AsFastaString();
01091     }
01092 
01093     if ( TRY_FAST_TITLE && sequence::GetTitle(bioseq, object_title) ) {
01094         // Good, we've got title fast way.
01095     }
01096     else if (CScope* scope = GetScope()) { // we are under OM here
01097         CBioseq_Handle bio_handle = scope->GetBioseqHandle(bioseq);
01098         if ( bio_handle ) {
01099             *object_title = sequence::GetTitle(bio_handle);
01100             //LOG_POST_X(10, Info<<"object title: "<<*molecule_title);
01101         }
01102         else {
01103             // the last resort
01104             bioseq.GetLabel(object_title, CBioseq::eBoth);
01105         }
01106 
01107     }
01108     else {  // non-OM controlled object
01109         bioseq.GetLabel(object_title, CBioseq::eBoth);
01110     }
01111 
01112     ITERATE ( CBioseq::TId, it, bioseq.GetId() ) {
01113         const CSeq_id* seq_id = *it;
01114         if ( seq_id ) {
01115             object_all_ids->append(seq_id->AsFastaString());
01116             object_all_ids->append(" ");
01117         }
01118     }
01119 }
01120 
01121 
01122 void CLDS_Object::GetAnnotInfo(const CLDS_CoreObjectsReader::SObjectDetails& obj_info,
01123                                const CSeq_annot& annot,
01124                                string* object_str_id,
01125                                string* object_title,
01126                                string* object_all_ids)
01127 {
01128     set<string> ref_seq_ids;
01129     CLDS_Seq_ids *ids =
01130         m_Seq_idsCollector? m_Seq_idsCollector->GetIds(obj_info.info): 0;
01131     if ( ids ) {
01132         ITERATE ( CLDS_Seq_ids::TIds, it, ids->m_Ids ) {
01133             const CSeq_id& id = **it;
01134             string str_id = id.AsFastaString();
01135             ref_seq_ids.insert(NStr::ToUpper(str_id));
01136         }
01137 
01138         CLDS_Seq_ids::TGis& gis = ids->m_Gis;
01139         sort(gis.begin(), gis.end());
01140         gis.erase(unique(gis.begin(), gis.end()), gis.end());
01141         CSeq_id id;
01142         ITERATE ( CLDS_Seq_ids::TGis, it, gis ) {
01143             id.SetGi(*it);
01144             string str_id = id.AsFastaString();
01145             ref_seq_ids.insert(NStr::ToUpper(str_id));
01146         }
01147         //LOG_POST_X(9, Info <<
01148         //           "Saving " << ref_seq_ids.size() <<
01149         //           " Seq-ids in Seq-annot");
01150     }
01151     else if ( annot.CanGetData() ) {
01152         // Check for alignment in annotation
01153         //
01154         const CSeq_annot_Base::C_Data& adata = annot.GetData();
01155         if ( adata.IsAlign() ) {
01156             const CSeq_annot_Base::C_Data::TAlign& al_list = adata.GetAlign();
01157             ITERATE (CSeq_annot_Base::C_Data::TAlign, it, al_list){
01158                 if (!(*it)->CanGetSegs())
01159                     continue;
01160 
01161                 const CSeq_align::TSegs& segs = (*it)->GetSegs();
01162                 switch (segs.Which())
01163                     {
01164                     case CSeq_align::C_Segs::e_Std:
01165                     {
01166                         const CSeq_align_Base::C_Segs::TStd& std_list =
01167                             segs.GetStd();
01168                         ITERATE (CSeq_align_Base::C_Segs::TStd, it2, std_list) {
01169                             const CRef<CStd_seg>& seg = *it2;
01170                             const CStd_seg::TIds& ids = seg->GetIds();
01171 
01172                             ITERATE(CStd_seg::TIds, it3, ids) {
01173                                 string str_id = (*it3)->AsFastaString();
01174                                 ref_seq_ids.insert(NStr::ToUpper(str_id));
01175 
01176                             } // ITERATE
01177 
01178                         } // ITERATE
01179                     }
01180                     break;
01181                     case CSeq_align::C_Segs::e_Denseg:
01182                     {
01183                         const CSeq_align_Base::C_Segs::TDenseg& denseg =
01184                             segs.GetDenseg();
01185                         const CDense_seg::TIds& ids = denseg.GetIds();
01186 
01187                         ITERATE (CDense_seg::TIds, it3, ids) {
01188                             string str_id = (*it3)->AsFastaString();
01189                             ref_seq_ids.insert(NStr::ToUpper(str_id));
01190                         } // ITERATE
01191 
01192                     }
01193                     break;
01194                     //case CSeq_align::C_Segs::e_Packed:
01195                     //case CSeq_align::C_Segs::e_Disc:
01196                     default:
01197                         break;
01198                     }
01199 
01200             } // ITERATE
01201         }
01202         else {
01203             for (CTypeConstIterator<CSeq_id> it(ConstBegin(annot)); it; ++it) {
01204                 const CSeq_id& id = *it;
01205                 string str_id = id.AsFastaString();
01206                 ref_seq_ids.insert(NStr::ToUpper(str_id));
01207             }
01208         }
01209     }
01210 
01211     // Save all seq ids referred by the alignment
01212     //
01213     ITERATE (set<string>, it, ref_seq_ids) {
01214         object_all_ids->append(*it);
01215         object_all_ids->append(" ");
01216     }
01217 }
01218 
01219 
01220 int CLDS_Object::FindMaxObjRecId()
01221 {
01222     if (m_MaxObjRecId) {
01223         return m_MaxObjRecId;
01224     }
01225 
01226     LDS_GETMAXID(m_MaxObjRecId, m_db.object_db, object_id);
01227 
01228     int ann_rec_id = 0;
01229     LDS_GETMAXID(ann_rec_id, m_db.annot_db, annot_id);
01230 
01231     if (ann_rec_id > m_MaxObjRecId) {
01232         m_MaxObjRecId = ann_rec_id;
01233     }
01234 
01235     return m_MaxObjRecId;
01236 }
01237 
01238 
01239 static bool s_GetSequenceBase(const CObject_id& obj_id,
01240                               SLDS_SeqIdBase*  seqid_base)
01241 {
01242     switch (obj_id.Which()) {
01243     case CObject_id::e_Id:
01244         seqid_base->int_id = obj_id.GetId();
01245         seqid_base->str_id.erase();
01246         return true;
01247     case CObject_id::e_Str:
01248         seqid_base->int_id = 0;
01249         seqid_base->str_id = obj_id.GetStr();
01250         return true;
01251     default:
01252         break;
01253     }
01254     return false;
01255 }
01256 
01257 
01258 static bool s_GetSequenceBase(const CPDB_seq_id& pdb_id,
01259                               SLDS_SeqIdBase*  seqid_base)
01260 {
01261     seqid_base->int_id = 0;
01262     seqid_base->str_id = pdb_id.GetMol().Get();
01263     seqid_base->str_id += '|';
01264     char chain = (char) pdb_id.GetChain();
01265     if ( chain == '|' ) {
01266         seqid_base->str_id += "VB";
01267     }
01268     else if ( chain == '\0' ) {
01269         seqid_base->str_id += ' ';
01270     }
01271     else if ( islower((unsigned char)chain) ) {
01272         seqid_base->str_id.append(2, chain);
01273     }
01274     else {
01275         seqid_base->str_id += chain;
01276     }
01277     return true;
01278 }
01279 
01280 
01281 void LDS_GetSequenceBase(const CSeq_id&   seq_id,
01282                          SLDS_SeqIdBase*  seqid_base)
01283 {
01284     _ASSERT(seqid_base);
01285 
01286     int   obj_id_int = 0;
01287     const CTextseq_id* obj_id_txt = 0;
01288 
01289     switch (seq_id.Which()) {
01290     case CSeq_id::e_Local:
01291         if ( s_GetSequenceBase(seq_id.GetLocal(), seqid_base) ) {
01292             return;
01293         }
01294         break;
01295     case CSeq_id::e_Gibbsq:
01296         obj_id_int = seq_id.GetGibbsq();
01297         break;
01298     case CSeq_id::e_Gibbmt:
01299         obj_id_int = seq_id.GetGibbmt();
01300         break;
01301     case CSeq_id::e_Giim:
01302         obj_id_int = seq_id.GetGiim().GetId();
01303         break;
01304     case CSeq_id::e_Genbank:
01305         obj_id_txt = &seq_id.GetGenbank();
01306         break;
01307     case CSeq_id::e_Embl:
01308         obj_id_txt = &seq_id.GetEmbl();
01309         break;
01310     case CSeq_id::e_Pir:
01311         obj_id_txt = &seq_id.GetPir();
01312         break;
01313     case CSeq_id::e_Swissprot:
01314         obj_id_txt = &seq_id.GetSwissprot();
01315         break;
01316     case CSeq_id::e_Patent:
01317         {{
01318             seqid_base->int_id = 0;
01319             seqid_base->str_id = "";
01320             const CId_pat& pat = seq_id.GetPatent().GetCit();
01321             pat.GetLabel(&seqid_base->str_id);
01322         }}
01323         return;
01324     case CSeq_id::e_Other:
01325         obj_id_txt = &seq_id.GetOther();
01326         break;
01327     case CSeq_id::e_General:
01328         {{
01329             seqid_base->int_id = 0;
01330             seqid_base->str_id = "";
01331             seq_id.GetGeneral().GetLabel(&seqid_base->str_id);
01332         }}
01333         return;
01334     case CSeq_id::e_Gi:
01335         obj_id_int = seq_id.GetGi();
01336         break;
01337     case CSeq_id::e_Ddbj:
01338         obj_id_txt = &seq_id.GetDdbj();
01339         break;
01340     case CSeq_id::e_Prf:
01341         obj_id_txt = &seq_id.GetPrf();
01342         break;
01343     case CSeq_id::e_Pdb:
01344         if ( s_GetSequenceBase(seq_id.GetPdb(), seqid_base) ) {
01345             return;
01346         }
01347         break;
01348     case CSeq_id::e_Tpg:
01349         obj_id_txt = &seq_id.GetTpg();
01350         break;
01351     case CSeq_id::e_Tpe:
01352         obj_id_txt = &seq_id.GetTpe();
01353         break;
01354     case CSeq_id::e_Tpd:
01355         obj_id_txt = &seq_id.GetTpd();
01356         break;
01357     case CSeq_id::e_Gpipe:
01358         obj_id_txt = &seq_id.GetGpipe();
01359         break;
01360     default:
01361         _ASSERT(0);
01362         break;
01363     }
01364 
01365     const string* id_str = 0;
01366 
01367     if (obj_id_int) {
01368         seqid_base->int_id = obj_id_int;
01369         seqid_base->str_id.erase();
01370         return;
01371     }
01372 
01373     if (obj_id_txt) {
01374         if (obj_id_txt->CanGetAccession()) {
01375             const CTextseq_id::TAccession& acc =
01376                                 obj_id_txt->GetAccession();
01377             id_str = &acc;
01378         } else {
01379             if (obj_id_txt->CanGetName()) {
01380                 const CTextseq_id::TName& name =
01381                     obj_id_txt->GetName();
01382                 id_str = &name;
01383             }
01384         }
01385     }
01386 
01387     if (id_str) {
01388         seqid_base->int_id = 0;
01389         seqid_base->str_id = *id_str;
01390         return;
01391     }
01392 
01393     LOG_POST_X(11, Warning
01394                << "SeqId indexer: unsupported type "
01395                << seq_id.AsFastaString());
01396 
01397     seqid_base->Init();
01398 
01399 }
01400 
01401 bool LDS_GetSequenceBase(const string&   seq_id_str,
01402                          SLDS_SeqIdBase* seqid_base,
01403                          CSeq_id*        conv_seq_id)
01404 {
01405     if ( seq_id_str.empty() ) {
01406         return false;
01407     }
01408 
01409     _ASSERT(seqid_base);
01410 
01411     CRef<CSeq_id> tmp_seq_id;
01412 
01413     if (conv_seq_id == 0) {
01414         tmp_seq_id.Reset((conv_seq_id = new CSeq_id));
01415 
01416     }
01417 
01418     bool can_convert = true;
01419 
01420     try {
01421         conv_seq_id->Set(seq_id_str);
01422     } catch (CSeqIdException&) {
01423         try {
01424             conv_seq_id->Set(CSeq_id::e_Local, seq_id_str);
01425         } catch (CSeqIdException&) {
01426             can_convert = false;
01427             LOG_POST_X(12, Error <<
01428                        "Cannot convert seq id string: " << seq_id_str);
01429             seqid_base->Init();
01430         }
01431     }
01432 
01433     if (can_convert) {
01434         LDS_GetSequenceBase(*conv_seq_id, seqid_base);
01435     }
01436 
01437     return can_convert;
01438 }
01439 
01440 
01441 /// Scanner functor to build id index
01442 ///
01443 /// @internal
01444 ///
01445 class CLDS_BuildIdIdx
01446 {
01447 public:
01448     CLDS_BuildIdIdx(CLDS_Database& db, bool control_dups)
01449     : m_db(db),
01450         m_coll(db.GetTables()),
01451         m_SeqId(new CSeq_id),
01452         m_ControlDups(control_dups),
01453         m_Query(new CLDS_Query(db)),
01454         m_ObjIds(bm::BM_GAP)
01455     {
01456         if (m_ControlDups) {
01457             m_SequenceFind.reset(new CLDS_Query::CSequenceFinder(*m_Query));
01458         }
01459     }
01460 
01461     void operator()(SLDS_ObjectDB& dbf)
01462     {
01463         int object_id = dbf.object_id; // PK
01464 
01465         if (!dbf.primary_seqid.IsNull()) {
01466             dbf.primary_seqid.ToString(m_PriSeqId_Str);
01467 
01468             x_AddToIdx(m_PriSeqId_Str, object_id);
01469         }
01470 
01471         dbf.seq_ids.ToString(m_SeqId_Str);
01472         vector<string> seq_id_arr;
01473         NStr::Tokenize(m_SeqId_Str, " ", seq_id_arr, NStr::eMergeDelims);
01474         ITERATE (vector<string>, it, seq_id_arr) {
01475             const string& seq_id_str = *it;
01476             if (NStr::CompareNocase(seq_id_str,m_PriSeqId_Str)==0) {
01477                 continue;
01478             }
01479             x_AddToIdx(seq_id_str, object_id);
01480         }
01481     }
01482 
01483 private:
01484     void x_AddToIdx(const string& seq_id_str, int rec_id)
01485     {
01486         bool can_convert =
01487             LDS_GetSequenceBase(seq_id_str, &m_SBase, &*m_SeqId);
01488         if (can_convert) {
01489             if (m_ControlDups) {
01490                 _ASSERT(m_SequenceFind.get());
01491                 CLDS_Set& cand = m_SequenceFind->GetCandidates();
01492                 cand.clear();
01493                 m_SequenceFind->Screen(m_SBase);
01494                 if (cand.any()) {
01495                     CLDS_Set dup_ids(bm::BM_GAP);
01496                     m_SequenceFind->FindInCandidates(seq_id_str, &dup_ids);
01497 
01498                     if (dup_ids.any()) {
01499                         unsigned id = dup_ids.get_first();
01500                         m_Query->ReportDuplicateObjectSeqId(seq_id_str,
01501                                                             id,
01502                                                             rec_id);
01503                     }
01504                 }
01505             }
01506 
01507             x_AddToIdx(m_SBase, rec_id);
01508         }
01509     }
01510 
01511     void x_AddToIdx(const SLDS_SeqIdBase& sbase, int rec_id)
01512     {
01513         if (sbase.int_id) {
01514             _TRACE("int id: "<<sbase.int_id<<" -> "<<rec_id);
01515             m_coll.obj_seqid_int_idx.id = sbase.int_id;
01516             m_coll.obj_seqid_int_idx.row_id = rec_id;
01517             m_coll.obj_seqid_int_idx.Insert();
01518         }
01519         else if (!sbase.str_id.empty()) {
01520             _TRACE("str id: "<<sbase.str_id<<" -> "<<rec_id);
01521             m_coll.obj_seqid_txt_idx.id = sbase.str_id;
01522             m_coll.obj_seqid_txt_idx.row_id = rec_id;
01523             m_coll.obj_seqid_txt_idx.Insert();
01524         }
01525     }
01526 
01527 private:
01528     CLDS_BuildIdIdx(const CLDS_BuildIdIdx&);
01529     CLDS_BuildIdIdx& operator=(const CLDS_BuildIdIdx&);
01530 
01531 private:
01532     CLDS_Database&         m_db;
01533     SLDS_TablesCollection& m_coll;
01534     string                 m_PriSeqId_Str;
01535     string                 m_SeqId_Str;
01536     CRef<CSeq_id>          m_SeqId;
01537     SLDS_SeqIdBase         m_SBase;
01538     bool                   m_ControlDups; ///< Control id duplicates
01539     auto_ptr<CLDS_Query>                  m_Query;
01540     auto_ptr<CLDS_Query::CSequenceFinder> m_SequenceFind;
01541     CLDS_Set               m_ObjIds;  ///< id set for duplicate search
01542 };
01543 
01544 void CLDS_Object::BuildSeqIdIdx()
01545 {
01546     m_db.obj_seqid_int_idx.Truncate();
01547     m_db.obj_seqid_txt_idx.Truncate();
01548 
01549     LOG_POST_X(13, Info << "Building sequence id index on objects...");
01550 
01551     CLDS_BuildIdIdx func(m_DataBase, m_ControlDupIds);
01552     BDB_iterate_file(m_db.object_db, func);
01553 }
01554 
01555 
01556 CLDS_Object::CLDS_Object(CLDS_Database& db,
01557                          const map<string, int>& obj_map)
01558 : m_DataBase(db),
01559   m_db(db.GetTables()),
01560   m_ObjTypeMap(obj_map),
01561   m_MaxObjRecId(0),
01562   m_ControlDupIds(false),
01563   m_GBReleaseMode(eDefaultGBReleaseMode)
01564 {
01565 }
01566 
01567 
01568 CLDS_Object::~CLDS_Object()
01569 {
01570 }
01571 
01572 
01573 END_SCOPE(objects)
01574 END_NCBI_SCOPE
Modified on Wed May 23 13:03:22 2012 by modify_doxy.py rev. 337098