NCBI C++ ToolKit
alnmulti_ds_builder.cpp
Go to the documentation of this file.
00001 /*  $Id: alnmulti_ds_builder.cpp 22179 2010-09-28 20:29:55Z katargir $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE
00005  *               National Center for Biotechnology Information
00006  *
00007  *  This software/database is a "United States Government Work" under the
00008  *  terms of the United States Copyright Act.  It was written as part of
00009  *  the author's official duties as a United States Government employee and
00010  *  thus cannot be copyrighted.  This software/database is freely available
00011  *  to the public for use. The National Library of Medicine and the U.S.
00012  *  Government have not placed any restriction on its use or reproduction.
00013  *
00014  *  Although all reasonable efforts have been taken to ensure the accuracy
00015  *  and reliability of the software and data, the NLM and the U.S.
00016  *  Government do not and cannot warrant the performance or results that
00017  *  may be obtained by using this software or data. The NLM and the U.S.
00018  *  Government disclaim all warranties, express or implied, including
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.
00021  *
00022  *  Please cite the author in any work or product based on this material.
00023  *
00024  * ===========================================================================
00025  *
00026  * Authors:  Andrey Yazhuk
00027  *
00028  * File Description:
00029  *
00030  */
00031 
00032 #include <ncbi_pch.hpp>
00033 
00034 #include <gui/widgets/aln_multiple/alnmulti_ds_builder.hpp>
00035 
00036 #include <gui/widgets/data/sparse_functions.hpp>
00037 
00038 #include <gui/widgets/aln_multiple/sparse_multi_ds.hpp>
00039 #include <gui/widgets/aln_multiple/alnvec_multi_ds.hpp>
00040 
00041 #include <gui/objutils/utils.hpp>
00042 #include <objects/seq/Seq_annot.hpp>
00043 #include <objects/seqalign/Seq_align.hpp>
00044 #include <objects/seq/seq_id_handle.hpp>
00045 
00046 #include <corelib/ncbitime.hpp>
00047 #include <objmgr/align_ci.hpp>
00048 #include <objmgr/bioseq_handle.hpp>
00049 
00050 #include <objtools/alnmgr/aln_converters.hpp>
00051 #include <objtools/alnmgr/aln_builders.hpp>
00052 
00053 
00054 BEGIN_NCBI_SCOPE
00055 USING_SCOPE(ncbi::objects);
00056 
00057 
00058 CAlnMultiDSBuilder::CAlnMultiDSBuilder()
00059     : m_CreateSparse( true )
00060     , m_SyncCreate( false )
00061 {
00062 }
00063 
00064 
00065 CAlnMultiDSBuilder::~CAlnMultiDSBuilder()
00066 {
00067     x_Clear();
00068 }
00069 
00070 
00071 void CAlnMultiDSBuilder::Init(objects::CScope& scope, const objects::CSeq_align& align)
00072 {
00073     x_Clear();
00074 
00075     m_Scope.Reset(&scope);
00076     m_OrigAligns.push_back(CConstRef<CSeq_align>(&align));
00077 }
00078 
00079 
00080 void CAlnMultiDSBuilder::Init(objects::CScope& scope, const objects::CSeq_annot& annot)
00081 {
00082     x_Clear();
00083 
00084     m_Scope.Reset(&scope);
00085     ExtractSeqAligns(annot, m_OrigAligns);
00086 }
00087 
00088 
00089 void CAlnMultiDSBuilder::Init(objects::CScope& scope, const objects::CBioseq& bioseq)
00090 {
00091     x_Clear();
00092 
00093     m_Scope.Reset(&scope);
00094     ExtractSeqAligns(bioseq, m_OrigAligns);
00095 }
00096 
00097 void CAlnMultiDSBuilder::Init(objects::CScope& scope, const objects::CSeq_entry& seq_entry)
00098 {
00099     x_Clear();
00100 
00101     m_Scope.Reset(&scope);
00102     ExtractSeqAligns(seq_entry, m_OrigAligns);
00103 
00104     CTypeConstIterator<objects::CBioseq> it(seq_entry);
00105     while(it)   {
00106         m_Scope->AddBioseq(*it);
00107         ++it;
00108     }
00109 }
00110 
00111 void CAlnMultiDSBuilder::Init(objects::CScope& scope, const objects::CBioseq_Handle& handle)
00112 {
00113     x_Clear();
00114 
00115     m_Scope.Reset(&scope);
00116 
00117     SAnnotSelector sel = CSeqUtils::GetAnnotSelector(CSeq_annot::TData::e_Align);
00118     CAlign_CI it(handle, sel);
00119     int i = 0;
00120     for ( ;  it;  ++it) {
00121         m_OrigAligns.push_back(CConstRef<CSeq_align>(&*it));
00122         if(i++ > 20)
00123             return;
00124     }
00125 }
00126 
00127 
00128 void CAlnMultiDSBuilder::Init(objects::CScope& scope, TAlignVector& aligns)
00129 {
00130     x_Clear();
00131 
00132     m_Scope.Reset(&scope);
00133     m_OrigAligns = aligns;
00134 }
00135 
00136 
00137 /// initial data set from which an alignment will be build
00138 void CAlnMultiDSBuilder::Init(CScope& scope, TAnnotVector& annots)
00139 {
00140     x_Clear();
00141 
00142     m_Scope.Reset(&scope);
00143 
00144     ITERATE(TAnnotVector, it_annot, annots) {
00145         const CSeq_annot& annot = **it_annot;
00146         if(annot.GetData().IsAlign() ) {
00147             ExtractSeqAligns(annot, m_OrigAligns);
00148         }
00149     }
00150 }
00151 
00152 
00153 void CAlnMultiDSBuilder::x_Clear()
00154 {
00155     m_Scope.Reset();
00156     m_OrigAligns.clear();
00157     m_AlnStats.Reset();
00158     m_AnchoredAlns.clear();
00159     m_MasterId.Reset();
00160     m_CreateSparse = true;
00161 }
00162 
00163 
00164 void CAlnMultiDSBuilder::PreCreateDataSource(bool sparse)
00165 {
00166     m_CreateSparse = sparse;
00167     if(m_CreateSparse)  {
00168         x_PreCreateSparseDataSource();
00169     }
00170 }
00171 
00172 
00173 CRef<IAlnMultiDataSource>
00174     CAlnMultiDSBuilder::CreateDataSource()
00175 {
00176     _TRACE( (unsigned long)m_OrigAligns.size() << " m_OrigAligns");
00177 
00178     x_TestAlignments();
00179 
00180     if(m_CreateSparse)  {
00181         return x_CreateSparseDataSource();
00182     } else {
00183         return x_CreateAlnVecDataSource();
00184     }
00185 }
00186 
00187 
00188 void CAlnMultiDSBuilder::GetBioseqHandles(vector<CBioseq_Handle>& handles)
00189 {
00190     if(m_AlnStats)    {
00191         typedef TAlnStats::TIdVec TIdVec;
00192 
00193         ITERATE(TIdVec, it, m_AlnStats->GetIdVec()) {
00194             const CSeq_id& sid = (*it)->GetSeqId();
00195             const CBioseq_Handle h = m_Scope->GetBioseqHandle(sid);
00196             handles.push_back(h);
00197         }
00198     }
00199 }
00200 
00201 
00202 void CAlnMultiDSBuilder::x_PreCreateSparseDataSource()
00203 {
00204     m_AlnStats.Reset();
00205     m_AnchoredAlns.clear();
00206 
00207     if(! m_OrigAligns.empty())    {
00208         TIdExtract extractor;
00209         TAlnIdMap aln_id_map(extractor, m_OrigAligns.size());
00210         ITERATE (TAlignVector, it, m_OrigAligns) {
00211             aln_id_map.push_back(**it);
00212         }
00213 
00214         /// Crete align statistics object
00215         m_AlnStats.Reset(new TAlnStats(aln_id_map));
00216         //TAlnStats aln_stats(aln_vector, ,
00217         //                    m_SeqIdAlnBitmap->GetAnchorRows(),
00218         //                    m_SeqIdAlnBitmap->GetBaseWidths());
00219 
00220         /// Construct a vector of anchored alignments
00221         CreateAnchoredAlnVec(*m_AlnStats, m_AnchoredAlns, m_Options);
00222 
00223         //TODO use aln_stats to init m_Options
00224     }
00225 }
00226 
00227 
00228 CRef<IAlnMultiDataSource>
00229     CAlnMultiDSBuilder::x_CreateSparseDataSource()
00230 {
00231     CStopWatch sw;
00232     sw.Start();
00233 
00234     CRef<IAlnMultiDataSource> ds;
00235     if(m_MasterId.GetPointer()  &&  ! m_OrigAligns.empty())    {
00236         _TRACE("Creating CSparseMultiDataSource");
00237         CSparseMultiDataSource* sp_ds = new CSparseMultiDataSource(*m_Scope);
00238         ds.Reset(sp_ds);
00239         sp_ds->Init(m_AnchoredAlns, m_Options, m_SyncCreate);
00240     }
00241     _TRACE( 1000 * sw.Elapsed() << " ms" );
00242     return ds;
00243 }
00244 
00245 
00246 CRef<IAlnMultiDataSource>
00247     CAlnMultiDSBuilder::x_CreateAlnVecDataSource()
00248 {
00249     CRef<CAlnVecMultiDataSource> ds(new CAlnVecMultiDataSource(*m_Scope));
00250     ds->Init(m_OrigAligns, m_SyncCreate);
00251     return CRef<IAlnMultiDataSource>(ds.GetPointer());
00252 }
00253 
00254 
00255 /// Analyzes m_OrigAligns and decides how to build an alignment from it
00256 void CAlnMultiDSBuilder::x_TestAlignments()
00257 {
00258     TAlignVector good_aligns;
00259     x_GetLinearAlignments(good_aligns);
00260 
00261     // build alignment map
00262     typedef set<const CSeq_align*> TAlignSet;
00263     typedef map<CSeq_id_Handle, TAlignSet> TIDToAligns;
00264     TIDToAligns align_map;
00265 
00266     ITERATE(TAlignVector, it_al, good_aligns) {
00267         const CSeq_align& al = **it_al;
00268 
00269         for ( CTypeConstIterator<CSeq_id> it_id(al); it_id;  ++it_id) {
00270             CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(*it_id);
00271 
00272             TIDToAligns::const_iterator it = align_map.find(idh);
00273             if(it == align_map.end())   {
00274                 it = align_map.insert(TIDToAligns::value_type(idh, TAlignSet())).first;
00275             }
00276             TAlignSet& aln_set = const_cast<TAlignSet&>(it->second);
00277             aln_set.insert(&al);
00278         }
00279     }
00280 
00281     /// select the ID that exist in max number of alignments
00282     size_t max_al = 0; /// max number of alignments for a sinegle ID
00283     CSeq_id_Handle max_h;
00284     ITERATE(TIDToAligns, it_map, align_map) {
00285         const TAlignSet& aln_set = it_map->second;
00286         if(aln_set.size() > max_al) {
00287             max_al = aln_set.size();
00288             max_h = it_map->first;
00289         }
00290     }
00291 
00292     if(max_h)   {
00293         m_MasterId = max_h.GetSeqId();
00294     }
00295     string s_id = m_MasterId ? m_MasterId->GetSeqIdString() : "NULL";
00296     _TRACE("Master ID " << s_id);
00297 }
00298 
00299 
00300 // selects alignments that have the same length on all sequences
00301 void CAlnMultiDSBuilder::x_GetLinearAlignments(TAlignVector& aligns)
00302 {
00303     typedef CSeq_align::C_Segs  TSegs;
00304     aligns.reserve(m_OrigAligns.size());
00305 
00306     // test every CSeq-aling
00307     ITERATE(TAlignVector, it, m_OrigAligns) {
00308         const CSeq_align& align = **it;
00309         const TSegs& segs = align.GetSegs();
00310         bool linear = true;
00311 
00312         switch(segs.Which()) {
00313         case TSegs::e_Denseg:
00314         case TSegs::e_Dendiag:
00315             break;
00316         case TSegs::e_Std: {
00317             ITERATE(TSegs::TStd, it_s, segs.GetStd())   {
00318                 const CStd_seg& std_seg = **it_s;
00319                 if(! x_IsLinear(std_seg))   {
00320                     linear = false;
00321                     break;
00322                 }
00323             }
00324             break;
00325         }
00326         case TSegs::e_Sparse:
00327         case TSegs::e_Spliced:
00328             break;
00329         default:
00330             linear = false; // other types currently not supported
00331             break;
00332         }
00333         if(linear)  {
00334             aligns.push_back(*it);
00335         }
00336     }
00337 }
00338 
00339 /// returns true if CStd_seg is linear
00340 bool CAlnMultiDSBuilder::x_IsLinear(const CStd_seg& /*seg*/)
00341 {
00342     return false;
00343 }
00344 
00345 
00346 END_NCBI_SCOPE
Modified on Wed May 23 13:18:52 2012 by modify_doxy.py rev. 337098