|
NCBI C++ ToolKit
|
00001 /* $Id: alnmulti_ds_builder.cpp 22179 2010-09-28 20:29:55Z katargir $ 00002 * =========================================================================== 00003 * 00004 * PUBLIC DOMAIN NOTICE 00005 * National Center for Biotechnology Information 00006 * 00007 * This software/database is a "United States Government Work" under the 00008 * terms of the United States Copyright Act. It was written as part of 00009 * the author's official duties as a United States Government employee and 00010 * thus cannot be copyrighted. This software/database is freely available 00011 * to the public for use. The National Library of Medicine and the U.S. 00012 * Government have not placed any restriction on its use or reproduction. 00013 * 00014 * Although all reasonable efforts have been taken to ensure the accuracy 00015 * and reliability of the software and data, the NLM and the U.S. 00016 * Government do not and cannot warrant the performance or results that 00017 * may be obtained by using this software or data. The NLM and the U.S. 00018 * Government disclaim all warranties, express or implied, including 00019 * warranties of performance, merchantability or fitness for any particular 00020 * purpose. 00021 * 00022 * Please cite the author in any work or product based on this material. 00023 * 00024 * =========================================================================== 00025 * 00026 * Authors: Andrey Yazhuk 00027 * 00028 * File Description: 00029 * 00030 */ 00031 00032 #include <ncbi_pch.hpp> 00033 00034 #include <gui/widgets/aln_multiple/alnmulti_ds_builder.hpp> 00035 00036 #include <gui/widgets/data/sparse_functions.hpp> 00037 00038 #include <gui/widgets/aln_multiple/sparse_multi_ds.hpp> 00039 #include <gui/widgets/aln_multiple/alnvec_multi_ds.hpp> 00040 00041 #include <gui/objutils/utils.hpp> 00042 #include <objects/seq/Seq_annot.hpp> 00043 #include <objects/seqalign/Seq_align.hpp> 00044 #include <objects/seq/seq_id_handle.hpp> 00045 00046 #include <corelib/ncbitime.hpp> 00047 #include <objmgr/align_ci.hpp> 00048 #include <objmgr/bioseq_handle.hpp> 00049 00050 #include <objtools/alnmgr/aln_converters.hpp> 00051 #include <objtools/alnmgr/aln_builders.hpp> 00052 00053 00054 BEGIN_NCBI_SCOPE 00055 USING_SCOPE(ncbi::objects); 00056 00057 00058 CAlnMultiDSBuilder::CAlnMultiDSBuilder() 00059 : m_CreateSparse( true ) 00060 , m_SyncCreate( false ) 00061 { 00062 } 00063 00064 00065 CAlnMultiDSBuilder::~CAlnMultiDSBuilder() 00066 { 00067 x_Clear(); 00068 } 00069 00070 00071 void CAlnMultiDSBuilder::Init(objects::CScope& scope, const objects::CSeq_align& align) 00072 { 00073 x_Clear(); 00074 00075 m_Scope.Reset(&scope); 00076 m_OrigAligns.push_back(CConstRef<CSeq_align>(&align)); 00077 } 00078 00079 00080 void CAlnMultiDSBuilder::Init(objects::CScope& scope, const objects::CSeq_annot& annot) 00081 { 00082 x_Clear(); 00083 00084 m_Scope.Reset(&scope); 00085 ExtractSeqAligns(annot, m_OrigAligns); 00086 } 00087 00088 00089 void CAlnMultiDSBuilder::Init(objects::CScope& scope, const objects::CBioseq& bioseq) 00090 { 00091 x_Clear(); 00092 00093 m_Scope.Reset(&scope); 00094 ExtractSeqAligns(bioseq, m_OrigAligns); 00095 } 00096 00097 void CAlnMultiDSBuilder::Init(objects::CScope& scope, const objects::CSeq_entry& seq_entry) 00098 { 00099 x_Clear(); 00100 00101 m_Scope.Reset(&scope); 00102 ExtractSeqAligns(seq_entry, m_OrigAligns); 00103 00104 CTypeConstIterator<objects::CBioseq> it(seq_entry); 00105 while(it) { 00106 m_Scope->AddBioseq(*it); 00107 ++it; 00108 } 00109 } 00110 00111 void CAlnMultiDSBuilder::Init(objects::CScope& scope, const objects::CBioseq_Handle& handle) 00112 { 00113 x_Clear(); 00114 00115 m_Scope.Reset(&scope); 00116 00117 SAnnotSelector sel = CSeqUtils::GetAnnotSelector(CSeq_annot::TData::e_Align); 00118 CAlign_CI it(handle, sel); 00119 int i = 0; 00120 for ( ; it; ++it) { 00121 m_OrigAligns.push_back(CConstRef<CSeq_align>(&*it)); 00122 if(i++ > 20) 00123 return; 00124 } 00125 } 00126 00127 00128 void CAlnMultiDSBuilder::Init(objects::CScope& scope, TAlignVector& aligns) 00129 { 00130 x_Clear(); 00131 00132 m_Scope.Reset(&scope); 00133 m_OrigAligns = aligns; 00134 } 00135 00136 00137 /// initial data set from which an alignment will be build 00138 void CAlnMultiDSBuilder::Init(CScope& scope, TAnnotVector& annots) 00139 { 00140 x_Clear(); 00141 00142 m_Scope.Reset(&scope); 00143 00144 ITERATE(TAnnotVector, it_annot, annots) { 00145 const CSeq_annot& annot = **it_annot; 00146 if(annot.GetData().IsAlign() ) { 00147 ExtractSeqAligns(annot, m_OrigAligns); 00148 } 00149 } 00150 } 00151 00152 00153 void CAlnMultiDSBuilder::x_Clear() 00154 { 00155 m_Scope.Reset(); 00156 m_OrigAligns.clear(); 00157 m_AlnStats.Reset(); 00158 m_AnchoredAlns.clear(); 00159 m_MasterId.Reset(); 00160 m_CreateSparse = true; 00161 } 00162 00163 00164 void CAlnMultiDSBuilder::PreCreateDataSource(bool sparse) 00165 { 00166 m_CreateSparse = sparse; 00167 if(m_CreateSparse) { 00168 x_PreCreateSparseDataSource(); 00169 } 00170 } 00171 00172 00173 CRef<IAlnMultiDataSource> 00174 CAlnMultiDSBuilder::CreateDataSource() 00175 { 00176 _TRACE( (unsigned long)m_OrigAligns.size() << " m_OrigAligns"); 00177 00178 x_TestAlignments(); 00179 00180 if(m_CreateSparse) { 00181 return x_CreateSparseDataSource(); 00182 } else { 00183 return x_CreateAlnVecDataSource(); 00184 } 00185 } 00186 00187 00188 void CAlnMultiDSBuilder::GetBioseqHandles(vector<CBioseq_Handle>& handles) 00189 { 00190 if(m_AlnStats) { 00191 typedef TAlnStats::TIdVec TIdVec; 00192 00193 ITERATE(TIdVec, it, m_AlnStats->GetIdVec()) { 00194 const CSeq_id& sid = (*it)->GetSeqId(); 00195 const CBioseq_Handle h = m_Scope->GetBioseqHandle(sid); 00196 handles.push_back(h); 00197 } 00198 } 00199 } 00200 00201 00202 void CAlnMultiDSBuilder::x_PreCreateSparseDataSource() 00203 { 00204 m_AlnStats.Reset(); 00205 m_AnchoredAlns.clear(); 00206 00207 if(! m_OrigAligns.empty()) { 00208 TIdExtract extractor; 00209 TAlnIdMap aln_id_map(extractor, m_OrigAligns.size()); 00210 ITERATE (TAlignVector, it, m_OrigAligns) { 00211 aln_id_map.push_back(**it); 00212 } 00213 00214 /// Crete align statistics object 00215 m_AlnStats.Reset(new TAlnStats(aln_id_map)); 00216 //TAlnStats aln_stats(aln_vector, , 00217 // m_SeqIdAlnBitmap->GetAnchorRows(), 00218 // m_SeqIdAlnBitmap->GetBaseWidths()); 00219 00220 /// Construct a vector of anchored alignments 00221 CreateAnchoredAlnVec(*m_AlnStats, m_AnchoredAlns, m_Options); 00222 00223 //TODO use aln_stats to init m_Options 00224 } 00225 } 00226 00227 00228 CRef<IAlnMultiDataSource> 00229 CAlnMultiDSBuilder::x_CreateSparseDataSource() 00230 { 00231 CStopWatch sw; 00232 sw.Start(); 00233 00234 CRef<IAlnMultiDataSource> ds; 00235 if(m_MasterId.GetPointer() && ! m_OrigAligns.empty()) { 00236 _TRACE("Creating CSparseMultiDataSource"); 00237 CSparseMultiDataSource* sp_ds = new CSparseMultiDataSource(*m_Scope); 00238 ds.Reset(sp_ds); 00239 sp_ds->Init(m_AnchoredAlns, m_Options, m_SyncCreate); 00240 } 00241 _TRACE( 1000 * sw.Elapsed() << " ms" ); 00242 return ds; 00243 } 00244 00245 00246 CRef<IAlnMultiDataSource> 00247 CAlnMultiDSBuilder::x_CreateAlnVecDataSource() 00248 { 00249 CRef<CAlnVecMultiDataSource> ds(new CAlnVecMultiDataSource(*m_Scope)); 00250 ds->Init(m_OrigAligns, m_SyncCreate); 00251 return CRef<IAlnMultiDataSource>(ds.GetPointer()); 00252 } 00253 00254 00255 /// Analyzes m_OrigAligns and decides how to build an alignment from it 00256 void CAlnMultiDSBuilder::x_TestAlignments() 00257 { 00258 TAlignVector good_aligns; 00259 x_GetLinearAlignments(good_aligns); 00260 00261 // build alignment map 00262 typedef set<const CSeq_align*> TAlignSet; 00263 typedef map<CSeq_id_Handle, TAlignSet> TIDToAligns; 00264 TIDToAligns align_map; 00265 00266 ITERATE(TAlignVector, it_al, good_aligns) { 00267 const CSeq_align& al = **it_al; 00268 00269 for ( CTypeConstIterator<CSeq_id> it_id(al); it_id; ++it_id) { 00270 CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(*it_id); 00271 00272 TIDToAligns::const_iterator it = align_map.find(idh); 00273 if(it == align_map.end()) { 00274 it = align_map.insert(TIDToAligns::value_type(idh, TAlignSet())).first; 00275 } 00276 TAlignSet& aln_set = const_cast<TAlignSet&>(it->second); 00277 aln_set.insert(&al); 00278 } 00279 } 00280 00281 /// select the ID that exist in max number of alignments 00282 size_t max_al = 0; /// max number of alignments for a sinegle ID 00283 CSeq_id_Handle max_h; 00284 ITERATE(TIDToAligns, it_map, align_map) { 00285 const TAlignSet& aln_set = it_map->second; 00286 if(aln_set.size() > max_al) { 00287 max_al = aln_set.size(); 00288 max_h = it_map->first; 00289 } 00290 } 00291 00292 if(max_h) { 00293 m_MasterId = max_h.GetSeqId(); 00294 } 00295 string s_id = m_MasterId ? m_MasterId->GetSeqIdString() : "NULL"; 00296 _TRACE("Master ID " << s_id); 00297 } 00298 00299 00300 // selects alignments that have the same length on all sequences 00301 void CAlnMultiDSBuilder::x_GetLinearAlignments(TAlignVector& aligns) 00302 { 00303 typedef CSeq_align::C_Segs TSegs; 00304 aligns.reserve(m_OrigAligns.size()); 00305 00306 // test every CSeq-aling 00307 ITERATE(TAlignVector, it, m_OrigAligns) { 00308 const CSeq_align& align = **it; 00309 const TSegs& segs = align.GetSegs(); 00310 bool linear = true; 00311 00312 switch(segs.Which()) { 00313 case TSegs::e_Denseg: 00314 case TSegs::e_Dendiag: 00315 break; 00316 case TSegs::e_Std: { 00317 ITERATE(TSegs::TStd, it_s, segs.GetStd()) { 00318 const CStd_seg& std_seg = **it_s; 00319 if(! x_IsLinear(std_seg)) { 00320 linear = false; 00321 break; 00322 } 00323 } 00324 break; 00325 } 00326 case TSegs::e_Sparse: 00327 case TSegs::e_Spliced: 00328 break; 00329 default: 00330 linear = false; // other types currently not supported 00331 break; 00332 } 00333 if(linear) { 00334 aligns.push_back(*it); 00335 } 00336 } 00337 } 00338 00339 /// returns true if CStd_seg is linear 00340 bool CAlnMultiDSBuilder::x_IsLinear(const CStd_seg& /*seg*/) 00341 { 00342 return false; 00343 } 00344 00345 00346 END_NCBI_SCOPE
1.7.5.1
Modified on Wed May 23 13:18:52 2012 by modify_doxy.py rev. 337098