src/algo/align/splign/splign.cpp

Go to the documentation of this file.
00001 /* $Id: splign.cpp 176197 2009-11-16 19:50:35Z kapustin $
00002 * ===========================================================================
00003 *
00004 *                            public DOMAIN NOTICE                          
00005 *               National Center for Biotechnology Information
00006 *                                                                          
00007 *  This software/database is a "United States Government Work" under the   
00008 *  terms of the United States Copyright Act.  It was written as part of    
00009 *  the author's official duties as a United States Government employee and 
00010 *  thus cannot be copyrighted.  This software/database is freely available 
00011 *  to the public for use. The National Library of Medicine and the U.S.    
00012 *  Government have not placed any restriction on its use or reproduction.  
00013 *                                                                          
00014 *  Although all reasonable efforts have been taken to ensure the accuracy  
00015 *  and reliability of the software and data, the NLM and the U.S.          
00016 *  Government do not and cannot warrant the performance or results that    
00017 *  may be obtained by using this software or data. The NLM and the U.S.    
00018 *  Government disclaim all warranties, express or implied, including       
00019 *  warranties of performance, merchantability or fitness for any particular
00020 *  purpose.                                                                
00021 *                                                                          
00022 *  Please cite the author in any work or product based on this material.   
00023 *
00024 * ===========================================================================
00025 *
00026 * Author:  Yuri Kapustin
00027 *
00028 * File Description:
00029 *   CSplign class implementation
00030 *
00031 */
00032 
00033 
00034 #include <ncbi_pch.hpp>
00035 
00036 #include "splign_util.hpp"
00037 #include "messages.hpp"
00038 #include <algo/align/util/hit_comparator.hpp>
00039 #include <algo/align/util/compartment_finder.hpp>
00040 #include <algo/align/nw/nw_band_aligner.hpp>
00041 #include <algo/align/nw/nw_spliced_aligner16.hpp>
00042 #include <algo/align/nw/nw_formatter.hpp>
00043 #include <algo/align/nw/align_exception.hpp>
00044 #include <algo/align/splign/splign.hpp>
00045 
00046 #include <algo/sequence/orf.hpp>
00047 
00048 #include <objmgr/scope.hpp>
00049 #include <objmgr/bioseq_handle.hpp>
00050 #include <objmgr/seq_vector.hpp>
00051 #include <objmgr/util/seq_loc_util.hpp>
00052 
00053 #include <objects/seqloc/Seq_interval.hpp>
00054 #include <objects/seqalign/Seq_align.hpp>
00055 #include <objects/seqalign/Seq_align_set.hpp>
00056 #include <objects/seqalign/Spliced_seg.hpp>
00057 #include <objects/seqalign/Spliced_exon.hpp>
00058 #include <objects/seqalign/Spliced_exon_chunk.hpp>
00059 #include <objects/seqalign/Product_pos.hpp>
00060 #include <objects/seqalign/Splice_site.hpp>
00061 #include <objects/seqalign/Score.hpp>
00062 #include <objects/seqalign/Score_set.hpp>
00063 #include <objects/general/Object_id.hpp>
00064 
00065 #include <math.h>
00066 #include <algorithm>
00067 
00068 BEGIN_NCBI_SCOPE
00069 
00070 namespace {
00071 
00072     // define cut-off strategy at the terminii:
00073     
00074     // (1) - pre-processing
00075     // For non-covered ends longer than kNonCoveredEndThreshold use
00076     // m_max_genomic_ext. For shorter ends use k * query_len^(1/kPower)
00077     
00078     const Uint4  kNonCoveredEndThreshold (55);
00079     const double kPower (2.5);
00080     
00081     // (2) - post-processing
00082     // term exons shorter than kMinTermExonSize with identity lower than
00083     // kMinTermExonIdty will be converted to gaps
00084     const size_t kMinTermExonSize (28);
00085     const double kMinTermExonIdty (0.9);
00086 
00087     const CSplign::THit::TCoord
00088                  kMaxCoord (numeric_limits<CSplign::THit::TCoord>::max());
00089 }
00090 
00091 
00092 CSplign::CSplign( void )
00093 {
00094     m_CompartmentPenalty = s_GetDefaultCompartmentPenalty();
00095     m_MinExonIdty = s_GetDefaultMinExonIdty();
00096     m_MinSingletonIdty = m_MinCompartmentIdty =s_GetDefaultMinCompartmentIdty();
00097     m_MinSingletonIdtyBps = numeric_limits<size_t>::max();
00098     m_max_genomic_ext = s_GetDefaultMaxGenomicExtent();
00099     m_MaxIntron = CCompartmentFinder<THit>::s_GetDefaultMaxIntron();
00100     m_endgaps = true;
00101     m_strand = true;
00102     m_nopolya = false;
00103     m_cds_start = m_cds_stop = 0;
00104     m_model_id = 0;
00105     m_MaxCompsPerQuery = 0;
00106     m_MinPatternHitLength = 13;
00107 }
00108 
00109 CSplign::~CSplign()
00110 {
00111 }
00112 
00113 
00114 CRef<CSplign::TAligner>& CSplign::SetAligner( void ) {
00115     return m_aligner;
00116 }
00117 
00118 CConstRef<CSplign::TAligner> CSplign::GetAligner( void ) const {
00119     return m_aligner;
00120 }
00121 
00122 
00123 CRef<CSplicedAligner> CSplign::s_CreateDefaultAligner(bool low_query_quality)
00124 {
00125     CRef<CSplicedAligner> aligner(
00126         static_cast<CSplicedAligner*>(new CSplicedAligner16));
00127         
00128 #ifdef ALGOALIGN_NW_SPLIGN_MAKE_PUBLIC_BINARY
00129     if(false == low_query_quality) {
00130         aligner->SetWm  (1000);
00131         aligner->SetWms (-1044);
00132         aligner->SetWg  (-3070);
00133         aligner->SetWs  (-173);
00134         aligner->SetScoreMatrix(NULL);
00135 
00136         aligner->SetWi(0, -4270);
00137         aligner->SetWi(1, -5314);
00138         aligner->SetWi(2, -6358);
00139         aligner->SetWi(3, -7395);
00140     }
00141     else {
00142         aligner->SetWm  (1000);
00143         aligner->SetWms (-1011);
00144         aligner->SetWg  (-1460);
00145         aligner->SetWs  (-464);
00146         aligner->SetScoreMatrix(NULL);
00147 
00148         aligner->SetWi(0, -4988);
00149         aligner->SetWi(1, -5999);
00150         aligner->SetWi(2, -7010);
00151         aligner->SetWi(3, -13060);
00152     }
00153 #endif
00154 
00155     return aligner;
00156 }
00157 
00158 
00159 void CSplign::SetEndGapDetection( bool on ) {
00160     m_endgaps = on;
00161 }
00162 
00163 bool CSplign::GetEndGapDetection( void ) const {
00164     return m_endgaps;
00165 }
00166 
00167 void CSplign::SetPolyaDetection( bool on ) {
00168     m_nopolya = !on;
00169 }
00170 
00171 bool CSplign::GetPolyaDetection( void ) const {
00172     return !m_nopolya;
00173 }
00174 
00175 void CSplign::SetStrand( bool strand ) {
00176     m_strand = strand;
00177 }
00178 
00179 bool CSplign::GetStrand( void ) const {
00180     return m_strand;
00181 }
00182 
00183 void CSplign::SetMinExonIdentity( double idty )
00184 {
00185     if(!(0 <= idty && idty <= 1)) {
00186         NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_BadIdentityThreshold);
00187     }
00188     else {
00189         m_MinExonIdty = idty;
00190     }
00191 }
00192 
00193 void CSplign::SetMinCompartmentIdentity( double idty )
00194 {
00195     if(!(0 <= idty && idty <= 1)) {
00196         NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_BadIdentityThreshold);
00197     }
00198     else {
00199         m_MinCompartmentIdty = idty;
00200     }
00201 }
00202 
00203 void CSplign::SetMinSingletonIdentity(double idty)
00204 {
00205     if(!(0 <= idty && idty <= 1)) {
00206         NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_BadIdentityThreshold);
00207     }
00208     else {
00209         m_MinSingletonIdty = idty;
00210     }
00211 }
00212 
00213 void CSplign::SetMinSingletonIdentityBps(size_t idty_bps)
00214 {
00215     m_MinSingletonIdtyBps = idty_bps;
00216 }
00217 
00218 size_t CSplign::s_GetDefaultMaxGenomicExtent(void)
00219 {
00220     return 35000;
00221 }
00222 
00223 
00224 void CSplign::SetMaxGenomicExtent(size_t mge)
00225 {
00226     m_max_genomic_ext = mge;
00227 }
00228 
00229 
00230 size_t CSplign::GetMaxGenomicExtent(void) const
00231 {
00232     return m_max_genomic_ext;
00233 }
00234 
00235 
00236 void CSplign::SetMaxIntron(size_t max_intron)
00237 {
00238     m_MaxIntron = max_intron;
00239 }
00240 
00241 
00242 size_t CSplign::GetMaxIntron(void) const
00243 {
00244     return m_MaxIntron;
00245 }
00246 
00247 
00248 double CSplign::GetMinExonIdentity( void ) const {
00249     return m_MinExonIdty;
00250 }
00251 
00252 double CSplign::s_GetDefaultMinExonIdty(void)
00253 {
00254     return 0.75;
00255 }
00256 
00257 
00258 double CSplign::GetMinCompartmentIdentity(void) const {
00259     return m_MinCompartmentIdty;
00260 }
00261 
00262 double CSplign::GetMinSingletonIdentity(void) const {
00263     return m_MinSingletonIdty;
00264 }
00265 
00266 size_t CSplign::GetMinSingletonIdentityBps(void) const {
00267     return m_MinSingletonIdtyBps;
00268 }
00269 
00270 double CSplign::s_GetDefaultMinCompartmentIdty(void)
00271 {
00272     return 0.70;
00273 }
00274 
00275 void CSplign::SetMaxCompsPerQuery(size_t m) {
00276     m_MaxCompsPerQuery = m;
00277 }
00278 
00279 size_t CSplign::GetMaxCompsPerQuery(void) const {
00280     return m_MaxCompsPerQuery;
00281 }
00282 
00283 
00284 
00285 CRef<objects::CScope> CSplign::GetScope(void) const
00286 {
00287     return m_Scope;
00288 }
00289 
00290 
00291 CRef<objects::CScope>& CSplign::SetScope(void)
00292 {
00293     return m_Scope;
00294 }
00295 
00296 
00297 void CSplign::PreserveScope(bool preserve_scope)
00298 {
00299     m_CanResetHistory = ! preserve_scope;
00300 }
00301 
00302 
00303 void CSplign::SetCompartmentPenalty(double penalty)
00304 {
00305     if(penalty < 0 || penalty > 1) {
00306         NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_QueryCoverageOutOfRange);
00307     }
00308     m_CompartmentPenalty = penalty;
00309 }
00310 
00311 double CSplign::s_GetDefaultCompartmentPenalty(void)
00312 {
00313     return 0.55;
00314 }
00315 
00316 double CSplign::GetCompartmentPenalty( void ) const 
00317 {
00318     return m_CompartmentPenalty;
00319 }
00320 
00321 void CSplign::x_LoadSequence(vector<char>* seq, 
00322                              const objects::CSeq_id& seqid,
00323                              THit::TCoord start,
00324                              THit::TCoord finish,
00325                              bool retain)
00326 {
00327     USING_SCOPE(objects);
00328 
00329     try {
00330     
00331         if(m_Scope.IsNull()) {
00332             NCBI_THROW(CAlgoAlignException, eInternal, "Splign scope not set");
00333         }
00334 
00335         CBioseq_Handle bh = m_Scope->GetBioseqHandle(seqid);
00336 
00337         if(retain && m_CanResetHistory) {
00338             m_Scope->ResetHistory();
00339         }
00340 
00341         if(bh) {
00342 
00343             CSeqVector sv = bh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
00344             const TSeqPos dim = sv.size();
00345             if(dim == 0) {
00346                 NCBI_THROW(CAlgoAlignException,
00347                            eNoSeqData, 
00348                            string("Sequence is empty: ") 
00349                            + seqid.AsFastaString());
00350             }
00351             if(finish >= dim) {
00352                 finish = dim - 1;
00353             }
00354 
00355             if(start > finish) {
00356                 CNcbiOstrstream ostr;
00357                 ostr << "Invalid sequence interval requested for "
00358                      << seqid.GetSeqIdString(true) << ":\t"
00359                      << start << '\t' << finish;
00360                 const string err = CNcbiOstrstreamToString(ostr);
00361                 NCBI_THROW(CAlgoAlignException, eNoSeqData, err);
00362             }
00363             
00364             string s;
00365             sv.GetSeqData(start, finish + 1, s);
00366             seq->resize(1 + finish - start);
00367             copy(s.begin(), s.end(), seq->begin());
00368         }
00369         else {
00370             NCBI_THROW(CAlgoAlignException, eNoSeqData, 
00371                        string("ID not found: ") + seqid.AsFastaString());
00372         }
00373         
00374         if(retain == false && m_CanResetHistory) {
00375             m_Scope->RemoveFromHistory(bh);
00376         }       
00377     }
00378     
00379     catch(CAlgoAlignException& e) {
00380         NCBI_RETHROW_SAME(e, "CSplign::x_LoadSequence(): Sequence data problem");
00381     }
00382 }
00383 
00384 
00385 void CSplign::ClearMem(void)
00386 {
00387     m_Scope.Reset(NULL);
00388     m_pattern.clear();
00389     m_alnmap.clear();
00390     m_genomic.clear();
00391     m_mrna.clear();
00392 }
00393 
00394 
00395 CSplign::THitRef CSplign::sx_NewHit(THit::TCoord q0, THit::TCoord q,
00396                                     THit::TCoord s0, THit::TCoord s)
00397 {
00398     THitRef hr (new THit);
00399     hr->SetQueryStart(q0);
00400     hr->SetSubjStart(s0);
00401     hr->SetQueryStop(q - 1);
00402     hr->SetSubjStop(s - 1);
00403     hr->SetLength(q - q0);
00404     hr->SetMismatches(0);
00405     hr->SetGaps(0);
00406     hr->SetEValue(0);
00407     hr->SetScore(2*(q - q0));
00408     hr->SetIdentity(1);
00409     return hr;
00410 }
00411 
00412 
00413 void CSplign::x_SplitQualifyingHits(THitRefs* phitrefs)
00414 {
00415     const char * Seq1 (&m_mrna.front());
00416     const char * Seq2 (&m_genomic.front());
00417     THitRefs rv;
00418     ITERATE(THitRefs, ii, (*phitrefs)) {
00419 
00420         const THitRef & h (*ii);
00421         const double idty (h->GetIdentity());
00422         const bool diag (h->GetGaps() == 0 && h->GetQuerySpan() == h->GetSubjSpan());
00423         if(idty == 1 || idty < .95 || h->GetLength() < 100 || !diag) {
00424             rv.push_back(h);
00425         }
00426         else {
00427 
00428             int q0 (-1), s0 (-1), q1 (h->GetQueryMax());
00429             int q (h->GetQueryMin()), s (h->GetSubjMin());
00430             size_t new_hits (0);
00431             while(q <= q1) {
00432                 if(Seq1[q++] != Seq2[s++]) {
00433                     if(q0 != -1 && q >= q0 + int(m_MinPatternHitLength)) {
00434                         THitRef hr (sx_NewHit(q0, q, s0, s));
00435                         hr->SetQueryId(h->GetQueryId());
00436                         hr->SetSubjId(h->GetSubjId());
00437                         rv.push_back(hr);
00438                         ++new_hits;
00439                     }
00440                     q0 = s0 = -1;
00441                 }
00442                 else {
00443                     if(q0 == -1) {
00444                         q0 = q;
00445                         s0 = s;
00446                     }
00447                 }
00448             }
00449 
00450             if(q0 != -1 && q >= q0 + int(m_MinPatternHitLength)) {
00451                 THitRef hr (sx_NewHit(q0, q, s0, s));
00452                 hr->SetQueryId(h->GetQueryId());
00453                 hr->SetSubjId(h->GetSubjId());
00454                 rv.push_back(hr);
00455                 ++new_hits;
00456             }
00457 
00458             if(new_hits == 0) {
00459                 rv.push_back(h);
00460             }
00461         }
00462     }
00463 
00464     *phitrefs = rv;
00465 }
00466 
00467 
00468 void CSplign::x_SetPattern(THitRefs* phitrefs)
00469 {
00470     m_alnmap.clear();
00471     m_pattern.clear();
00472 
00473     // sort the input by min query coordinate
00474     typedef CHitComparator<THit> THitComparator;
00475     THitComparator sorter (THitComparator::eQueryMin);
00476     stable_sort(phitrefs->begin(), phitrefs->end(), sorter);
00477 
00478     // check that no two consecutive hits are farther than the max intron
00479     // (extra short hits skipped)
00480     size_t prev (0);
00481     NON_CONST_ITERATE(THitRefs, ii, *phitrefs) {
00482 
00483         THitRef& h (*ii);
00484 
00485         if(h->GetQuerySpan() < m_MinPatternHitLength) {
00486             h.Reset(0);
00487             continue;
00488         }
00489 
00490         if(prev > 0) {
00491             
00492             const bool consistent (h->GetSubjStrand()?
00493                                   (h->GetSubjStart() < prev + m_MaxIntron):
00494                                   (h->GetSubjStart() + m_MaxIntron > prev));
00495 
00496             if(!consistent) {
00497                 const string errmsg (g_msg_CompartmentInconsistent
00498                                      + string(" (extra long introns)"));
00499                 NCBI_THROW(CAlgoAlignException, eIntronTooLong, errmsg);
00500             }
00501         }
00502 
00503         prev = h->GetSubjStop();
00504     }
00505 
00506     phitrefs->erase(remove_if(phitrefs->begin(), phitrefs->end(),
00507                               CHitFilter<THit>::s_PNullRef),
00508                     phitrefs->end());
00509 
00510     // save each hit longer than the minimum and test whether the hit is perfect
00511     vector<size_t> pattern0;
00512     vector<pair<bool,double> > imperfect;
00513     double max_idty (0.0);
00514     for(size_t i (0), n (phitrefs->size()); i < n; ++i) {
00515 
00516         const THitRef & h ((*phitrefs)[i]);
00517         const bool valid (true);
00518         if(valid) {
00519 
00520             pattern0.push_back(h->GetQueryMin());
00521             pattern0.push_back(h->GetQueryMax());
00522             pattern0.push_back(h->GetSubjMin());
00523             pattern0.push_back(h->GetSubjMax());
00524             const double idty (h->GetIdentity());
00525             const bool imprf  (idty < 1.00 
00526                                || h->GetQuerySpan() != h->GetSubjSpan()
00527                                || h->GetMismatches() > 0
00528                                || h->GetGaps() > 0);
00529             imperfect.push_back(pair<bool,double>(imprf, idty));
00530             if(idty > max_idty) {
00531                 max_idty = idty;
00532             }
00533         }
00534     }
00535 
00536     if(max_idty < .85 && pattern0.size() >= 4) {
00537         m_BoundingRange = pair<size_t, size_t>(pattern0[2], pattern0.back());
00538     }
00539     else {
00540         m_BoundingRange = pair<size_t, size_t>(0, 0);
00541     }
00542 
00543     const size_t dim (pattern0.size());
00544 
00545     const char* Seq1 (&m_mrna.front());
00546     const size_t SeqLen1 (m_polya_start < kMax_UInt? m_polya_start: m_mrna.size());
00547     const char* Seq2 (&m_genomic.front());
00548     const size_t SeqLen2 (m_genomic.size());
00549     
00550     // verify conditions on the input hit pattern
00551     CNcbiOstrstream ostr_err;
00552     bool some_error (false), bad_input (false);
00553     if(dim % 4 == 0) {
00554 
00555         for(size_t i (0); i < dim; i += 4) {
00556 
00557             if(pattern0[i] > pattern0[i+1] || pattern0[i+2] > pattern0[i+3]) {
00558                 ostr_err << "Pattern hits must be specified in plus strand";
00559                 some_error = bad_input = true;
00560                 break;
00561             }
00562             
00563             if(i > 4) {
00564                 if(pattern0[i] <= pattern0[i-3] || pattern0[i+2] <= pattern0[i-1]) {
00565                     ostr_err << g_msg_CompartmentInconsistent
00566                              << string(" (hits not sorted)");
00567                     some_error = true;
00568                     break;
00569                 }
00570             }
00571             
00572             const bool br1 (pattern0[i+1] >= SeqLen1);
00573             const bool br2 (pattern0[i+3] >= SeqLen2);
00574             if(br1 || br2) {
00575 
00576                 ostr_err << "Pattern hits out of range ("
00577                          << "query = " 
00578                          << phitrefs->front()->GetQueryId()->GetSeqIdString(true)
00579                          << "subj = " 
00580                          << phitrefs->front()->GetSubjId()->GetSeqIdString(true)
00581                          << "):" << endl;
00582 
00583                 if(br1) {
00584                     ostr_err << "\tquery_pattern_max = " << pattern0[i+1]
00585                              << "; query_len = " << SeqLen1 << endl;
00586                 }
00587 
00588                 if(br2) {
00589                     ostr_err << "\tsubj_pattern_max = " << pattern0[i+3]
00590                              << "; subj_len = " << SeqLen2 << endl;
00591                 }
00592 
00593                 some_error= true;
00594                 break;
00595             }
00596         }
00597 
00598     }
00599     else {
00600         ostr_err << "Pattern dimension must be a multiple of four";
00601         some_error = bad_input = true;
00602     }
00603     
00604     if(some_error) {
00605         ostr_err << " (query = " 
00606                  << phitrefs->front()->GetQueryId()->AsFastaString() 
00607                  << " , subj = "
00608                  << phitrefs->front()->GetSubjId()->AsFastaString() << ')'
00609                  << endl;
00610     }
00611 
00612     const string err = CNcbiOstrstreamToString(ostr_err);
00613     if(err.size() > 0) {
00614         if(bad_input) {
00615             NCBI_THROW(CAlgoAlignException, eBadParameter, err);
00616         }
00617         else {
00618             NCBI_THROW(CAlgoAlignException, ePattern, err);
00619         }
00620     }
00621 
00622     SAlnMapElem map_elem;
00623     map_elem.m_box[0] = map_elem.m_box[2] = 0;
00624     map_elem.m_pattern_start = map_elem.m_pattern_end = -1;
00625 
00626     // build the alignment map
00627     CBandAligner nwa;
00628     for(size_t i = 0; i < dim; i += 4) {    
00629             
00630         size_t L1, R1, L2, R2;
00631         size_t max_seg_size (0);
00632 
00633         const bool imprf (imperfect[i/4].first);
00634         if(imprf) {                
00635 
00636             // TODO:
00637             // a better approach is to find indels and mismatches
00638             // and split at the SplitQualifyingHits() stage
00639             // to pass here only perfect diags
00640             const size_t len1 (pattern0[i+1] - pattern0[i] + 1);
00641             const size_t len2 (pattern0[i+3] - pattern0[i+2] + 1);
00642             const size_t maxlen (max(len1, len2));
00643             const size_t lendif (len1 < len2? len2 - len1: len1 - len2);
00644             size_t band (size_t((1 - imperfect[i/4].second) * maxlen) + 2);
00645             if(band < lendif) band += lendif;
00646             nwa.SetBand(band);
00647             nwa.SetSequences(Seq1 + pattern0[i],   len1,
00648                              Seq2 + pattern0[i+2], len2,
00649                              false);
00650             nwa.Run();
00651             max_seg_size = nwa.GetLongestSeg(&L1, &R1, &L2, &R2);
00652         }
00653         else {
00654 
00655             L1 = 1;
00656             R1 = pattern0[i+1] - pattern0[i] - 1;
00657             L2 = 1;
00658             R2 = pattern0[i+3] - pattern0[i+2] - 1;
00659             max_seg_size = R1 - L1 + 1;
00660         }
00661 
00662         if(max_seg_size) {
00663 
00664             // make the core
00665             {{
00666                 size_t cut ((1 + R1 - L1) / 5);
00667                 if(cut > 20) cut = 20;
00668 
00669                 const size_t l1 (L1 + cut), l2 (L2 + cut);
00670                 const size_t r1 (R1 - cut), r2 (R2 - cut);
00671                 if(l1 < r1 && l2 < r2) {
00672                     L1 = l1; L2 = l2; 
00673                     R1 = r1; R2 = r2;
00674                 }
00675             }}
00676 
00677             size_t q0 (pattern0[i] + L1);
00678             size_t s0 (pattern0[i+2] + L2);
00679             size_t q1 (pattern0[i] + R1);
00680             size_t s1 (pattern0[i+2] + R2);
00681 
00682             if(imprf) {
00683 
00684                 const size_t hitlen_q (pattern0[i + 1] - pattern0[i] + 1);
00685                 const size_t sh (size_t(hitlen_q / 4));
00686                 
00687                 size_t delta (sh > L1? sh - L1: 0);
00688                 q0 += delta;
00689                 s0 += delta;
00690                     
00691                 const size_t h2s_right (hitlen_q - R1 - 1);
00692                 delta = sh > h2s_right? sh - h2s_right: 0;
00693                 q1 -= delta;
00694                 s1 -= delta;
00695                 
00696                 if(q0 > q1 || s0 > s1) {
00697                     
00698                     // the longest segment too short
00699                     q0 = pattern0[i] + L1;
00700                     s0 = pattern0[i+2] + L2;
00701                     q1 = pattern0[i] + R1;
00702                     s1 = pattern0[i+2] + R2;
00703                 }
00704             }
00705 
00706             m_pattern.push_back(q0); m_pattern.push_back(q1);
00707             m_pattern.push_back(s0); m_pattern.push_back(s1);
00708                 
00709             const size_t pattern_dim = m_pattern.size();
00710             if(map_elem.m_pattern_start == -1) {
00711                 map_elem.m_pattern_start = pattern_dim - 4;
00712             }
00713             map_elem.m_pattern_end = pattern_dim - 1;
00714         }
00715             
00716         map_elem.m_box[1] = pattern0[i+1];
00717         map_elem.m_box[3] = pattern0[i+3];
00718     }
00719         
00720     map_elem.m_box[1] = SeqLen1 - 1;
00721     map_elem.m_box[3] = SeqLen2 - 1;
00722     m_alnmap.push_back(map_elem);
00723 }
00724 
00725 
00726 CSplign::TOrfPair CSplign::GetCds(const THit::TId& id, const vector<char> * seq_data)
00727 {
00728     TOrfPair rv (TOrf(0, 0), TOrf(0, 0));
00729 
00730     TStrIdToOrfs::const_iterator ie (m_OrfMap.end());
00731     const string strid (id->AsFastaString());
00732     TStrIdToOrfs::const_iterator ii (m_OrfMap.find(strid));
00733 
00734     if(ii != ie) {
00735         rv = ii->second;
00736     }
00737     else {
00738 
00739         USING_SCOPE(objects);
00740 
00741         vector<char> seq;
00742         if(seq_data == 0) {
00743             x_LoadSequence(&seq, *id, 0, kMaxCoord, false);
00744             seq_data = & seq;
00745         }
00746 
00747         // Assign CDS to the max ORF longer than 90 bps and starting from ATG
00748         //
00749         vector<CRef<CSeq_loc> > orfs;
00750         vector<string> start_codon;
00751         start_codon.push_back("ATG");
00752 
00753         COrf::FindOrfs(*seq_data, orfs, 90, 1, start_codon);
00754         TSeqPos max_len_plus  (0), max_len_minus  (0);
00755         TSeqPos max_from_plus (0), max_from_minus (0);
00756         TSeqPos max_to_plus   (0), max_to_minus   (0);
00757         ITERATE (vector<CRef<CSeq_loc> >, orf, orfs) {
00758 
00759             const TSeqPos len           (sequence::GetLength(**orf, NULL));
00760             const ENa_strand orf_strand ((*orf)->GetInt().GetStrand());
00761             const bool antisense        (orf_strand == eNa_strand_minus);
00762         
00763             if(antisense) {
00764                 if(len > max_len_minus) {
00765                     max_len_minus    = len;
00766                     max_from_minus   = (*orf)->GetInt().GetTo();
00767                     max_to_minus     = (*orf)->GetInt().GetFrom();
00768                 }
00769             }
00770             else {
00771                 if(len > max_len_plus) {
00772                     max_len_plus    = len;
00773                     max_from_plus   = (*orf)->GetInt().GetFrom();
00774                     max_to_plus     = (*orf)->GetInt().GetTo();
00775                 }
00776             }
00777         }
00778 
00779         if(max_len_plus > 0) {
00780             rv.first = TOrf(max_from_plus, max_to_plus);
00781         }
00782 
00783         if(max_len_minus > 0) {
00784             rv.second = TOrf(max_from_minus, max_to_minus);
00785         }
00786 
00787         m_OrfMap[strid] = rv;
00788     }
00789 
00790     return rv;
00791 }
00792 
00793 
00794 void CSplign::x_FinalizeAlignedCompartment(SAlignedCompartment & ac)
00795 {
00796     ac.m_Id         = ++m_model_id;
00797     ac.m_Segments   = m_segments;
00798     ac.m_Status     = SAlignedCompartment::eStatus_Ok;
00799     ac.m_Msg        = "Ok";
00800     ac.m_Cds_start  = m_cds_start;
00801     ac.m_Cds_stop   = m_cds_stop;
00802     ac.m_QueryLen   = m_mrna.size();
00803     ac.m_PolyA      = (m_polya_start < kMax_UInt? m_polya_start : 0);
00804 }
00805 
00806 
00807 // PRE:  Input Blast hits.
00808 // POST: TResults - a vector of aligned compartments.
00809 void CSplign::Run(THitRefs* phitrefs)
00810 {
00811     if(!phitrefs) {
00812         NCBI_THROW(CAlgoAlignException, eInternal, "Unexpected NULL pointers");
00813     }
00814 
00815     THitRefs& hitrefs = *phitrefs;
00816     
00817     // make sure query hit is in plus strand
00818     NON_CONST_ITERATE(THitRefs, ii, hitrefs) {
00819 
00820         THitRef& h = *ii;
00821         if(h.NotNull() && h->GetQueryStrand() == false) {
00822             h->FlipStrands();
00823         }
00824     }
00825   
00826     if(m_aligner.IsNull()) {
00827         NCBI_THROW(CAlgoAlignException, eNotInitialized, g_msg_AlignedNotSpecified);
00828     }
00829     
00830     if(hitrefs.size() == 0) {
00831         NCBI_THROW(CAlgoAlignException, eNoHits, g_msg_EmptyHitVectorPassed);
00832     }
00833 
00834     m_result.clear();
00835 
00836     THit::TId id_query (hitrefs.front()->GetQueryId());
00837 
00838     const THit::TCoord mrna_size (objects::sequence::GetLength(*id_query, m_Scope));
00839     if(mrna_size == kMaxCoord) {
00840         NCBI_THROW(CAlgoAlignException, eNoSeqData, 
00841                    string("Sequence not found: ") + id_query->AsFastaString());
00842     }
00843     
00844     // iterate through compartments
00845     const THit::TCoord min_singleton_idty_final (
00846                        min(size_t(m_MinSingletonIdty * mrna_size),
00847                            m_MinSingletonIdtyBps));
00848 
00849     CCompartmentAccessor<THit> comps (THit::TCoord(m_CompartmentPenalty * mrna_size),
00850                                       THit::TCoord(m_MinCompartmentIdty * mrna_size),
00851                                       min_singleton_idty_final,
00852                                       true);
00853     comps.SetMaxIntron(m_MaxIntron);
00854     comps.Run(hitrefs.begin(), hitrefs.end());
00855 
00856     pair<size_t,size_t> dim (comps.GetCounts()); // (count_total, count_unmasked)
00857     if(dim.second > 0) {
00858  
00859         // pre-load cDNA
00860         m_mrna.clear();
00861         x_LoadSequence(&m_mrna, *id_query, 0, kMaxCoord, false);
00862 
00863         const TOrfPair orfs (GetCds(id_query, & m_mrna));
00864         if(m_strand) {
00865             m_cds_start  = orfs.first.first;
00866             m_cds_stop = orfs.first.second;
00867         }
00868         else {
00869             m_cds_start  = orfs.second.first;
00870             m_cds_stop = orfs.second.second;
00871         }
00872 
00873         if(!m_strand) {
00874             // make a reverse complimentary
00875             reverse (m_mrna.begin(), m_mrna.end());
00876             transform(m_mrna.begin(), m_mrna.end(), m_mrna.begin(), SCompliment());
00877         }
00878 
00879         // compartments share the space between them
00880         size_t smin (0), smax (kMax_UInt);
00881         bool same_strand (false);
00882 
00883         const THit::TCoord* box (comps.GetBox(0));
00884         if(m_MaxCompsPerQuery > 0 && dim.second > m_MaxCompsPerQuery) {
00885             dim.second = m_MaxCompsPerQuery;
00886         }
00887         
00888         for(size_t i (0); i < dim.first; ++i, box += 4) {
00889             
00890             if(i + 1 == dim.first) {
00891                 smax = kMax_UInt;
00892                 same_strand = false;
00893             }
00894             else {            
00895                 bool strand_this (comps.GetStrand(i));
00896                 bool strand_next (comps.GetStrand(i+1));
00897                 same_strand = strand_this == strand_next;
00898                 smax = same_strand? (box + 4)[2]: kMax_UInt;
00899             }
00900      
00901             try {
00902 
00903                 if(smax < box[3]) {
00904                     // alert if not ordered by lower subject coordinate
00905                     NCBI_THROW(CAlgoAlignException, eInternal, 
00906                                "Unexpected order of compartments");                    
00907                 }
00908 
00909                 if(comps.GetStatus(i)) {
00910                     THitRefs comp_hits;
00911                     comps.Get(i, comp_hits);
00912 
00913                     if(smax < box[3]) smax = box[3];
00914                     if(smin > box[2]) smin = box[2];
00915 
00916                     SAlignedCompartment ac (x_RunOnCompartment(&comp_hits, smin,smax));
00917                     x_FinalizeAlignedCompartment(ac);
00918                     m_result.push_back(ac);
00919                 }
00920             }
00921 
00922             catch(CAlgoAlignException& e) {
00923                 
00924                 if(e.GetSeverity() == eDiag_Fatal) {
00925                     throw;
00926                 }
00927                 
00928                 m_result.push_back(SAlignedCompartment(0, e.GetMsg().c_str()));
00929 
00930                 const CException::TErrCode errcode (e.GetErrCode());
00931                 if(errcode != CAlgoAlignException::eNoAlignment) {
00932                     m_result.back().m_Status = SAlignedCompartment::eStatus_Error;
00933                 }
00934 
00935                 ++m_model_id;
00936             }
00937 
00938             smin = same_strand? box[3]: 0;
00939         }
00940     }
00941 }
00942 
00943 
00944 bool CSplign::AlignSingleCompartment(CRef<objects::CSeq_align> compartment,
00945                                      SAlignedCompartment* result)
00946 {
00947     USING_SCOPE(objects);
00948     const CRef<CSeq_loc> seqloc (compartment->GetBounds().front());
00949     const size_t subj_min (seqloc->GetStart(eExtreme_Positional));
00950     const size_t subj_max (seqloc->GetStop(eExtreme_Positional));
00951 
00952     THitRefs hitrefs;
00953     ITERATE(CSeq_align_set::Tdata, ii, compartment->GetSegs().GetDisc().Get()) {
00954     }
00955 
00956     return AlignSingleCompartment(&hitrefs, subj_min, subj_max, result);
00957 }
00958 
00959 
00960 bool CSplign::AlignSingleCompartment(THitRefs* phitrefs,
00961                                      size_t subj_min, size_t subj_max,
00962                                      SAlignedCompartment* result)
00963 {
00964     m_mrna.resize(0);
00965 
00966     THit::TId id_query (phitrefs->front()->GetQueryId());
00967 
00968     x_LoadSequence(&m_mrna, *id_query, 0, kMaxCoord, false);
00969 
00970     const TOrfPair orfs (GetCds(id_query, & m_mrna));
00971     if(m_strand) {
00972         m_cds_start  = orfs.first.first;
00973         m_cds_stop   = orfs.first.second;
00974     }
00975     else {
00976         m_cds_start  = orfs.second.first;
00977         m_cds_stop   = orfs.second.second;
00978     }
00979 
00980     if(!m_strand) {
00981         reverse (m_mrna.begin(), m_mrna.end());
00982         transform(m_mrna.begin(), m_mrna.end(), m_mrna.begin(), SCompliment());
00983     }
00984 
00985     bool rv (true);
00986     try {
00987 
00988         SAlignedCompartment ac (x_RunOnCompartment(phitrefs, subj_min, subj_max));
00989         x_FinalizeAlignedCompartment(ac);
00990         *result = ac;        
00991         m_mrna.resize(0);
00992     }
00993 
00994     catch(CAlgoAlignException& e) {
00995 
00996         m_mrna.resize(0);        
00997 
00998         if(e.GetSeverity() == eDiag_Fatal) {
00999             throw;
01000         }
01001         
01002         *result = SAlignedCompartment(0, e.GetMsg().c_str());
01003 
01004         const CException::TErrCode errcode (e.GetErrCode());
01005         if(errcode != CAlgoAlignException::eNoAlignment) {
01006             result->m_Status = SAlignedCompartment::eStatus_Error;
01007         }
01008 
01009         ++m_model_id;
01010         rv = false;
01011     }
01012 
01013     return rv;
01014 }
01015 
01016 // naive polya detection; sense direction assumed
01017 size_t CSplign::s_TestPolyA(const char * seq, size_t dim, size_t cds_stop)
01018 {
01019     const size_t kMaxNonA (3), kMinAstreak (5);
01020     int i (dim - 1), i0 (dim);
01021     for(size_t count_non_a (0), astreak (0); i >= 0 && count_non_a < kMaxNonA; --i) {
01022 
01023         if(seq[i] != 'A') {
01024             ++count_non_a;
01025             astreak = 0;
01026         }
01027         else {
01028             if(++astreak >= kMinAstreak) {
01029                 i0 = i;
01030             }
01031         }
01032     }
01033 
01034     const size_t len (dim - i0);
01035     size_t rv;
01036     if(len >= kMinAstreak) {
01037         rv = i0;
01038         if(0 < cds_stop && cds_stop < dim && rv <= cds_stop) {
01039             rv = cds_stop + 1;
01040         }
01041     }
01042     else {
01043         rv = kMax_UInt;
01044     }
01045 
01046     return rv;
01047 }
01048 
01049 
01050 // PRE:  Hits (initial, not transformed) representing the compartment; 
01051 //       maximum genomic sequence span;
01052 //       pre-loaded and appropriately transformed query sequence.
01053 // POST: A set of segments packed into the aligned compartment.
01054 
01055 CSplign::SAlignedCompartment CSplign::x_RunOnCompartment(THitRefs* phitrefs, 
01056                                                          size_t range_left, 
01057                                                          size_t range_right)
01058 {    
01059     SAlignedCompartment rv;
01060  
01061     try {
01062         m_segments.clear();
01063     
01064         if(range_left > range_right) {
01065             NCBI_THROW(CAlgoAlignException, eInternal, g_msg_InvalidRange);
01066         }
01067 
01068         if(phitrefs->size() == 0) {
01069             NCBI_THROW(CAlgoAlignException, eNoHits, g_msg_NoHitsAfterFiltering);
01070         }
01071     
01072         const size_t mrna_size (m_mrna.size());
01073     
01074         if(m_strand == false) {
01075         
01076             // adjust the hits
01077             for(size_t i (0), n (phitrefs->size()); i < n; ++i) {
01078 
01079                 THitRef& h ((*phitrefs)[i]);
01080                 THit::TCoord a0 (mrna_size - h->GetQueryMin() - 1);
01081                 THit::TCoord a1 (mrna_size - h->GetQueryMax() - 1);
01082                 const bool new_strand (!(h->GetSubjStrand()));
01083                 h->SetQueryStart(a1);
01084                 h->SetQueryStop(a0);
01085                 h->SetSubjStrand(new_strand);
01086             }
01087         }
01088         
01089         m_polya_start = m_nopolya?
01090             kMax_UInt:
01091             s_TestPolyA(&m_mrna.front(), m_mrna.size(), m_cds_stop);
01092     
01093         // cleave off hits beyond polya
01094         if(m_polya_start < kMax_UInt) {
01095             CleaveOffByTail(phitrefs, m_polya_start); 
01096         }
01097 
01098         // keep short terminal hits out of the pattern
01099         THitRefs::iterator ii (phitrefs->begin()), jj (phitrefs->end() - 1);
01100         const size_t min_termhitlen1 (m_MinPatternHitLength);
01101         const size_t min_termhitlen2 (2*m_MinPatternHitLength);
01102         bool b0 (true), b1 (true);
01103         while(b0 && b1 && ii < jj) {
01104             
01105             while(ii->IsNull() && ii < jj) ++ii;
01106             while(jj->IsNull() && ii < jj) --jj;
01107             
01108             if(ii < jj) {
01109                 
01110                 const double hit_idty ((*ii)->GetIdentity());
01111                 const size_t min_termhitlen (
01112                    hit_idty < .9999? min_termhitlen2: min_termhitlen1);
01113 
01114                 if((*ii)->GetQuerySpan() < min_termhitlen) {
01115                     ii++ -> Reset(0);
01116                 }
01117                 else {
01118                     b0 = false;
01119                 }
01120             }
01121             
01122             if(ii < jj) {
01123                 
01124                 const double hit_idty ((*jj)->GetIdentity());
01125                 const size_t min_termhitlen (
01126                    hit_idty < .9999? min_termhitlen2: min_termhitlen1);
01127 
01128                 if((*jj)->GetQuerySpan() < min_termhitlen) {
01129                     jj-- -> Reset(0);
01130                 }
01131                 else {
01132                     b1 = false;
01133                 }
01134             }
01135         }
01136         
01137         phitrefs->erase(remove_if(phitrefs->begin(), phitrefs->end(),
01138                                   CHitFilter<THit>::s_PNullRef),
01139                         phitrefs->end());
01140 
01141         if(phitrefs->size() == 0) {
01142             NCBI_THROW(CAlgoAlignException, eNoHits,
01143                        g_msg_NoHitsAfterFiltering);
01144         }
01145 
01146 
01147         // find regions of interest on mRna (query) and contig (subj)
01148         THit::TCoord span [4];
01149         CHitFilter<THit>::s_GetSpan(*phitrefs, span);
01150         THit::TCoord qmin (span[0]), qmax (span[1]), smin (span[2]), smax (span[3]);
01151 
01152         const bool ctg_strand (phitrefs->front()->GetSubjStrand());
01153 
01154         // m1: estimate terminal genomic extents based on uncovered end sizes
01155         const THit::TCoord extent_left_m1 (x_GetGenomicExtent(qmin));
01156         const THit::TCoord rspace   ((m_polya_start < kMax_UInt?
01157                                       m_polya_start: mrna_size) - qmax + 1);
01158         const THit::TCoord extent_right_m1 (x_GetGenomicExtent(rspace));
01159         
01160         // m2: estimate genomic extents using compartment hits
01161         THit::TCoord fixed_left (kMaxCoord / 2), fixed_right(fixed_left);
01162 
01163         const size_t kTermLenCutOff_m2 (10);
01164         const bool fix_left  (qmin   <= kTermLenCutOff_m2);
01165         const bool fix_right (rspace <= kTermLenCutOff_m2);
01166         if(fix_left || fix_right) {
01167 
01168             if(phitrefs->size() > 1) {
01169 
01170                 // select based on the max intron length 
01171                 THit::TCoord max_intron (0);
01172                 THit::TCoord prev_start (phitrefs->front()->GetSubjStart());
01173 
01174                 ITERATE(THitRefs, ii, (*phitrefs)) {
01175 
01176                     const THit::TCoord cur_start ((*ii)->GetSubjStart());
01177                     const THit::TCoord intron    (cur_start >= prev_start?
01178                                                   cur_start - prev_start:
01179                                                   prev_start - cur_start);
01180                     if(intron > max_intron) {
01181                         max_intron = intron;
01182                     }
01183                     prev_start = cur_start;
01184                 }
01185 
01186                 const double factor (2.5);
01187                 if(fix_left)  { fixed_left  = THit::TCoord(max_intron * factor); }
01188                 if(fix_right) { fixed_right = THit::TCoord(max_intron * factor); }
01189             }
01190             else {
01191                 // stay conservative for single-hit compartments
01192                 const THit::TCoord single_hit_extent (300);
01193                 if(fix_left)  { fixed_left  = single_hit_extent; }
01194                 if(fix_right) { fixed_right = single_hit_extent; }
01195             }
01196         }
01197 
01198         const THit::TCoord extent_left_m2  (100 + max(fixed_left,  qmin));
01199         const THit::TCoord extent_right_m2 (100 + max(fixed_right, rspace));
01200 
01201         const THit::TCoord extent_left (min(extent_left_m1, extent_left_m2));
01202         const THit::TCoord extent_right (min(extent_right_m1, extent_right_m2));
01203 
01204         if(ctg_strand) {
01205             smin = max(0, int(smin - extent_left));
01206             smax += extent_right;
01207         }
01208         else {
01209             smin = max(0, int(smin - extent_right));
01210             smax += extent_left;
01211         }
01212 
01213         // regardless of hits, entire cDNA is aligned (without the tail, if any)
01214         qmin = 0;
01215         qmax = m_polya_start < kMax_UInt? m_polya_start - 1: mrna_size - 1;
01216     
01217         // make sure to obey the genomic range specified
01218         if(smin < range_left) {
01219             smin = range_left;
01220         }
01221         if(smax > range_right) {
01222             smax = range_right;
01223         }
01224 
01225         m_genomic.clear();
01226         x_LoadSequence(&m_genomic, *(phitrefs->front()->GetSubjId()), 
01227                        smin, smax, true);
01228 
01229         // adjust smax if beyond the end
01230         const THit::TCoord ctg_end (smin + m_genomic.size());
01231         if(smax >= ctg_end) {
01232             smax = ctg_end > 0? ctg_end - 1: 0;
01233         }
01234 
01235         if(ctg_strand == false) {
01236 
01237             // make reverse complementary
01238             // for the contig's area of interest
01239             reverse (m_genomic.begin(), m_genomic.end());
01240             transform(m_genomic.begin(), m_genomic.end(), m_genomic.begin(),
01241                       SCompliment());
01242         }
01243 
01244         NON_CONST_ITERATE(THitRefs, ii, *phitrefs) {
01245 
01246             THitRef& h (*ii);
01247 
01248             const THit::TCoord hsmin (h->GetSubjMin());
01249             const THit::TCoord hsmax (h->GetSubjMax());
01250             if(!(smin <= hsmin && hsmax <= smax)) {
01251                 CNcbiOstrstream ostr;
01252                 ostr << "Invalid pattern hit:\n" << *h
01253                      << "\n Interval = (" << smin << ", " << smax << ')';
01254                 const string errmsg = CNcbiOstrstreamToString(ostr);
01255                 NCBI_THROW(CAlgoAlignException, ePattern, errmsg);
01256             }
01257             
01258             if(ctg_strand == false) {
01259 
01260                 THit::TCoord a2 (smax - (hsmax - smin));
01261                 THit::TCoord a3 (smax - (hsmin - smin));
01262                 h->SetSubjStart(a2);
01263                 h->SetSubjStop(a3);
01264             }
01265         }
01266 
01267         rv.m_QueryStrand = m_strand;
01268         rv.m_SubjStrand  = ctg_strand;
01269     
01270         // shift hits so that they originate from zero
01271         NON_CONST_ITERATE(THitRefs, ii, *phitrefs) {
01272             (*ii)->Shift(-qmin, -smin);
01273         }
01274 
01275         x_SplitQualifyingHits(phitrefs);
01276         x_SetPattern(phitrefs);
01277         rv.m_Score = x_Run(&m_mrna.front(), &m_genomic.front());
01278 
01279         const size_t seg_dim (m_segments.size());
01280         if(seg_dim == 0) {
01281             NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);
01282         }
01283 
01284         // try to extend the last segment as long as it's a perfect match
01285         if(m_polya_start < kMax_UInt && seg_dim && m_segments[seg_dim-1].m_exon) {
01286 
01287             TSegment& s (const_cast<TSegment&>(m_segments[seg_dim-1]));
01288 
01289             const char* p0 = &m_mrna.front() + s.m_box[1] + 1;
01290             const char* q = &m_genomic.front() + s.m_box[3] + 1;
01291             const char* p = p0;
01292             const char* pe = &m_mrna.front() + mrna_size;
01293             const char* qe = &m_genomic.front() + m_genomic.size();
01294             for(; p < pe && q < qe; ++p, ++q) {
01295                 if(*p != *q) break;
01296             }
01297         
01298             const size_t sh (p - p0);
01299             if(sh) {
01300                 // resize
01301                 s.m_box[1] += sh;
01302                 s.m_box[3] += sh;
01303                 s.m_details.append(sh, 'M');
01304                 s.Update(m_aligner);
01305             
01306                 // fix annotation
01307                 const size_t ann_dim = s.m_annot.size();
01308                 if(ann_dim > 2 && s.m_annot[ann_dim - 3] == '>') {
01309 
01310                     s.m_annot[ann_dim - 2] = q < qe? *q: ' ';
01311                     ++q;
01312                     s.m_annot[ann_dim - 1] = q < qe? *q: ' ';
01313                 }
01314             
01315                 m_polya_start += sh;
01316             }
01317         }
01318     
01319         // look for PolyA in trailing segments:
01320         // if a segment is mostly 'A's then we add it to PolyA
01321 
01322         int j (seg_dim - 1), j0 (j);
01323         for(; j >= 0; --j) {
01324         
01325             const TSegment& s (m_segments[j]);
01326 
01327             const char* p0 (&m_mrna[qmin] + s.m_box[0]);
01328             const char* p1 (&m_mrna[qmin] + s.m_box[1] + 1);
01329             const size_t len_chars (p1 - p0);
01330             size_t count (0);
01331             for(const char* pc (p0); pc != p1; ++pc) {
01332                 if(*pc == 'A') ++count;
01333             }
01334 
01335             double min_a_content (0.76); // min 'A' content in a polyA
01336 
01337             // also check splices
01338             if(s.m_exon && j > 0 && m_segments[j-1].m_exon) {
01339 
01340                 bool consensus (TSegment::s_IsConsensusSplice(
01341                                 m_segments[j-1].GetDonor(), s.GetAcceptor()));
01342                 if(!consensus || len_chars <= 6) {
01343                     min_a_content = 0.599;
01344                 }
01345                 if(len_chars < 3) {
01346                     min_a_content = 0.49;
01347                 }
01348             }
01349 
01350             if(!s.m_exon) {
01351                 min_a_content = (s.m_len <= 4)? 0.49: 0.599;
01352             }
01353         
01354             if(double(count)/len_chars < min_a_content) {
01355                 if(s.m_exon && s.m_len > 4) break;
01356             }
01357             else {
01358                 j0 = j - 1;
01359             }
01360         }
01361 
01362         if(j0 >= 0 && j0 < int(seg_dim - 1)) {
01363             m_polya_start = m_segments[j0].m_box[1] + 1;
01364         }
01365 
01366         // test if we have at least one exon before poly(a)
01367         bool some_exons (false);
01368         for(int i = 0; i <= j0; ++i ) {
01369             if(m_segments[i].m_exon) {
01370                 some_exons = true;
01371                 break;
01372             }
01373         }
01374 
01375         if(!some_exons) {
01376             NCBI_THROW(CAlgoAlignException, eNoAlignment,g_msg_NoExonsAboveIdtyLimit);
01377         }
01378     
01379         m_segments.resize(j0 + 1);
01380 
01381         // scratch it if the total coverage is too low
01382         double mcount (0);
01383         ITERATE(TSegments, jj, m_segments) {
01384             if(jj->m_exon) {
01385                 mcount += jj->m_idty * jj->m_len;
01386             }
01387         }
01388 
01389         const THit::TCoord min_singleton_idty_final (
01390                  min(size_t(m_MinSingletonIdty * qmax), m_MinSingletonIdtyBps));
01391 
01392         if(mcount < min_singleton_idty_final) {
01393             NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);
01394         }
01395 
01396         // convert coordinates back to original
01397         NON_CONST_ITERATE(TSegments, jj, m_segments) {
01398         
01399             if(rv.m_QueryStrand) {
01400                 jj->m_box[0] += qmin;
01401                 jj->m_box[1] += qmin;
01402             }
01403             else {
01404                 jj->m_box[0] = mrna_size - jj->m_box[0] - 1;
01405                 jj->m_box[1] = mrna_size - jj->m_box[1] - 1;
01406             }
01407         
01408             if(jj->m_exon) {
01409                 if(rv.m_SubjStrand) {
01410                     jj->m_box[2] += smin;
01411                     jj->m_box[3] += smin;
01412                 }
01413                 else {
01414                     jj->m_box[2] = smax - jj->m_box[2];
01415                     jj->m_box[3] = smax - jj->m_box[3];
01416                 }
01417             }
01418         }
01419 
01420         if(!rv.m_QueryStrand && m_polya_start > 0 && m_polya_start < mrna_size) {
01421             m_polya_start = mrna_size - m_polya_start - 1;
01422         }
01423     } // try
01424 
01425     catch(CAlgoAlignException& e) {
01426         
01427         const CException::TErrCode errcode (e.GetErrCode());
01428         bool severe (true);
01429         switch(errcode) {
01430         case CAlgoAlignException::eNoAlignment:
01431         case CAlgoAlignException::eMemoryLimit:
01432         case CAlgoAlignException::eNoHits:
01433         case CAlgoAlignException::eIntronTooLong:
01434         // case CAlgoAlignException::ePattern:
01435             severe = false;
01436             break;
01437         }
01438 
01439         if(severe) {
01440             e.SetSeverity(eDiag_Fatal);
01441         }
01442         throw;
01443     }
01444 
01445     return rv;
01446 }
01447 
01448 
01449 static const char s_kGap [] = "<GAP>";
01450 
01451 // at this level and below, plus strand is assumed for both sequences
01452 float CSplign::x_Run(const char* Seq1, const char* Seq2)
01453 {
01454     typedef deque<TSegment> TSegmentDeque;
01455     TSegmentDeque segments;
01456 
01457 //#define DBG_DUMP_PATTERN
01458 #ifdef  DBG_DUMP_PATTERN
01459     cerr << "Pattern:" << endl;  
01460 #endif
01461 
01462     const size_t map_dim (m_alnmap.size());
01463     if(map_dim != 1) {
01464         NCBI_THROW(CAlgoAlignException, eInternal, "Multiple maps not supported");
01465     }
01466 
01467     float rv (0);
01468     size_t cds_start (0), cds_stop (0);
01469     const int kMaxShot (5);
01470     for(size_t i (0); i < map_dim; ++i) {
01471 
01472         const SAlnMapElem& zone (m_alnmap[i]);
01473 
01474         // setup sequences
01475         const size_t len1 (zone.m_box[1] - zone.m_box[0] + 1);
01476         const size_t len2 (zone.m_box[3] - zone.m_box[2] + 1);
01477         
01478         // remap cds if antisense
01479         if(m_strand) {
01480             cds_start = m_cds_start;
01481             cds_stop  = m_cds_stop;
01482         }
01483         else {
01484             cds_start = len1 - m_cds_start - 1;
01485             cds_stop  = len1 - m_cds_stop - 1;
01486         }
01487 
01488         m_aligner->SetSequences(Seq1 + zone.m_box[0], len1,
01489                                 Seq2 + zone.m_box[2], len2,
01490                                 false);
01491 
01492         // prepare the pattern
01493         vector<size_t> pattern;
01494         if(m_pattern.size() > 0) {
01495 
01496             if(zone.m_pattern_start < 0) {
01497                 NCBI_THROW(CAlgoAlignException, eInternal,
01498                            "CSplign::x_Run(): Invalid alignment pattern");
01499             }
01500             
01501             copy(m_pattern.begin() + zone.m_pattern_start,
01502                  m_pattern.begin() + zone.m_pattern_end + 1,
01503                  back_inserter(pattern));
01504         }
01505         
01506         for(size_t j (0), pt_dim (pattern.size()); j < pt_dim; j += 4) {
01507 
01508 #ifdef  DBG_DUMP_PATTERN
01509             cerr << (1 + pattern[j]) << '\t' << (1 + pattern[j+1]) << '\t'
01510                  << "(len = " << (pattern[j+1] - pattern[j] + 1) << ")\t"
01511                  << (1 + pattern[j+2]) << '\t' << (1 + pattern[j+3]) 
01512                  << "(len = " << (pattern[j+3] - pattern[j+2] + 1) << ")\t"
01513                  << endl;
01514 #undef DBG_DUMP_PATTERN
01515 #endif
01516 
01517             pattern[j]   -= zone.m_box[0];
01518             pattern[j+1] -= zone.m_box[0];
01519             pattern[j+2] -= zone.m_box[2];
01520             pattern[j+3] -= zone.m_box[2];
01521         }
01522 
01523         // setup the aligner
01524         m_aligner->SetPattern(pattern);
01525         m_aligner->SetEndSpaceFree(true, true, true, true);
01526         m_aligner->SetCDS(cds_start, cds_stop);
01527 
01528         rv += m_aligner->Run();
01529         m_aligner->CheckPreferences();
01530 
01531 // #define DBG_DUMP_TYPE2
01532 #ifdef  DBG_DUMP_TYPE2
01533         {{
01534             CNWFormatter fmt (*m_aligner);
01535             string txt;
01536             fmt.AsText(&txt, CNWFormatter::eFormatType2);
01537             cerr << txt;
01538         }}  
01539 #undef DBG_DUMP_TYPE2
01540 #endif
01541 
01542         CNWFormatter formatter (*m_aligner);
01543         formatter.MakeSegments(&segments);
01544 
01545         // append a gap
01546         if(i + 1 < map_dim) {
01547             segments.push_back(TSegment());
01548             TSegment& g (segments.back());
01549             g.m_exon = false;
01550             g.m_box[0] = zone.m_box[1] + 1;
01551             g.m_box[1] = m_alnmap[i+1].m_box[0] - 1;
01552             g.m_box[2] = zone.m_box[3] + 1;
01553             g.m_box[3] = m_alnmap[i+1].m_box[2] - 1;
01554             g.m_idty = 0;
01555             g.m_len = g.m_box[1] - g.m_box[0] + 1;
01556             g.m_annot = s_kGap;
01557             g.m_details.resize(0);
01558             g.m_score = 0; // no score for <Gap>s
01559         }
01560     } // zone iterations end
01561 
01562 
01563 //#define DUMP_ORIG_SEGS
01564 #ifdef DUMP_ORIG_SEGS
01565     cerr << "Orig segments:" << endl;
01566     ITERATE(TSegmentDeque, ii, segments) {
01567         cerr << ii->m_exon << '\t' << ii->m_idty << '\t' << ii->m_len << '\t'
01568              << ii->m_box[0] << '\t' << ii->m_box[1] << '\t'
01569              << ii->m_box[2] << '\t' << ii->m_box[3] << '\t'
01570              << ii->m_annot  << '\t' << ii->m_score  << endl;
01571     }
01572 #endif
01573 
01574     if(segments.size() == 0) {
01575         NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);
01576     }
01577 
01578     // segment-level postprocessing
01579 
01580     const size_t SeqLen2 (m_genomic.size());
01581     const size_t SeqLen1 (m_polya_start == kMax_UInt?
01582                           m_mrna.size():
01583                           m_polya_start);
01584 
01585     // if the limiting range is set, clear all segments beyond the range
01586     if(m_BoundingRange.second > 0) {
01587 
01588         NON_CONST_ITERATE(TSegmentDeque, ii, segments) {
01589             if(ii->m_exon  &&
01590                (ii->m_box[3] < m_BoundingRange.first
01591                 || ii->m_box[2] > m_BoundingRange.second))
01592             {
01593                 ii->m_exon = false;
01594                 ii->m_idty = 0;
01595                 ii->m_details.resize(0);
01596                 ii->m_annot = s_kGap;
01597                 ii->m_score = 0;
01598             }
01599         }
01600     }
01601 
01602     // may need to catch up with cds at terms
01603     if(cds_start < cds_stop) {
01604         {{
01605         TSegmentDeque::iterator ib (segments.begin()),
01606             ie (segments.end()), ii (ib);
01607         while(ii != ie && !ii->m_exon) ++ii;
01608         if(ii != ie) {
01609             const int start_undershot (ii->m_box[0] - cds_start);
01610             if(0 < start_undershot && start_undershot < kMaxShot) {
01611                 const int extent (min(start_undershot, int(ii->m_box[2])));
01612                 if(extent > 0) {
01613                     ii->ExtendLeft(extent, Seq1, Seq2, m_aligner);
01614                 }
01615             }
01616         }
01617         }}
01618 
01619         {{
01620         TSegmentDeque::reverse_iterator irb (segments.rbegin()),
01621             ire (segments.rend()), iir (irb);
01622         while(iir != ire && !iir->m_exon) ++iir;
01623         if(iir != ire) {
01624             const int stop_undershot (cds_stop - iir->m_box[1]);
01625             if(0 < stop_undershot && stop_undershot < kMaxShot) {
01626                 const int extent (min(stop_undershot,
01627                                       int(SeqLen2 - iir->m_box[3] - 1)));
01628                 if(extent > 0) {
01629                     iir->ExtendRight(extent, Seq1, Seq2, m_aligner);
01630                 }
01631             }
01632         }
01633         }}
01634     }
01635 
01636     m_segments.resize(0);
01637     while(true) {
01638 
01639         if(m_segments.size() > 0) {
01640             segments.resize(m_segments.size());
01641             copy(m_segments.begin(), m_segments.end(), segments.begin());
01642             m_segments.resize(0);
01643         }
01644 
01645         size_t seg_dim (segments.size());
01646         if(seg_dim == 0) {
01647             return 0;
01648         }
01649 
01650         size_t exon_count0 (0);
01651         ITERATE(TSegmentDeque, ii, segments) {
01652             if(ii->m_exon) ++exon_count0;
01653         }
01654 
01655         // Go from the ends and see if we can improve term exons
01656         size_t k0 (0);
01657         while(k0 < seg_dim) {
01658 
01659             TSegment& s = segments[k0];
01660             if(s.m_exon) {
01661 
01662                 const size_t len (1 + s.m_box[1] - s.m_box[0]);
01663                 const double min_idty (len >= kMinTermExonSize?
01664                                        m_MinExonIdty:
01665                                        max(m_MinExonIdty, kMinTermExonIdty));
01666                 
01667                 const bool b1 (s.m_idty < min_idty || m_endgaps);
01668                 const bool b2 (cds_start == cds_stop
01669                                || cds_start > s.m_box[0] + kMaxShot);
01670                 if(b1 && b2) {
01671                     s.ImproveFromLeft(Seq1, Seq2, m_aligner);
01672                 }
01673 
01674                 if(s.m_idty >= min_idty) {
01675                     break;
01676                 }
01677             }
01678             ++k0;
01679         }
01680 
01681         int k1 (seg_dim - 1);
01682         while(k1 >= int(k0)) {
01683 
01684             TSegment& s (segments[k1]);
01685             if(s.m_exon) {
01686 
01687                 const size_t len (1 + s.m_box[1] - s.m_box[0]);
01688                 const double min_idty (len >= kMinTermExonSize?
01689                                        m_MinExonIdty:
01690                                        max(m_MinExonIdty, kMinTermExonIdty));
01691 
01692                 const bool b1 (s.m_idty < min_idty || m_endgaps);
01693                 const bool b2 (cds_start == cds_stop 
01694                                || cds_stop + kMaxShot < s.m_box[1]);
01695 
01696                 if(b1 && b2) {
01697                     s.ImproveFromRight(Seq1, Seq2, m_aligner);
01698                 }
01699 
01700                 if(s.m_idty >= min_idty) {
01701                     break;
01702                 }
01703             }
01704             --k1;
01705         }
01706 
01707         // turn to gaps exons with low identity
01708         for(size_t k (0); k < seg_dim; ++k) {
01709 
01710             TSegment& s (segments[k]);
01711             if(s.m_exon == false) continue;
01712 
01713             bool drop (false);
01714             if(s.m_idty < m_MinExonIdty) {
01715                 drop = true; // always make gaps on low identity
01716             }
01717             else if (s.m_idty < .9 && s.m_len < 20) {
01718 
01719                 // same for short exons preceded/followed by non-consensus splices
01720                 bool nc_prev (false), nc_next (false);
01721                 if(k > 0 && segments[k-1].m_exon) {
01722                     nc_prev = ! TSegment::s_IsConsensusSplice(
01723                                  segments[k-1].GetDonor(),
01724                                  s.GetAcceptor());
01725                 }
01726 
01727                 if(k + 1 < seg_dim && segments[k+1].m_exon) {
01728                     nc_next = ! TSegment::s_IsConsensusSplice(
01729                                  s.GetDonor(),
01730                                  segments[k+1].GetAcceptor());
01731                 }
01732 
01733                 drop = nc_prev || nc_next;
01734             }
01735 
01736             if(drop) {
01737                 s.m_exon = false;
01738                 s.m_idty = 0;
01739                 s.m_len = s.m_box[1] - s.m_box[0] + 1;
01740                 s.m_annot = s_kGap;
01741                 s.m_details.resize(0);
01742                 s.m_score = 0;
01743             }
01744         }
01745 
01746         // turn to gaps short weak terminal exons
01747         {{
01748             // find the two leftmost exons
01749             size_t exon_count (0);
01750             TSegment* term_segs[] = {0, 0};
01751             for(size_t i = 0; i < seg_dim; ++i) {
01752                 TSegment& s = segments[i];
01753                 if(s.m_exon) {
01754                     term_segs[exon_count] = &s;
01755                     if(++exon_count == 2) {
01756                         break;
01757                     }
01758                 }
01759             }
01760 
01761             if(exon_count == 2) {
01762                 x_ProcessTermSegm(term_segs, 0);
01763             }
01764         }}
01765 
01766         {{
01767             // find the two rightmost exons
01768             size_t exon_count (0);
01769             TSegment* term_segs[] = {0, 0};
01770             for(int i = seg_dim - 1; i >= 0; --i) {
01771                 TSegment& s = segments[i];
01772                 if(s.m_exon) {
01773                     term_segs[exon_count] = &s;
01774                     if(++exon_count == 2) {
01775                         break;
01776                     }
01777                 }
01778             }
01779 
01780             if(exon_count == 2) {
01781                 x_ProcessTermSegm(term_segs, 1);
01782             }
01783         }}
01784 
01785         // turn to gaps extra-short exons preceded/followed by gaps
01786         bool gap_prev (false);
01787         for(size_t k (0); k < seg_dim; ++k) {
01788 
01789             TSegment& s (segments[k]);
01790             if(s.m_exon == false) {
01791                 gap_prev = true;
01792             }
01793             else {
01794                 size_t length (s.m_box[1] - s.m_box[0] + 1);
01795                 bool gap_next (false);
01796                 if(k + 1 < seg_dim) {
01797                     gap_next = !segments[k+1].m_exon;
01798                 }
01799                 if(length <= 10 && (gap_prev || gap_next)) {
01800                     s.m_exon = false;
01801                     s.m_idty = 0;
01802                     s.m_len = s.m_box[1] - s.m_box[0] + 1;
01803                     s.m_annot = s_kGap;
01804                     s.m_details.resize(0);
01805                     s.m_score = 0;
01806                 }
01807                 gap_prev = false;
01808             }
01809         }
01810 
01811         // indicate any slack space on the left
01812         if(segments[0].m_box[0] > 0) {
01813             
01814             TSegment g;
01815             g.m_exon = false;
01816             g.m_box[0] = 0;
01817             g.m_box[1] = segments[0].m_box[0] - 1;
01818             g.m_box[2] = 0;
01819             g.m_box[3] = segments[0].m_box[2] - 1;
01820 
01821             g.m_idty = 0;
01822             g.m_len = segments[0].m_box[0];
01823             g.m_annot = s_kGap;
01824             g.m_details.resize(0);
01825             g.m_score = 0;
01826 
01827             segments.push_front(g);
01828             ++seg_dim;
01829         }
01830 
01831         // same on the right
01832         TSegment& seg_last (segments.back());
01833 
01834         if(seg_last.m_box[1] + 1 < SeqLen1) {
01835             
01836             TSegment g;
01837             g.m_exon = false;
01838             g.m_box[0] = seg_last.m_box[1] + 1;
01839             g.m_box[1] = SeqLen1 - 1;
01840             g.m_box[2] = seg_last.m_box[3] + 1;
01841             g.m_box[3] = SeqLen2 - 1;
01842             g.m_idty = 0;
01843             g.m_len = g.m_box[1] - g.m_box[0] + 1;
01844             g.m_annot = s_kGap;
01845             g.m_details.resize(0);
01846             g.m_score = 0;
01847 
01848             segments.push_back(g);
01849             ++seg_dim;
01850         }
01851 
01852         // merge all adjacent gaps
01853         int gap_start_idx (-1);
01854         if(seg_dim && segments[0].m_exon == false) {
01855             gap_start_idx = 0;
01856         }
01857 
01858         for(size_t k (0); k < seg_dim; ++k) {
01859             TSegment& s (segments[k]);
01860             if(!s.m_exon) {
01861                 if(gap_start_idx == -1) {
01862                     gap_start_idx = int(k);
01863                     if(k > 0) {
01864                         s.m_box[0] = segments[k-1].m_box[1] + 1;
01865                         s.m_box[2] = segments[k-1].m_box[3] + 1;
01866                     }
01867                 }
01868             }
01869             else {
01870                 if(gap_start_idx >= 0) {
01871                     TSegment& g = segments[gap_start_idx];
01872                     g.m_box[1] = s.m_box[0] - 1;
01873                     g.m_box[3] = s.m_box[2] - 1;
01874                     g.m_len = g.m_box[1] - g.m_box[0] + 1;
01875                     g.m_details.resize(0);
01876                     m_segments.push_back(g);
01877                     gap_start_idx = -1;
01878                 }
01879                 m_segments.push_back(s);
01880             } 
01881         }
01882 
01883         if(gap_start_idx >= 0) {
01884             TSegment& g (segments[gap_start_idx]);
01885             g.m_box[1] = segments[seg_dim-1].m_box[1];
01886             g.m_box[3] = segments[seg_dim-1].m_box[3];
01887             g.m_len = g.m_box[1] - g.m_box[0] + 1;
01888             g.m_details.resize(0);
01889             m_segments.push_back(g);
01890         }
01891 
01892         size_t exon_count1 (0);
01893         ITERATE(TSegments, ii, m_segments) {
01894             if(ii->m_exon) ++exon_count1;
01895         }
01896 
01897         if(exon_count0 == exon_count1) break;
01898     }
01899 
01900 //#define DUMP_PROCESSED_SEGS
01901 #ifdef DUMP_PROCESSED_SEGS
01902     cerr << "Processed segments:" << endl;
01903     ITERATE(TSegments, ii, m_segments) {
01904         cerr << ii->m_box[0] << '\t' << ii->m_box[1] << '\t'
01905              << ii->m_box[2] << '\t' << ii->m_box[3] << '\t'
01906              << ii->m_annot << '\t' << ii->m_score << endl;
01907     }
01908 #endif
01909 
01910     rv /= m_aligner->GetWm();
01911     return rv;
01912 }
01913 
01914 
01915 double CSplign::SAlignedCompartment::GetIdentity() const
01916 {
01917     string trans;
01918     for(size_t i (0), dim (m_Segments.size()); i < dim; ++i) {
01919         const TSegment & s (m_Segments[i]);
01920         if(s.m_exon) {
01921             trans.append(s.m_details);
01922         }
01923         else {
01924             trans.append(s.m_len, 'D');
01925         }
01926     }
01927     size_t matches = 0;
01928     ITERATE(string, ii, trans) {
01929         if(*ii == 'M') {
01930             ++matches;
01931         }
01932     }
01933     return double(matches) / trans.size();
01934 }
01935 
01936 
01937 
01938 void CSplign::SAlignedCompartment::GetBox(Uint4* box) const
01939 {
01940     box[0] = box[2] = kMax_UInt;
01941     box[1] = box[3] = 0;
01942     ITERATE(TSegments, ii, m_Segments) {
01943         const TSegment& s (*ii);
01944         if(s.m_exon) {
01945             
01946             Uint4 a, b;
01947             if(s.m_box[0] <= s.m_box[1]) {
01948                 a = s.m_box[0];
01949                 b = s.m_box[1];
01950             }
01951             else {
01952                 b = s.m_box[0];
01953                 a = s.m_box[1];
01954             }
01955             if(a < box[0]) {
01956                 box[0] = a;
01957             }
01958             if(b > box[1]) {
01959                 box[1] = b;
01960             }
01961 
01962             if(s.m_box[2] <= s.m_box[3]) {
01963                 a = s.m_box[2];
01964                 b = s.m_box[3];
01965             }
01966             else {
01967                 b = s.m_box[2];
01968                 a = s.m_box[3];
01969             }
01970             if(a < box[2]) {
01971                 box[2] = a;
01972             }
01973             if(b > box[3]) {
01974                 box[3] = b;
01975             }
01976         }
01977     }
01978 }
01979 
01980 
01981 bool CSplign::x_ProcessTermSegm(TSegment** term_segs, Uint1 side) const
01982 {            
01983     bool turn2gap (false);
01984 
01985     const size_t exon_size (1 + term_segs[0]->m_box[1] -
01986                             term_segs[0]->m_box[0]);
01987 
01988     if(exon_size < kMinTermExonSize) {
01989 
01990         const double idty (term_segs[0]->m_idty);
01991         if(idty < kMinTermExonIdty) {
01992             turn2gap = true;
01993         }
01994         else {
01995             
01996             // verify that the intron is not too long
01997 
01998             size_t a, b;
01999             const char *dnr, *acc;
02000             if(side == 0) {
02001                 a = term_segs[0]->m_box[3];
02002                 b = term_segs[1]->m_box[2];
02003                 dnr = term_segs[0]->GetDonor();
02004                 acc = term_segs[1]->GetAcceptor();
02005             }
02006             else {
02007                 a = term_segs[1]->m_box[3];
02008                 b = term_segs[0]->m_box[2];
02009                 dnr = term_segs[1]->GetDonor();
02010                 acc = term_segs[0]->GetAcceptor();
02011             }
02012 
02013             const size_t intron_len (b - a);
02014 
02015             const bool consensus (TSegment::s_IsConsensusSplice(dnr, acc));
02016 
02017             size_t max_ext ((idty < .96 || !consensus || exon_size < 16)? 
02018                             m_max_genomic_ext: (5000 *  kMinTermExonSize));
02019 
02020             if(consensus) {
02021                 if(exon_size < 8) {
02022                     max_ext = 10 * exon_size;
02023                 }
02024             }
02025             else if(exon_size < 16) {
02026                 max_ext = 1;
02027             }
02028 
02029             const size_t max_intron_len (x_GetGenomicExtent(exon_size, max_ext));
02030             if(intron_len > max_intron_len) {
02031                 turn2gap = true;
02032             }
02033         }
02034       
02035         if(turn2gap) {
02036 
02037             // turn the segment into a gap
02038             TSegment& s = *(term_segs[0]);
02039             s.m_exon = false;
02040             s.m_idty = 0;
02041             s.m_len = exon_size;
02042             s.m_annot = s_kGap;
02043             s.m_details.resize(0);
02044             s.m_score = 0;
02045         }
02046     }
02047 
02048     return turn2gap;
02049 }
02050 
02051 
02052 Uint4 CSplign::x_GetGenomicExtent(const Uint4 query_len, Uint4 max_ext) const 
02053 {
02054     if(max_ext == 0) {
02055         max_ext = m_max_genomic_ext;
02056     }
02057 
02058     Uint4 rv (0);
02059     if(query_len >= kNonCoveredEndThreshold) {
02060         rv = m_max_genomic_ext;
02061     }
02062     else {
02063         const double k (pow(kNonCoveredEndThreshold, - 1. / kPower) * max_ext);
02064         const double drv (k * pow(query_len, 1. / kPower));
02065         rv = Uint4(drv);
02066     }
02067 
02068     return rv;
02069 }
02070 
02071 
02072 ////////////////////////////////////
02073 
02074 namespace splign_local {
02075 
02076     template<typename T>
02077     void ElemToBuffer(const T& n, char*& p)
02078     {
02079         *(reinterpret_cast<T*>(p)) = n;
02080         p += sizeof(n);
02081     }
02082     
02083     template<>
02084     void ElemToBuffer(const string& s, char*& p)
02085     {
02086         copy(s.begin(), s.end(), p);
02087         p += s.size();
02088         *p++ = 0;
02089     }
02090     
02091     template<typename T>
02092     void ElemFromBuffer(T& n, const char*& p)
02093     {
02094         n = *(reinterpret_cast<const T*>(p));
02095         p += sizeof(n);
02096     }
02097     
02098     template<>
02099     void ElemFromBuffer(string& s, const char*& p)
02100     {
02101         s = p;
02102         p += s.size() + 1;
02103     }
02104 }
02105 
02106 
02107 void CNWFormatter::SSegment::ToBuffer(TNetCacheBuffer* target) const
02108 {
02109     using namespace splign_local;
02110 
02111     if(target == 0) {
02112         NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_NullPointerPassed );
02113     }
02114     
02115     const size_t total_size = sizeof m_exon + sizeof m_idty + 
02116         sizeof m_len + sizeof m_box + m_annot.size() + 1 +
02117         m_details.size() + 1 + sizeof m_score;
02118 
02119     target->resize(total_size);
02120     
02121     char* p = &target->front();
02122     ElemToBuffer(m_exon, p);
02123     ElemToBuffer(m_idty, p);
02124     ElemToBuffer(m_len, p);
02125     for(size_t i = 0; i < 4; ++i) {
02126         ElemToBuffer(m_box[i], p);
02127     }
02128     ElemToBuffer(m_annot, p);
02129     ElemToBuffer(m_details, p);
02130     ElemToBuffer(m_score, p);
02131 }
02132 
02133 
02134 void CNWFormatter::SSegment::FromBuffer(const TNetCacheBuffer& source)
02135 {
02136     using namespace splign_local;
02137 
02138     const size_t min_size = sizeof m_exon + sizeof m_idty + sizeof m_len + 
02139         + sizeof m_box + 1 + 1 + sizeof m_score;
02140 
02141     if(source.size() < min_size) {
02142         NCBI_THROW(CAlgoAlignException, eInternal, g_msg_NetCacheBufferIncomplete);
02143     }
02144     
02145     const char* p = &source.front();
02146     ElemFromBuffer(m_exon, p);
02147     ElemFromBuffer(m_idty, p);
02148     ElemFromBuffer(m_len, p);
02149 
02150     for(size_t i = 0; i < 4; ++i) {
02151         ElemFromBuffer(m_box[i], p);
02152     }
02153     
02154     ElemFromBuffer(m_annot, p);
02155     ElemFromBuffer(m_details, p);
02156     ElemFromBuffer(m_score, p);
02157 }
02158 
02159 
02160 void CSplign::SAlignedCompartment::ToBuffer(TNetCacheBuffer* target) const
02161 {
02162     using namespace splign_local;
02163 
02164     if(target == 0) {
02165         NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_NullPointerPassed);
02166     }
02167 
02168     const size_t core_size (
02169         sizeof m_Id + sizeof m_Status + m_Msg.size() + 1
02170         + sizeof m_QueryStrand + sizeof m_SubjStrand
02171         + sizeof m_Cds_start + sizeof m_Cds_stop
02172         + sizeof m_QueryLen
02173         + sizeof m_PolyA
02174         + sizeof m_Score);
02175 
02176     vector<char> core (core_size);
02177 
02178     char* p = &core.front();
02179     ElemToBuffer(m_Id, p);
02180     ElemToBuffer(m_Status, p);
02181     ElemToBuffer(m_Msg, p);
02182     ElemToBuffer(m_QueryStrand, p);
02183     ElemToBuffer(m_SubjStrand, p);
02184     ElemToBuffer(m_Cds_start, p);
02185     ElemToBuffer(m_Cds_stop, p);
02186     ElemToBuffer(m_QueryLen, p);
02187     ElemToBuffer(m_PolyA, p);
02188     ElemToBuffer(m_Score, p);
02189     
02190     typedef vector<TNetCacheBuffer> TBuffers;
02191     TBuffers vb (m_Segments.size());
02192     size_t ibuf (0);
02193     ITERATE(TSegments, ii, m_Segments) {
02194         ii->ToBuffer(&vb[ibuf++]);
02195     }
02196 
02197     size_t total_size (core_size + sizeof(size_t) * m_Segments.size());
02198     ITERATE(TBuffers, ii, vb) {
02199         total_size += ii->size();
02200     }
02201 
02202     target->resize(total_size);
02203     TNetCacheBuffer::iterator it = target->begin();
02204     copy(core.begin(), core.end(), it);
02205     it += core_size;
02206 
02207     ITERATE(TBuffers, ii, vb) {
02208         char* p = &(*it);
02209         const size_t seg_buf_size = ii->size();
02210         *((size_t*)p) = seg_buf_size;
02211         it += sizeof (size_t);
02212         copy(ii->begin(), ii->end(), it);
02213         it += seg_buf_size;
02214     }
02215 }
02216 
02217 
02218 void CSplign::SAlignedCompartment::FromBuffer(const TNetCacheBuffer& source)
02219 {
02220     using namespace splign_local;
02221 
02222     const size_t min_size (
02223           sizeof m_Id 
02224           + sizeof m_Status
02225           + 1 
02226           + sizeof m_QueryStrand + sizeof m_SubjStrand
02227           + sizeof m_Cds_start + sizeof m_Cds_stop
02228           + sizeof m_QueryLen
02229           + sizeof m_PolyA
02230           + sizeof m_Score );
02231 
02232     if(source.size() < min_size) {
02233         NCBI_THROW(CAlgoAlignException, eInternal, g_msg_NetCacheBufferIncomplete);
02234     }
02235     
02236     const char* p (&source.front());
02237     ElemFromBuffer(m_Id, p);
02238     ElemFromBuffer(m_Status, p);
02239     ElemFromBuffer(m_Msg, p);
02240     ElemFromBuffer(m_QueryStrand, p);
02241     ElemFromBuffer(m_SubjStrand, p);
02242     ElemFromBuffer(m_Cds_start, p);
02243     ElemFromBuffer(m_Cds_stop, p);
02244     ElemFromBuffer(m_QueryLen, p);
02245     ElemFromBuffer(m_PolyA, p);
02246     ElemFromBuffer(m_Score, p);
02247 
02248     const char* pe (&source.back());
02249     while(p <= pe) {
02250         size_t seg_buf_size (0);
02251         ElemFromBuffer(seg_buf_size, p);
02252         m_Segments.push_back(TSegment());
02253         TSegment& seg (m_Segments.back());
02254         seg.FromBuffer(TNetCacheBuffer(p, p + seg_buf_size));
02255         p += seg_buf_size;
02256     }
02257 }
02258 
02259 
02260 bool IsConsDonor(const objects::CSpliced_exon& exon) {
02261     USING_SCOPE(objects);
02262     const CSpliced_exon::TDonor_after_exon & splice (exon.GetDonor_after_exon());
02263     const string bases (splice.GetBases());
02264     return (bases.size() >= 2 && bases[0] == 'G' && bases[1] == 'T');
02265 }
02266 
02267 
02268 bool IsConsAcceptor(const objects::CSpliced_exon& exon) {
02269 
02270     USING_SCOPE(objects);
02271     const CSpliced_exon::TAcceptor_before_exon &
02272         splice (exon.GetAcceptor_before_exon());
02273     const string bases (splice.GetBases());
02274     const size_t dim (bases.size());
02275     return (dim >= 2 && bases[dim - 2] == 'A' && bases[dim - 1] == 'G');
02276 }
02277 
02278 
02279 size_t CSplign::s_ComputeStats(CRef<objects::CSeq_align_set> sas,
02280                                TScoreSets * output_stats,
02281                                TOrf cds,
02282                                EStatFlags flags)
02283 {
02284     USING_SCOPE(objects);
02285 
02286     const
02287     bool valid_input (sas.GetPointer() && sas->CanGet() && sas->Get().size() 
02288                       && sas->Get().front()->CanGetSegs()
02289                       && sas->Get().front()->GetSegs().IsSpliced()
02290                       && sas->Get().front()->GetSegs().GetSpliced().GetProduct_type() 
02291                       == CSpliced_seg::eProduct_type_transcript
02292                       && output_stats);
02293 
02294     if(!valid_input) {
02295         NCBI_THROW(CAlgoAlignException, eBadParameter,
02296                    "CSplign::s_ComputeStats(): Invalid input");
02297     }
02298 
02299     output_stats->resize(0);
02300     
02301     ITERATE(CSeq_align_set::Tdata, ii1, sas->Get()) {
02302         CRef<CScore_set> ss (s_ComputeStats(*ii1, false, cds, flags));
02303         output_stats->push_back(ss);
02304     }
02305 
02306     return output_stats->size();
02307 }
02308 
02309 
02310 namespace {
02311     const int kFrame_not_set (-10);
02312     const int kFrame_end     (-5);
02313     const int kFrame_lost    (-20);
02314 }
02315 
02316 
02317 CRef<objects::CScore_set> CSplign::s_ComputeStats(CRef<objects::CSeq_align> sa,
02318                                                   bool embed_scoreset,
02319                                                   TOrf cds,
02320                                                   EStatFlags flags)
02321 {
02322     USING_SCOPE(objects);
02323 
02324 
02325     if(!(flags & (eSF_BasicNonCds | eSF_BasicCds))) {
02326         NCBI_THROW(CException, eUnknown,
02327                    "CSplign::s_ComputeStats(): mode not yet supported.");
02328     }
02329 
02330     typedef CSeq_align::TSegs::TSpliced TSpliced;
02331     const TSpliced & spliced (sa->GetSegs().GetSpliced());
02332     if(spliced.GetProduct_type() != CSpliced_seg::eProduct_type_transcript) {
02333         NCBI_THROW(CAlgoAlignException, eBadParameter,
02334                    "CSplign::s_ComputeStats(): Unsupported product type");
02335     }
02336 
02337     const bool cds_stats ((flags & eSF_BasicCds) && (cds.first + cds.second > 0));
02338     const bool qstrand (spliced.GetProduct_strand() != eNa_strand_minus);
02339     if(cds_stats) {
02340         const bool cds_strand (cds.first < cds.second);
02341         if(qstrand ^ cds_strand) {
02342             NCBI_THROW(CAlgoAlignException, eBadParameter,
02343                        "CSplign::s_ComputeStats(): Transcript orientation not "
02344                        "matching specified CDS orientation.");
02345         }
02346     }
02347 
02348     typedef TSpliced::TExons TExons;
02349     const TExons & exons (spliced.GetExons());
02350 
02351     size_t matches (0),
02352         aligned_query_bases (0),  // matches, mismatches and indels
02353         aln_length_exons (0),
02354         aln_length_gaps (0),
02355         splices_total (0),        // twice the number of introns
02356         splices_consensus (0);
02357 
02358     const TSeqPos  qlen (spliced.GetProduct_length());
02359     const TSeqPos polya (spliced.CanGetPoly_a()?
02360                          spliced.GetPoly_a(): (qstrand? qlen: TSeqPos(-1)));
02361     const TSeqPos prod_length_no_polya (qstrand? polya: qlen - 1 - polya);
02362         
02363     typedef CSpliced_exon TExon;
02364     TSeqPos qprev (qstrand? TSeqPos(-1): qlen);
02365     bool cons_dnr (false);
02366     string xcript;
02367     ITERATE(TExons, ii2, exons) {
02368 
02369         const TExon & exon (**ii2);
02370         const TSeqPos qmin (exon.GetProduct_start().GetNucpos()),
02371             qmax (exon.GetProduct_end().GetNucpos());
02372 
02373         const TSeqPos qgap (qstrand? qmin - qprev - 1: qprev - qmax - 1);
02374 
02375         if(qgap > 0) {
02376             aln_length_gaps += qgap;
02377             cons_dnr = false;
02378             if(cds_stats) xcript.append(qgap, 'X');
02379         }
02380         else if (ii2 != exons.begin()) {
02381             splices_total += 2;
02382             if(cons_dnr && IsConsAcceptor(exon)) { splices_consensus += 2; }
02383         }
02384 
02385         typedef TExon::TParts TParts;
02386         const TParts & parts (exon.GetParts());
02387         string errmsg;
02388         ITERATE(TParts, ii3, parts) {
02389             const CSpliced_exon_chunk & part (**ii3);
02390             const CSpliced_exon_chunk::E_Choice choice (part.Which());
02391             TSeqPos len (0);
02392             switch(choice) {
02393             case CSpliced_exon_chunk::e_Match:
02394                 len = part.GetMatch();
02395                 matches += len;
02396                 aligned_query_bases += len;
02397                 if(cds_stats) xcript.append(len, 'M');
02398                 break;
02399             case CSpliced_exon_chunk::e_Mismatch:
02400                 len = part.GetMismatch();
02401                 aligned_query_bases += len;
02402                 if(cds_stats) xcript.append(len, 'R');
02403                 break;
02404             case CSpliced_exon_chunk::e_Product_ins:
02405                 len = part.GetProduct_ins();
02406                 aligned_query_bases += len;
02407                 if(cds_stats) xcript.append(len, 'D');
02408                 break;
02409             case CSpliced_exon_chunk::e_Genomic_ins:
02410                 len = part.GetGenomic_ins();
02411                 if(cds_stats) xcript.append(len, 'I');
02412                 break;
02413             default:
02414                 errmsg = "Unexpected spliced exon chunk part: "
02415                     + part.SelectionName(choice);
02416                 NCBI_THROW(CAlgoAlignException, eBadParameter, errmsg);
02417             }
02418             aln_length_exons += len;
02419         }
02420 
02421         cons_dnr = IsConsDonor(exon);
02422         qprev = qstrand? qmax: qmin;
02423     } // TExons
02424 
02425     const TSeqPos qgap (qstrand? polya - qprev - 1: qprev - polya - 1);
02426     aln_length_gaps += qgap;
02427     if(cds_stats) xcript.append(qgap, 'X');
02428 
02429     // set individual scores
02430     CRef<CScore_set> ss (new CScore_set);
02431     CScore_set::Tdata & scores (ss->Set());
02432         
02433     if(flags & eSF_BasicNonCds) {
02434         {
02435             CRef<CScore> score_matches (new CScore());
02436             score_matches->SetId().SetId(eCS_Matches);
02437             score_matches->SetValue().SetInt(matches);
02438             scores.push_back(score_matches);
02439         }
02440         {
02441             CRef<CScore> score_overall_identity (new CScore());
02442             score_overall_identity->SetId().SetId(eCS_OverallIdentity);
02443             score_overall_identity->SetValue().
02444                 SetReal(double(matches)/(aln_length_exons + aln_length_gaps));
02445             scores.push_back(score_overall_identity);
02446         }
02447         {
02448             CRef<CScore> score_splices (new CScore());
02449             score_splices->SetId().SetId(eCS_Splices);
02450             score_splices->SetValue().SetInt(splices_total);
02451             scores.push_back(score_splices);
02452         }
02453         {
02454             CRef<CScore> score_splices_consensus (new CScore());
02455             score_splices_consensus->SetId().SetId(eCS_ConsensusSplices);
02456             score_splices_consensus->SetValue().SetInt(splices_consensus);
02457             scores.push_back(score_splices_consensus);
02458         }
02459         {
02460             CRef<CScore> score_coverage (new CScore());
02461             score_coverage->SetId().SetId(eCS_ProductCoverage);
02462             score_coverage->SetValue().
02463                 SetReal(double(aligned_query_bases) / prod_length_no_polya);
02464             scores.push_back(score_coverage);
02465         }
02466         {
02467             CRef<CScore> score_exon_identity (new CScore());
02468             score_exon_identity->SetId().SetId(eCS_ExonIdentity);
02469             score_exon_identity->SetValue().
02470                 SetReal(double(matches) / aln_length_exons);
02471             scores.push_back(score_exon_identity);
02472         }
02473     }
02474 
02475     if(cds_stats) {
02476 
02477         if(!qstrand && qlen <= 0) {
02478             NCBI_THROW(CAlgoAlignException, eBadParameter,
02479                        "CSplign::s_ComputeStats(): Cannot compute "
02480                        "inframe stats - transcript length not set.");
02481         }
02482 
02483         int qpos (qstrand? -1: int(qlen));
02484         int qinc (qstrand? +1: -1);
02485         int frame (kFrame_not_set);
02486         size_t aln_length_cds (0);
02487         size_t matches_frame[] = {0, 0, 0, 0, 0};
02488         const int cds_start (cds.first), cds_stop (cds.second);
02489         for(string::const_iterator ie (xcript.end()), ii(xcript.begin());
02490             ii != ie && frame != kFrame_end; ++ii)
02491         {
02492 
02493             switch(*ii) {
02494 
02495             case 'M':
02496                 qpos += qinc;
02497                 if(frame == kFrame_not_set && qpos == cds_start) frame = 0;
02498                 if(qpos == cds_stop) frame = kFrame_end;
02499                 if(frame >= -2) {
02500                     ++aln_length_cds;
02501                     ++matches_frame[frame + 2];
02502                 }
02503                 break;
02504 
02505             case 'R':
02506                 qpos += qinc;
02507                 if(frame == kFrame_not_set && qpos == cds_start) frame = 0;
02508                 if(qpos == cds_stop) frame = kFrame_end;
02509                 if(frame >= -2) ++aln_length_cds;
02510                 break;
02511 
02512             case 'D':
02513                 qpos += qinc;
02514                 if(frame == kFrame_not_set && qpos == cds_start) frame = 0;
02515                 if(qpos == cds_stop) frame = kFrame_end;
02516                 if(frame >= -2) {
02517                     ++aln_length_cds;
02518                     frame = (frame + 1) % 3;
02519                 }
02520                 break;
02521 
02522             case 'I':
02523                 if(frame >= -2) {
02524                     ++aln_length_cds;
02525                     frame = (frame - 1) % 3;
02526                 }
02527                 break;
02528 
02529             case 'X':
02530                 qpos += qinc;
02531                 if( qstrand && cds_start <= qpos && qpos < cds_stop ||
02532                     !qstrand && cds_start >= qpos && qpos > cds_stop )
02533                 {
02534                     frame = kFrame_lost;
02535                     ++aln_length_cds;
02536                 }
02537                 break;
02538             }
02539         }
02540 
02541         {
02542             CRef<CScore> score_matches_inframe (new CScore());
02543             score_matches_inframe->SetId().SetId(eCS_InframeMatches);
02544             score_matches_inframe->SetValue().SetInt(matches_frame[2]);
02545             scores.push_back(score_matches_inframe);
02546         }
02547 
02548         {
02549             CRef<CScore> score_inframe_identity (new CScore());
02550             score_inframe_identity->SetId().SetId(eCS_InframeIdentity);
02551             score_inframe_identity->SetValue().
02552                 SetReal(double(matches_frame[2]) / aln_length_cds);
02553             scores.push_back(score_inframe_identity);
02554         }
02555     }
02556 
02557 
02558     if(embed_scoreset) {
02559         CSeq_align::TScore & sa_score (sa->SetScore());
02560         sa_score.resize(scores.size());
02561         copy(scores.begin(), scores.end(), sa_score.begin());
02562     }
02563 
02564     return ss;
02565 }
02566 
02567 
02568 
02569 END_NCBI_SCOPE
02570 
02571 

Generated on Sun Dec 6 22:16:44 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Mon Dec 07 16:20:49 2009 by modify_doxy.py rev. 173732