00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034 #include <ncbi_pch.hpp>
00035
00036 #include "splign_util.hpp"
00037 #include "messages.hpp"
00038 #include <algo/align/util/hit_comparator.hpp>
00039 #include <algo/align/util/compartment_finder.hpp>
00040 #include <algo/align/nw/nw_band_aligner.hpp>
00041 #include <algo/align/nw/nw_spliced_aligner16.hpp>
00042 #include <algo/align/nw/nw_formatter.hpp>
00043 #include <algo/align/nw/align_exception.hpp>
00044 #include <algo/align/splign/splign.hpp>
00045
00046 #include <algo/sequence/orf.hpp>
00047
00048 #include <objmgr/scope.hpp>
00049 #include <objmgr/bioseq_handle.hpp>
00050 #include <objmgr/seq_vector.hpp>
00051 #include <objmgr/util/seq_loc_util.hpp>
00052
00053 #include <objects/seqloc/Seq_interval.hpp>
00054 #include <objects/seqalign/Seq_align.hpp>
00055 #include <objects/seqalign/Seq_align_set.hpp>
00056 #include <objects/seqalign/Spliced_seg.hpp>
00057 #include <objects/seqalign/Spliced_exon.hpp>
00058 #include <objects/seqalign/Spliced_exon_chunk.hpp>
00059 #include <objects/seqalign/Product_pos.hpp>
00060 #include <objects/seqalign/Splice_site.hpp>
00061 #include <objects/seqalign/Score.hpp>
00062 #include <objects/seqalign/Score_set.hpp>
00063 #include <objects/general/Object_id.hpp>
00064
00065 #include <math.h>
00066 #include <algorithm>
00067
00068 BEGIN_NCBI_SCOPE
00069
00070 namespace {
00071
00072
00073
00074
00075
00076
00077
00078 const Uint4 kNonCoveredEndThreshold (55);
00079 const double kPower (2.5);
00080
00081
00082
00083
00084 const size_t kMinTermExonSize (28);
00085 const double kMinTermExonIdty (0.9);
00086
00087 const CSplign::THit::TCoord
00088 kMaxCoord (numeric_limits<CSplign::THit::TCoord>::max());
00089 }
00090
00091
00092 CSplign::CSplign( void )
00093 {
00094 m_CompartmentPenalty = s_GetDefaultCompartmentPenalty();
00095 m_MinExonIdty = s_GetDefaultMinExonIdty();
00096 m_MinSingletonIdty = m_MinCompartmentIdty =s_GetDefaultMinCompartmentIdty();
00097 m_MinSingletonIdtyBps = numeric_limits<size_t>::max();
00098 m_max_genomic_ext = s_GetDefaultMaxGenomicExtent();
00099 m_MaxIntron = CCompartmentFinder<THit>::s_GetDefaultMaxIntron();
00100 m_endgaps = true;
00101 m_strand = true;
00102 m_nopolya = false;
00103 m_cds_start = m_cds_stop = 0;
00104 m_model_id = 0;
00105 m_MaxCompsPerQuery = 0;
00106 m_MinPatternHitLength = 13;
00107 }
00108
00109 CSplign::~CSplign()
00110 {
00111 }
00112
00113
00114 CRef<CSplign::TAligner>& CSplign::SetAligner( void ) {
00115 return m_aligner;
00116 }
00117
00118 CConstRef<CSplign::TAligner> CSplign::GetAligner( void ) const {
00119 return m_aligner;
00120 }
00121
00122
00123 CRef<CSplicedAligner> CSplign::s_CreateDefaultAligner(bool low_query_quality)
00124 {
00125 CRef<CSplicedAligner> aligner(
00126 static_cast<CSplicedAligner*>(new CSplicedAligner16));
00127
00128 #ifdef ALGOALIGN_NW_SPLIGN_MAKE_PUBLIC_BINARY
00129 if(false == low_query_quality) {
00130 aligner->SetWm (1000);
00131 aligner->SetWms (-1044);
00132 aligner->SetWg (-3070);
00133 aligner->SetWs (-173);
00134 aligner->SetScoreMatrix(NULL);
00135
00136 aligner->SetWi(0, -4270);
00137 aligner->SetWi(1, -5314);
00138 aligner->SetWi(2, -6358);
00139 aligner->SetWi(3, -7395);
00140 }
00141 else {
00142 aligner->SetWm (1000);
00143 aligner->SetWms (-1011);
00144 aligner->SetWg (-1460);
00145 aligner->SetWs (-464);
00146 aligner->SetScoreMatrix(NULL);
00147
00148 aligner->SetWi(0, -4988);
00149 aligner->SetWi(1, -5999);
00150 aligner->SetWi(2, -7010);
00151 aligner->SetWi(3, -13060);
00152 }
00153 #endif
00154
00155 return aligner;
00156 }
00157
00158
00159 void CSplign::SetEndGapDetection( bool on ) {
00160 m_endgaps = on;
00161 }
00162
00163 bool CSplign::GetEndGapDetection( void ) const {
00164 return m_endgaps;
00165 }
00166
00167 void CSplign::SetPolyaDetection( bool on ) {
00168 m_nopolya = !on;
00169 }
00170
00171 bool CSplign::GetPolyaDetection( void ) const {
00172 return !m_nopolya;
00173 }
00174
00175 void CSplign::SetStrand( bool strand ) {
00176 m_strand = strand;
00177 }
00178
00179 bool CSplign::GetStrand( void ) const {
00180 return m_strand;
00181 }
00182
00183 void CSplign::SetMinExonIdentity( double idty )
00184 {
00185 if(!(0 <= idty && idty <= 1)) {
00186 NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_BadIdentityThreshold);
00187 }
00188 else {
00189 m_MinExonIdty = idty;
00190 }
00191 }
00192
00193 void CSplign::SetMinCompartmentIdentity( double idty )
00194 {
00195 if(!(0 <= idty && idty <= 1)) {
00196 NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_BadIdentityThreshold);
00197 }
00198 else {
00199 m_MinCompartmentIdty = idty;
00200 }
00201 }
00202
00203 void CSplign::SetMinSingletonIdentity(double idty)
00204 {
00205 if(!(0 <= idty && idty <= 1)) {
00206 NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_BadIdentityThreshold);
00207 }
00208 else {
00209 m_MinSingletonIdty = idty;
00210 }
00211 }
00212
00213 void CSplign::SetMinSingletonIdentityBps(size_t idty_bps)
00214 {
00215 m_MinSingletonIdtyBps = idty_bps;
00216 }
00217
00218 size_t CSplign::s_GetDefaultMaxGenomicExtent(void)
00219 {
00220 return 35000;
00221 }
00222
00223
00224 void CSplign::SetMaxGenomicExtent(size_t mge)
00225 {
00226 m_max_genomic_ext = mge;
00227 }
00228
00229
00230 size_t CSplign::GetMaxGenomicExtent(void) const
00231 {
00232 return m_max_genomic_ext;
00233 }
00234
00235
00236 void CSplign::SetMaxIntron(size_t max_intron)
00237 {
00238 m_MaxIntron = max_intron;
00239 }
00240
00241
00242 size_t CSplign::GetMaxIntron(void) const
00243 {
00244 return m_MaxIntron;
00245 }
00246
00247
00248 double CSplign::GetMinExonIdentity( void ) const {
00249 return m_MinExonIdty;
00250 }
00251
00252 double CSplign::s_GetDefaultMinExonIdty(void)
00253 {
00254 return 0.75;
00255 }
00256
00257
00258 double CSplign::GetMinCompartmentIdentity(void) const {
00259 return m_MinCompartmentIdty;
00260 }
00261
00262 double CSplign::GetMinSingletonIdentity(void) const {
00263 return m_MinSingletonIdty;
00264 }
00265
00266 size_t CSplign::GetMinSingletonIdentityBps(void) const {
00267 return m_MinSingletonIdtyBps;
00268 }
00269
00270 double CSplign::s_GetDefaultMinCompartmentIdty(void)
00271 {
00272 return 0.70;
00273 }
00274
00275 void CSplign::SetMaxCompsPerQuery(size_t m) {
00276 m_MaxCompsPerQuery = m;
00277 }
00278
00279 size_t CSplign::GetMaxCompsPerQuery(void) const {
00280 return m_MaxCompsPerQuery;
00281 }
00282
00283
00284
00285 CRef<objects::CScope> CSplign::GetScope(void) const
00286 {
00287 return m_Scope;
00288 }
00289
00290
00291 CRef<objects::CScope>& CSplign::SetScope(void)
00292 {
00293 return m_Scope;
00294 }
00295
00296
00297 void CSplign::PreserveScope(bool preserve_scope)
00298 {
00299 m_CanResetHistory = ! preserve_scope;
00300 }
00301
00302
00303 void CSplign::SetCompartmentPenalty(double penalty)
00304 {
00305 if(penalty < 0 || penalty > 1) {
00306 NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_QueryCoverageOutOfRange);
00307 }
00308 m_CompartmentPenalty = penalty;
00309 }
00310
00311 double CSplign::s_GetDefaultCompartmentPenalty(void)
00312 {
00313 return 0.55;
00314 }
00315
00316 double CSplign::GetCompartmentPenalty( void ) const
00317 {
00318 return m_CompartmentPenalty;
00319 }
00320
00321 void CSplign::x_LoadSequence(vector<char>* seq,
00322 const objects::CSeq_id& seqid,
00323 THit::TCoord start,
00324 THit::TCoord finish,
00325 bool retain)
00326 {
00327 USING_SCOPE(objects);
00328
00329 try {
00330
00331 if(m_Scope.IsNull()) {
00332 NCBI_THROW(CAlgoAlignException, eInternal, "Splign scope not set");
00333 }
00334
00335 CBioseq_Handle bh = m_Scope->GetBioseqHandle(seqid);
00336
00337 if(retain && m_CanResetHistory) {
00338 m_Scope->ResetHistory();
00339 }
00340
00341 if(bh) {
00342
00343 CSeqVector sv = bh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
00344 const TSeqPos dim = sv.size();
00345 if(dim == 0) {
00346 NCBI_THROW(CAlgoAlignException,
00347 eNoSeqData,
00348 string("Sequence is empty: ")
00349 + seqid.AsFastaString());
00350 }
00351 if(finish >= dim) {
00352 finish = dim - 1;
00353 }
00354
00355 if(start > finish) {
00356 CNcbiOstrstream ostr;
00357 ostr << "Invalid sequence interval requested for "
00358 << seqid.GetSeqIdString(true) << ":\t"
00359 << start << '\t' << finish;
00360 const string err = CNcbiOstrstreamToString(ostr);
00361 NCBI_THROW(CAlgoAlignException, eNoSeqData, err);
00362 }
00363
00364 string s;
00365 sv.GetSeqData(start, finish + 1, s);
00366 seq->resize(1 + finish - start);
00367 copy(s.begin(), s.end(), seq->begin());
00368 }
00369 else {
00370 NCBI_THROW(CAlgoAlignException, eNoSeqData,
00371 string("ID not found: ") + seqid.AsFastaString());
00372 }
00373
00374 if(retain == false && m_CanResetHistory) {
00375 m_Scope->RemoveFromHistory(bh);
00376 }
00377 }
00378
00379 catch(CAlgoAlignException& e) {
00380 NCBI_RETHROW_SAME(e, "CSplign::x_LoadSequence(): Sequence data problem");
00381 }
00382 }
00383
00384
00385 void CSplign::ClearMem(void)
00386 {
00387 m_Scope.Reset(NULL);
00388 m_pattern.clear();
00389 m_alnmap.clear();
00390 m_genomic.clear();
00391 m_mrna.clear();
00392 }
00393
00394
00395 CSplign::THitRef CSplign::sx_NewHit(THit::TCoord q0, THit::TCoord q,
00396 THit::TCoord s0, THit::TCoord s)
00397 {
00398 THitRef hr (new THit);
00399 hr->SetQueryStart(q0);
00400 hr->SetSubjStart(s0);
00401 hr->SetQueryStop(q - 1);
00402 hr->SetSubjStop(s - 1);
00403 hr->SetLength(q - q0);
00404 hr->SetMismatches(0);
00405 hr->SetGaps(0);
00406 hr->SetEValue(0);
00407 hr->SetScore(2*(q - q0));
00408 hr->SetIdentity(1);
00409 return hr;
00410 }
00411
00412
00413 void CSplign::x_SplitQualifyingHits(THitRefs* phitrefs)
00414 {
00415 const char * Seq1 (&m_mrna.front());
00416 const char * Seq2 (&m_genomic.front());
00417 THitRefs rv;
00418 ITERATE(THitRefs, ii, (*phitrefs)) {
00419
00420 const THitRef & h (*ii);
00421 const double idty (h->GetIdentity());
00422 const bool diag (h->GetGaps() == 0 && h->GetQuerySpan() == h->GetSubjSpan());
00423 if(idty == 1 || idty < .95 || h->GetLength() < 100 || !diag) {
00424 rv.push_back(h);
00425 }
00426 else {
00427
00428 int q0 (-1), s0 (-1), q1 (h->GetQueryMax());
00429 int q (h->GetQueryMin()), s (h->GetSubjMin());
00430 size_t new_hits (0);
00431 while(q <= q1) {
00432 if(Seq1[q++] != Seq2[s++]) {
00433 if(q0 != -1 && q >= q0 + int(m_MinPatternHitLength)) {
00434 THitRef hr (sx_NewHit(q0, q, s0, s));
00435 hr->SetQueryId(h->GetQueryId());
00436 hr->SetSubjId(h->GetSubjId());
00437 rv.push_back(hr);
00438 ++new_hits;
00439 }
00440 q0 = s0 = -1;
00441 }
00442 else {
00443 if(q0 == -1) {
00444 q0 = q;
00445 s0 = s;
00446 }
00447 }
00448 }
00449
00450 if(q0 != -1 && q >= q0 + int(m_MinPatternHitLength)) {
00451 THitRef hr (sx_NewHit(q0, q, s0, s));
00452 hr->SetQueryId(h->GetQueryId());
00453 hr->SetSubjId(h->GetSubjId());
00454 rv.push_back(hr);
00455 ++new_hits;
00456 }
00457
00458 if(new_hits == 0) {
00459 rv.push_back(h);
00460 }
00461 }
00462 }
00463
00464 *phitrefs = rv;
00465 }
00466
00467
00468 void CSplign::x_SetPattern(THitRefs* phitrefs)
00469 {
00470 m_alnmap.clear();
00471 m_pattern.clear();
00472
00473
00474 typedef CHitComparator<THit> THitComparator;
00475 THitComparator sorter (THitComparator::eQueryMin);
00476 stable_sort(phitrefs->begin(), phitrefs->end(), sorter);
00477
00478
00479
00480 size_t prev (0);
00481 NON_CONST_ITERATE(THitRefs, ii, *phitrefs) {
00482
00483 THitRef& h (*ii);
00484
00485 if(h->GetQuerySpan() < m_MinPatternHitLength) {
00486 h.Reset(0);
00487 continue;
00488 }
00489
00490 if(prev > 0) {
00491
00492 const bool consistent (h->GetSubjStrand()?
00493 (h->GetSubjStart() < prev + m_MaxIntron):
00494 (h->GetSubjStart() + m_MaxIntron > prev));
00495
00496 if(!consistent) {
00497 const string errmsg (g_msg_CompartmentInconsistent
00498 + string(" (extra long introns)"));
00499 NCBI_THROW(CAlgoAlignException, eIntronTooLong, errmsg);
00500 }
00501 }
00502
00503 prev = h->GetSubjStop();
00504 }
00505
00506 phitrefs->erase(remove_if(phitrefs->begin(), phitrefs->end(),
00507 CHitFilter<THit>::s_PNullRef),
00508 phitrefs->end());
00509
00510
00511 vector<size_t> pattern0;
00512 vector<pair<bool,double> > imperfect;
00513 double max_idty (0.0);
00514 for(size_t i (0), n (phitrefs->size()); i < n; ++i) {
00515
00516 const THitRef & h ((*phitrefs)[i]);
00517 const bool valid (true);
00518 if(valid) {
00519
00520 pattern0.push_back(h->GetQueryMin());
00521 pattern0.push_back(h->GetQueryMax());
00522 pattern0.push_back(h->GetSubjMin());
00523 pattern0.push_back(h->GetSubjMax());
00524 const double idty (h->GetIdentity());
00525 const bool imprf (idty < 1.00
00526 || h->GetQuerySpan() != h->GetSubjSpan()
00527 || h->GetMismatches() > 0
00528 || h->GetGaps() > 0);
00529 imperfect.push_back(pair<bool,double>(imprf, idty));
00530 if(idty > max_idty) {
00531 max_idty = idty;
00532 }
00533 }
00534 }
00535
00536 if(max_idty < .85 && pattern0.size() >= 4) {
00537 m_BoundingRange = pair<size_t, size_t>(pattern0[2], pattern0.back());
00538 }
00539 else {
00540 m_BoundingRange = pair<size_t, size_t>(0, 0);
00541 }
00542
00543 const size_t dim (pattern0.size());
00544
00545 const char* Seq1 (&m_mrna.front());
00546 const size_t SeqLen1 (m_polya_start < kMax_UInt? m_polya_start: m_mrna.size());
00547 const char* Seq2 (&m_genomic.front());
00548 const size_t SeqLen2 (m_genomic.size());
00549
00550
00551 CNcbiOstrstream ostr_err;
00552 bool some_error (false), bad_input (false);
00553 if(dim % 4 == 0) {
00554
00555 for(size_t i (0); i < dim; i += 4) {
00556
00557 if(pattern0[i] > pattern0[i+1] || pattern0[i+2] > pattern0[i+3]) {
00558 ostr_err << "Pattern hits must be specified in plus strand";
00559 some_error = bad_input = true;
00560 break;
00561 }
00562
00563 if(i > 4) {
00564 if(pattern0[i] <= pattern0[i-3] || pattern0[i+2] <= pattern0[i-1]) {
00565 ostr_err << g_msg_CompartmentInconsistent
00566 << string(" (hits not sorted)");
00567 some_error = true;
00568 break;
00569 }
00570 }
00571
00572 const bool br1 (pattern0[i+1] >= SeqLen1);
00573 const bool br2 (pattern0[i+3] >= SeqLen2);
00574 if(br1 || br2) {
00575
00576 ostr_err << "Pattern hits out of range ("
00577 << "query = "
00578 << phitrefs->front()->GetQueryId()->GetSeqIdString(true)
00579 << "subj = "
00580 << phitrefs->front()->GetSubjId()->GetSeqIdString(true)
00581 << "):" << endl;
00582
00583 if(br1) {
00584 ostr_err << "\tquery_pattern_max = " << pattern0[i+1]
00585 << "; query_len = " << SeqLen1 << endl;
00586 }
00587
00588 if(br2) {
00589 ostr_err << "\tsubj_pattern_max = " << pattern0[i+3]
00590 << "; subj_len = " << SeqLen2 << endl;
00591 }
00592
00593 some_error= true;
00594 break;
00595 }
00596 }
00597
00598 }
00599 else {
00600 ostr_err << "Pattern dimension must be a multiple of four";
00601 some_error = bad_input = true;
00602 }
00603
00604 if(some_error) {
00605 ostr_err << " (query = "
00606 << phitrefs->front()->GetQueryId()->AsFastaString()
00607 << " , subj = "
00608 << phitrefs->front()->GetSubjId()->AsFastaString() << ')'
00609 << endl;
00610 }
00611
00612 const string err = CNcbiOstrstreamToString(ostr_err);
00613 if(err.size() > 0) {
00614 if(bad_input) {
00615 NCBI_THROW(CAlgoAlignException, eBadParameter, err);
00616 }
00617 else {
00618 NCBI_THROW(CAlgoAlignException, ePattern, err);
00619 }
00620 }
00621
00622 SAlnMapElem map_elem;
00623 map_elem.m_box[0] = map_elem.m_box[2] = 0;
00624 map_elem.m_pattern_start = map_elem.m_pattern_end = -1;
00625
00626
00627 CBandAligner nwa;
00628 for(size_t i = 0; i < dim; i += 4) {
00629
00630 size_t L1, R1, L2, R2;
00631 size_t max_seg_size (0);
00632
00633 const bool imprf (imperfect[i/4].first);
00634 if(imprf) {
00635
00636
00637
00638
00639
00640 const size_t len1 (pattern0[i+1] - pattern0[i] + 1);
00641 const size_t len2 (pattern0[i+3] - pattern0[i+2] + 1);
00642 const size_t maxlen (max(len1, len2));
00643 const size_t lendif (len1 < len2? len2 - len1: len1 - len2);
00644 size_t band (size_t((1 - imperfect[i/4].second) * maxlen) + 2);
00645 if(band < lendif) band += lendif;
00646 nwa.SetBand(band);
00647 nwa.SetSequences(Seq1 + pattern0[i], len1,
00648 Seq2 + pattern0[i+2], len2,
00649 false);
00650 nwa.Run();
00651 max_seg_size = nwa.GetLongestSeg(&L1, &R1, &L2, &R2);
00652 }
00653 else {
00654
00655 L1 = 1;
00656 R1 = pattern0[i+1] - pattern0[i] - 1;
00657 L2 = 1;
00658 R2 = pattern0[i+3] - pattern0[i+2] - 1;
00659 max_seg_size = R1 - L1 + 1;
00660 }
00661
00662 if(max_seg_size) {
00663
00664
00665 {{
00666 size_t cut ((1 + R1 - L1) / 5);
00667 if(cut > 20) cut = 20;
00668
00669 const size_t l1 (L1 + cut), l2 (L2 + cut);
00670 const size_t r1 (R1 - cut), r2 (R2 - cut);
00671 if(l1 < r1 && l2 < r2) {
00672 L1 = l1; L2 = l2;
00673 R1 = r1; R2 = r2;
00674 }
00675 }}
00676
00677 size_t q0 (pattern0[i] + L1);
00678 size_t s0 (pattern0[i+2] + L2);
00679 size_t q1 (pattern0[i] + R1);
00680 size_t s1 (pattern0[i+2] + R2);
00681
00682 if(imprf) {
00683
00684 const size_t hitlen_q (pattern0[i + 1] - pattern0[i] + 1);
00685 const size_t sh (size_t(hitlen_q / 4));
00686
00687 size_t delta (sh > L1? sh - L1: 0);
00688 q0 += delta;
00689 s0 += delta;
00690
00691 const size_t h2s_right (hitlen_q - R1 - 1);
00692 delta = sh > h2s_right? sh - h2s_right: 0;
00693 q1 -= delta;
00694 s1 -= delta;
00695
00696 if(q0 > q1 || s0 > s1) {
00697
00698
00699 q0 = pattern0[i] + L1;
00700 s0 = pattern0[i+2] + L2;
00701 q1 = pattern0[i] + R1;
00702 s1 = pattern0[i+2] + R2;
00703 }
00704 }
00705
00706 m_pattern.push_back(q0); m_pattern.push_back(q1);
00707 m_pattern.push_back(s0); m_pattern.push_back(s1);
00708
00709 const size_t pattern_dim = m_pattern.size();
00710 if(map_elem.m_pattern_start == -1) {
00711 map_elem.m_pattern_start = pattern_dim - 4;
00712 }
00713 map_elem.m_pattern_end = pattern_dim - 1;
00714 }
00715
00716 map_elem.m_box[1] = pattern0[i+1];
00717 map_elem.m_box[3] = pattern0[i+3];
00718 }
00719
00720 map_elem.m_box[1] = SeqLen1 - 1;
00721 map_elem.m_box[3] = SeqLen2 - 1;
00722 m_alnmap.push_back(map_elem);
00723 }
00724
00725
00726 CSplign::TOrfPair CSplign::GetCds(const THit::TId& id, const vector<char> * seq_data)
00727 {
00728 TOrfPair rv (TOrf(0, 0), TOrf(0, 0));
00729
00730 TStrIdToOrfs::const_iterator ie (m_OrfMap.end());
00731 const string strid (id->AsFastaString());
00732 TStrIdToOrfs::const_iterator ii (m_OrfMap.find(strid));
00733
00734 if(ii != ie) {
00735 rv = ii->second;
00736 }
00737 else {
00738
00739 USING_SCOPE(objects);
00740
00741 vector<char> seq;
00742 if(seq_data == 0) {
00743 x_LoadSequence(&seq, *id, 0, kMaxCoord, false);
00744 seq_data = & seq;
00745 }
00746
00747
00748
00749 vector<CRef<CSeq_loc> > orfs;
00750 vector<string> start_codon;
00751 start_codon.push_back("ATG");
00752
00753 COrf::FindOrfs(*seq_data, orfs, 90, 1, start_codon);
00754 TSeqPos max_len_plus (0), max_len_minus (0);
00755 TSeqPos max_from_plus (0), max_from_minus (0);
00756 TSeqPos max_to_plus (0), max_to_minus (0);
00757 ITERATE (vector<CRef<CSeq_loc> >, orf, orfs) {
00758
00759 const TSeqPos len (sequence::GetLength(**orf, NULL));
00760 const ENa_strand orf_strand ((*orf)->GetInt().GetStrand());
00761 const bool antisense (orf_strand == eNa_strand_minus);
00762
00763 if(antisense) {
00764 if(len > max_len_minus) {
00765 max_len_minus = len;
00766 max_from_minus = (*orf)->GetInt().GetTo();
00767 max_to_minus = (*orf)->GetInt().GetFrom();
00768 }
00769 }
00770 else {
00771 if(len > max_len_plus) {
00772 max_len_plus = len;
00773 max_from_plus = (*orf)->GetInt().GetFrom();
00774 max_to_plus = (*orf)->GetInt().GetTo();
00775 }
00776 }
00777 }
00778
00779 if(max_len_plus > 0) {
00780 rv.first = TOrf(max_from_plus, max_to_plus);
00781 }
00782
00783 if(max_len_minus > 0) {
00784 rv.second = TOrf(max_from_minus, max_to_minus);
00785 }
00786
00787 m_OrfMap[strid] = rv;
00788 }
00789
00790 return rv;
00791 }
00792
00793
00794 void CSplign::x_FinalizeAlignedCompartment(SAlignedCompartment & ac)
00795 {
00796 ac.m_Id = ++m_model_id;
00797 ac.m_Segments = m_segments;
00798 ac.m_Status = SAlignedCompartment::eStatus_Ok;
00799 ac.m_Msg = "Ok";
00800 ac.m_Cds_start = m_cds_start;
00801 ac.m_Cds_stop = m_cds_stop;
00802 ac.m_QueryLen = m_mrna.size();
00803 ac.m_PolyA = (m_polya_start < kMax_UInt? m_polya_start : 0);
00804 }
00805
00806
00807
00808
00809 void CSplign::Run(THitRefs* phitrefs)
00810 {
00811 if(!phitrefs) {
00812 NCBI_THROW(CAlgoAlignException, eInternal, "Unexpected NULL pointers");
00813 }
00814
00815 THitRefs& hitrefs = *phitrefs;
00816
00817
00818 NON_CONST_ITERATE(THitRefs, ii, hitrefs) {
00819
00820 THitRef& h = *ii;
00821 if(h.NotNull() && h->GetQueryStrand() == false) {
00822 h->FlipStrands();
00823 }
00824 }
00825
00826 if(m_aligner.IsNull()) {
00827 NCBI_THROW(CAlgoAlignException, eNotInitialized, g_msg_AlignedNotSpecified);
00828 }
00829
00830 if(hitrefs.size() == 0) {
00831 NCBI_THROW(CAlgoAlignException, eNoHits, g_msg_EmptyHitVectorPassed);
00832 }
00833
00834 m_result.clear();
00835
00836 THit::TId id_query (hitrefs.front()->GetQueryId());
00837
00838 const THit::TCoord mrna_size (objects::sequence::GetLength(*id_query, m_Scope));
00839 if(mrna_size == kMaxCoord) {
00840 NCBI_THROW(CAlgoAlignException, eNoSeqData,
00841 string("Sequence not found: ") + id_query->AsFastaString());
00842 }
00843
00844
00845 const THit::TCoord min_singleton_idty_final (
00846 min(size_t(m_MinSingletonIdty * mrna_size),
00847 m_MinSingletonIdtyBps));
00848
00849 CCompartmentAccessor<THit> comps (THit::TCoord(m_CompartmentPenalty * mrna_size),
00850 THit::TCoord(m_MinCompartmentIdty * mrna_size),
00851 min_singleton_idty_final,
00852 true);
00853 comps.SetMaxIntron(m_MaxIntron);
00854 comps.Run(hitrefs.begin(), hitrefs.end());
00855
00856 pair<size_t,size_t> dim (comps.GetCounts());
00857 if(dim.second > 0) {
00858
00859
00860 m_mrna.clear();
00861 x_LoadSequence(&m_mrna, *id_query, 0, kMaxCoord, false);
00862
00863 const TOrfPair orfs (GetCds(id_query, & m_mrna));
00864 if(m_strand) {
00865 m_cds_start = orfs.first.first;
00866 m_cds_stop = orfs.first.second;
00867 }
00868 else {
00869 m_cds_start = orfs.second.first;
00870 m_cds_stop = orfs.second.second;
00871 }
00872
00873 if(!m_strand) {
00874
00875 reverse (m_mrna.begin(), m_mrna.end());
00876 transform(m_mrna.begin(), m_mrna.end(), m_mrna.begin(), SCompliment());
00877 }
00878
00879
00880 size_t smin (0), smax (kMax_UInt);
00881 bool same_strand (false);
00882
00883 const THit::TCoord* box (comps.GetBox(0));
00884 if(m_MaxCompsPerQuery > 0 && dim.second > m_MaxCompsPerQuery) {
00885 dim.second = m_MaxCompsPerQuery;
00886 }
00887
00888 for(size_t i (0); i < dim.first; ++i, box += 4) {
00889
00890 if(i + 1 == dim.first) {
00891 smax = kMax_UInt;
00892 same_strand = false;
00893 }
00894 else {
00895 bool strand_this (comps.GetStrand(i));
00896 bool strand_next (comps.GetStrand(i+1));
00897 same_strand = strand_this == strand_next;
00898 smax = same_strand? (box + 4)[2]: kMax_UInt;
00899 }
00900
00901 try {
00902
00903 if(smax < box[3]) {
00904
00905 NCBI_THROW(CAlgoAlignException, eInternal,
00906 "Unexpected order of compartments");
00907 }
00908
00909 if(comps.GetStatus(i)) {
00910 THitRefs comp_hits;
00911 comps.Get(i, comp_hits);
00912
00913 if(smax < box[3]) smax = box[3];
00914 if(smin > box[2]) smin = box[2];
00915
00916 SAlignedCompartment ac (x_RunOnCompartment(&comp_hits, smin,smax));
00917 x_FinalizeAlignedCompartment(ac);
00918 m_result.push_back(ac);
00919 }
00920 }
00921
00922 catch(CAlgoAlignException& e) {
00923
00924 if(e.GetSeverity() == eDiag_Fatal) {
00925 throw;
00926 }
00927
00928 m_result.push_back(SAlignedCompartment(0, e.GetMsg().c_str()));
00929
00930 const CException::TErrCode errcode (e.GetErrCode());
00931 if(errcode != CAlgoAlignException::eNoAlignment) {
00932 m_result.back().m_Status = SAlignedCompartment::eStatus_Error;
00933 }
00934
00935 ++m_model_id;
00936 }
00937
00938 smin = same_strand? box[3]: 0;
00939 }
00940 }
00941 }
00942
00943
00944 bool CSplign::AlignSingleCompartment(CRef<objects::CSeq_align> compartment,
00945 SAlignedCompartment* result)
00946 {
00947 USING_SCOPE(objects);
00948 const CRef<CSeq_loc> seqloc (compartment->GetBounds().front());
00949 const size_t subj_min (seqloc->GetStart(eExtreme_Positional));
00950 const size_t subj_max (seqloc->GetStop(eExtreme_Positional));
00951
00952 THitRefs hitrefs;
00953 ITERATE(CSeq_align_set::Tdata, ii, compartment->GetSegs().GetDisc().Get()) {
00954 }
00955
00956 return AlignSingleCompartment(&hitrefs, subj_min, subj_max, result);
00957 }
00958
00959
00960 bool CSplign::AlignSingleCompartment(THitRefs* phitrefs,
00961 size_t subj_min, size_t subj_max,
00962 SAlignedCompartment* result)
00963 {
00964 m_mrna.resize(0);
00965
00966 THit::TId id_query (phitrefs->front()->GetQueryId());
00967
00968 x_LoadSequence(&m_mrna, *id_query, 0, kMaxCoord, false);
00969
00970 const TOrfPair orfs (GetCds(id_query, & m_mrna));
00971 if(m_strand) {
00972 m_cds_start = orfs.first.first;
00973 m_cds_stop = orfs.first.second;
00974 }
00975 else {
00976 m_cds_start = orfs.second.first;
00977 m_cds_stop = orfs.second.second;
00978 }
00979
00980 if(!m_strand) {
00981 reverse (m_mrna.begin(), m_mrna.end());
00982 transform(m_mrna.begin(), m_mrna.end(), m_mrna.begin(), SCompliment());
00983 }
00984
00985 bool rv (true);
00986 try {
00987
00988 SAlignedCompartment ac (x_RunOnCompartment(phitrefs, subj_min, subj_max));
00989 x_FinalizeAlignedCompartment(ac);
00990 *result = ac;
00991 m_mrna.resize(0);
00992 }
00993
00994 catch(CAlgoAlignException& e) {
00995
00996 m_mrna.resize(0);
00997
00998 if(e.GetSeverity() == eDiag_Fatal) {
00999 throw;
01000 }
01001
01002 *result = SAlignedCompartment(0, e.GetMsg().c_str());
01003
01004 const CException::TErrCode errcode (e.GetErrCode());
01005 if(errcode != CAlgoAlignException::eNoAlignment) {
01006 result->m_Status = SAlignedCompartment::eStatus_Error;
01007 }
01008
01009 ++m_model_id;
01010 rv = false;
01011 }
01012
01013 return rv;
01014 }
01015
01016
01017 size_t CSplign::s_TestPolyA(const char * seq, size_t dim, size_t cds_stop)
01018 {
01019 const size_t kMaxNonA (3), kMinAstreak (5);
01020 int i (dim - 1), i0 (dim);
01021 for(size_t count_non_a (0), astreak (0); i >= 0 && count_non_a < kMaxNonA; --i) {
01022
01023 if(seq[i] != 'A') {
01024 ++count_non_a;
01025 astreak = 0;
01026 }
01027 else {
01028 if(++astreak >= kMinAstreak) {
01029 i0 = i;
01030 }
01031 }
01032 }
01033
01034 const size_t len (dim - i0);
01035 size_t rv;
01036 if(len >= kMinAstreak) {
01037 rv = i0;
01038 if(0 < cds_stop && cds_stop < dim && rv <= cds_stop) {
01039 rv = cds_stop + 1;
01040 }
01041 }
01042 else {
01043 rv = kMax_UInt;
01044 }
01045
01046 return rv;
01047 }
01048
01049
01050
01051
01052
01053
01054
01055 CSplign::SAlignedCompartment CSplign::x_RunOnCompartment(THitRefs* phitrefs,
01056 size_t range_left,
01057 size_t range_right)
01058 {
01059 SAlignedCompartment rv;
01060
01061 try {
01062 m_segments.clear();
01063
01064 if(range_left > range_right) {
01065 NCBI_THROW(CAlgoAlignException, eInternal, g_msg_InvalidRange);
01066 }
01067
01068 if(phitrefs->size() == 0) {
01069 NCBI_THROW(CAlgoAlignException, eNoHits, g_msg_NoHitsAfterFiltering);
01070 }
01071
01072 const size_t mrna_size (m_mrna.size());
01073
01074 if(m_strand == false) {
01075
01076
01077 for(size_t i (0), n (phitrefs->size()); i < n; ++i) {
01078
01079 THitRef& h ((*phitrefs)[i]);
01080 THit::TCoord a0 (mrna_size - h->GetQueryMin() - 1);
01081 THit::TCoord a1 (mrna_size - h->GetQueryMax() - 1);
01082 const bool new_strand (!(h->GetSubjStrand()));
01083 h->SetQueryStart(a1);
01084 h->SetQueryStop(a0);
01085 h->SetSubjStrand(new_strand);
01086 }
01087 }
01088
01089 m_polya_start = m_nopolya?
01090 kMax_UInt:
01091 s_TestPolyA(&m_mrna.front(), m_mrna.size(), m_cds_stop);
01092
01093
01094 if(m_polya_start < kMax_UInt) {
01095 CleaveOffByTail(phitrefs, m_polya_start);
01096 }
01097
01098
01099 THitRefs::iterator ii (phitrefs->begin()), jj (phitrefs->end() - 1);
01100 const size_t min_termhitlen1 (m_MinPatternHitLength);
01101 const size_t min_termhitlen2 (2*m_MinPatternHitLength);
01102 bool b0 (true), b1 (true);
01103 while(b0 && b1 && ii < jj) {
01104
01105 while(ii->IsNull() && ii < jj) ++ii;
01106 while(jj->IsNull() && ii < jj) --jj;
01107
01108 if(ii < jj) {
01109
01110 const double hit_idty ((*ii)->GetIdentity());
01111 const size_t min_termhitlen (
01112 hit_idty < .9999? min_termhitlen2: min_termhitlen1);
01113
01114 if((*ii)->GetQuerySpan() < min_termhitlen) {
01115 ii++ -> Reset(0);
01116 }
01117 else {
01118 b0 = false;
01119 }
01120 }
01121
01122 if(ii < jj) {
01123
01124 const double hit_idty ((*jj)->GetIdentity());
01125 const size_t min_termhitlen (
01126 hit_idty < .9999? min_termhitlen2: min_termhitlen1);
01127
01128 if((*jj)->GetQuerySpan() < min_termhitlen) {
01129 jj-- -> Reset(0);
01130 }
01131 else {
01132 b1 = false;
01133 }
01134 }
01135 }
01136
01137 phitrefs->erase(remove_if(phitrefs->begin(), phitrefs->end(),
01138 CHitFilter<THit>::s_PNullRef),
01139 phitrefs->end());
01140
01141 if(phitrefs->size() == 0) {
01142 NCBI_THROW(CAlgoAlignException, eNoHits,
01143 g_msg_NoHitsAfterFiltering);
01144 }
01145
01146
01147
01148 THit::TCoord span [4];
01149 CHitFilter<THit>::s_GetSpan(*phitrefs, span);
01150 THit::TCoord qmin (span[0]), qmax (span[1]), smin (span[2]), smax (span[3]);
01151
01152 const bool ctg_strand (phitrefs->front()->GetSubjStrand());
01153
01154
01155 const THit::TCoord extent_left_m1 (x_GetGenomicExtent(qmin));
01156 const THit::TCoord rspace ((m_polya_start < kMax_UInt?
01157 m_polya_start: mrna_size) - qmax + 1);
01158 const THit::TCoord extent_right_m1 (x_GetGenomicExtent(rspace));
01159
01160
01161 THit::TCoord fixed_left (kMaxCoord / 2), fixed_right(fixed_left);
01162
01163 const size_t kTermLenCutOff_m2 (10);
01164 const bool fix_left (qmin <= kTermLenCutOff_m2);
01165 const bool fix_right (rspace <= kTermLenCutOff_m2);
01166 if(fix_left || fix_right) {
01167
01168 if(phitrefs->size() > 1) {
01169
01170
01171 THit::TCoord max_intron (0);
01172 THit::TCoord prev_start (phitrefs->front()->GetSubjStart());
01173
01174 ITERATE(THitRefs, ii, (*phitrefs)) {
01175
01176 const THit::TCoord cur_start ((*ii)->GetSubjStart());
01177 const THit::TCoord intron (cur_start >= prev_start?
01178 cur_start - prev_start:
01179 prev_start - cur_start);
01180 if(intron > max_intron) {
01181 max_intron = intron;
01182 }
01183 prev_start = cur_start;
01184 }
01185
01186 const double factor (2.5);
01187 if(fix_left) { fixed_left = THit::TCoord(max_intron * factor); }
01188 if(fix_right) { fixed_right = THit::TCoord(max_intron * factor); }
01189 }
01190 else {
01191
01192 const THit::TCoord single_hit_extent (300);
01193 if(fix_left) { fixed_left = single_hit_extent; }
01194 if(fix_right) { fixed_right = single_hit_extent; }
01195 }
01196 }
01197
01198 const THit::TCoord extent_left_m2 (100 + max(fixed_left, qmin));
01199 const THit::TCoord extent_right_m2 (100 + max(fixed_right, rspace));
01200
01201 const THit::TCoord extent_left (min(extent_left_m1, extent_left_m2));
01202 const THit::TCoord extent_right (min(extent_right_m1, extent_right_m2));
01203
01204 if(ctg_strand) {
01205 smin = max(0, int(smin - extent_left));
01206 smax += extent_right;
01207 }
01208 else {
01209 smin = max(0, int(smin - extent_right));
01210 smax += extent_left;
01211 }
01212
01213
01214 qmin = 0;
01215 qmax = m_polya_start < kMax_UInt? m_polya_start - 1: mrna_size - 1;
01216
01217
01218 if(smin < range_left) {
01219 smin = range_left;
01220 }
01221 if(smax > range_right) {
01222 smax = range_right;
01223 }
01224
01225 m_genomic.clear();
01226 x_LoadSequence(&m_genomic, *(phitrefs->front()->GetSubjId()),
01227 smin, smax, true);
01228
01229
01230 const THit::TCoord ctg_end (smin + m_genomic.size());
01231 if(smax >= ctg_end) {
01232 smax = ctg_end > 0? ctg_end - 1: 0;
01233 }
01234
01235 if(ctg_strand == false) {
01236
01237
01238
01239 reverse (m_genomic.begin(), m_genomic.end());
01240 transform(m_genomic.begin(), m_genomic.end(), m_genomic.begin(),
01241 SCompliment());
01242 }
01243
01244 NON_CONST_ITERATE(THitRefs, ii, *phitrefs) {
01245
01246 THitRef& h (*ii);
01247
01248 const THit::TCoord hsmin (h->GetSubjMin());
01249 const THit::TCoord hsmax (h->GetSubjMax());
01250 if(!(smin <= hsmin && hsmax <= smax)) {
01251 CNcbiOstrstream ostr;
01252 ostr << "Invalid pattern hit:\n" << *h
01253 << "\n Interval = (" << smin << ", " << smax << ')';
01254 const string errmsg = CNcbiOstrstreamToString(ostr);
01255 NCBI_THROW(CAlgoAlignException, ePattern, errmsg);
01256 }
01257
01258 if(ctg_strand == false) {
01259
01260 THit::TCoord a2 (smax - (hsmax - smin));
01261 THit::TCoord a3 (smax - (hsmin - smin));
01262 h->SetSubjStart(a2);
01263 h->SetSubjStop(a3);
01264 }
01265 }
01266
01267 rv.m_QueryStrand = m_strand;
01268 rv.m_SubjStrand = ctg_strand;
01269
01270
01271 NON_CONST_ITERATE(THitRefs, ii, *phitrefs) {
01272 (*ii)->Shift(-qmin, -smin);
01273 }
01274
01275 x_SplitQualifyingHits(phitrefs);
01276 x_SetPattern(phitrefs);
01277 rv.m_Score = x_Run(&m_mrna.front(), &m_genomic.front());
01278
01279 const size_t seg_dim (m_segments.size());
01280 if(seg_dim == 0) {
01281 NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);
01282 }
01283
01284
01285 if(m_polya_start < kMax_UInt && seg_dim && m_segments[seg_dim-1].m_exon) {
01286
01287 TSegment& s (const_cast<TSegment&>(m_segments[seg_dim-1]));
01288
01289 const char* p0 = &m_mrna.front() + s.m_box[1] + 1;
01290 const char* q = &m_genomic.front() + s.m_box[3] + 1;
01291 const char* p = p0;
01292 const char* pe = &m_mrna.front() + mrna_size;
01293 const char* qe = &m_genomic.front() + m_genomic.size();
01294 for(; p < pe && q < qe; ++p, ++q) {
01295 if(*p != *q) break;
01296 }
01297
01298 const size_t sh (p - p0);
01299 if(sh) {
01300
01301 s.m_box[1] += sh;
01302 s.m_box[3] += sh;
01303 s.m_details.append(sh, 'M');
01304 s.Update(m_aligner);
01305
01306
01307 const size_t ann_dim = s.m_annot.size();
01308 if(ann_dim > 2 && s.m_annot[ann_dim - 3] == '>') {
01309
01310 s.m_annot[ann_dim - 2] = q < qe? *q: ' ';
01311 ++q;
01312 s.m_annot[ann_dim - 1] = q < qe? *q: ' ';
01313 }
01314
01315 m_polya_start += sh;
01316 }
01317 }
01318
01319
01320
01321
01322 int j (seg_dim - 1), j0 (j);
01323 for(; j >= 0; --j) {
01324
01325 const TSegment& s (m_segments[j]);
01326
01327 const char* p0 (&m_mrna[qmin] + s.m_box[0]);
01328 const char* p1 (&m_mrna[qmin] + s.m_box[1] + 1);
01329 const size_t len_chars (p1 - p0);
01330 size_t count (0);
01331 for(const char* pc (p0); pc != p1; ++pc) {
01332 if(*pc == 'A') ++count;
01333 }
01334
01335 double min_a_content (0.76);
01336
01337
01338 if(s.m_exon && j > 0 && m_segments[j-1].m_exon) {
01339
01340 bool consensus (TSegment::s_IsConsensusSplice(
01341 m_segments[j-1].GetDonor(), s.GetAcceptor()));
01342 if(!consensus || len_chars <= 6) {
01343 min_a_content = 0.599;
01344 }
01345 if(len_chars < 3) {
01346 min_a_content = 0.49;
01347 }
01348 }
01349
01350 if(!s.m_exon) {
01351 min_a_content = (s.m_len <= 4)? 0.49: 0.599;
01352 }
01353
01354 if(double(count)/len_chars < min_a_content) {
01355 if(s.m_exon && s.m_len > 4) break;
01356 }
01357 else {
01358 j0 = j - 1;
01359 }
01360 }
01361
01362 if(j0 >= 0 && j0 < int(seg_dim - 1)) {
01363 m_polya_start = m_segments[j0].m_box[1] + 1;
01364 }
01365
01366
01367 bool some_exons (false);
01368 for(int i = 0; i <= j0; ++i ) {
01369 if(m_segments[i].m_exon) {
01370 some_exons = true;
01371 break;
01372 }
01373 }
01374
01375 if(!some_exons) {
01376 NCBI_THROW(CAlgoAlignException, eNoAlignment,g_msg_NoExonsAboveIdtyLimit);
01377 }
01378
01379 m_segments.resize(j0 + 1);
01380
01381
01382 double mcount (0);
01383 ITERATE(TSegments, jj, m_segments) {
01384 if(jj->m_exon) {
01385 mcount += jj->m_idty * jj->m_len;
01386 }
01387 }
01388
01389 const THit::TCoord min_singleton_idty_final (
01390 min(size_t(m_MinSingletonIdty * qmax), m_MinSingletonIdtyBps));
01391
01392 if(mcount < min_singleton_idty_final) {
01393 NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);
01394 }
01395
01396
01397 NON_CONST_ITERATE(TSegments, jj, m_segments) {
01398
01399 if(rv.m_QueryStrand) {
01400 jj->m_box[0] += qmin;
01401 jj->m_box[1] += qmin;
01402 }
01403 else {
01404 jj->m_box[0] = mrna_size - jj->m_box[0] - 1;
01405 jj->m_box[1] = mrna_size - jj->m_box[1] - 1;
01406 }
01407
01408 if(jj->m_exon) {
01409 if(rv.m_SubjStrand) {
01410 jj->m_box[2] += smin;
01411 jj->m_box[3] += smin;
01412 }
01413 else {
01414 jj->m_box[2] = smax - jj->m_box[2];
01415 jj->m_box[3] = smax - jj->m_box[3];
01416 }
01417 }
01418 }
01419
01420 if(!rv.m_QueryStrand && m_polya_start > 0 && m_polya_start < mrna_size) {
01421 m_polya_start = mrna_size - m_polya_start - 1;
01422 }
01423 }
01424
01425 catch(CAlgoAlignException& e) {
01426
01427 const CException::TErrCode errcode (e.GetErrCode());
01428 bool severe (true);
01429 switch(errcode) {
01430 case CAlgoAlignException::eNoAlignment:
01431 case CAlgoAlignException::eMemoryLimit:
01432 case CAlgoAlignException::eNoHits:
01433 case CAlgoAlignException::eIntronTooLong:
01434
01435 severe = false;
01436 break;
01437 }
01438
01439 if(severe) {
01440 e.SetSeverity(eDiag_Fatal);
01441 }
01442 throw;
01443 }
01444
01445 return rv;
01446 }
01447
01448
01449 static const char s_kGap [] = "<GAP>";
01450
01451
01452 float CSplign::x_Run(const char* Seq1, const char* Seq2)
01453 {
01454 typedef deque<TSegment> TSegmentDeque;
01455 TSegmentDeque segments;
01456
01457
01458 #ifdef DBG_DUMP_PATTERN
01459 cerr << "Pattern:" << endl;
01460 #endif
01461
01462 const size_t map_dim (m_alnmap.size());
01463 if(map_dim != 1) {
01464 NCBI_THROW(CAlgoAlignException, eInternal, "Multiple maps not supported");
01465 }
01466
01467 float rv (0);
01468 size_t cds_start (0), cds_stop (0);
01469 const int kMaxShot (5);
01470 for(size_t i (0); i < map_dim; ++i) {
01471
01472 const SAlnMapElem& zone (m_alnmap[i]);
01473
01474
01475 const size_t len1 (zone.m_box[1] - zone.m_box[0] + 1);
01476 const size_t len2 (zone.m_box[3] - zone.m_box[2] + 1);
01477
01478
01479 if(m_strand) {
01480 cds_start = m_cds_start;
01481 cds_stop = m_cds_stop;
01482 }
01483 else {
01484 cds_start = len1 - m_cds_start - 1;
01485 cds_stop = len1 - m_cds_stop - 1;
01486 }
01487
01488 m_aligner->SetSequences(Seq1 + zone.m_box[0], len1,
01489 Seq2 + zone.m_box[2], len2,
01490 false);
01491
01492
01493 vector<size_t> pattern;
01494 if(m_pattern.size() > 0) {
01495
01496 if(zone.m_pattern_start < 0) {
01497 NCBI_THROW(CAlgoAlignException, eInternal,
01498 "CSplign::x_Run(): Invalid alignment pattern");
01499 }
01500
01501 copy(m_pattern.begin() + zone.m_pattern_start,
01502 m_pattern.begin() + zone.m_pattern_end + 1,
01503 back_inserter(pattern));
01504 }
01505
01506 for(size_t j (0), pt_dim (pattern.size()); j < pt_dim; j += 4) {
01507
01508 #ifdef DBG_DUMP_PATTERN
01509 cerr << (1 + pattern[j]) << '\t' << (1 + pattern[j+1]) << '\t'
01510 << "(len = " << (pattern[j+1] - pattern[j] + 1) << ")\t"
01511 << (1 + pattern[j+2]) << '\t' << (1 + pattern[j+3])
01512 << "(len = " << (pattern[j+3] - pattern[j+2] + 1) << ")\t"
01513 << endl;
01514 #undef DBG_DUMP_PATTERN
01515 #endif
01516
01517 pattern[j] -= zone.m_box[0];
01518 pattern[j+1] -= zone.m_box[0];
01519 pattern[j+2] -= zone.m_box[2];
01520 pattern[j+3] -= zone.m_box[2];
01521 }
01522
01523
01524 m_aligner->SetPattern(pattern);
01525 m_aligner->SetEndSpaceFree(true, true, true, true);
01526 m_aligner->SetCDS(cds_start, cds_stop);
01527
01528 rv += m_aligner->Run();
01529 m_aligner->CheckPreferences();
01530
01531
01532 #ifdef DBG_DUMP_TYPE2
01533 {{
01534 CNWFormatter fmt (*m_aligner);
01535 string txt;
01536 fmt.AsText(&txt, CNWFormatter::eFormatType2);
01537 cerr << txt;
01538 }}
01539 #undef DBG_DUMP_TYPE2
01540 #endif
01541
01542 CNWFormatter formatter (*m_aligner);
01543 formatter.MakeSegments(&segments);
01544
01545
01546 if(i + 1 < map_dim) {
01547 segments.push_back(TSegment());
01548 TSegment& g (segments.back());
01549 g.m_exon = false;
01550 g.m_box[0] = zone.m_box[1] + 1;
01551 g.m_box[1] = m_alnmap[i+1].m_box[0] - 1;
01552 g.m_box[2] = zone.m_box[3] + 1;
01553 g.m_box[3] = m_alnmap[i+1].m_box[2] - 1;
01554 g.m_idty = 0;
01555 g.m_len = g.m_box[1] - g.m_box[0] + 1;
01556 g.m_annot = s_kGap;
01557 g.m_details.resize(0);
01558 g.m_score = 0;
01559 }
01560 }
01561
01562
01563
01564 #ifdef DUMP_ORIG_SEGS
01565 cerr << "Orig segments:" << endl;
01566 ITERATE(TSegmentDeque, ii, segments) {
01567 cerr << ii->m_exon << '\t' << ii->m_idty << '\t' << ii->m_len << '\t'
01568 << ii->m_box[0] << '\t' << ii->m_box[1] << '\t'
01569 << ii->m_box[2] << '\t' << ii->m_box[3] << '\t'
01570 << ii->m_annot << '\t' << ii->m_score << endl;
01571 }
01572 #endif
01573
01574 if(segments.size() == 0) {
01575 NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);
01576 }
01577
01578
01579
01580 const size_t SeqLen2 (m_genomic.size());
01581 const size_t SeqLen1 (m_polya_start == kMax_UInt?
01582 m_mrna.size():
01583 m_polya_start);
01584
01585
01586 if(m_BoundingRange.second > 0) {
01587
01588 NON_CONST_ITERATE(TSegmentDeque, ii, segments) {
01589 if(ii->m_exon &&
01590 (ii->m_box[3] < m_BoundingRange.first
01591 || ii->m_box[2] > m_BoundingRange.second))
01592 {
01593 ii->m_exon = false;
01594 ii->m_idty = 0;
01595 ii->m_details.resize(0);
01596 ii->m_annot = s_kGap;
01597 ii->m_score = 0;
01598 }
01599 }
01600 }
01601
01602
01603 if(cds_start < cds_stop) {
01604 {{
01605 TSegmentDeque::iterator ib (segments.begin()),
01606 ie (segments.end()), ii (ib);
01607 while(ii != ie && !ii->m_exon) ++ii;
01608 if(ii != ie) {
01609 const int start_undershot (ii->m_box[0] - cds_start);
01610 if(0 < start_undershot && start_undershot < kMaxShot) {
01611 const int extent (min(start_undershot, int(ii->m_box[2])));
01612 if(extent > 0) {
01613 ii->ExtendLeft(extent, Seq1, Seq2, m_aligner);
01614 }
01615 }
01616 }
01617 }}
01618
01619 {{
01620 TSegmentDeque::reverse_iterator irb (segments.rbegin()),
01621 ire (segments.rend()), iir (irb);
01622 while(iir != ire && !iir->m_exon) ++iir;
01623 if(iir != ire) {
01624 const int stop_undershot (cds_stop - iir->m_box[1]);
01625 if(0 < stop_undershot && stop_undershot < kMaxShot) {
01626 const int extent (min(stop_undershot,
01627 int(SeqLen2 - iir->m_box[3] - 1)));
01628 if(extent > 0) {
01629 iir->ExtendRight(extent, Seq1, Seq2, m_aligner);
01630 }
01631 }
01632 }
01633 }}
01634 }
01635
01636 m_segments.resize(0);
01637 while(true) {
01638
01639 if(m_segments.size() > 0) {
01640 segments.resize(m_segments.size());
01641 copy(m_segments.begin(), m_segments.end(), segments.begin());
01642 m_segments.resize(0);
01643 }
01644
01645 size_t seg_dim (segments.size());
01646 if(seg_dim == 0) {
01647 return 0;
01648 }
01649
01650 size_t exon_count0 (0);
01651 ITERATE(TSegmentDeque, ii, segments) {
01652 if(ii->m_exon) ++exon_count0;
01653 }
01654
01655
01656 size_t k0 (0);
01657 while(k0 < seg_dim) {
01658
01659 TSegment& s = segments[k0];
01660 if(s.m_exon) {
01661
01662 const size_t len (1 + s.m_box[1] - s.m_box[0]);
01663 const double min_idty (len >= kMinTermExonSize?
01664 m_MinExonIdty:
01665 max(m_MinExonIdty, kMinTermExonIdty));
01666
01667 const bool b1 (s.m_idty < min_idty || m_endgaps);
01668 const bool b2 (cds_start == cds_stop
01669 || cds_start > s.m_box[0] + kMaxShot);
01670 if(b1 && b2) {
01671 s.ImproveFromLeft(Seq1, Seq2, m_aligner);
01672 }
01673
01674 if(s.m_idty >= min_idty) {
01675 break;
01676 }
01677 }
01678 ++k0;
01679 }
01680
01681 int k1 (seg_dim - 1);
01682 while(k1 >= int(k0)) {
01683
01684 TSegment& s (segments[k1]);
01685 if(s.m_exon) {
01686
01687 const size_t len (1 + s.m_box[1] - s.m_box[0]);
01688 const double min_idty (len >= kMinTermExonSize?
01689 m_MinExonIdty:
01690 max(m_MinExonIdty, kMinTermExonIdty));
01691
01692 const bool b1 (s.m_idty < min_idty || m_endgaps);
01693 const bool b2 (cds_start == cds_stop
01694 || cds_stop + kMaxShot < s.m_box[1]);
01695
01696 if(b1 && b2) {
01697 s.ImproveFromRight(Seq1, Seq2, m_aligner);
01698 }
01699
01700 if(s.m_idty >= min_idty) {
01701 break;
01702 }
01703 }
01704 --k1;
01705 }
01706
01707
01708 for(size_t k (0); k < seg_dim; ++k) {
01709
01710 TSegment& s (segments[k]);
01711 if(s.m_exon == false) continue;
01712
01713 bool drop (false);
01714 if(s.m_idty < m_MinExonIdty) {
01715 drop = true;
01716 }
01717 else if (s.m_idty < .9 && s.m_len < 20) {
01718
01719
01720 bool nc_prev (false), nc_next (false);
01721 if(k > 0 && segments[k-1].m_exon) {
01722 nc_prev = ! TSegment::s_IsConsensusSplice(
01723 segments[k-1].GetDonor(),
01724 s.GetAcceptor());
01725 }
01726
01727 if(k + 1 < seg_dim && segments[k+1].m_exon) {
01728 nc_next = ! TSegment::s_IsConsensusSplice(
01729 s.GetDonor(),
01730 segments[k+1].GetAcceptor());
01731 }
01732
01733 drop = nc_prev || nc_next;
01734 }
01735
01736 if(drop) {
01737 s.m_exon = false;
01738 s.m_idty = 0;
01739 s.m_len = s.m_box[1] - s.m_box[0] + 1;
01740 s.m_annot = s_kGap;
01741 s.m_details.resize(0);
01742 s.m_score = 0;
01743 }
01744 }
01745
01746
01747 {{
01748
01749 size_t exon_count (0);
01750 TSegment* term_segs[] = {0, 0};
01751 for(size_t i = 0; i < seg_dim; ++i) {
01752 TSegment& s = segments[i];
01753 if(s.m_exon) {
01754 term_segs[exon_count] = &s;
01755 if(++exon_count == 2) {
01756 break;
01757 }
01758 }
01759 }
01760
01761 if(exon_count == 2) {
01762 x_ProcessTermSegm(term_segs, 0);
01763 }
01764 }}
01765
01766 {{
01767
01768 size_t exon_count (0);
01769 TSegment* term_segs[] = {0, 0};
01770 for(int i = seg_dim - 1; i >= 0; --i) {
01771 TSegment& s = segments[i];
01772 if(s.m_exon) {
01773 term_segs[exon_count] = &s;
01774 if(++exon_count == 2) {
01775 break;
01776 }
01777 }
01778 }
01779
01780 if(exon_count == 2) {
01781 x_ProcessTermSegm(term_segs, 1);
01782 }
01783 }}
01784
01785
01786 bool gap_prev (false);
01787 for(size_t k (0); k < seg_dim; ++k) {
01788
01789 TSegment& s (segments[k]);
01790 if(s.m_exon == false) {
01791 gap_prev = true;
01792 }
01793 else {
01794 size_t length (s.m_box[1] - s.m_box[0] + 1);
01795 bool gap_next (false);
01796 if(k + 1 < seg_dim) {
01797 gap_next = !segments[k+1].m_exon;
01798 }
01799 if(length <= 10 && (gap_prev || gap_next)) {
01800 s.m_exon = false;
01801 s.m_idty = 0;
01802 s.m_len = s.m_box[1] - s.m_box[0] + 1;
01803 s.m_annot = s_kGap;
01804 s.m_details.resize(0);
01805 s.m_score = 0;
01806 }
01807 gap_prev = false;
01808 }
01809 }
01810
01811
01812 if(segments[0].m_box[0] > 0) {
01813
01814 TSegment g;
01815 g.m_exon = false;
01816 g.m_box[0] = 0;
01817 g.m_box[1] = segments[0].m_box[0] - 1;
01818 g.m_box[2] = 0;
01819 g.m_box[3] = segments[0].m_box[2] - 1;
01820
01821 g.m_idty = 0;
01822 g.m_len = segments[0].m_box[0];
01823 g.m_annot = s_kGap;
01824 g.m_details.resize(0);
01825 g.m_score = 0;
01826
01827 segments.push_front(g);
01828 ++seg_dim;
01829 }
01830
01831
01832 TSegment& seg_last (segments.back());
01833
01834 if(seg_last.m_box[1] + 1 < SeqLen1) {
01835
01836 TSegment g;
01837 g.m_exon = false;
01838 g.m_box[0] = seg_last.m_box[1] + 1;
01839 g.m_box[1] = SeqLen1 - 1;
01840 g.m_box[2] = seg_last.m_box[3] + 1;
01841 g.m_box[3] = SeqLen2 - 1;
01842 g.m_idty = 0;
01843 g.m_len = g.m_box[1] - g.m_box[0] + 1;
01844 g.m_annot = s_kGap;
01845 g.m_details.resize(0);
01846 g.m_score = 0;
01847
01848 segments.push_back(g);
01849 ++seg_dim;
01850 }
01851
01852
01853 int gap_start_idx (-1);
01854 if(seg_dim && segments[0].m_exon == false) {
01855 gap_start_idx = 0;
01856 }
01857
01858 for(size_t k (0); k < seg_dim; ++k) {
01859 TSegment& s (segments[k]);
01860 if(!s.m_exon) {
01861 if(gap_start_idx == -1) {
01862 gap_start_idx = int(k);
01863 if(k > 0) {
01864 s.m_box[0] = segments[k-1].m_box[1] + 1;
01865 s.m_box[2] = segments[k-1].m_box[3] + 1;
01866 }
01867 }
01868 }
01869 else {
01870 if(gap_start_idx >= 0) {
01871 TSegment& g = segments[gap_start_idx];
01872 g.m_box[1] = s.m_box[0] - 1;
01873 g.m_box[3] = s.m_box[2] - 1;
01874 g.m_len = g.m_box[1] - g.m_box[0] + 1;
01875 g.m_details.resize(0);
01876 m_segments.push_back(g);
01877 gap_start_idx = -1;
01878 }
01879 m_segments.push_back(s);
01880 }
01881 }
01882
01883 if(gap_start_idx >= 0) {
01884 TSegment& g (segments[gap_start_idx]);
01885 g.m_box[1] = segments[seg_dim-1].m_box[1];
01886 g.m_box[3] = segments[seg_dim-1].m_box[3];
01887 g.m_len = g.m_box[1] - g.m_box[0] + 1;
01888 g.m_details.resize(0);
01889 m_segments.push_back(g);
01890 }
01891
01892 size_t exon_count1 (0);
01893 ITERATE(TSegments, ii, m_segments) {
01894 if(ii->m_exon) ++exon_count1;
01895 }
01896
01897 if(exon_count0 == exon_count1) break;
01898 }
01899
01900
01901 #ifdef DUMP_PROCESSED_SEGS
01902 cerr << "Processed segments:" << endl;
01903 ITERATE(TSegments, ii, m_segments) {
01904 cerr << ii->m_box[0] << '\t' << ii->m_box[1] << '\t'
01905 << ii->m_box[2] << '\t' << ii->m_box[3] << '\t'
01906 << ii->m_annot << '\t' << ii->m_score << endl;
01907 }
01908 #endif
01909
01910 rv /= m_aligner->GetWm();
01911 return rv;
01912 }
01913
01914
01915 double CSplign::SAlignedCompartment::GetIdentity() const
01916 {
01917 string trans;
01918 for(size_t i (0), dim (m_Segments.size()); i < dim; ++i) {
01919 const TSegment & s (m_Segments[i]);
01920 if(s.m_exon) {
01921 trans.append(s.m_details);
01922 }
01923 else {
01924 trans.append(s.m_len, 'D');
01925 }
01926 }
01927 size_t matches = 0;
01928 ITERATE(string, ii, trans) {
01929 if(*ii == 'M') {
01930 ++matches;
01931 }
01932 }
01933 return double(matches) / trans.size();
01934 }
01935
01936
01937
01938 void CSplign::SAlignedCompartment::GetBox(Uint4* box) const
01939 {
01940 box[0] = box[2] = kMax_UInt;
01941 box[1] = box[3] = 0;
01942 ITERATE(TSegments, ii, m_Segments) {
01943 const TSegment& s (*ii);
01944 if(s.m_exon) {
01945
01946 Uint4 a, b;
01947 if(s.m_box[0] <= s.m_box[1]) {
01948 a = s.m_box[0];
01949 b = s.m_box[1];
01950 }
01951 else {
01952 b = s.m_box[0];
01953 a = s.m_box[1];
01954 }
01955 if(a < box[0]) {
01956 box[0] = a;
01957 }
01958 if(b > box[1]) {
01959 box[1] = b;
01960 }
01961
01962 if(s.m_box[2] <= s.m_box[3]) {
01963 a = s.m_box[2];
01964 b = s.m_box[3];
01965 }
01966 else {
01967 b = s.m_box[2];
01968 a = s.m_box[3];
01969 }
01970 if(a < box[2]) {
01971 box[2] = a;
01972 }
01973 if(b > box[3]) {
01974 box[3] = b;
01975 }
01976 }
01977 }
01978 }
01979
01980
01981 bool CSplign::x_ProcessTermSegm(TSegment** term_segs, Uint1 side) const
01982 {
01983 bool turn2gap (false);
01984
01985 const size_t exon_size (1 + term_segs[0]->m_box[1] -
01986 term_segs[0]->m_box[0]);
01987
01988 if(exon_size < kMinTermExonSize) {
01989
01990 const double idty (term_segs[0]->m_idty);
01991 if(idty < kMinTermExonIdty) {
01992 turn2gap = true;
01993 }
01994 else {
01995
01996
01997
01998 size_t a, b;
01999 const char *dnr, *acc;
02000 if(side == 0) {
02001 a = term_segs[0]->m_box[3];
02002 b = term_segs[1]->m_box[2];
02003 dnr = term_segs[0]->GetDonor();
02004 acc = term_segs[1]->GetAcceptor();
02005 }
02006 else {
02007 a = term_segs[1]->m_box[3];
02008 b = term_segs[0]->m_box[2];
02009 dnr = term_segs[1]->GetDonor();
02010 acc = term_segs[0]->GetAcceptor();
02011 }
02012
02013 const size_t intron_len (b - a);
02014
02015 const bool consensus (TSegment::s_IsConsensusSplice(dnr, acc));
02016
02017 size_t max_ext ((idty < .96 || !consensus || exon_size < 16)?
02018 m_max_genomic_ext: (5000 * kMinTermExonSize));
02019
02020 if(consensus) {
02021 if(exon_size < 8) {
02022 max_ext = 10 * exon_size;
02023 }
02024 }
02025 else if(exon_size < 16) {
02026 max_ext = 1;
02027 }
02028
02029 const size_t max_intron_len (x_GetGenomicExtent(exon_size, max_ext));
02030 if(intron_len > max_intron_len) {
02031 turn2gap = true;
02032 }
02033 }
02034
02035 if(turn2gap) {
02036
02037
02038 TSegment& s = *(term_segs[0]);
02039 s.m_exon = false;
02040 s.m_idty = 0;
02041 s.m_len = exon_size;
02042 s.m_annot = s_kGap;
02043 s.m_details.resize(0);
02044 s.m_score = 0;
02045 }
02046 }
02047
02048 return turn2gap;
02049 }
02050
02051
02052 Uint4 CSplign::x_GetGenomicExtent(const Uint4 query_len, Uint4 max_ext) const
02053 {
02054 if(max_ext == 0) {
02055 max_ext = m_max_genomic_ext;
02056 }
02057
02058 Uint4 rv (0);
02059 if(query_len >= kNonCoveredEndThreshold) {
02060 rv = m_max_genomic_ext;
02061 }
02062 else {
02063 const double k (pow(kNonCoveredEndThreshold, - 1. / kPower) * max_ext);
02064 const double drv (k * pow(query_len, 1. / kPower));
02065 rv = Uint4(drv);
02066 }
02067
02068 return rv;
02069 }
02070
02071
02072
02073
02074 namespace splign_local {
02075
02076 template<typename T>
02077 void ElemToBuffer(const T& n, char*& p)
02078 {
02079 *(reinterpret_cast<T*>(p)) = n;
02080 p += sizeof(n);
02081 }
02082
02083 template<>
02084 void ElemToBuffer(const string& s, char*& p)
02085 {
02086 copy(s.begin(), s.end(), p);
02087 p += s.size();
02088 *p++ = 0;
02089 }
02090
02091 template<typename T>
02092 void ElemFromBuffer(T& n, const char*& p)
02093 {
02094 n = *(reinterpret_cast<const T*>(p));
02095 p += sizeof(n);
02096 }
02097
02098 template<>
02099 void ElemFromBuffer(string& s, const char*& p)
02100 {
02101 s = p;
02102 p += s.size() + 1;
02103 }
02104 }
02105
02106
02107 void CNWFormatter::SSegment::ToBuffer(TNetCacheBuffer* target) const
02108 {
02109 using namespace splign_local;
02110
02111 if(target == 0) {
02112 NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_NullPointerPassed );
02113 }
02114
02115 const size_t total_size = sizeof m_exon + sizeof m_idty +
02116 sizeof m_len + sizeof m_box + m_annot.size() + 1 +
02117 m_details.size() + 1 + sizeof m_score;
02118
02119 target->resize(total_size);
02120
02121 char* p = &target->front();
02122 ElemToBuffer(m_exon, p);
02123 ElemToBuffer(m_idty, p);
02124 ElemToBuffer(m_len, p);
02125 for(size_t i = 0; i < 4; ++i) {
02126 ElemToBuffer(m_box[i], p);
02127 }
02128 ElemToBuffer(m_annot, p);
02129 ElemToBuffer(m_details, p);
02130 ElemToBuffer(m_score, p);
02131 }
02132
02133
02134 void CNWFormatter::SSegment::FromBuffer(const TNetCacheBuffer& source)
02135 {
02136 using namespace splign_local;
02137
02138 const size_t min_size = sizeof m_exon + sizeof m_idty + sizeof m_len +
02139 + sizeof m_box + 1 + 1 + sizeof m_score;
02140
02141 if(source.size() < min_size) {
02142 NCBI_THROW(CAlgoAlignException, eInternal, g_msg_NetCacheBufferIncomplete);
02143 }
02144
02145 const char* p = &source.front();
02146 ElemFromBuffer(m_exon, p);
02147 ElemFromBuffer(m_idty, p);
02148 ElemFromBuffer(m_len, p);
02149
02150 for(size_t i = 0; i < 4; ++i) {
02151 ElemFromBuffer(m_box[i], p);
02152 }
02153
02154 ElemFromBuffer(m_annot, p);
02155 ElemFromBuffer(m_details, p);
02156 ElemFromBuffer(m_score, p);
02157 }
02158
02159
02160 void CSplign::SAlignedCompartment::ToBuffer(TNetCacheBuffer* target) const
02161 {
02162 using namespace splign_local;
02163
02164 if(target == 0) {
02165 NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_NullPointerPassed);
02166 }
02167
02168 const size_t core_size (
02169 sizeof m_Id + sizeof m_Status + m_Msg.size() + 1
02170 + sizeof m_QueryStrand + sizeof m_SubjStrand
02171 + sizeof m_Cds_start + sizeof m_Cds_stop
02172 + sizeof m_QueryLen
02173 + sizeof m_PolyA
02174 + sizeof m_Score);
02175
02176 vector<char> core (core_size);
02177
02178 char* p = &core.front();
02179 ElemToBuffer(m_Id, p);
02180 ElemToBuffer(m_Status, p);
02181 ElemToBuffer(m_Msg, p);
02182 ElemToBuffer(m_QueryStrand, p);
02183 ElemToBuffer(m_SubjStrand, p);
02184 ElemToBuffer(m_Cds_start, p);
02185 ElemToBuffer(m_Cds_stop, p);
02186 ElemToBuffer(m_QueryLen, p);
02187 ElemToBuffer(m_PolyA, p);
02188 ElemToBuffer(m_Score, p);
02189
02190 typedef vector<TNetCacheBuffer> TBuffers;
02191 TBuffers vb (m_Segments.size());
02192 size_t ibuf (0);
02193 ITERATE(TSegments, ii, m_Segments) {
02194 ii->ToBuffer(&vb[ibuf++]);
02195 }
02196
02197 size_t total_size (core_size + sizeof(size_t) * m_Segments.size());
02198 ITERATE(TBuffers, ii, vb) {
02199 total_size += ii->size();
02200 }
02201
02202 target->resize(total_size);
02203 TNetCacheBuffer::iterator it = target->begin();
02204 copy(core.begin(), core.end(), it);
02205 it += core_size;
02206
02207 ITERATE(TBuffers, ii, vb) {
02208 char* p = &(*it);
02209 const size_t seg_buf_size = ii->size();
02210 *((size_t*)p) = seg_buf_size;
02211 it += sizeof (size_t);
02212 copy(ii->begin(), ii->end(), it);
02213 it += seg_buf_size;
02214 }
02215 }
02216
02217
02218 void CSplign::SAlignedCompartment::FromBuffer(const TNetCacheBuffer& source)
02219 {
02220 using namespace splign_local;
02221
02222 const size_t min_size (
02223 sizeof m_Id
02224 + sizeof m_Status
02225 + 1
02226 + sizeof m_QueryStrand + sizeof m_SubjStrand
02227 + sizeof m_Cds_start + sizeof m_Cds_stop
02228 + sizeof m_QueryLen
02229 + sizeof m_PolyA
02230 + sizeof m_Score );
02231
02232 if(source.size() < min_size) {
02233 NCBI_THROW(CAlgoAlignException, eInternal, g_msg_NetCacheBufferIncomplete);
02234 }
02235
02236 const char* p (&source.front());
02237 ElemFromBuffer(m_Id, p);
02238 ElemFromBuffer(m_Status, p);
02239 ElemFromBuffer(m_Msg, p);
02240 ElemFromBuffer(m_QueryStrand, p);
02241 ElemFromBuffer(m_SubjStrand, p);
02242 ElemFromBuffer(m_Cds_start, p);
02243 ElemFromBuffer(m_Cds_stop, p);
02244 ElemFromBuffer(m_QueryLen, p);
02245 ElemFromBuffer(m_PolyA, p);
02246 ElemFromBuffer(m_Score, p);
02247
02248 const char* pe (&source.back());
02249 while(p <= pe) {
02250 size_t seg_buf_size (0);
02251 ElemFromBuffer(seg_buf_size, p);
02252 m_Segments.push_back(TSegment());
02253 TSegment& seg (m_Segments.back());
02254 seg.FromBuffer(TNetCacheBuffer(p, p + seg_buf_size));
02255 p += seg_buf_size;
02256 }
02257 }
02258
02259
02260 bool IsConsDonor(const objects::CSpliced_exon& exon) {
02261 USING_SCOPE(objects);
02262 const CSpliced_exon::TDonor_after_exon & splice (exon.GetDonor_after_exon());
02263 const string bases (splice.GetBases());
02264 return (bases.size() >= 2 && bases[0] == 'G' && bases[1] == 'T');
02265 }
02266
02267
02268 bool IsConsAcceptor(const objects::CSpliced_exon& exon) {
02269
02270 USING_SCOPE(objects);
02271 const CSpliced_exon::TAcceptor_before_exon &
02272 splice (exon.GetAcceptor_before_exon());
02273 const string bases (splice.GetBases());
02274 const size_t dim (bases.size());
02275 return (dim >= 2 && bases[dim - 2] == 'A' && bases[dim - 1] == 'G');
02276 }
02277
02278
02279 size_t CSplign::s_ComputeStats(CRef<objects::CSeq_align_set> sas,
02280 TScoreSets * output_stats,
02281 TOrf cds,
02282 EStatFlags flags)
02283 {
02284 USING_SCOPE(objects);
02285
02286 const
02287 bool valid_input (sas.GetPointer() && sas->CanGet() && sas->Get().size()
02288 && sas->Get().front()->CanGetSegs()
02289 && sas->Get().front()->GetSegs().IsSpliced()
02290 && sas->Get().front()->GetSegs().GetSpliced().GetProduct_type()
02291 == CSpliced_seg::eProduct_type_transcript
02292 && output_stats);
02293
02294 if(!valid_input) {
02295 NCBI_THROW(CAlgoAlignException, eBadParameter,
02296 "CSplign::s_ComputeStats(): Invalid input");
02297 }
02298
02299 output_stats->resize(0);
02300
02301 ITERATE(CSeq_align_set::Tdata, ii1, sas->Get()) {
02302 CRef<CScore_set> ss (s_ComputeStats(*ii1, false, cds, flags));
02303 output_stats->push_back(ss);
02304 }
02305
02306 return output_stats->size();
02307 }
02308
02309
02310 namespace {
02311 const int kFrame_not_set (-10);
02312 const int kFrame_end (-5);
02313 const int kFrame_lost (-20);
02314 }
02315
02316
02317 CRef<objects::CScore_set> CSplign::s_ComputeStats(CRef<objects::CSeq_align> sa,
02318 bool embed_scoreset,
02319 TOrf cds,
02320 EStatFlags flags)
02321 {
02322 USING_SCOPE(objects);
02323
02324
02325 if(!(flags & (eSF_BasicNonCds | eSF_BasicCds))) {
02326 NCBI_THROW(CException, eUnknown,
02327 "CSplign::s_ComputeStats(): mode not yet supported.");
02328 }
02329
02330 typedef CSeq_align::TSegs::TSpliced TSpliced;
02331 const TSpliced & spliced (sa->GetSegs().GetSpliced());
02332 if(spliced.GetProduct_type() != CSpliced_seg::eProduct_type_transcript) {
02333 NCBI_THROW(CAlgoAlignException, eBadParameter,
02334 "CSplign::s_ComputeStats(): Unsupported product type");
02335 }
02336
02337 const bool cds_stats ((flags & eSF_BasicCds) && (cds.first + cds.second > 0));
02338 const bool qstrand (spliced.GetProduct_strand() != eNa_strand_minus);
02339 if(cds_stats) {
02340 const bool cds_strand (cds.first < cds.second);
02341 if(qstrand ^ cds_strand) {
02342 NCBI_THROW(CAlgoAlignException, eBadParameter,
02343 "CSplign::s_ComputeStats(): Transcript orientation not "
02344 "matching specified CDS orientation.");
02345 }
02346 }
02347
02348 typedef TSpliced::TExons TExons;
02349 const TExons & exons (spliced.GetExons());
02350
02351 size_t matches (0),
02352 aligned_query_bases (0),
02353 aln_length_exons (0),
02354 aln_length_gaps (0),
02355 splices_total (0),
02356 splices_consensus (0);
02357
02358 const TSeqPos qlen (spliced.GetProduct_length());
02359 const TSeqPos polya (spliced.CanGetPoly_a()?
02360 spliced.GetPoly_a(): (qstrand? qlen: TSeqPos(-1)));
02361 const TSeqPos prod_length_no_polya (qstrand? polya: qlen - 1 - polya);
02362
02363 typedef CSpliced_exon TExon;
02364 TSeqPos qprev (qstrand? TSeqPos(-1): qlen);
02365 bool cons_dnr (false);
02366 string xcript;
02367 ITERATE(TExons, ii2, exons) {
02368
02369 const TExon & exon (**ii2);
02370 const TSeqPos qmin (exon.GetProduct_start().GetNucpos()),
02371 qmax (exon.GetProduct_end().GetNucpos());
02372
02373 const TSeqPos qgap (qstrand? qmin - qprev - 1: qprev - qmax - 1);
02374
02375 if(qgap > 0) {
02376 aln_length_gaps += qgap;
02377 cons_dnr = false;
02378 if(cds_stats) xcript.append(qgap, 'X');
02379 }
02380 else if (ii2 != exons.begin()) {
02381 splices_total += 2;
02382 if(cons_dnr && IsConsAcceptor(exon)) { splices_consensus += 2; }
02383 }
02384
02385 typedef TExon::TParts TParts;
02386 const TParts & parts (exon.GetParts());
02387 string errmsg;
02388 ITERATE(TParts, ii3, parts) {
02389 const CSpliced_exon_chunk & part (**ii3);
02390 const CSpliced_exon_chunk::E_Choice choice (part.Which());
02391 TSeqPos len (0);
02392 switch(choice) {
02393 case CSpliced_exon_chunk::e_Match:
02394 len = part.GetMatch();
02395 matches += len;
02396 aligned_query_bases += len;
02397 if(cds_stats) xcript.append(len, 'M');
02398 break;
02399 case CSpliced_exon_chunk::e_Mismatch:
02400 len = part.GetMismatch();
02401 aligned_query_bases += len;
02402 if(cds_stats) xcript.append(len, 'R');
02403 break;
02404 case CSpliced_exon_chunk::e_Product_ins:
02405 len = part.GetProduct_ins();
02406 aligned_query_bases += len;
02407 if(cds_stats) xcript.append(len, 'D');
02408 break;
02409 case CSpliced_exon_chunk::e_Genomic_ins:
02410 len = part.GetGenomic_ins();
02411 if(cds_stats) xcript.append(len, 'I');
02412 break;
02413 default:
02414 errmsg = "Unexpected spliced exon chunk part: "
02415 + part.SelectionName(choice);
02416 NCBI_THROW(CAlgoAlignException, eBadParameter, errmsg);
02417 }
02418 aln_length_exons += len;
02419 }
02420
02421 cons_dnr = IsConsDonor(exon);
02422 qprev = qstrand? qmax: qmin;
02423 }
02424
02425 const TSeqPos qgap (qstrand? polya - qprev - 1: qprev - polya - 1);
02426 aln_length_gaps += qgap;
02427 if(cds_stats) xcript.append(qgap, 'X');
02428
02429
02430 CRef<CScore_set> ss (new CScore_set);
02431 CScore_set::Tdata & scores (ss->Set());
02432
02433 if(flags & eSF_BasicNonCds) {
02434 {
02435 CRef<CScore> score_matches (new CScore());
02436 score_matches->SetId().SetId(eCS_Matches);
02437 score_matches->SetValue().SetInt(matches);
02438 scores.push_back(score_matches);
02439 }
02440 {
02441 CRef<CScore> score_overall_identity (new CScore());
02442 score_overall_identity->SetId().SetId(eCS_OverallIdentity);
02443 score_overall_identity->SetValue().
02444 SetReal(double(matches)/(aln_length_exons + aln_length_gaps));
02445 scores.push_back(score_overall_identity);
02446 }
02447 {
02448 CRef<CScore> score_splices (new CScore());
02449 score_splices->SetId().SetId(eCS_Splices);
02450 score_splices->SetValue().SetInt(splices_total);
02451 scores.push_back(score_splices);
02452 }
02453 {
02454 CRef<CScore> score_splices_consensus (new CScore());
02455 score_splices_consensus->SetId().SetId(eCS_ConsensusSplices);
02456 score_splices_consensus->SetValue().SetInt(splices_consensus);
02457 scores.push_back(score_splices_consensus);
02458 }
02459 {
02460 CRef<CScore> score_coverage (new CScore());
02461 score_coverage->SetId().SetId(eCS_ProductCoverage);
02462 score_coverage->SetValue().
02463 SetReal(double(aligned_query_bases) / prod_length_no_polya);
02464 scores.push_back(score_coverage);
02465 }
02466 {
02467 CRef<CScore> score_exon_identity (new CScore());
02468 score_exon_identity->SetId().SetId(eCS_ExonIdentity);
02469 score_exon_identity->SetValue().
02470 SetReal(double(matches) / aln_length_exons);
02471 scores.push_back(score_exon_identity);
02472 }
02473 }
02474
02475 if(cds_stats) {
02476
02477 if(!qstrand && qlen <= 0) {
02478 NCBI_THROW(CAlgoAlignException, eBadParameter,
02479 "CSplign::s_ComputeStats(): Cannot compute "
02480 "inframe stats - transcript length not set.");
02481 }
02482
02483 int qpos (qstrand? -1: int(qlen));
02484 int qinc (qstrand? +1: -1);
02485 int frame (kFrame_not_set);
02486 size_t aln_length_cds (0);
02487 size_t matches_frame[] = {0, 0, 0, 0, 0};
02488 const int cds_start (cds.first), cds_stop (cds.second);
02489 for(string::const_iterator ie (xcript.end()), ii(xcript.begin());
02490 ii != ie && frame != kFrame_end; ++ii)
02491 {
02492
02493 switch(*ii) {
02494
02495 case 'M':
02496 qpos += qinc;
02497 if(frame == kFrame_not_set && qpos == cds_start) frame = 0;
02498 if(qpos == cds_stop) frame = kFrame_end;
02499 if(frame >= -2) {
02500 ++aln_length_cds;
02501 ++matches_frame[frame + 2];
02502 }
02503 break;
02504
02505 case 'R':
02506 qpos += qinc;
02507 if(frame == kFrame_not_set && qpos == cds_start) frame = 0;
02508 if(qpos == cds_stop) frame = kFrame_end;
02509 if(frame >= -2) ++aln_length_cds;
02510 break;
02511
02512 case 'D':
02513 qpos += qinc;
02514 if(frame == kFrame_not_set && qpos == cds_start) frame = 0;
02515 if(qpos == cds_stop) frame = kFrame_end;
02516 if(frame >= -2) {
02517 ++aln_length_cds;
02518 frame = (frame + 1) % 3;
02519 }
02520 break;
02521
02522 case 'I':
02523 if(frame >= -2) {
02524 ++aln_length_cds;
02525 frame = (frame - 1) % 3;
02526 }
02527 break;
02528
02529 case 'X':
02530 qpos += qinc;
02531 if( qstrand && cds_start <= qpos && qpos < cds_stop ||
02532 !qstrand && cds_start >= qpos && qpos > cds_stop )
02533 {
02534 frame = kFrame_lost;
02535 ++aln_length_cds;
02536 }
02537 break;
02538 }
02539 }
02540
02541 {
02542 CRef<CScore> score_matches_inframe (new CScore());
02543 score_matches_inframe->SetId().SetId(eCS_InframeMatches);
02544 score_matches_inframe->SetValue().SetInt(matches_frame[2]);
02545 scores.push_back(score_matches_inframe);
02546 }
02547
02548 {
02549 CRef<CScore> score_inframe_identity (new CScore());
02550 score_inframe_identity->SetId().SetId(eCS_InframeIdentity);
02551 score_inframe_identity->SetValue().
02552 SetReal(double(matches_frame[2]) / aln_length_cds);
02553 scores.push_back(score_inframe_identity);
02554 }
02555 }
02556
02557
02558 if(embed_scoreset) {
02559 CSeq_align::TScore & sa_score (sa->SetScore());
02560 sa_score.resize(scores.size());
02561 copy(scores.begin(), scores.end(), sa_score.begin());
02562 }
02563
02564 return ss;
02565 }
02566
02567
02568
02569 END_NCBI_SCOPE
02570
02571