00001 #ifndef ALGO_ALIGN_UTIL_COMPARTMENT_FINDER__HPP
00002 #define ALGO_ALIGN_UTIL_COMPARTMENT_FINDER__HPP
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044 #include <corelib/ncbi_limits.hpp>
00045
00046 #include <algo/align/nw/align_exception.hpp>
00047 #include <algo/align/util/hit_filter.hpp>
00048
00049 #include <objects/seqalign/Seq_align_set.hpp>
00050 #include <objects/seqalign/Seq_align.hpp>
00051 #include <objects/seqalign/Dense_seg.hpp>
00052 #include <objects/seqloc/Seq_loc.hpp>
00053 #include <objects/seqloc/Na_strand.hpp>
00054
00055 #include <algorithm>
00056 #include <numeric>
00057 #include <vector>
00058
00059
00060 BEGIN_NCBI_SCOPE
00061
00062
00063 template<class THit>
00064 class CCompartmentFinder {
00065
00066 public:
00067
00068 typedef CRef<THit> THitRef;
00069 typedef vector<THitRef> THitRefs;
00070 typedef typename THit::TCoord TCoord;
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082 CCompartmentFinder(typename THitRefs::const_iterator start,
00083 typename THitRefs::const_iterator finish);
00084
00085
00086
00087
00088
00089
00090 void SetMaxIntron (TCoord intr_max) {
00091 m_intron_max = intr_max;
00092 }
00093
00094
00095
00096 static TCoord s_GetDefaultMaxIntron(void) {
00097
00098
00099 return 1200000;
00100 }
00101
00102
00103
00104
00105
00106
00107
00108 void SetPenalty(TCoord penalty) {
00109 m_penalty = penalty;
00110 }
00111
00112
00113
00114
00115
00116
00117 void SetMinMatches(TCoord min_matches) {
00118 m_MinSingletonMatches = m_MinMatches = min_matches;
00119 }
00120
00121
00122
00123
00124
00125
00126 void SetMinSingletonMatches(TCoord min_matches) {
00127 m_MinSingletonMatches = min_matches;
00128 }
00129
00130
00131
00132 static TCoord s_GetDefaultPenalty(void) {
00133 return 500;
00134 }
00135
00136
00137
00138 static TCoord s_GetDefaultMinCoverage(void) {
00139 return 500;
00140 }
00141
00142
00143
00144
00145
00146
00147
00148
00149 size_t Run(bool cross_filter = false);
00150
00151
00152
00153 void OrderCompartments(void);
00154
00155
00156
00157
00158 class CCompartment {
00159
00160 public:
00161
00162
00163
00164 CCompartment(void) {
00165 m_box[0] = m_box[2] = numeric_limits<TCoord>::max();
00166 m_box[1] = m_box[3] = 0;
00167 }
00168
00169
00170
00171 const THitRefs& GetMembers(void) {
00172 return m_members;
00173 }
00174
00175
00176
00177 void AddMember(const THitRef& hitref) {
00178 m_members.push_back(hitref);
00179 }
00180
00181
00182
00183 THitRefs& SetMembers(void) {
00184 return m_members;
00185 }
00186
00187
00188
00189 void UpdateMinMax(void);
00190
00191
00192
00193 bool GetStrand(void) const;
00194
00195
00196
00197
00198
00199
00200
00201 const THitRef GetFirst(void) const {
00202 m_iter = 0;
00203 return GetNext();
00204 }
00205
00206
00207
00208
00209
00210
00211
00212
00213 const THitRef GetNext(void) const {
00214 if(m_iter < m_members.size()) {
00215 return m_members[m_iter++];
00216 }
00217 return THitRef(NULL);
00218 }
00219
00220
00221
00222
00223
00224
00225 const TCoord* GetBox(void) const {
00226 return m_box;
00227 }
00228
00229
00230 static bool s_PLowerSubj(const CCompartment& c1, const CCompartment& c2) {
00231
00232 return c1.m_box[2] < c2.m_box[2];
00233 }
00234
00235 protected:
00236
00237 THitRefs m_members;
00238 TCoord m_box[4];
00239 mutable size_t m_iter;
00240 };
00241
00242
00243
00244 CCompartment* GetFirst(void);
00245 CCompartment* GetNext(void);
00246
00247 private:
00248
00249 TCoord m_intron_max;
00250 TCoord m_penalty;
00251 TCoord m_MinMatches;
00252 TCoord m_MinSingletonMatches;
00253
00254 THitRefs m_hitrefs;
00255 vector<CCompartment> m_compartments;
00256 int m_iter;
00257
00258 struct SHitStatus {
00259
00260 enum EType {
00261 eNone,
00262 eExtension,
00263 eOpening
00264 };
00265
00266 EType m_type;
00267 int m_prev;
00268 double m_score;
00269
00270 SHitStatus(void): m_type(eNone), m_prev(-1), m_score(0)
00271 {}
00272 SHitStatus(EType type, int prev, double score):
00273 m_type(type), m_prev(prev), m_score(score)
00274 {}
00275 };
00276
00277
00278 static size_t sx_XFilter(THitRefs& hitrefs,
00279 typename THitRefs::iterator ihr,
00280 typename THitRefs::iterator ihr_e,
00281 Uint1 w,
00282 size_t min_compartment_hit_len);
00283
00284
00285
00286 static bool s_PNullRef(const THitRef& hr) {
00287 return hr.IsNull();
00288 }
00289 };
00290
00291
00292
00293 template<class THit>
00294 class CCompartmentAccessor
00295 {
00296 public:
00297
00298 typedef CCompartmentFinder<THit> TCompartmentFinder;
00299 typedef typename TCompartmentFinder::THitRefs THitRefs;
00300 typedef typename TCompartmentFinder::TCoord TCoord;
00301
00302
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313 CCompartmentAccessor(TCoord comp_penalty_bps,
00314 TCoord min_matches,
00315 TCoord min_singleton_matches = numeric_limits<TCoord>::max(),
00316 bool cross_filter = false);
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326
00327
00328
00329
00330
00331
00332
00333
00334
00335
00336 CCompartmentAccessor(typename THitRefs::iterator start,
00337 typename THitRefs::iterator finish,
00338 TCoord comp_penalty_bps,
00339 TCoord min_matches,
00340 TCoord min_singleton_matches = numeric_limits<TCoord>::max(),
00341 bool cross_filter = false);
00342
00343
00344
00345
00346
00347
00348
00349
00350
00351 void Run(typename THitRefs::iterator start,
00352 typename THitRefs::iterator finish);
00353
00354
00355
00356 void SetMaxIntron(TCoord mi) { m_MaxIntron = mi; }
00357
00358
00359
00360 TCoord GetMaxIntron(void) const { return m_MaxIntron; }
00361
00362
00363
00364
00365
00366
00367
00368
00369 bool GetFirst(THitRefs& compartment);
00370
00371
00372
00373
00374
00375
00376
00377
00378 bool GetNext(THitRefs& compartment);
00379
00380
00381
00382
00383
00384
00385
00386
00387
00388 pair<size_t,size_t> GetCounts(void) const {
00389
00390
00391 size_t valid_count (0);
00392 for(size_t i(0), n(m_status.size()); i != n; ++i) {
00393 if(m_status[i]) ++valid_count;
00394 }
00395
00396 pair<size_t, size_t> rv (m_pending.size(), valid_count);
00397
00398 return rv;
00399 }
00400
00401
00402
00403
00404
00405
00406
00407
00408 void Get(size_t idx, THitRefs& compartment) const {
00409 compartment = m_pending[idx];
00410 }
00411
00412 const TCoord* GetBox(size_t i) const {
00413 return &m_ranges.front() + i*4;
00414 }
00415
00416 bool GetStrand(size_t i) const {
00417 return m_strands[i];
00418 }
00419
00420 bool GetStatus(size_t i) const {
00421 return m_status[i];
00422 }
00423
00424
00425
00426
00427
00428
00429
00430
00431 CRef<objects::CSeq_align_set> AsSeqAlignSet(void) const;
00432
00433 private:
00434
00435 void x_Init(TCoord comp_penalty, TCoord min_matches,
00436 TCoord min_singleton_matches, bool cross_filter);
00437
00438
00439 TCoord m_Penalty;
00440 TCoord m_MinMatches;
00441 TCoord m_MinSingletonMatches;
00442 TCoord m_MaxIntron;
00443 bool m_CrossFiltering;
00444
00445
00446 vector<THitRefs> m_pending;
00447 vector<TCoord> m_ranges;
00448 vector<bool> m_strands;
00449 vector<bool> m_status;
00450 size_t m_iter;
00451
00452 void x_Copy2Pending(TCompartmentFinder& finder);
00453 };
00454
00455
00456
00457
00458
00459
00460
00461 const double kPenaltyPerIntronBase (-2e-5);
00462
00463
00464 const double kPenaltyPerIntronPos (-1e-9);
00465
00466
00467 template<class THit>
00468 void CCompartmentFinder<THit>::CCompartment::UpdateMinMax()
00469 {
00470 typedef CHitFilter<THit> THitFilter;
00471 THitFilter::s_GetSpan(m_members, m_box);
00472 }
00473
00474
00475 template<class THit>
00476 bool CCompartmentFinder<THit>::CCompartment::GetStrand() const {
00477
00478 if(m_members.size()) {
00479 return m_members.front()->GetSubjStrand();
00480 }
00481 else {
00482 NCBI_THROW( CAlgoAlignException,
00483 eInternal,
00484 "Strand requested on an empty compartment" );
00485 }
00486 }
00487
00488
00489 template<class THit>
00490 CCompartmentFinder<THit>::CCompartmentFinder(
00491 typename THitRefs::const_iterator start,
00492 typename THitRefs::const_iterator finish ):
00493
00494 m_intron_max(s_GetDefaultMaxIntron()),
00495 m_penalty(s_GetDefaultPenalty()),
00496 m_MinMatches(1),
00497 m_MinSingletonMatches(1),
00498 m_iter(-1)
00499 {
00500 m_hitrefs.resize(finish - start);
00501 copy(start, finish, m_hitrefs.begin());
00502 }
00503
00504
00505 template<class THit>
00506 class CQueryMatchAccumulator:
00507 public binary_function<double, CRef<THit>, double>
00508 {
00509 public:
00510
00511 CQueryMatchAccumulator(void): m_Finish(-1.0)
00512 {}
00513
00514 double operator() (double acm, CRef<THit> ph)
00515 {
00516 const typename THit::TCoord qmin (ph->GetQueryMin()),
00517 qmax (ph->GetQueryMax());
00518 if(qmin > m_Finish)
00519 return acm + ph->GetIdentity() * ((m_Finish = qmax) - qmin + 1);
00520 else {
00521 if(qmax > m_Finish) {
00522 const double finish0 (m_Finish);
00523 return acm + ph->GetIdentity() * ((m_Finish = qmax) - finish0);
00524 }
00525 else {
00526 return acm;
00527 }
00528 }
00529 }
00530
00531 private:
00532
00533 double m_Finish;
00534 };
00535
00536
00537 template<class THit>
00538 double GetTotalMatches(
00539 const typename CCompartmentFinder<THit>::THitRefs& hitrefs0)
00540 {
00541 typename CCompartmentFinder<THit>::THitRefs hitrefs (hitrefs0);
00542
00543 typedef CHitComparator<THit> THitComparator;
00544 THitComparator sorter (THitComparator::eQueryMin);
00545 stable_sort(hitrefs.begin(), hitrefs.end(), sorter);
00546
00547 const double rv (accumulate(hitrefs.begin(), hitrefs.end(), 0.0,
00548 CQueryMatchAccumulator<THit>()));
00549 return rv;
00550 }
00551
00552
00553 template<class THit>
00554 size_t CCompartmentFinder<THit>::Run(bool cross_filter)
00555 {
00556 const double kMinusInf (-1e12);
00557
00558 m_compartments.clear();
00559 const int dimhits = m_hitrefs.size();
00560 if(dimhits == 0) {
00561 return 0;
00562 }
00563
00564
00565
00566
00567
00568 typedef CHitComparator<THit> THitComparator;
00569 stable_sort(m_hitrefs.begin(), m_hitrefs.end(),
00570 THitComparator(THitComparator::eSubjMaxQueryMax));
00571
00572
00573
00574
00575
00576
00577
00578 typedef vector<SHitStatus> THitData;
00579 THitData hitstatus (dimhits);
00580
00581 const TCoord subj_min_global (m_hitrefs.front()->GetSubjMin());
00582
00583 int i_bestsofar (0);
00584 for(int i (0); i < dimhits; ++i) {
00585
00586 const THitRef& h (m_hitrefs[i]);
00587 const double identity (h->GetIdentity());
00588 const typename THit::TCoord hbox [4] = {
00589 h->GetQueryMin(), h->GetQueryMax(),
00590 h->GetSubjMin(), h->GetSubjMax()
00591 };
00592
00593
00594 #ifdef CF_DBG_TRACE
00595 cerr << endl << *h << endl;
00596 #endif
00597
00598 double best_ext_score (kMinusInf);
00599 int i_best_ext (-1);
00600
00601 double best_open_score (identity*(hbox[1] - hbox[0] + 1) - m_penalty);
00602 int i_best_open (-1);
00603
00604 if(hbox[2] > m_hitrefs[i_bestsofar]->GetSubjMax()) {
00605
00606 const double score_open (identity*(hbox[1] - hbox[0] + 1)
00607 + hitstatus[i_bestsofar].m_score
00608 - m_penalty);
00609
00610 if(score_open > best_open_score) {
00611 best_open_score = score_open;
00612 i_best_open = i_bestsofar;
00613 }
00614 }
00615 else {
00616
00617 for(int j (i - 1); j >= 0; --j) {
00618
00619 const double score_open (identity * (hbox[1] - hbox[0] + 1)
00620 + hitstatus[j].m_score
00621 - m_penalty);
00622
00623 if(score_open > best_open_score
00624 && hbox[2] > m_hitrefs[j]->GetSubjMax())
00625 {
00626 best_open_score = score_open;
00627 i_best_open = j;
00628 }
00629 }
00630 }
00631
00632 for(int j = i; j >= 0; --j) {
00633
00634 THitRef phc;
00635 typename THit::TCoord phcbox[4];
00636 if(j != i) {
00637
00638 phc = m_hitrefs[j];
00639 phcbox[0] = phc->GetQueryMin();
00640 phcbox[1] = phc->GetQueryMax();
00641 phcbox[2] = phc->GetSubjMin();
00642 phcbox[3] = phc->GetSubjMax();
00643
00644 #ifdef CF_DBG_TRACE
00645 cerr << '\t' << *phc << endl;
00646 #endif
00647 }
00648 #ifdef CF_DBG_TRACE
00649 else {
00650 cerr << "\t[Dummy]" << endl;
00651 }
00652 #endif
00653
00654
00655 {{
00656 typename THit::TCoord q0, s0;
00657 bool good (false);
00658 int subj_space;
00659 TCoord intron_start (0);
00660
00661 if(i != j) {
00662
00663 if(phcbox[1] < hbox[1] && phcbox[0] < hbox[0]) {
00664
00665 if(hbox[0] <= phcbox[1] &&
00666 hbox[2] + phcbox[1] - hbox[0] >= phcbox[3]) {
00667
00668 q0 = phcbox[1] + 1;
00669 s0 = hbox[2] + phcbox[1] - hbox[0];
00670 }
00671 else if(phcbox[3] >= hbox[2]) {
00672
00673 s0 = phcbox[3] + 1;
00674 q0 = hbox[1] - (hbox[3] - s0);
00675 }
00676 else {
00677
00678 q0 = hbox[0];
00679 s0 = hbox[2];
00680 }
00681 subj_space = s0 - phcbox[3] - 1;
00682
00683
00684
00685
00686 const TCoord max_gap (100);
00687
00688 good = (subj_space <= int(m_intron_max))
00689 && (subj_space + max_gap >= q0 - phcbox[1] - 1)
00690 && (s0 < hbox[3] && q0 < hbox[1]);
00691
00692 if(good) {
00693 intron_start = phcbox[3];
00694 }
00695 }
00696 }
00697
00698 if(good) {
00699
00700 const double jscore (hitstatus[j].m_score);
00701 const double intron_penalty ((subj_space > 0)?
00702 (kPenaltyPerIntronPos * (intron_start - subj_min_global)
00703 + subj_space * kPenaltyPerIntronBase): 0.0);
00704
00705 const double ext_score (jscore +
00706 identity * (hbox[1] - q0 + 1) + intron_penalty);
00707
00708 if(ext_score > best_ext_score) {
00709 best_ext_score = ext_score;
00710 i_best_ext = j;
00711 }
00712 #ifdef CF_DBG_TRACE
00713 cerr << "\tGood for extension with score = " << ext_score << endl;
00714 #endif
00715 }
00716 }}
00717 }
00718
00719 typename SHitStatus::EType hit_type;
00720 int prev_hit;
00721 double best_score;
00722 if(best_ext_score > best_open_score) {
00723
00724 hit_type = SHitStatus::eExtension;
00725 prev_hit = i_best_ext;
00726 best_score = best_ext_score;
00727 }
00728 else {
00729
00730 hit_type = SHitStatus::eOpening;
00731 prev_hit = i_best_open;
00732 best_score = best_open_score;
00733 }
00734
00735 hitstatus[i].m_type = hit_type;
00736 hitstatus[i].m_prev = prev_hit;
00737 hitstatus[i].m_score = best_score;
00738
00739 if(best_score > hitstatus[i_bestsofar].m_score) {
00740 i_bestsofar = i;
00741 }
00742
00743 #ifdef CF_DBG_TRACE
00744 cerr << "Status = " << ((hit_type == SHitStatus::eOpening)? "Open": "Extend")
00745 << '\t' << best_score << endl;
00746 if(prev_hit == -1) {
00747 cerr << "[Dummy]" << endl;
00748 }
00749 else {
00750 cerr << *(m_hitrefs[prev_hit]) << endl;
00751 }
00752 #endif
00753 }
00754
00755 #ifdef CF_DBG_TRACE
00756 cerr << "\n\n--- BACKTRACE ---\n";
00757 #endif
00758
00759
00760
00761 const double score_best (hitstatus[i_bestsofar].m_score);
00762 const double min_matches (m_MinSingletonMatches < m_MinMatches?
00763 m_MinSingletonMatches: m_MinMatches);
00764
00765 vector<bool> comp_status;
00766 if(score_best + m_penalty >= min_matches) {
00767
00768 int i (i_bestsofar);
00769 bool new_compartment (true);
00770 THitRefs hitrefs;
00771 while(i != -1) {
00772
00773 if(new_compartment) {
00774
00775 const double mp (GetTotalMatches<THit>(hitrefs));
00776 if(mp > 0) {
00777
00778 m_compartments.push_back(CCompartment());
00779 m_compartments.back().SetMembers() = hitrefs;
00780 comp_status.push_back(mp >= m_MinMatches);
00781 }
00782
00783 hitrefs.resize(0);
00784 new_compartment = false;
00785 }
00786 hitrefs.push_back(m_hitrefs[i]);
00787
00788 #ifdef CF_DBG_TRACE
00789 cerr << *(m_hitrefs[i]) << endl;
00790 #endif
00791
00792 if(hitstatus[i].m_type == SHitStatus::eOpening) {
00793 new_compartment = true;
00794 }
00795 i = hitstatus[i].m_prev;
00796 }
00797
00798 const double mp (GetTotalMatches<THit>(hitrefs));
00799 if(mp > 0) {
00800 bool status (m_compartments.size() == 0 && mp >= m_MinSingletonMatches
00801 || mp >= m_MinMatches);
00802 m_compartments.push_back(CCompartment());
00803 m_compartments.back().SetMembers() = hitrefs;
00804 comp_status.push_back(status);
00805 }
00806 }
00807
00808 if(cross_filter && m_compartments.size()) {
00809
00810 const TCoord kMinCompartmentHitLength (8);
00811
00812
00813 for(size_t icn (m_compartments.size()), ic (0); ic < icn; ++ic) {
00814
00815 CCompartment & comp (m_compartments[ic]);
00816 THitRefs& hitrefs (comp.SetMembers());
00817 size_t nullified (0);
00818 for(int in (hitrefs.size()), i (in - 1); i > 0; --i) {
00819
00820 int j1 (i);
00821 while(j1 < in && hitrefs[j1].IsNull()) ++j1;
00822 if(j1 == in) continue;
00823
00824 THitRef& h1 (hitrefs[j1]);
00825 THitRef& h2 (hitrefs[i-1]);
00826
00827 if(h1.IsNull()) continue;
00828
00829 const TCoord * box1o (h1->GetBox());
00830 TCoord box1 [4] = {box1o[0], box1o[1], box1o[2], box1o[3]};
00831 const TCoord * box2o (h2->GetBox());
00832 TCoord box2 [4] = {box2o[0], box2o[1], box2o[2], box2o[3]};
00833
00834 const int qd (box1[1] - box2[0] + 1);
00835 const int sd (box1[3] - box2[2] + 1);
00836 if(qd > sd && qd > 0) {
00837
00838 if(box2[0] <= box1[0] + kMinCompartmentHitLength) {
00839
00840 #ifdef ALGOALIGNUTIL_COMPARTMENT_FINDER_KEEP_TERMINII
00841 if(i + 1 == in) {
00842 TCoord new_coord (box1[0] + (box1[1] - box1[0])/2);
00843 if(box1[0] + 1 <= new_coord) {
00844 --new_coord;
00845 }
00846 else {
00847 new_coord = box1[0];
00848 }
00849 h1->Modify(1, new_coord);
00850 }
00851 else
00852 #endif
00853 {
00854 h1.Reset(0);
00855 ++nullified;
00856 }
00857 }
00858 else {
00859 h1->Modify(1, box2[0] - 1);
00860 }
00861
00862 if(box2[1] <= box1[1] + kMinCompartmentHitLength) {
00863
00864 #ifdef ALGOALIGNUTIL_COMPARTMENT_FINDER_KEEP_TERMINII
00865 if(i == 1) {
00866 TCoord new_coord (box2[0] + (box2[1] - box2[0])/2);
00867 if(box2[1] >= new_coord + 1) {
00868 ++new_coord;
00869 }
00870 else {
00871 new_coord = box2[1];
00872 }
00873 h2->Modify(0, new_coord);
00874 }
00875 else
00876 #endif
00877 {
00878 h2.Reset(0);
00879 ++nullified;
00880 }
00881 }
00882 else {
00883 h2->Modify(0, box1[1] + 1);
00884 }
00885 }
00886 else if (sd > 0) {
00887
00888 if(box2[2] <= box1[2] + kMinCompartmentHitLength) {
00889
00890 if(i + 1 == in) {
00891 TCoord new_coord (box1[2] + (box1[3] - box1[2])/2);
00892 if(box1[2] + 1 <= new_coord) {
00893 --new_coord;
00894 }
00895 else {
00896 new_coord = box1[2];
00897 }
00898 h1->Modify(3, new_coord);
00899 }
00900 else {
00901 h1.Reset(0);
00902 ++nullified;
00903 }
00904 }
00905 else {
00906 h1->Modify(3, box2[2] - 1);
00907 }
00908
00909 if(box2[3] <= box1[3] + kMinCompartmentHitLength) {
00910
00911 if(i == 1) {
00912 TCoord new_coord (box2[2] + (box2[3] - box2[2])/2);
00913 if(box2[3] >= new_coord + 1) {
00914 ++new_coord;
00915 }
00916 else {
00917 new_coord = box2[3];
00918 }
00919
00920 h2->Modify(2, new_coord);
00921 }
00922 else {
00923 h2.Reset(0);
00924 ++nullified;
00925 }
00926 }
00927 else {
00928 h2->Modify(2, box1[3] + 1);
00929 }
00930 }
00931 }
00932
00933 if(nullified > 0) {
00934 hitrefs.erase(remove_if(hitrefs.begin(), hitrefs.end(),
00935 CCompartmentFinder<THit>::s_PNullRef),
00936 hitrefs.end());
00937 }
00938 }
00939
00940 #define ALGO_ALIGN_COMPARTMENT_FINDER_USE_FULL_XFILTERING
00941 #ifdef ALGO_ALIGN_COMPARTMENT_FINDER_USE_FULL_XFILTERING
00942
00943 typename THitRefs::iterator ihr_b (m_hitrefs.begin()),
00944 ihr_e(m_hitrefs.end()), ihr (ihr_b);
00945
00946 stable_sort(ihr_b, ihr_e, THitComparator(THitComparator::eSubjMinSubjMax));
00947
00948
00949 for(int icn (m_compartments.size()), ic (icn - 1); ic >= 0; --ic) {
00950
00951 CCompartment & comp (m_compartments[ic]);
00952 THitRefs& hitrefs (comp.SetMembers());
00953
00954 if(hitrefs.size() < 3) {
00955 continue;
00956 }
00957 else {
00958 NON_CONST_ITERATE(typename THitRefs, ii, hitrefs) {
00959 (*ii)->SetScore(- (*ii)->GetScore());
00960 }
00961 hitrefs.front()->SetEValue(-1);
00962 hitrefs.back()->SetEValue(-1);
00963 }
00964
00965 const TCoord comp_subj_min (hitrefs.back()->GetSubjStart());
00966 const TCoord comp_subj_max (hitrefs.front()->GetSubjStop());
00967 while(ihr != ihr_e && ((*ihr)->GetSubjStart() < comp_subj_min
00968 || (*ihr)->GetScore() < 0))
00969 {
00970 ++ihr;
00971 }
00972
00973 if(ihr == ihr_e) break;
00974 if((*ihr)->GetSubjStart() > comp_subj_max) continue;
00975 typename THitRefs::iterator ihrc_b (ihr);
00976
00977 typename THitRefs::iterator ihrc_e (ihr);
00978 while(ihrc_e != ihr_e && ((*ihrc_e)->GetSubjStart() < comp_subj_max
00979 || (*ihrc_e)->GetScore() < 0))
00980 {
00981 ++ihrc_e;
00982 }
00983
00984 size_t nullified (sx_XFilter(hitrefs, ihrc_b, ihrc_e, 1,
00985 kMinCompartmentHitLength));
00986 sort(ihrc_b, ihrc_e, THitComparator (THitComparator::eQueryMinQueryMax));
00987
00988 nullified += sx_XFilter(hitrefs, ihrc_b, ihrc_e, 0,
00989 kMinCompartmentHitLength);
00990
00991 ihr = ihrc_e;
00992
00993 if(nullified > 0) {
00994 hitrefs.erase(remove_if(hitrefs.begin(), hitrefs.end(),
00995 CCompartmentFinder<THit>::s_PNullRef),
00996 hitrefs.end());
00997 }
00998 }
00999
01000 for(int icn (m_compartments.size()), ic (icn - 1); ic >= 0; --ic) {
01001
01002 CCompartment & comp (m_compartments[ic]);
01003 THitRefs& hitrefs (comp.SetMembers());
01004 NON_CONST_ITERATE(typename THitRefs, ii, hitrefs) {
01005 const double score ((*ii)->GetScore());
01006 if(score < 0) {
01007 (*ii)->SetScore(-score);
01008 }
01009 const double eval ((*ii)->GetEValue());
01010 if(eval < 0) {
01011 (*ii)->SetEValue(0);
01012 }
01013 }
01014 }
01015 #endif
01016 }
01017
01018
01019 for(size_t i (0), in (m_compartments.size()); i < in; ++i) {
01020 if(comp_status[i] == false) {
01021
01022 THitRefs & hitrefs (m_compartments[i].SetMembers());
01023 NON_CONST_ITERATE(typename THitRefs, ii, hitrefs) {
01024 THitRef & hr (*ii);
01025 hr->SetScore(-hr->GetScore());
01026 }
01027 }
01028 }
01029
01030 return m_compartments.size();
01031 }
01032
01033
01034 template<class THit>
01035 size_t CCompartmentFinder<THit>::sx_XFilter(
01036 THitRefs& hitrefs,
01037 typename THitRefs::iterator ihr,
01038 typename THitRefs::iterator ihr_e,
01039 Uint1 w,
01040 size_t min_compartment_hit_len)
01041 {
01042 size_t nullified (0);
01043 for(int in (hitrefs.size()), i (in - 2); i > 0 && ihr != ihr_e; --i) {
01044
01045 THitRef& h1 (hitrefs[i]);
01046 if(h1.IsNull()) continue;
01047
01048 const TCoord * box1o (h1->GetBox());
01049 TCoord box1 [4] = {box1o[0], box1o[1], box1o[2], box1o[3]};
01050
01051
01052 for(; ihr != ihr_e; ++ihr) {
01053
01054 THitRef hr (*ihr);
01055 if(hr.IsNull()) continue;
01056 if(hr->GetScore() > 0 && hr->GetStop(w) >= box1[2*w]) {
01057 break;
01058 }
01059 }
01060
01061 if(ihr == ihr_e) {
01062 break;
01063 }
01064
01065
01066 TCoord ls0 (box1[2*w+1] + 1), cursegmax (0);
01067 TCoord segmax_start(box1[2*w]), segmax_stop(box1[2*w+1]);
01068
01069 if(box1[2*w] < (*ihr)->GetStart(w)) {
01070
01071 segmax_start = ls0 = box1[2*w];
01072 segmax_stop = (*ihr)->GetStart(w) - 1;
01073 cursegmax = segmax_stop - segmax_start + 1;
01074 }
01075 else {
01076
01077 TCoord shrsmax ((*ihr)->GetStop(w));
01078 for(++ihr; ihr != ihr_e; ++ihr) {
01079
01080 THitRef hr (*ihr);
01081 if(hr.IsNull() || hr->GetScore() < 0) {
01082 continue;
01083 }
01084
01085 const TCoord hrs0 (hr->GetStart(w));
01086 if(hrs0 > box1[2*w+1]) {
01087 segmax_stop = box1[2*w+1];
01088 break;
01089 }
01090
01091 if(hrs0 > shrsmax) {
01092 segmax_stop = hrs0 - 1;
01093 break;
01094 }
01095
01096 const TCoord hrs1 (hr->GetStop(w));
01097 if(hrs1 > shrsmax) {
01098 shrsmax = hrs1;
01099 }
01100 }
01101
01102 if(shrsmax < box1[2*w+1]) {
01103 segmax_start = ls0 = shrsmax + 1;
01104 cursegmax = segmax_stop - segmax_start + 1;
01105 }
01106 }
01107
01108 if(ls0 > box1[2*w+1]) {
01109 h1.Reset(0);
01110 ++nullified;
01111 continue;
01112 }
01113
01114
01115 for(; ihr != ihr_e; ++ihr) {
01116
01117 THitRef hr (*ihr);
01118 if(hr.IsNull() || hr->GetScore() < 0) {
01119 continue;
01120 }
01121
01122 const TCoord hrs0 (hr->GetStart(w));
01123 if(hrs0 > box1[2*w+1]) {
01124 if(ls0 <= box1[2*w+1] && box1[2*w+1] + 1 - ls0 > cursegmax) {
01125 segmax_start = ls0;
01126 segmax_stop = box1[2*w+1];
01127 cursegmax = segmax_stop - segmax_start + 1;
01128 }
01129 break;
01130 }
01131
01132 if(hrs0 > ls0) {
01133 if(hrs0 - ls0 > cursegmax) {
01134 segmax_start = ls0;
01135 segmax_stop = hrs0 - 1;
01136 cursegmax = segmax_stop - segmax_start + 1;
01137 }
01138
01139 ls0 = hr->GetStop(w) + 1;
01140 }
01141 else if(hr->GetStop(w) + 1 > ls0) {
01142 ls0 = hr->GetStop(w) + 1;
01143 }
01144 }
01145
01146 if(box1[2*w+1] > ls0 + cursegmax) {
01147 segmax_start = ls0;
01148 segmax_stop = box1[2*w+1];
01149 cursegmax = segmax_stop - segmax_start + 1;
01150 }
01151
01152 if(cursegmax < min_compartment_hit_len) {
01153 h1.Reset(0);
01154 ++nullified;
01155 continue;
01156 }
01157 else {
01158
01159 if(box1[2*w] < segmax_start) {
01160 h1->Modify(2 * w, segmax_start);
01161 }
01162
01163 if(segmax_stop < box1[2*w+1]) {
01164 h1->Modify(2 * w + 1, segmax_stop);
01165 }
01166 }
01167
01168 if(ihr == ihr_e) break;
01169 }
01170
01171 return nullified;
01172 }
01173
01174
01175 template<class THit>
01176 typename CCompartmentFinder<THit>::CCompartment*
01177 CCompartmentFinder<THit>::GetFirst()
01178 {
01179 if(m_compartments.size()) {
01180 m_iter = 0;
01181 return &m_compartments[m_iter++];
01182 }
01183 else {
01184 m_iter = -1;
01185 return 0;
01186 }
01187 }
01188
01189 template<class THit>
01190 void CCompartmentFinder<THit>::OrderCompartments(void) {
01191
01192 for(size_t i = 0, dim = m_compartments.size(); i < dim; ++i) {
01193 m_compartments[i].UpdateMinMax();
01194 }
01195
01196 stable_sort(m_compartments.begin(), m_compartments.end(),
01197 CCompartmentFinder<THit>::CCompartment::s_PLowerSubj);
01198 }
01199
01200 template<class THit>
01201 typename CCompartmentFinder<THit>::CCompartment*
01202 CCompartmentFinder<THit>::GetNext()
01203 {
01204 const size_t dim = m_compartments.size();
01205 if(m_iter == -1 || m_iter >= int(dim)) {
01206 m_iter = -1;
01207 return 0;
01208 }
01209 else {
01210 return &m_compartments[m_iter++];
01211 }
01212 }
01213
01214
01215 template<class THit>
01216 CCompartmentAccessor<THit>::CCompartmentAccessor(TCoord comp_penalty,
01217 TCoord min_matches,
01218 TCoord min_singleton_matches,
01219 bool cross_filter)
01220 {
01221 x_Init(comp_penalty, min_matches, min_singleton_matches, cross_filter);
01222 }
01223
01224
01225 template<class THit>
01226 CCompartmentAccessor<THit>::CCompartmentAccessor(
01227 typename THitRefs::iterator istart,
01228 typename THitRefs::iterator ifinish,
01229 TCoord comp_penalty,
01230 TCoord min_matches,
01231 TCoord min_singleton_matches,
01232 bool cross_filter)
01233 {
01234 x_Init(comp_penalty, min_matches, min_singleton_matches, cross_filter);
01235 Run(istart, ifinish);
01236 }
01237
01238
01239 template<class THit>
01240 void CCompartmentAccessor<THit>::x_Init(TCoord comp_penalty,
01241 TCoord min_matches,
01242 TCoord min_singleton_matches,
01243 bool cross_filter)
01244 {
01245 m_Penalty = comp_penalty;
01246 m_MinMatches = min_matches;
01247 m_MinSingletonMatches = min_singleton_matches;
01248 m_CrossFiltering = cross_filter;
01249 m_MaxIntron = CCompartmentFinder<THit>::s_GetDefaultMaxIntron();
01250 }
01251
01252
01253 template<class THit>
01254 void CCompartmentAccessor<THit>::Run(typename THitRefs::iterator istart,
01255 typename THitRefs::iterator ifinish)
01256 {
01257 const TCoord kMax_TCoord (numeric_limits<TCoord>::max());
01258
01259
01260 typename THitRefs::iterator ib = istart, ie = ifinish, ii = ib,
01261 iplus_beg = ie;
01262 typedef CHitComparator<THit> THitComparator;
01263 THitComparator sorter (THitComparator::eSubjStrand);
01264 stable_sort(ib, ie, sorter);
01265
01266 TCoord minus_subj_min = kMax_TCoord, minus_subj_max = 0;
01267 for(ii = ib; ii != ie; ++ii) {
01268 if((*ii)->GetSubjStrand()) {
01269 iplus_beg = ii;
01270 break;
01271 }
01272 else {
01273 if((*ii)->GetSubjMin() < minus_subj_min) {
01274 minus_subj_min = (*ii)->GetSubjMin();
01275 }
01276 if((*ii)->GetSubjMax() > minus_subj_max) {
01277 minus_subj_max = (*ii)->GetSubjMax();
01278 }
01279 }
01280 }
01281
01282
01283 {{
01284
01285 for(ii = ib; ii != iplus_beg; ++ii) {
01286
01287 const typename THit::TCoord s0 = minus_subj_min + minus_subj_max
01288 - (*ii)->GetSubjMax();
01289 const typename THit::TCoord s1 = minus_subj_min + minus_subj_max
01290 - (*ii)->GetSubjMin();
01291 (*ii)->SetSubjStart(s0);
01292 (*ii)->SetSubjStop(s1);
01293 }
01294
01295 CCompartmentFinder<THit> finder (ib, iplus_beg);
01296 finder.SetPenalty(m_Penalty);
01297 finder.SetMinMatches(m_MinMatches);
01298 finder.SetMinSingletonMatches(m_MinSingletonMatches);
01299 finder.SetMaxIntron(m_MaxIntron);
01300 finder.Run(m_CrossFiltering);
01301
01302
01303 for(ii = ib; ii != iplus_beg; ++ii) {
01304
01305 const typename THit::TCoord s0 = minus_subj_min + minus_subj_max
01306 - (*ii)->GetSubjMax();
01307 const typename THit::TCoord s1 = minus_subj_min + minus_subj_max
01308 - (*ii)->GetSubjMin();
01309 (*ii)->SetSubjStart(s1);
01310 (*ii)->SetSubjStop(s0);
01311 }
01312
01313 x_Copy2Pending(finder);
01314 }}
01315
01316
01317 {{
01318 CCompartmentFinder<THit> finder (iplus_beg, ie);
01319 finder.SetPenalty(m_Penalty);
01320 finder.SetMinMatches(m_MinMatches);
01321 finder.SetMinSingletonMatches(m_MinSingletonMatches);
01322 finder.SetMaxIntron(m_MaxIntron);
01323 finder.Run(m_CrossFiltering);
01324 x_Copy2Pending(finder);
01325 }}
01326 }
01327
01328
01329 template<class THit>
01330 void CCompartmentAccessor<THit>::x_Copy2Pending(
01331 CCompartmentFinder<THit>& finder)
01332 {
01333 finder.OrderCompartments();
01334
01335 typedef typename CCompartmentFinder<THit>::THitRef THitRef;
01336
01337
01338 for(typename CCompartmentFinder<THit>::CCompartment* compartment =
01339 finder.GetFirst(); compartment; compartment = finder.GetNext()) {
01340
01341 if(compartment->GetMembers().size() > 0) {
01342
01343 m_pending.push_back(THitRefs(0));
01344 THitRefs& vh = m_pending.back();
01345
01346 for(THitRef ph (compartment->GetFirst()); ph; ph = compartment->GetNext())
01347 {
01348 vh.push_back(ph);
01349 }
01350
01351 const TCoord* box = compartment->GetBox();
01352 m_ranges.push_back(box[0]);
01353 m_ranges.push_back(box[1]);
01354 m_ranges.push_back(box[2]);
01355 m_ranges.push_back(box[3]);
01356
01357 m_strands.push_back(compartment->GetStrand());
01358 m_status.push_back(compartment->GetFirst()->GetScore() > 0);
01359 }
01360 }
01361 }
01362
01363
01364 template<class THit>
01365 bool CCompartmentAccessor<THit>::GetFirst(THitRefs& compartment) {
01366
01367 m_iter = 0;
01368 return GetNext(compartment);
01369 }
01370
01371
01372 template<class THit>
01373 bool CCompartmentAccessor<THit>::GetNext(THitRefs& compartment) {
01374
01375 compartment.clear();
01376 if(m_iter < m_pending.size()) {
01377 compartment = m_pending[m_iter++];
01378 return true;
01379 }
01380 else {
01381 return false;
01382 }
01383 }
01384
01385
01386 template<class THit>
01387 CRef<objects::CSeq_align_set> CCompartmentAccessor<THit>::AsSeqAlignSet(void) const
01388 {
01389 USING_SCOPE(objects);
01390
01391 CRef<CSeq_align_set> rv (new CSeq_align_set);
01392
01393
01394 CRef<CSeq_id> seqid_query (new CSeq_id), seqid_subj (new CSeq_id);
01395 if(m_pending.size()) {
01396 if(m_pending.front().size()) {
01397 const THit & h (*m_pending.front().front());
01398 seqid_query->Assign(*(h.GetQueryId()));
01399 seqid_subj->Assign(*(h.GetSubjId()));
01400 }
01401 }
01402
01403 CSeq_align_set::Tdata& sas1_data (rv->Set());
01404
01405 for(size_t i(0), idim(m_pending.size()); i < idim; ++i) {
01406
01407 if(m_status[i]) {
01408
01409
01410 TCoord range_min (i > 0 && m_strands[i-1] == m_strands[i]
01411 ? m_ranges[4*i - 1]
01412 : 0);
01413
01414 TCoord range_max (i + 1 < idim && m_strands[i+1] == m_strands[i]
01415 ? m_ranges[4*i + 6]
01416 : numeric_limits<TCoord>::max());
01417
01418 CRef<CSeq_align> sa2 (new CSeq_align);
01419 sa2->SetType(CSeq_align::eType_disc);
01420
01421 CSeq_align::TBounds & bounds (sa2->SetBounds());
01422 const ENa_strand query_strand (eNa_strand_plus);
01423
01424 const ENa_strand subj_strand (m_strands[i]? eNa_strand_plus:
01425 eNa_strand_minus);
01426 CRef<CSeq_loc> seq_loc (new CSeq_loc (*seqid_subj, range_min,
01427 range_max, subj_strand));
01428 bounds.push_back(seq_loc);
01429
01430
01431 CSeq_align_set::Tdata& sas2_data (sa2->SetSegs().SetDisc().Set());
01432 ITERATE(typename THitRefs, ii, m_pending[i]) {
01433
01434 const THit& h (**ii);
01435 CRef<CSeq_align> sa3 (new CSeq_align);
01436 sa3->SetType(CSeq_align::eType_global);
01437 CDense_seg & ds (sa3->SetSegs().SetDenseg());
01438 ds.FromTranscript(h.GetQueryStart(), query_strand,
01439 h.GetSubjStart(), subj_strand,
01440 h.GetTranscript());
01441 ds.SetIds().push_back(seqid_query);
01442 ds.SetIds().push_back(seqid_subj);
01443 sas2_data.push_back(sa3);
01444 }
01445
01446
01447 sas1_data.push_back(sa2);
01448 }
01449 }
01450
01451 return rv;
01452 }
01453
01454
01455
01456 END_NCBI_SCOPE
01457
01458
01459 #endif
01460
01461