include/algo/align/util/hit_filter.hpp

Go to the documentation of this file.
00001 #ifndef ALGO_ALIGN_UTIL_HITFILTER__HPP
00002 #define ALGO_ALIGN_UTIL_HITFILTER__HPP
00003 
00004 /* $Id: hit_filter.hpp 170328 2009-09-11 13:49:02Z kapustin $
00005 * ===========================================================================
00006 *
00007 *                            public DOMAIN NOTICE                          
00008 *               National Center for Biotechnology Information
00009 *                                                                          
00010 *  This software/database is a "United States Government Work" under the   
00011 *  terms of the United States Copyright Act.  It was written as part of    
00012 *  the author's official duties as a United States Government employee and 
00013 *  thus cannot be copyrighted.  This software/database is freely available 
00014 *  to the public for use. The National Library of Medicine and the U.S.    
00015 *  Government have not placed any restriction on its use or reproduction.  
00016 *                                                                          
00017 *  Although all reasonable efforts have been taken to ensure the accuracy  
00018 *  and reliability of the software and data, the NLM and the U.S.          
00019 *  Government do not and cannot warrant the performance or results that    
00020 *  may be obtained by using this software or data. The NLM and the U.S.    
00021 *  Government disclaim all warranties, express or implied, including       
00022 *  warranties of performance, merchantability or fitness for any particular
00023 *  purpose.                                                                
00024 *                                                                          
00025 *  Please cite the author in any work or product based on this material.   
00026 *
00027 * ===========================================================================
00028 *
00029 * Author:  Yuri Kapustin
00030 *
00031 * File Description: Multiple-sequence alignment uniquification algorithms.
00032 *
00033 */
00034 
00035 #include <algo/align/util/hit_comparator.hpp>
00036 
00037 #include <algorithm>
00038 #include <numeric>
00039 #include <vector>
00040 #include <set>
00041 
00042 
00043 /** @addtogroup AlgoAlignRoot
00044  *
00045  * @{
00046  */
00047 
00048 
00049 BEGIN_NCBI_SCOPE
00050 
00051 /////////////////////////////////////////
00052 // CHitCoverageAccumulator
00053 
00054 template<class THit>
00055 class CHitCoverageAccumulator
00056 {
00057 public:
00058 
00059     typedef CRef<THit>    THitRef;
00060     typedef typename THit::TCoord  TCoord;
00061 
00062     /// Create the object; specify the accumulation sequence
00063     ///
00064     /// @param where
00065     ///    Accumulation source sequence (0 = query, 1 = subject)
00066     CHitCoverageAccumulator(Uint1 where): 
00067         m_Where(where),  
00068         m_Finish(0)
00069     {
00070         if(m_Where == 0) {
00071             m_i1 = 0;
00072             m_i2 = 1;
00073         }
00074         else {
00075             m_i1 = 2;
00076             m_i2 = 3;
00077         }
00078     }
00079 
00080     /// Overloaded function call operator to be used with std::accumulate()
00081     ///
00082     /// @param iVal
00083     ///    Accumulation target
00084     /// @return
00085     ///    Updated accumulation target
00086     TCoord operator() (TCoord iVal, const THitRef& ph)
00087     {
00088         const TCoord box [] = { ph->GetQueryMin(), ph->GetQueryMax(),
00089                                 ph->GetSubjMin(), ph->GetSubjMax()   };
00090 
00091         if(box[m_i1] > m_Finish || m_Finish == 0) {
00092             return iVal + (m_Finish = box[m_i2]) - box[m_i1] + 1;
00093         }
00094         else {
00095             TCoord Finish0 = m_Finish;
00096             return (box[m_i2] > Finish0)?
00097                 (iVal + (m_Finish = box[m_i2]) - Finish0): iVal;
00098         }
00099     }
00100 
00101 private:
00102 
00103     Uint1   m_Where;
00104     TCoord  m_Finish;
00105     Uint1   m_i1, m_i2;
00106 };
00107 
00108 /////////////////////////////////////////////
00109 /////////////////////////////////////////////
00110 
00111 // The multiple-sequence alignment uniquification algorithm.
00112 // The algorithm ensures that every residue on each sequence is either aligned 
00113 // uniquely, or not at all (unless the residue is covered alignments overlapping 
00114 // by longer than retain_overlap).In the process of resolving ambiguities 
00115 // in the input alignments, the algorithm starts from the highest-scoring hit.
00116 // This is a greedy algorithm so no claim is made as of optimality of the overall 
00117 // alignment.
00118 
00119 template<class THit>
00120 class CHitFilter: public CObject
00121 {
00122 public:
00123 
00124     typedef CRef<THit>              THitRef;
00125     typedef vector<THitRef>         THitRefs;
00126     typedef typename THit::TCoord   TCoord;
00127 
00128     /// Collect coverage for the range of hits on the specified source sequence
00129     ///
00130     /// @param where
00131     ///    Accumulation source sequence (0 = query, 1 = subject)
00132     /// @param from
00133     ///    Hit range start
00134     /// @param to
00135     ///    Hit range stop
00136     /// @return
00137     ///    The number of residues on the source sequences covered by at least one
00138     ///    alignment
00139     static  TCoord s_GetCoverage(Uint1 where, 
00140                                  typename THitRefs::const_iterator from,
00141                                  typename THitRefs::const_iterator to)
00142     {
00143         // since we need to sort, create and init a local vector
00144         THitRefs hitrefs (to - from);
00145         typedef typename THitRefs::const_iterator TCI; 
00146         typedef typename THitRefs::iterator TI; 
00147         TCI ii = from;
00148         TI jj = hitrefs.begin();
00149         while(ii != to) {
00150             *jj++ = *ii++;
00151         }
00152         
00153         // prepare a sorter object and sort
00154         typedef CHitComparator<THit> THitComparator;
00155         typename THitComparator::ESortCriterion sort_type (
00156             where == 0? 
00157             THitComparator::eQueryMin:
00158             THitComparator::eSubjMin);
00159 
00160         THitComparator sorter (sort_type);
00161         stable_sort(hitrefs.begin(), hitrefs.end(), sorter);
00162         
00163         // compute coverage
00164         return accumulate(hitrefs.begin(), hitrefs.end(), 
00165                           TCoord(0), 
00166                           CHitCoverageAccumulator<THit>(where));
00167     }
00168 
00169     /// Get sequence span for a set of alignments (hits).
00170     ///
00171     /// @param hitrefs
00172     ///    Source hits
00173     /// @param span
00174     ///    Destination array (0=query min, 1=query max, 2=subj min, 3=subj max)
00175     static void s_GetSpan(const THitRefs& hitrefs, TCoord span [4]) 
00176     {      
00177         span[0] = span[2] = kMax_UInt; 
00178         span[1] = span[3] = 0;
00179           
00180         for(typename THitRefs::const_iterator ii = hitrefs.begin(),
00181                 ii_end = hitrefs.end(); ii != ii_end; ++ii) {
00182             
00183             TCoord x = (*ii)->GetQueryMin();
00184             if(span[0] > x) {
00185                 span[0] = x;
00186             }
00187             x = (*ii)->GetSubjMin();
00188             if(span[2] > x) {
00189                 span[2] = x;
00190             }
00191             x = (*ii)->GetQueryMax();
00192             if(span[1] < x) {
00193                 span[1] = x;
00194             }
00195             x = (*ii)->GetSubjMax();
00196             if(span[3] < x) {
00197                 span[3] = x;
00198             }
00199         }
00200     }
00201 
00202     static TCoord s_GetOverlap(TCoord L1, TCoord R1, TCoord L2, TCoord R2)
00203     {
00204         const TCoord minmax = min(R1, R2);
00205         const TCoord maxmin = max(L1, L2);
00206         const  int n = minmax - maxmin + 1;
00207         return n > 0? n: 0;
00208     }
00209     
00210     /// Multiple-sequences greedy alignment uniquification algorithm
00211     ///
00212     /// @param hri_begin
00213     ///    Input hit range start
00214     /// @param hri_end
00215     ///    Input hit range stop
00216     /// @param phits_new
00217     ///    An external hit vector to keep any extra hits created during the call
00218     /// @param min_hit_len
00219     ///    Minimum alignment length to keep
00220     /// @param min_hit_idty
00221     ///    Minimum alignment identity to keep
00222     /// @param margin
00223     ///    Speed/memory trade-off (0 will use max memory; >0 will slow down a bit but 
00224     //     reduce memory requirements)
00225     /// @retain_overlap
00226     ///    Minimum length of overlaps to keep (0 = no overlaps)
00227 
00228     enum EUnique_type {
00229         e_Strict,
00230         e_Query,
00231         e_Subject
00232     };
00233     
00234     static void s_RunGreedy(typename THitRefs::iterator hri_beg, 
00235                             typename THitRefs::iterator hri_end,
00236                             THitRefs* phits_new,
00237                             TCoord min_hit_len = 100,
00238                             double min_hit_idty = .9,
00239                             TCoord margin = 1,
00240                             TCoord retain_overlap = 0,
00241                             EUnique_type unique_type = e_Strict)
00242     {
00243 
00244         int max_idx = 2;
00245         int min_idx = 0;
00246         switch (unique_type) {
00247             case e_Strict:
00248                 break;
00249             case e_Query:
00250                 max_idx = 1;
00251                 break;
00252             case e_Subject:
00253                 min_idx = 1;
00254                 break;
00255         }
00256                 
00257         if(hri_beg > hri_end) {
00258             NCBI_THROW(CAlgoAlignUtilException, eInternal, 
00259                        "Invalid input iterator order");
00260         }
00261 
00262         const size_t dim0 = hri_end - hri_beg;
00263         THitRefs all;
00264         
00265         size_t dim_factor = 2;
00266         bool restart = true;
00267         
00268         vector<bool> del;
00269 
00270         while(restart) {
00271             
00272             restart = false;
00273 
00274             all.resize(dim0);
00275             copy(hri_beg, hri_end, all.begin());
00276             all.reserve(dim_factor * dim0);
00277 
00278             const typename THitRefs::iterator all_beg = all.begin();
00279 
00280             // compile ordered set of hit ends
00281             typename THitRefs::iterator ii = all_beg;
00282             THitEnds hit_ends;
00283             for(size_t i = 0; i < dim0; ++i, ++ii) {
00284 
00285                 THitRef& h = *ii;
00286                 for(Uint1 point = 0; point < 4; ++point) {
00287                     THitEnd he;
00288                     he.m_Point = point;
00289                     he.m_Ptr = &h;
00290                     he.m_X = h->GetBox()[point];
00291                     hit_ends.insert(he);
00292                 }
00293             }
00294 
00295             // Create additional markers for potential 'hot dogs'.
00296             // A hot dog is a higher-scoring hit within a lower scoring one.
00297             THitEnds hit_mids;
00298             typename THit::TId id;
00299             id = (*(hit_ends.begin()->m_Ptr))->GetId(hit_ends.begin()->m_Point/2);
00300             typedef pair<THitRef*, Uint1> THitEndInfo;
00301             typedef set<THitEndInfo> TOpenHits;
00302             TOpenHits open_hits;
00303             open_hits.insert(
00304                 THitEndInfo(hit_ends.begin()->m_Ptr,hit_ends.begin()->m_Point/2));
00305             ITERATE(typename THitEnds, ii, hit_ends) {
00306 
00307                 const Uint1 where = ii->m_Point/2;
00308                 THitRef* hitrefptr = ii->m_Ptr;
00309                 if((*hitrefptr)->GetId(where)->CompareOrdered(*id) != 0) {
00310                     // new id => reset
00311                     open_hits.clear();
00312                     open_hits.insert(THitEndInfo(hitrefptr,where));
00313                     id = (*hitrefptr)->GetId(where);
00314                 }
00315                 else {
00316                     typename TOpenHits::const_iterator iise = open_hits.end();
00317                     typename TOpenHits::iterator iis 
00318                         = open_hits.find(THitEndInfo(hitrefptr,where));
00319                     if(iis == iise) {
00320                         // new open
00321                         open_hits.insert(THitEndInfo(hitrefptr,where));
00322                     }
00323                     else {
00324                         // hit closed => check for hot fogs
00325                         open_hits.erase(iis);
00326                         const float curscore = (*hitrefptr)->GetScore();
00327                         ITERATE(typename TOpenHits, iis2, open_hits) {
00328                             
00329                             bool hd_score (((*(iis2->first))->GetScore() < curscore));
00330                             // In large-scale alignments, the 'hot dogs' can be 
00331                             // numerous and increase the size of hit_mids 
00332                             // significantly. This is addressed by using the margin,
00333                             // which reduces the number of mid-points stored,
00334                             // but requires to scan through endpoints in a larger
00335                             // vicinity of a current best hit.
00336                             bool hd_xl (margin + (*(iis2->first))->GetMin(iis2->second)
00337                                         < (*hitrefptr)->GetMin(where));
00338                             bool hd_xr (margin + (*hitrefptr)->GetMax(where)
00339                                         < (*(iis2->first))->GetMax(iis2->second));
00340                             if(hd_score && hd_xl && hd_xr) {
00341                                 
00342                                 THitEnd hm;
00343                                 hm.m_Point = iis2->second * 2;
00344                                 hm.m_Ptr = iis2->first;
00345                                 hm.m_X = ( (*hitrefptr)->GetStart(where) 
00346                                            + (*hitrefptr)->GetStop(where) ) / 2;
00347                                 hit_mids.insert(hm);
00348                             }
00349                         }
00350                     }
00351                 }
00352             }
00353 
00354             hit_ends.insert(hit_mids.begin(), hit_mids.end());
00355 
00356             vector<bool> skip (dim0, false);
00357             skip.reserve(dim_factor * dim0);
00358             del.assign(dim0, false);
00359             del.reserve(dim_factor * dim0);
00360 
00361             const THitRef* hitref_firstptr = &(*all_beg);
00362             size_t dim = dim0;
00363             while(restart == false) {
00364 
00365                 // select the best hit
00366                 double score_best = 0;
00367                 typename THitRefs::iterator ii = all_beg, ii_best = all.end();
00368                 for(size_t i = 0; i < dim; ++i, ++ii) {
00369                     if(skip[i] == false) {
00370                         const double score = (*ii)->GetScore();
00371                         if(score > score_best) {
00372                             ii_best = ii;
00373                             score_best = score;
00374                         }
00375                     }
00376                 }
00377 
00378                 if(ii_best == all.end()) {
00379                     break;
00380                 }
00381 
00382                 // truncate overlaps with the current best hit
00383                 const THitRef& hc = *ii_best;
00384                 const bool is_constraint = hc->GetScore() > 1e20;
00385                 THitEnd he [4];
00386                 for(Uint1 point = 0; point < 4; ++point) {
00387 
00388                     he[point].m_Point = point;
00389                     he[point].m_Ptr = const_cast<THitRef*>(&hc);
00390                     if(is_constraint == false) {
00391                         he[point].m_X = hc->GetBox()[point];
00392                     }
00393                     else {
00394                         const TCoord a = 0, b = numeric_limits<TCoord>::max() / 2;
00395                         const bool is_start = point % 2 == 0;
00396                         const bool is_plus  = hc->GetStrand(point / 2);
00397                         he[point].m_X =  (is_start ^ is_plus)? b: a;
00398                     }
00399                 }
00400       
00401                 for(Uint1 where = min_idx; where < max_idx; ++where) {
00402 
00403                     const TCoord cmin = hc->GetMin(where);
00404                     const TCoord cmax = hc->GetMax(where);
00405 
00406                     THitEnd *phe_lo, *phe_hi;
00407                     const Uint1 i1 = 2*where, i2 = i1 + 1;
00408                     if(hc->GetStrand(where)) {
00409                         phe_lo = &he[i1];
00410                         phe_hi = &he[i2];
00411                     }
00412                     else {
00413                         phe_lo = &he[i2];
00414                         phe_hi = &he[i1];
00415                     }
00416 
00417                     if(phe_lo->m_X >= margin) {
00418                         phe_lo->m_X -= margin;
00419                     }
00420                     else {
00421                         phe_lo->m_X = 0;
00422                     }
00423 
00424                     phe_hi->m_X += margin;
00425 
00426                     typedef typename THitEnds::iterator THitEndsIter;
00427                     THitEndsIter ii0 (hit_ends.lower_bound(*phe_lo));
00428                     THitEndsIter ii1 (hit_ends.upper_bound(*phe_hi));
00429 
00430                     // special case: if X is zero, go left as long as it's the same id
00431                     if(phe_lo->m_X == 0) {
00432                         THitRef hitref (*(ii0->m_Ptr));
00433                         const typename THit::TId& id (hitref->GetId(ii0->m_Point/2));
00434                         for(THitEndsIter ii_start(hit_ends.begin()); 
00435                             ii0 != ii_start; --ii0)
00436                         {
00437                             THitRef hr (*(ii0->m_Ptr));
00438                             const typename THit::TId & id2 (hr->GetId(ii0->m_Point/2));
00439                             if(0 != id->CompareOrdered(*id2)) {
00440                                 ++ii0;
00441                                 break;
00442                             }
00443                         }
00444                     }
00445 
00446                     for(typename THitEnds::iterator ii (ii0); ii != ii1; ++ii) {
00447                     
00448                         const THitEnd& he = *ii;                   
00449 
00450                         const size_t hitrefidx = he.m_Ptr - hitref_firstptr;
00451                         const bool alive       = skip[hitrefidx] == false;
00452                         const bool self        = he.m_Ptr == &hc;
00453 
00454                         if(alive && !self) {
00455                             
00456                             THitRef hit_new;
00457                             int newpos 
00458                                 = sx_Cleave(*he.m_Ptr, he.m_Point/2, 
00459                                             cmin, cmax, min_hit_len, 
00460                                             min_hit_idty, & hit_new,
00461                                             retain_overlap);
00462 
00463                             if(newpos <= -2) { // eliminate
00464                                 del[hitrefidx] = skip[hitrefidx] = true;
00465                             }
00466 
00467                             if(newpos >= 0) { // truncated or split
00468 
00469                                 const TCoord* box = (*he.m_Ptr)->GetBox();
00470 
00471                                 THitEnd he2;
00472                                 he2.m_Point = he.m_Point;
00473                                 he2.m_Ptr = he.m_Ptr;
00474                                 he2.m_X = box[he2.m_Point];
00475                                 hit_ends.insert(he2);
00476 
00477                                 THitEnd he3;
00478                                 he3.m_Point = (he.m_Point + 2) % 4;
00479                                 he3.m_Ptr = he.m_Ptr;
00480                                 he3.m_X = box[he3.m_Point];
00481                                 hit_ends.insert(he3);
00482 
00483                                 if(hit_new.NotEmpty()) {
00484                                 
00485                                     if(++dim > 2 * dim0) {
00486                                         dim_factor *= 2;
00487                                         restart = true;
00488                                         break;
00489                                     }
00490 
00491                                     all.push_back(hit_new);
00492                                     THitRef* ptr = & all.back();
00493                                     del.push_back(false);
00494                                     skip.push_back(false);
00495 
00496                                     Uint1 point = he.m_Point;
00497                                     Uint1 where = point / 2;
00498 
00499                                     // only two new ends to add
00500                                     THitEnd he2;
00501                                     he2.m_Point = point;
00502                                     he2.m_Ptr = ptr;
00503                                     he2.m_X = (box[point]== hit_new->GetStart(where))?
00504                                         hit_new->GetStop(where): 
00505                                         hit_new->GetStart(where);
00506                                     hit_ends.insert(he2);
00507 
00508                                     point = (he.m_Point + 2) % 4;
00509                                     where = point / 2;
00510 
00511                                     THitEnd he3;
00512                                     he3.m_Point = point;
00513                                     he3.m_Ptr = ptr;
00514                                     he3.m_X = (box[point]==hit_new->GetStart(where))?
00515                                         hit_new->GetStop(where):
00516                                         hit_new->GetStart(where);
00517                                     hit_ends.insert(he3);
00518 
00519                                     // TODO: add midpoints for potential 
00520                                     // hot dogs as above
00521 
00522                                 } // if(new_hit.NotEmpty())
00523                             } // if(newpos >= 0) 
00524                         } // if(alive && !self)
00525                     } // for(ii ...
00526                 } // for(where ...
00527 
00528                 skip[&hc - hitref_firstptr] = true;
00529             } // while(restart == false)
00530         } // while(restart)
00531 
00532         // execute any pending deletions
00533         typename THitRefs::iterator all_beg = all.begin(), ii = all_beg;
00534         const size_t dim = all.size();
00535         for(size_t i = 0; i < dim; ++i, ++ii) {
00536             if(del[i]) {
00537                 ii->Reset(NULL);
00538             }
00539         }
00540 
00541         // copy hits to the results vector until the results are full or
00542         // there are no more hits to copy
00543         typename THitRefs::iterator idst = hri_beg, isrc = all_beg, 
00544             isrc_end = all.end();
00545         while(isrc != isrc_end && idst != hri_end) {
00546             if(isrc->NotEmpty()) {
00547                 *idst++ = *isrc;
00548             }
00549             ++isrc;
00550         }
00551 
00552         // nullify any slack hits in the results
00553         while(idst != hri_end) {
00554             idst++ -> Reset(NULL);
00555         }
00556 
00557         // if there are hits remaining, write them as new
00558         phits_new->resize(0);
00559         while(isrc != isrc_end) {
00560             if(isrc->NotEmpty()) {
00561                 phits_new->push_back(*isrc);
00562             }
00563             ++isrc;
00564         }
00565     }
00566 
00567 
00568     // merge hits abutting on query(subject) but separated on subject(query).
00569     // maxlenfr is the maximum allowed length of an alignment within a gap
00570     // as a fraction of the minimum length of two merging hits
00571     static void s_MergeAbutting(typename THitRefs::iterator hri_beg,
00572                                 typename THitRefs::iterator hri_end,
00573                                 const double& maxlenfr,
00574                                 THitRefs* pout)
00575     {
00576         if(hri_beg > hri_end) {
00577             NCBI_THROW(CAlgoAlignUtilException, eInternal, 
00578                        "Invalid input iterator order");
00579         }
00580 
00581         // copy input hits and make all queries in plus strand
00582         const size_t dim0 = hri_end - hri_beg;
00583         THitRefs all;
00584         all.reserve(dim0);
00585         for(typename THitRefs::iterator hri = hri_beg; hri != hri_end; ++hri) {
00586             if((*hri)->GetQueryStrand() == false) {
00587                 (*hri)->FlipStrands();
00588             }
00589             all.push_back(*hri);
00590         }
00591 
00592         // compile ordered set of hit ends;
00593         // go from the ends for better balancing
00594         typename THitRefs::iterator ii1 = all.begin();
00595         typename THitRefs::iterator ii2 = all.end(); --ii2;
00596         THitEnds hit_ends;
00597         while(ii1 != ii2) {
00598             
00599             THitRef& h1 = *ii1;
00600             THitRef& h2 = *ii2;
00601             for(Uint1 point = 0; point < 4; ++point) {
00602 
00603                 THitEnd he1;
00604                 he1.m_Point = point;
00605                 he1.m_Ptr = &h1;
00606                 he1.m_X = h1->GetBox()[point];
00607                 hit_ends.insert(he1);
00608 
00609                 THitEnd he2;
00610                 he2.m_Point = point;
00611                 he2.m_Ptr = &h2;
00612                 he2.m_X = h2->GetBox()[point];
00613                 hit_ends.insert(he2);
00614             }
00615 
00616             if(++ii1 == ii2) {
00617                 --ii2;
00618                 break;
00619             }
00620             --ii2;
00621         }
00622 
00623         if(ii1 == ii2) {
00624             THitRef& h = *ii2;
00625             for(Uint1 point = 0; point < 4; ++point) {
00626                 THitEnd he;
00627                 he.m_Point = point;
00628                 he.m_Ptr = &h;
00629                 he.m_X = h->GetBox()[point];
00630                 hit_ends.insert(he);
00631             }
00632         }
00633 
00634         // collate by id pairs and strands
00635         typedef CHitComparator<THit> THitComparator;
00636         typename THitComparator::ESortCriterion sort_type
00637             (THitComparator::eQueryIdSubjIdSubjStrand);
00638         THitComparator sorter (sort_type);
00639         stable_sort(all.begin(), all.end(), sorter);
00640         
00641         // go chunk-by-chunk and merge whatever is suitable
00642         pout->clear();
00643         
00644         THitRef h0 (all.front());
00645         typename THitRefs::iterator ib = all.begin();
00646         NON_CONST_ITERATE(typename THitRefs, ii, all) {
00647 
00648             if ( ! (*ii)->GetQueryId()->Match(*(h0->GetQueryId())) ||
00649                  ! (*ii)->GetSubjId()->Match(*(h0->GetSubjId())) ||
00650                  (*ii)->GetSubjStrand() != h0->GetSubjStrand() )
00651             {
00652                 h0 = *ii;
00653                 sx_TestAndMerge(ib, ii, hit_ends, maxlenfr, pout);
00654                 ib = ii;
00655             }
00656         }
00657         sx_TestAndMerge(ib, all.end(), hit_ends, maxlenfr, pout);
00658     }
00659 
00660     // helper predicate
00661     static bool s_PNullRef(const THitRef& hr) {
00662         return hr.IsNull();
00663     }
00664 
00665 protected:
00666 
00667     struct SHitEnd {
00668 
00669         Uint1    m_Point; // 0 = query start, 1 = query stop, 2 = subj start, 3 = stop
00670         THitRef* m_Ptr;
00671         TCoord   m_X;
00672 
00673         bool operator < (const SHitEnd& rhs) const {
00674             
00675             const Uint1 where1 = m_Point < 2? 0: 1;
00676             const Uint1 where2 = rhs.m_Point < 2? 0: 1;
00677             const THit& h1 = **m_Ptr;
00678             const THit& h2 = **rhs.m_Ptr;
00679             int c = h1.GetId(where1)->CompareOrdered(*h2.GetId(where2));
00680             return c < 0? true: (c > 0? false:
00681              (m_X == rhs.m_X? (h1.GetScore() < h2.GetScore()): m_X < rhs.m_X) );
00682         }
00683 
00684         friend ostream& operator << (ostream& ostr, const SHitEnd& he) {
00685             return 
00686                 ostr << "(Point,Coord) = (" 
00687                      << int(he.m_Point) << ", " 
00688                      << he.m_X << ")\n"
00689                      << **he.m_Ptr;
00690         }
00691     };
00692 
00693     typedef SHitEnd THitEnd;
00694     typedef multiset<THitEnd> THitEnds;
00695 
00696     // return adjusted coordinate; -1 if unaffected; -2 to delete; -3 on exception;
00697     // 0 on split.
00698     static int sx_Cleave(THitRef& h, Uint1 where, 
00699                          TCoord cmin, TCoord cmax, 
00700                          TCoord min_hit_len, double& min_idty,
00701                          THitRef* pnew_hit,
00702                          TCoord retain_overlap)
00703     {
00704         int rv = -1;
00705         pnew_hit->Reset(NULL);
00706 
00707         try {
00708             TCoord lmin = h->GetMin(where), lmax = h->GetMax(where);
00709 
00710             if(retain_overlap > 0) {
00711                 const TCoord overlap = s_GetOverlap(lmin, lmax, cmin, cmax);
00712                 if(overlap >= retain_overlap) {
00713                     return -1;
00714                 }
00715             }
00716 
00717             if(cmin <= lmin && lmax <= cmax) {
00718                 return -2;
00719             }
00720 
00721             int ldif = cmin - lmin, rdif = lmax - cmax;
00722 
00723             if(ldif > int(min_hit_len) && rdif > int(min_hit_len)) {
00724                 
00725                 // create a new hit
00726                 pnew_hit->Reset(new THit (*h));
00727                 h->Modify(2*where + 1, lmax = cmin - 1);
00728                 (*pnew_hit)->Modify(2*where, lmin = cmax + 1);
00729                 if((*pnew_hit)->GetIdentity() < min_idty) {
00730                     pnew_hit->Reset(NULL);
00731                 }
00732                 return (h->GetIdentity() < min_idty)? -2: 0;
00733             }
00734             else if(ldif > rdif) {
00735 
00736                 if(lmin <= cmin && cmin <= lmax) {
00737                     if(cmin == 0) return -2;
00738                     h->Modify(2*where + 1, lmax = cmin - 1);
00739                     rv = lmax;
00740                 }
00741                 if(lmin <= cmax && cmax <= lmax) {
00742                     h->Modify(2*where, lmin = cmax + 1);
00743                     rv = lmin;
00744                 }
00745             }
00746             else {
00747 
00748                 if(lmin <= cmax && cmax <= lmax) {
00749                     h->Modify(2*where, lmin = cmax + 1);
00750                     rv = lmin;
00751                 }
00752                 if(lmin <= cmin && cmin <= lmax) {
00753                     if(cmin == 0) return -2;
00754                     h->Modify(2*where + 1, lmax = cmin - 1);
00755                     rv = lmax;
00756                 }
00757 
00758             }
00759 
00760             if(1 + lmax - lmin < min_hit_len || h->GetIdentity() < min_idty) {
00761                 return -2;
00762             }
00763         }
00764         catch(CAlgoAlignUtilException&) {
00765             rv = -3; // hit would become inconsistent through the adjustment
00766         }
00767         return rv;
00768     }
00769 
00770     
00771     static THitRef sx_Merge(const THitRef& lhs, const THitRef& rhs, Uint1 where)
00772     {
00773         THitRef rv (0);
00774 
00775         const bool sstrand (lhs->GetSubjStrand());
00776         if(sstrand != rhs->GetSubjStrand()) {
00777 
00778             NCBI_THROW(CAlgoAlignUtilException, eInternal, 
00779                        "Cannot merge hits: different strands");
00780         }
00781 
00782         const int qgap ((!sstrand && where == 1)? 
00783                         (int(lhs->GetQueryMin()) - int(rhs->GetQueryMax())):
00784                         (int(rhs->GetQueryMin()) - int(lhs->GetQueryMax())));
00785 
00786         const int sgap ((!sstrand && where == 0)?
00787                         (int(lhs->GetSubjMin()) - int(rhs->GetSubjMax())):
00788                         (int(rhs->GetSubjMin()) - int(lhs->GetSubjMax())));
00789 
00790         const bool bv1 (abs(qgap) == 1 && abs(sgap) > 0);
00791         const bool bv2 (abs(sgap) == 1 && abs(qgap) > 0);
00792 
00793         if(!bv1 && !bv2){
00794             return rv;
00795         }
00796 
00797         typename THit::TTranscript x_lhs = 
00798             THit::s_RunLengthDecode(lhs->GetTranscript());
00799         typename THit::TTranscript x_rhs = 
00800             THit::s_RunLengthDecode(rhs->GetTranscript());
00801 
00802         if(x_lhs.size() && x_rhs.size()) {
00803 
00804             string x_gap;
00805             if(where == 0) {
00806                 x_gap.assign(abs(sgap) - 1, 'I');
00807             }
00808             else {
00809                 x_gap.assign(abs(qgap) - 1, 'D');
00810             }
00811 
00812             string xcript;
00813             THitRef h;
00814             if(where == 0 || sstrand) {
00815                 xcript = x_lhs + x_gap + x_rhs;
00816                 h = lhs;
00817             }
00818             else {
00819                 xcript = x_rhs + x_gap + x_lhs;
00820                 h = rhs;
00821             }
00822 
00823             rv.Reset (new THit(h->GetId(0), h->GetStart(0), h->GetStrand(0), 
00824                                h->GetId(1), h->GetStart(1), h->GetStrand(1), 
00825                                xcript));
00826         }
00827         else {
00828 
00829             rv.Reset(new THit());
00830             rv->SetId(0, lhs->GetId(0));
00831             rv->SetId(1, lhs->GetId(1));
00832             TCoord box [4] = {
00833                 lhs->GetQueryStart(), rhs->GetQueryStop(),
00834                 lhs->GetSubjStart(), rhs->GetSubjStop()
00835             };
00836             rv->SetBox(box);
00837         }
00838 
00839         return rv;
00840     }
00841 
00842     
00843     static bool s_TrackHit(const THit& h)
00844     {
00845         const string xcript (THit::s_RunLengthDecode(h.GetTranscript()));
00846         if(xcript.size()) {
00847 
00848             TCoord q = h.GetQueryStart(), q0 = q, s = h.GetSubjStart(), s0 = s;
00849 
00850             const int qinc = h.GetQueryStrand()? 1 : -1;
00851             const int sinc = h.GetSubjStrand()? 1 : -1;
00852 
00853             ITERATE(string, ii, xcript) {
00854                 switch(*ii) {
00855                 case 'M' : case 'R' : q0 = q; q += qinc; s0 = s; s += sinc; break;
00856                 case 'I' : s0 = s; s += sinc; break;
00857                 case 'D' : q0 = q; q += qinc; break;
00858                 default:
00859                     NCBI_THROW(CAlgoAlignUtilException, eInternal, 
00860                                "Invalid transcript symbol");
00861                 }
00862             }
00863             
00864             const bool rv = (q0 == h.GetQueryStop() && s0 == h.GetSubjStop());
00865             return rv;
00866         }
00867         else {
00868             return true;
00869         }
00870     }
00871 
00872 
00873     static void sx_TM(Uint1 where,
00874                       typename THitRefs::iterator ii_beg,
00875                       typename THitRefs::iterator ii_end,
00876                       const THitEnds& hit_ends,
00877                       const double& maxlenfr,
00878                       map<typename THitRefs::iterator, THitRef>& m)
00879     {
00880         typedef CHitComparator<THit> THitComparator;
00881         typename THitComparator::ESortCriterion sort_type (
00882                  where == 0? 
00883                  THitComparator::eQueryMinScore:
00884                  THitComparator::eSubjMinScore);
00885         THitComparator sorter (sort_type);
00886         stable_sort(ii_beg, ii_end, sorter);
00887 
00888         typedef typename THitRefs::iterator TIter;
00889         typedef map<typename THitRefs::iterator, THitRef> TIterMap;
00890         for(TIter ii = ii_beg; ii != ii_end; ++ii)  {
00891 
00892             TIter jj = ii;
00893             const TCoord cmax = (*ii)->GetMax(where);
00894             TCoord cmin = 0;
00895             for(++jj; jj != ii_end && cmax >= cmin; ++jj) {
00896                 cmin = (*jj)->GetMin(where);
00897             }
00898             
00899             if(cmax + 1 == cmin) {
00900 
00901                 --jj;
00902                 bool can_merge = true;
00903                 const Uint1 where2 ((where + 1) % 2);
00904                 const bool subj_strand ((*ii)->GetSubjStrand());
00905                 if(subj_strand) {
00906                     can_merge = (*ii)->GetMax(where2) < (*jj)->GetMin(where2);
00907                 }
00908                 else {
00909                     can_merge = (*jj)->GetMax(where2) < (*ii)->GetMin(where2);
00910                 }
00911                 
00912                 if(can_merge) {
00913 
00914                     // also test the maxlenfr condition using hit_ends
00915                     
00916                     THitEnd he_ii, he_jj;
00917 
00918                     if(subj_strand) {
00919 
00920                         he_ii.m_Point = 2*where2 + 1;
00921                         he_ii.m_Ptr = &(*ii);
00922                         he_ii.m_X = (*ii)->GetStop(where2) + 1;
00923 
00924                         he_jj.m_Point = 2*where2;
00925                         he_jj.m_Ptr = &(*jj);
00926                         he_jj.m_X = (*jj)->GetStart(where2) - 1;
00927                     }
00928                     else {
00929 
00930                         he_jj.m_Point = 2*where2 + 1;
00931                         he_jj.m_Ptr = &(*ii);
00932                         he_jj.m_X = (*ii)->GetStop(where2) - 1;
00933 
00934                         he_ii.m_Point = 2*where2;
00935                         he_ii.m_Ptr = &(*jj);
00936                         he_ii.m_X = (*jj)->GetStart(where2) + 1;
00937                     }
00938 
00939                     typedef typename THitEnds::const_iterator THitEndsIter;
00940                     const THitEndsIter ii0 = hit_ends.lower_bound(he_ii);
00941                     const THitEndsIter ii1 = hit_ends.upper_bound(he_jj);
00942 
00943                     const size_t max_len
00944                         = size_t(maxlenfr * min((*ii)->GetLength(),
00945                                                 (*jj)->GetLength())
00946                                  + 0.5);
00947 
00948                     for(THitEndsIter ii = ii0; ii != ii1; ++ii) {
00949 
00950                         const THitRef& h = *(ii->m_Ptr);
00951 
00952                         TCoord cmin (h->GetMin(where2));
00953                         if(cmin < he_ii.m_X) cmin = he_ii.m_X;
00954 
00955                         TCoord cmax (h->GetMax(where2));
00956                         if(cmax < he_jj.m_X) cmax = he_jj.m_X;
00957                         
00958                         if(cmax - cmin + 1 > max_len) {
00959                             can_merge = false;
00960                             break;
00961                         }
00962                     }
00963                 }
00964 
00965                 if(can_merge) {
00966                 
00967                     // if (*ii) already merged, merge with the result;
00968                     // if (*jj) already merged, do not merge.
00969                     typename TIterMap::iterator imi = m.find(ii);
00970                     THitRef lhs (imi == m.end()? *ii: imi->second);
00971                     if (m.find(jj) == m.end() && lhs.NotEmpty()) {
00972 
00973                         THitRef rv (sx_Merge(lhs, *jj, where));
00974                         if(rv.NotNull()) {
00975                             m[ii] = THitRef(0);
00976                             m[jj] = rv;
00977                         }
00978                     }                    
00979                 }
00980             }
00981         }
00982     }
00983 
00984 
00985     // merging of abutting hits sharing same ids and subject strand
00986     // (plus query strand assumed)
00987     void static sx_TestAndMerge(typename THitRefs::iterator ii_beg,
00988                                 typename THitRefs::iterator ii_end,
00989                                 const THitEnds& hit_ends,
00990                                 const double& maxlenfr,
00991                                 THitRefs* pout)
00992     {
00993         typedef typename THitRefs::iterator TIter;
00994         typedef map<TIter, THitRef> TIterMap;
00995         TIterMap m;
00996 
00997         sx_TM(0, ii_beg, ii_end, hit_ends, maxlenfr, m);
00998         sx_TM(1, ii_beg, ii_end, hit_ends, maxlenfr, m);
00999 
01000         // copy final merge targets and unmerged hits
01001         for(TIter ii = ii_beg; ii != ii_end; ++ii) {
01002             typename TIterMap::iterator imi = m.find(ii);
01003             THitRef hr (imi == m.end()? *ii: imi->second);
01004             if(hr.NotEmpty()) {
01005                 pout->push_back(hr);
01006             }
01007         }
01008     }
01009 
01010 };
01011 
01012 
01013 END_NCBI_SCOPE
01014 
01015 
01016 /* @} */
01017 
01018 #endif /* ALGO_ALIGN_UTIL_HITFILTER__HPP */
01019 
01020 

Generated on Sun Dec 6 21:55:33 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Mon Dec 07 16:20:32 2009 by modify_doxy.py rev. 173732