include/algo/align/nw/nw_aligner.hpp

Go to the documentation of this file.
00001 #ifndef ALGO_ALIGN_NW_NW_ALIGNER__HPP
00002 #define ALGO_ALIGN_NW_NW_ALIGNER__HPP
00003 
00004 /* $Id: nw_aligner.hpp 124578 2008-04-15 18:30:18Z kapustin $
00005 * ===========================================================================
00006 *
00007 *                            public DOMAIN NOTICE                          
00008 *               National Center for Biotechnology Information
00009 *                                                                          
00010 *  This software/database is a "United States Government Work" under the   
00011 *  terms of the United States Copyright Act.  It was written as part of    
00012 *  the author's official duties as a United States Government employee and 
00013 *  thus cannot be copyrighted.  This software/database is freely available 
00014 *  to the public for use. The National Library of Medicine and the U.S.    
00015 *  Government have not placed any restriction on its use or reproduction.  
00016 *                                                                          
00017 *  Although all reasonable efforts have been taken to ensure the accuracy  
00018 *  and reliability of the software and data, the NLM and the U.S.          
00019 *  Government do not and cannot warrant the performance or results that    
00020 *  may be obtained by using this software or data. The NLM and the U.S.    
00021 *  Government disclaim all warranties, express or implied, including       
00022 *  warranties of performance, merchantability or fitness for any particular
00023 *  purpose.                                                                
00024 *                                                                          
00025 *  Please cite the author in any work or product based on this material.   
00026 *
00027 * ===========================================================================
00028 *
00029 * Author:  Yuri Kapustin, Alexander Souvorov
00030 *
00031 * File Description:
00032 *   CNWAligner class definition
00033 *
00034 *   CNWAligner encapsulates a generic global (Needleman-Wunsch)
00035 *   alignment algorithm with affine gap penalty model.
00036 *
00037 */
00038 
00039 #include <corelib/ncbistd.hpp>
00040 #include <corelib/ncbiobj.hpp>
00041 #include <corelib/ncbi_limits.hpp>
00042 #include <util/tables/raw_scoremat.h>
00043 #include <objects/seqloc/Na_strand.hpp>
00044 
00045 #include <vector>
00046 #include <string>
00047 
00048 
00049 /** @addtogroup AlgoAlignRoot
00050  *
00051  * @{
00052  */
00053 
00054 
00055 BEGIN_NCBI_SCOPE
00056 
00057 BEGIN_SCOPE(objects)
00058     class CDense_seg;
00059     class CSeq_id;
00060 END_SCOPE(objects)
00061 
00062 // Needleman-Wunsch algorithm encapsulation
00063 //
00064 
00065 class  CNWAligner: public CObject
00066 {
00067 public:
00068     typedef int TScore;
00069 
00070     // ctors
00071     CNWAligner(void);
00072 
00073     // Null scoremat pointer indicates IUPACna coding
00074     CNWAligner(const char* seq1, size_t len1,
00075                const char* seq2, size_t len2,
00076                const SNCBIPackedScoreMatrix* scoremat = 0);
00077 
00078     CNWAligner(const string& seq1,
00079                const string& seq2,
00080                const SNCBIPackedScoreMatrix* scoremat = 0);
00081 
00082     virtual ~CNWAligner(void) {}
00083 
00084     // Compute the alignment
00085     virtual TScore Run(void);
00086 
00087     // Setters
00088     virtual void SetSequences(const char* seq1, size_t len1,
00089                               const char* seq2, size_t len2,
00090                               bool verify = true);
00091 
00092     void SetSequences(const string& seq1,
00093                       const string& seq2,
00094                       bool verify = true);
00095   
00096     void SetScoreMatrix(const SNCBIPackedScoreMatrix* scoremat);
00097 
00098     void SetWm  (TScore value);                      // match (na)
00099     void SetWms (TScore value);                      // mismatch (na)
00100     void SetWg  (TScore value)  { m_Wg  = value; }   // gap opening
00101     void SetWs  (TScore value)  { m_Ws  = value; }   // gap extension
00102 
00103     // specify whether end gaps should be penalized
00104     void SetEndSpaceFree(bool Left1, bool Right1, bool Left2, bool Right2);
00105 
00106     // alignment pattern (guides)
00107     void SetPattern(const vector<size_t>& pattern);
00108 
00109     // max memory to use
00110     void SetSpaceLimit(const size_t& maxmem) { m_MaxMem = maxmem; }
00111 
00112     // progress reporting
00113     struct SProgressInfo
00114     {
00115         SProgressInfo(void): m_iter_done(0), m_iter_total(0), m_data(0) {}
00116         size_t m_iter_done;
00117         size_t m_iter_total;
00118         void*  m_data;
00119         char   m_text_buffer [1024];
00120     };
00121 
00122     // return true to cancel calculation
00123     typedef bool (*FProgressCallback) (SProgressInfo*);
00124     void SetProgressCallback ( FProgressCallback prg_callback, void* data );
00125 
00126     // Getters
00127     static TScore GetDefaultWm  (void) { return  1; }
00128     static TScore GetDefaultWms (void) { return -2; }
00129     static TScore GetDefaultWg  (void) { return -5; }
00130     static TScore GetDefaultWs  (void) { return -2; }
00131 
00132     TScore GetWm  (void) const { return m_Wm; }
00133     TScore GetWms (void) const { return m_Wms; }
00134     TScore GetWg  (void) const { return m_Wg; }
00135     TScore GetWs  (void) const { return m_Ws; }
00136 
00137     const char*   GetSeq1(void) const { return m_Seq1; }
00138     size_t        GetSeqLen1(void) const { return m_SeqLen1; }
00139     const char*   GetSeq2(void) const { return m_Seq2; }
00140     size_t        GetSeqLen2(void) const { return m_SeqLen2; }
00141 
00142     void          GetEndSpaceFree(bool* L1, bool* R1, bool* L2, bool* R2)
00143                       const;
00144 
00145     TScore        GetScore(void) const;
00146 
00147     size_t        GetSpaceLimit(void) const {  return m_MaxMem; }
00148     static size_t GetDefaultSpaceLimit(void) {
00149         return 0xFFFFFFFF;
00150     }
00151     
00152     // alignment transcript
00153     enum ETranscriptSymbol {
00154         eTS_None         = 0   
00155         ,eTS_Delete       = 'D'
00156         ,eTS_Insert       = 'I'
00157         ,eTS_Match        = 'M'
00158         ,eTS_Replace      = 'R'
00159         ,eTS_Intron       = 'Z'
00160 #ifdef ALGOALIGN_NW_SPLIGN_MAKE_PUBLIC_BINARY
00161         ,eTS_SlackDelete // unaligned s-w term
00162         ,eTS_SlackInsert // -- " -- 
00163 #endif
00164     };
00165     typedef vector<ETranscriptSymbol> TTranscript;
00166 
00167     // raw transcript
00168     TTranscript   GetTranscript(bool reversed = true) const;
00169     void          SetTranscript(const TTranscript& transcript);
00170 
00171     // transcript as a string
00172     string GetTranscriptString(void) const;
00173 
00174     // if set, all positively scoring diags will be
00175     // recorded as matches in the alignment transcript;
00176     // only real matches otherwise.
00177     void  SetPositivesAsMatches(bool positives_as_matches = true) {
00178         m_PositivesAsMatches = positives_as_matches;
00179     }
00180     bool  GetPositivesAsMatches(void) const {
00181         return m_PositivesAsMatches;
00182     }
00183 
00184     // transcript parsers
00185     size_t         GetLeftSeg(size_t* q0, size_t* q1,
00186                               size_t* s0, size_t* s1,
00187                               size_t min_size) const;
00188     size_t         GetRightSeg(size_t* q0, size_t* q1,
00189                                size_t* s0, size_t* s1,
00190                                size_t min_size) const;
00191     size_t         GetLongestSeg(size_t* q0, size_t* q1,
00192                                  size_t* s0, size_t* s1) const;
00193  
00194     // returns the size of a single backtrace matrix element
00195     virtual size_t GetElemSize(void) const {
00196         return 1;
00197     }
00198 
00199     // Compute score with the given transcript and sequences offsets.
00200     // if defaults are supplied for start1 and start2,
00201     // compute score using transcript only assuming nucleotide alignment.
00202     virtual TScore ScoreFromTranscript(const TTranscript& transcript,
00203                                        size_t start1 = kMax_UInt,
00204                                        size_t start2 = kMax_UInt ) const;
00205 
00206     void    EnableMultipleThreads(bool enable = true);
00207 
00208     // A naive pattern generator-use cautiously.
00209     // Do not use on sequences with repeats or error.
00210     size_t MakePattern(const size_t hit_size = 100, 
00211                        const size_t core_size = 28);
00212 
00213     // Create a Dense-seg representing the alignment, without ids set
00214     CRef<objects::CDense_seg> GetDense_seg(TSeqPos query_start,
00215                                            objects::ENa_strand query_strand,
00216                                            TSeqPos subj_start,
00217                                            objects::ENa_strand subj_strand)
00218                                            const;
00219 
00220     // Create a Dense-seg representing the alignment, with provided ids set
00221     CRef<objects::CDense_seg> GetDense_seg(TSeqPos query_start,
00222                                            objects::ENa_strand query_strand,
00223                                            const objects::CSeq_id& query_id,
00224                                            TSeqPos subj_start,
00225                                            objects::ENa_strand subj_strand,
00226                                            const objects::CSeq_id& subj_id)
00227                                            const;
00228 
00229 protected:
00230 
00231     // Bonuses and penalties
00232     TScore   m_Wm;   // match bonus (eNucl)
00233     TScore   m_Wms;  // mismatch penalty (eNucl)
00234     TScore   m_Wg;   // gap opening penalty
00235     TScore   m_Ws;   // gap extension penalty
00236 
00237     // end-space free flags
00238     bool     m_esf_L1, m_esf_R1, m_esf_L2, m_esf_R2;
00239 
00240     // alphabet and score matrix
00241     const char*               m_abc;
00242     SNCBIFullScoreMatrix      m_ScoreMatrix;
00243     bool                      m_ScoreMatrixInvalid;
00244 
00245     // progress callback
00246     FProgressCallback         m_prg_callback;
00247 
00248     // progress status
00249     mutable SProgressInfo     m_prg_info;
00250 
00251     // termination flag
00252     mutable  bool             m_terminate;
00253 
00254     // Source sequences
00255     const char*               m_Seq1;
00256     size_t                    m_SeqLen1;
00257     const char*               m_Seq2;
00258     size_t                    m_SeqLen2;
00259     size_t x_CheckSequence(const char* seq, size_t len) const;
00260     virtual bool x_CheckMemoryLimit(void);
00261 
00262     // naive pattern generation helpers (Rabin-Karp approach)
00263     unsigned char   x_CalcFingerPrint64( const char* beg,
00264                                          const char* end,
00265                                          size_t& err_index );
00266     const char*     x_FindFingerPrint64( const char* beg, 
00267                                          const char* end,
00268                                          unsigned char fingerprint,
00269                                          size_t size,
00270                                          size_t& err_index );
00271 
00272     // Transcript, score and guiding hits
00273     TTranscript               m_Transcript;
00274     bool                      m_PositivesAsMatches;
00275     TScore                    m_score;
00276     vector<size_t>            m_guides;
00277 
00278     // multiple threads flag
00279     bool                      m_mt;
00280     size_t                    m_maxthreads;
00281 
00282     // approximate max space to use
00283     size_t                   m_MaxMem;
00284  
00285     // facilitate guide pre- and  post-processing, if applicable
00286     virtual TScore x_Run   (void);
00287 
00288     // core dynamic programming
00289     struct SAlignInOut;
00290     virtual TScore x_Align (SAlignInOut* data);
00291 
00292     // a helper class assuming four bits per backtrace matrix cell
00293     class CBacktraceMatrix4 {
00294     public:
00295 
00296         CBacktraceMatrix4(size_t dim) {
00297             m_Buf = new Uint1 [dim / 2 + 1];
00298             m_Elem = 0;
00299         }
00300 
00301         ~CBacktraceMatrix4() { delete [] m_Buf; }
00302 
00303         void SetAt(size_t i, Uint1 v) {
00304             if(i & 1) {
00305                 m_Buf[i >> 1] = m_Elem | (v << 4);
00306             }
00307             else {
00308                 m_Elem = v;
00309             }
00310         }
00311 
00312         void Purge(size_t i) {
00313             if(i & 1) {
00314                 m_Buf[i >> 1] = m_Elem;
00315             }
00316         }
00317 
00318         Uint1 operator[] (size_t i) const {
00319             return 0x0F & ((m_Buf[i >> 1]) >> ((i & 1) << 2));
00320         }
00321 
00322     private:
00323         
00324         Uint1 * m_Buf;
00325         Uint1   m_Elem;
00326     };
00327 
00328     void x_DoBackTrace(const CBacktraceMatrix4 & backtrace,
00329                        SAlignInOut* data);
00330 
00331     // retrieve transcript symbol for a one-character diag
00332     virtual ETranscriptSymbol x_GetDiagTS(size_t i1, size_t i2) const;
00333 
00334     friend class CNWAlignerThread_Align;
00335 };
00336 
00337 
00338 struct CNWAligner::SAlignInOut {
00339 
00340     SAlignInOut(): m_offset1(0), m_len1(0), 
00341                    m_offset2(0), m_len2(0), 
00342                    m_space(0) {}
00343 
00344     SAlignInOut(size_t offset1, size_t len1, bool esfL1, bool esfR1,
00345                 size_t offset2, size_t len2, bool esfL2, bool esfR2):
00346         m_offset1(offset1), m_len1(len1), m_esf_L1(esfL1), m_esf_R1(esfR1),
00347         m_offset2(offset2), m_len2(len2), m_esf_L2(esfL2), m_esf_R2(esfR2)
00348     {
00349         m_space = m_len1*m_len2;
00350     }
00351 
00352     // [in] first sequence
00353     size_t      m_offset1;
00354     size_t      m_len1;
00355     bool        m_esf_L1, m_esf_R1;
00356 
00357     // [in] second sequence
00358     size_t      m_offset2;
00359     size_t      m_len2;
00360     bool        m_esf_L2, m_esf_R2;
00361 
00362     // [out]
00363     TTranscript m_transcript;
00364 
00365     size_t      GetSpace(void) const {
00366         return m_space;
00367     }
00368 
00369     static bool PSpace(const SAlignInOut* p1, const SAlignInOut* p2) {
00370         return p1->m_space >= p2->m_space;
00371     }
00372 
00373 private:    
00374 
00375     size_t m_space; // required dynprog dimension
00376 };
00377 
00378 
00379 namespace {
00380 
00381     const char g_nwaligner_nucleotides [] = "AGTCBDHKMNRSVWY";
00382 
00383     const CNWAligner::TScore kInfMinus ( -(numeric_limits<CNWAligner::TScore>::
00384                                             max() / 2) );
00385 }
00386 
00387 END_NCBI_SCOPE
00388 
00389 
00390 /* @} */
00391 
00392 #endif  /* ALGO___NW_ALIGNER__HPP */
00393 
00394 

Generated on Wed Dec 9 02:54:19 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Wed Dec 09 08:17:25 2009 by modify_doxy.py rev. 173732