include/algo/align/splign/splign.hpp

Go to the documentation of this file.
00001 #ifndef ALGO_ALIGN_SPLIGN__HPP
00002 #define ALGO_ALIGN_SPLIGN__HPP
00003 
00004 /* $Id: splign.hpp 156369 2009-04-02 18:37:09Z kapustin $
00005 * ===========================================================================
00006 *
00007 *                            public DOMAIN NOTICE                          
00008 *               National Center for Biotechnology Information
00009 *                                                                          
00010 *  This software/database is a "United States Government Work" under the   
00011 *  terms of the United States Copyright Act.  It was written as part of    
00012 *  the author's official duties as a United States Government employee and 
00013 *  thus cannot be copyrighted.  This software/database is freely available 
00014 *  to the public for use. The National Library of Medicine and the U.S.    
00015 *  Government have not placed any restriction on its use or reproduction.  
00016 *                                                                          
00017 *  Although all reasonable efforts have been taken to ensure the accuracy  
00018 *  and reliability of the software and data, the NLM and the U.S.          
00019 *  Government do not and cannot warrant the performance or results that    
00020 *  may be obtained by using this software or data. The NLM and the U.S.    
00021 *  Government disclaim all warranties, express or implied, including       
00022 *  warranties of performance, merchantability or fitness for any particular
00023 *  purpose.                                                                
00024 *                                                                          
00025 *  Please cite the author in any work or product based on this material.   
00026 *
00027 * ===========================================================================
00028 *
00029 * Author:  Yuri Kapustin
00030 *
00031 * File Description:
00032 *   CSplign class definition
00033 *
00034 */
00035 
00036 #include <corelib/ncbistd.hpp>
00037 
00038 #include <objmgr/scope.hpp>
00039 #include <algo/align/nw/nw_formatter.hpp>
00040 #include <algo/align/util/blast_tabular.hpp>
00041 
00042 
00043 BEGIN_NCBI_SCOPE
00044 
00045 class CBlastTabular;
00046 
00047 BEGIN_SCOPE(objects)
00048     class CScope;
00049     class CSeq_id;
00050     class CScore_set;
00051     class CSeq_align_set;
00052 END_SCOPE(objects)
00053 
00054 
00055 class  CSplign: public CObject
00056 {
00057 public:
00058 
00059     typedef CSplicedAligner TAligner;
00060 
00061     CSplign(void);
00062     ~CSplign();
00063 
00064     // setters and getters
00065     CRef<TAligner>&     SetAligner(void);
00066     CConstRef<TAligner> GetAligner(void) const;
00067     static CRef<CSplicedAligner> s_CreateDefaultAligner(bool low_query_quality);
00068 
00069     CRef<objects::CScope>  GetScope(void) const;
00070     CRef<objects::CScope>& SetScope(void);
00071     void   PreserveScope(bool preserve_scope = true);
00072 
00073     void   SetEndGapDetection(bool on);
00074     bool   GetEndGapDetection(void) const;
00075 
00076     void   SetPolyaDetection(bool on);
00077     bool   GetPolyaDetection(void) const;
00078 
00079     void   SetStrand(bool strand);
00080     bool   GetStrand(void) const;
00081 
00082     void   SetMaxGenomicExtent(size_t mge);
00083     static size_t s_GetDefaultMaxGenomicExtent(void);
00084     size_t GetMaxGenomicExtent(void) const;
00085 
00086     void   SetMaxIntron(size_t max_intron);
00087     size_t GetMaxIntron(void) const;
00088 
00089     void   SetCompartmentPenalty(double penalty);
00090     static double s_GetDefaultCompartmentPenalty(void);
00091     double GetCompartmentPenalty(void) const;
00092 
00093     void   SetMinCompartmentIdentity(double idty);
00094     static double s_GetDefaultMinCompartmentIdty(void);
00095     double GetMinCompartmentIdentity(void) const;
00096 
00097     void   SetMinSingletonIdentity(double idty);
00098     double GetMinSingletonIdentity(void) const;
00099 
00100     void   SetMinSingletonIdentityBps(size_t idty);
00101     size_t GetMinSingletonIdentityBps(void) const;
00102 
00103     void   SetMinExonIdentity(double idty);
00104     static double s_GetDefaultMinExonIdty(void);
00105     double GetMinExonIdentity(void) const;
00106 
00107     void   SetStartModelId(size_t model_id) {
00108         m_model_id = model_id - 1;
00109     }
00110     size_t GetNextModelId(void) const {
00111         return m_model_id + 1;
00112     }
00113 
00114     void SetMaxCompsPerQuery(size_t m);
00115     size_t GetMaxCompsPerQuery(void) const;
00116     
00117     typedef CNWFormatter::SSegment   TSegment;
00118     typedef vector<TSegment>         TSegments;
00119 
00120     // aligned compartment representation 
00121     struct  SAlignedCompartment {
00122         
00123         size_t           m_Id;
00124 
00125         enum ECompartmentStatus {
00126             eStatus_Ok,
00127             eStatus_Empty,
00128             eStatus_Error
00129         };
00130 
00131         ECompartmentStatus m_Status;
00132 
00133         string           m_Msg;
00134         bool             m_QueryStrand, m_SubjStrand;
00135         size_t           m_Cds_start, m_Cds_stop;
00136         size_t           m_QueryLen;
00137         size_t           m_PolyA;
00138         float            m_Score;
00139         TSegments        m_Segments;
00140         
00141         SAlignedCompartment(void):
00142             m_Id(0),
00143             m_Status(eStatus_Empty),
00144             m_Cds_start(0), m_Cds_stop(0),
00145             m_QueryLen (0),
00146             m_PolyA(0),
00147             m_Score(0)
00148         {}
00149         
00150         SAlignedCompartment(size_t id, const char* msg):
00151             m_Id(id),
00152             m_Status(eStatus_Empty),
00153             m_Msg(msg),
00154             m_Cds_start(0), m_Cds_stop(0),
00155             m_QueryLen(0),
00156             m_PolyA(0),
00157             m_Score(0)
00158         {}
00159         
00160         // return overall identity (including gaps)
00161         double GetIdentity(void) const;
00162 
00163         // get aligned min/max on query and subject
00164         void GetBox(Uint4* box) const;
00165         
00166         // save to / read from NetCache buffer
00167         typedef vector<char> TNetCacheBuffer;
00168         void ToBuffer   (TNetCacheBuffer* buf) const;
00169         void FromBuffer (const TNetCacheBuffer& buf);
00170     };
00171     
00172     typedef CBlastTabular           THit;
00173     typedef CRef<THit>              THitRef;
00174     typedef vector<THitRef>         THitRefs;
00175 
00176     // identify compartments and align each of them
00177     void Run(THitRefs* hitrefs);
00178     typedef vector<SAlignedCompartment> TResults;
00179     
00180     // retrieve results computed with Run()
00181     const TResults& GetResult(void) const {
00182         return m_result;
00183     }
00184 
00185     // align single compartment within given genomic bounds
00186     bool AlignSingleCompartment(THitRefs* hitrefs,
00187                                 size_t range_left, size_t range_right,
00188                                 SAlignedCompartment* result);
00189 
00190     // align single ASN.1 compartment
00191     bool AlignSingleCompartment(CRef<objects::CSeq_align> compartment,
00192                                 SAlignedCompartment* result);
00193 
00194 
00195     // clear sequence vectors and scope - use with caution
00196     void ClearMem(void);
00197 
00198     typedef pair<size_t,size_t>   TOrf;
00199     typedef pair<TOrf,TOrf>       TOrfPair;
00200     TOrfPair GetCds(const THit::TId & id, const vector<char> * seq_data = 0);
00201 
00202     static size_t s_TestPolyA(const char * seq, size_t dim, size_t cds_stop = 0);
00203 
00204     // alignment statistics
00205 
00206     enum ECompartmentScores {
00207         eCS_Matches             = 6,
00208         eCS_OverallIdentity     = 10,
00209         eCS_InframeMatches      = 20,
00210         eCS_InframeIdentity     = 22,
00211         eCS_Splices             = 23,
00212         eCS_ConsensusSplices    = 24,
00213         eCS_ProductCoverage     = 27,
00214         eCS_ExonIdentity        = 28,
00215         eCS_CombinationIdentity = 32
00216     };
00217 
00218     enum EStatFlags {
00219         eSF_BasicNonCds = 1 << 0,
00220         eSF_BasicCds    = 1 << 1
00221     };
00222 
00223     typedef list<CRef<objects::CScore_set> > TScoreSets;
00224 
00225     /// Generate statistics based on splign-generated seq-align-set,
00226     /// with each seq-align corresponding to an aligned compartment.
00227     ///
00228     /// @param sas
00229     ///   [IN] Seq-align-set describing input alignments.
00230     /// @param output_stats
00231     ///   [OUT] A pointer to the object to be be filled in with computed stats.
00232     /// @param cds
00233     ///   [IN] Coding region start and stop to use when computing cds-related stats.
00234     ///   If both are null then no cds-related stats will be computed.
00235     /// @param flags
00236     ///   [IN] Bitwise OR of the eSF_* flags specifying types of statistics to include.
00237     /// @return
00238     ///   The number of elements written in output_stats.
00239     static size_t s_ComputeStats(
00240         CRef<objects::CSeq_align_set> sas,
00241         TScoreSets *                  output_stats,
00242         TOrf                          cds = TOrf(0, 0),
00243         EStatFlags                    flags = eSF_BasicNonCds);
00244 
00245     /// Generate statistics based on splign-generated seq-align corresponding
00246     /// to a single aligned compartment.
00247     ///
00248     /// @param sa
00249     ///   [IN] Seq-align describing one aligned compartment.
00250     /// @param embed_scoreset
00251     ///   [IN] Decorate the input seq-align with the scores.
00252     /// @param cds
00253     ///   [IN] Coding region start and stop to use when computing cds-related stats.
00254     ///   If both are null then no cds-related stats will be computed.
00255     /// @param flags
00256     ///   [IN] Bitwise OR of the eSF_* flags specifying types of statistics to include.
00257     /// @return
00258     ///   A reference to a score-set object with the computed statistics.
00259     static CRef<objects::CScore_set> s_ComputeStats(
00260         CRef<objects::CSeq_align> sa,
00261         bool                      embed_scoreset = true,
00262         TOrf                      cds = TOrf(0, 0),
00263         EStatFlags                flags = eSF_BasicNonCds);
00264 
00265 protected:
00266 
00267     // the spliced alignment computing object
00268     CRef<TAligner> m_aligner;
00269 
00270     // access to sequence data
00271     CRef<objects::CScope> m_Scope;
00272     bool                  m_CanResetHistory;
00273 
00274     // alignment pattern
00275     vector<size_t> m_pattern;
00276 
00277     // min exon idty - others will be marked as gaps
00278     double m_MinExonIdty;
00279 
00280     // compartment penalty as a per cent of the query (mRna) length
00281     double m_CompartmentPenalty;
00282 
00283     // min compartment idty - others will be skipped
00284     double m_MinCompartmentIdty;
00285 
00286     // min single compartment idty (per subject per strand) as a fraction of
00287     // the query length and as an absolute value.
00288     // The final value for the parameter is computed
00289     // as min(m_MinSingletonIdty * query_length, m_MinSingletonIdtyBps)
00290     double m_MinSingletonIdty;
00291 
00292     size_t m_MinSingletonIdtyBps;
00293 
00294 
00295     // mandatory end gap detection flag
00296     bool m_endgaps;
00297 
00298     // alignment map
00299     struct SAlnMapElem {
00300         size_t m_box [4];
00301         int    m_pattern_start, m_pattern_end;
00302     };
00303     vector<SAlnMapElem> m_alnmap;
00304 
00305     typedef map<string,TOrfPair>  TStrIdToOrfs;
00306     TStrIdToOrfs  m_OrfMap;
00307 
00308     // query sequence
00309     vector<char> m_mrna;
00310     bool         m_strand;
00311     size_t       m_polya_start;
00312     bool         m_nopolya;
00313 
00314     size_t       m_cds_start; // in antisense, these are computed based on a reverse-
00315     size_t       m_cds_stop;  // complimentary sequence, so start still less than stop
00316 
00317     // genomic sequence
00318     vector<char> m_genomic;
00319 
00320     // max space to look beyond end hits
00321     size_t       m_max_genomic_ext;
00322 
00323     // max intron length
00324     size_t       m_MaxIntron;
00325 
00326     // The limiting range as defined by the compartment hits,
00327     // if the max compartment hit identity is less than a cut-off.
00328     pair<size_t, size_t> m_BoundingRange;
00329 
00330     // output per compartment
00331     TSegments    m_segments;
00332   
00333     // all compartments
00334     size_t       m_model_id;
00335     TResults     m_result;
00336 
00337     size_t       m_MaxCompsPerQuery;
00338 
00339     size_t       m_MinPatternHitLength;
00340 
00341     SAlignedCompartment x_RunOnCompartment( THitRefs* hitrefs,
00342                                             size_t range_left,
00343                                             size_t range_right);
00344 
00345     float  x_Run(const char* seq1, const char* seq2);
00346 
00347     void   x_SplitQualifyingHits(THitRefs* phitrefs);
00348     void   x_SetPattern(THitRefs* hitrefs);
00349     bool   x_ProcessTermSegm(TSegment** term_segs, Uint1 side) const;
00350     Uint4  x_GetGenomicExtent(const Uint4 query_extent, Uint4 max_ext = 0) const;
00351     void   x_FinalizeAlignedCompartment(SAlignedCompartment & ac);
00352 
00353     void   x_LoadSequence(vector<char>* seq, 
00354                           const objects::CSeq_id& seqid,
00355                           THit::TCoord start,
00356                           THit::TCoord finish,
00357                           bool retain);
00358 
00359     static THitRef sx_NewHit(THit::TCoord q0, THit::TCoord q,
00360                              THit::TCoord s0, THit::TCoord s);
00361 
00362     /// forbidden
00363     CSplign(const CSplign&);
00364     CSplign& operator=(const CSplign&);
00365 };
00366 
00367 
00368 END_NCBI_SCOPE
00369 
00370 
00371 #endif
00372 
00373 

Generated on Sun Dec 6 21:55:30 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Mon Dec 07 16:20:32 2009 by modify_doxy.py rev. 173732