include/algo/align/prosplign/prosplign.hpp

Go to the documentation of this file.
00001 #ifndef ALGO_ALIGN_PROSPLIGN__HPP
00002 #define ALGO_ALIGN_PROSPLIGN__HPP
00003 
00004 /* $Id: prosplign.hpp 153867 2009-03-04 20:28:09Z chetvern $
00005 * ===========================================================================
00006 *
00007 *                            public DOMAIN NOTICE                          
00008 *               National Center for Biotechnology Information
00009 *                                                                          
00010 *  This software/database is a "United States Government Work" under the   
00011 *  terms of the United States Copyright Act.  It was written as part of    
00012 *  the author's official duties as a United States Government employee and 
00013 *  thus cannot be copyrighted.  This software/database is freely available 
00014 *  to the public for use. The National Library of Medicine and the U.S.    
00015 *  Government have not placed any restriction on its use or reproduction.  
00016 *                                                                          
00017 *  Although all reasonable efforts have been taken to ensure the accuracy  
00018 *  and reliability of the software and data, the NLM and the U.S.          
00019 *  Government do not and cannot warrant the performance or results that    
00020 *  may be obtained by using this software or data. The NLM and the U.S.    
00021 *  Government disclaim all warranties, express or implied, including       
00022 *  warranties of performance, merchantability or fitness for any particular
00023 *  purpose.                                                                
00024 *                                                                          
00025 *  Please cite the author in any work or product based on this material.   
00026 *
00027 * ===========================================================================
00028 *
00029 * Author:  Boris Kiryutin (prosplign algorithm and implementation)
00030 * Author:  Vyacheslav Chetvernin (this adapter)
00031 *
00032 * File Description:
00033 *   CProSplign class definition
00034 *   spliced protein to genomic sequence alignment
00035 *
00036 */
00037 
00038 #include <corelib/ncbistd.hpp>
00039 #include <corelib/ncbiargs.hpp>
00040 #include <corelib/ncbiobj.hpp>
00041 #include <objects/seqalign/seqalign__.hpp>
00042 #include <objmgr/seq_vector_ci.hpp>
00043 
00044 #include <list>
00045 
00046 BEGIN_NCBI_SCOPE
00047 
00048 BEGIN_SCOPE(objects)
00049     class CScope;
00050 END_SCOPE(objects)
00051 
00052 /// Scoring parameters object
00053 class  CProSplignOptions_Base: public CObject
00054 {
00055 public:
00056     static void SetupArgDescriptions(CArgDescriptions* argdescr);
00057 
00058     /// creates scoring parameter object with default values
00059     CProSplignOptions_Base();
00060     CProSplignOptions_Base(const CArgs& args);
00061 
00062     CProSplignOptions_Base& SetScoreMatrix(const string& matrix_name);
00063     const string& GetScoreMatrix() const;
00064 
00065     static const string default_score_matrix_name; // BLOSUM62
00066 
00067 private:
00068     string score_matrix_name;
00069 };
00070 
00071 class  CProSplignScoring: public CProSplignOptions_Base
00072 {
00073 public:
00074     static void SetupArgDescriptions(CArgDescriptions* argdescr);
00075 
00076     /// creates scoring parameter object with default values
00077     CProSplignScoring();
00078 
00079     CProSplignScoring(const CArgs& args);
00080 
00081 
00082     CProSplignScoring& SetMinIntronLen(int);
00083     int GetMinIntronLen() const;
00084 
00085 
00086     ///  in addition to ScoreMatrix prosplign uses following costs (negate to get a score)
00087 
00088     CProSplignScoring& SetGapOpeningCost(int);
00089     int GetGapOpeningCost() const;
00090 
00091     /// Gap Extension Cost for one aminoacid (three bases)
00092     CProSplignScoring& SetGapExtensionCost(int);
00093     int GetGapExtensionCost() const;
00094 
00095     CProSplignScoring& SetFrameshiftOpeningCost(int);
00096     int GetFrameshiftOpeningCost() const;
00097 
00098     /// GT/AG intron opening cost
00099     CProSplignScoring& SetGTIntronCost(int);
00100     int GetGTIntronCost() const;
00101     /// GC/AG intron opening cost
00102     CProSplignScoring& SetGCIntronCost(int);
00103     int GetGCIntronCost() const;
00104     ///AT/AC intron opening cost
00105     CProSplignScoring& SetATIntronCost(int);
00106     int GetATIntronCost() const;
00107 
00108     /// Non Consensus Intron Cost
00109     /// should not exceed a sum of lowest two intron opening costs,
00110     /// i.e. intron_non_consensus cost <= intron_GT cost + intron_GC cost
00111     CProSplignScoring& SetNonConsensusIntronCost(int);
00112     int GetNonConsensusIntronCost() const;            
00113 
00114     /// Inverted Intron Extension Cost 
00115     /// intron_extension cost for 1 base = 1/(inverted_intron_extension*3)
00116     CProSplignScoring& SetInvertedIntronExtensionCost(int);
00117     int GetInvertedIntronExtensionCost() const;
00118 
00119 public:
00120     static const int default_min_intron_len = 30;
00121 
00122     static const int default_gap_opening  = 10;
00123     static const int default_gap_extension = 1;
00124     static const int default_frameshift_opening = 30;
00125 
00126     static const int default_intron_GT = 15;
00127     static const int default_intron_GC = 20;
00128     static const int default_intron_AT = 25;
00129     static const int default_intron_non_consensus = 34;
00130     static const int default_inverted_intron_extension = 1000;
00131 
00132 private:
00133     int min_intron_len;
00134     int gap_opening;
00135     int gap_extension;
00136     int frameshift_opening;
00137     int intron_GT;
00138     int intron_GC;
00139     int intron_AT;
00140     int intron_non_consensus;
00141     int inverted_intron_extension;
00142 };
00143 
00144 /// Output filtering parameters
00145 ///
00146 /// ProSplign always makes a global alignment,
00147 /// i.e. it aligns the whole protein no matter how bad some parts of this alignment might be.
00148 /// Usually we don't want the bad pieces and remove them.
00149 /// The following parameters define good parts.
00150 class  CProSplignOutputOptions: public CProSplignOptions_Base
00151 {
00152 public:
00153     enum EMode {
00154         /// default filtering parameters
00155         eWithHoles,
00156         /// all zeroes - no filtering
00157         ePassThrough,
00158     };
00159 
00160     static void SetupArgDescriptions(CArgDescriptions* argdescr);
00161 
00162     CProSplignOutputOptions(EMode mode = eWithHoles);
00163     CProSplignOutputOptions(const CArgs& args);
00164 
00165     bool IsPassThrough() const;
00166 
00167     /// if possible, do not output frame-preserving gaps, output frameshifts only
00168     /// NOT USED ANYMORE
00169     CProSplignOutputOptions& SetEatGaps(bool);
00170     bool GetEatGaps() const;
00171 
00172     /// any length flank of a good piece should not be worse than this percentage threshold
00173     CProSplignOutputOptions& SetFlankPositives(int);
00174     int GetFlankPositives() const;
00175     /// good piece total percentage threshold
00176     CProSplignOutputOptions& SetTotalPositives(int);
00177     int GetTotalPositives() const;
00178 
00179     /// any part of a good piece longer than max_bad_len should not be worse than min_positives
00180     CProSplignOutputOptions& SetMaxBadLen(int);
00181     int GetMaxBadLen() const;
00182     CProSplignOutputOptions& SetMinPositives(int);
00183     int GetMinPositives() const;
00184 
00185     /// minimum exon identity
00186     CProSplignOutputOptions& SetMinExonId(int);
00187     int GetMinExonId() const;
00188     /// minimum exon positives percentage
00189     CProSplignOutputOptions& SetMinExonPos(int);
00190     int GetMinExonPos() const;
00191 
00192     /// minimum number of bases in the first and last exon
00193     CProSplignOutputOptions& SetMinFlankingExonLen(int);
00194     int GetMinFlankingExonLen() const;
00195     /// good piece should not be shorter than that 
00196     CProSplignOutputOptions& SetMinGoodLen(int);
00197     int GetMinGoodLen() const;
00198 
00199 
00200 
00201     /// reward (in # of positives?) for start codon match. Not implemented yet
00202     CProSplignOutputOptions& SetStartBonus(int);
00203     int GetStartBonus() const;
00204     /// reward for stop codon at the end. Not implemented yet
00205     CProSplignOutputOptions& SetStopBonus(int);
00206     int GetStopBonus() const;
00207 
00208 public:
00209     static const bool default_eat_gaps = true;
00210 
00211     static const int default_flank_positives = 55;
00212     static const int default_total_positives = 70;
00213 
00214     static const int default_max_bad_len = 45;
00215     static const int default_min_positives = 15;
00216 
00217     static const int default_min_exon_id = 30;
00218     static const int default_min_exon_pos = 55;
00219 
00220     static const int default_min_flanking_exon_len = 15;
00221     static const int default_min_good_len = 59;
00222 
00223     static const int default_start_bonus = 8; /// ???
00224     static const int default_stop_bonus = 8; /// ???
00225 
00226 private:
00227     bool eat_gaps;
00228     int flank_positives;
00229     int total_positives;
00230     int max_bad_len;
00231     int min_positives;
00232     int min_exon_id;
00233     int min_exon_pos;
00234     int min_flanking_exon_len;
00235     int min_good_len;
00236     int start_bonus;
00237     int stop_bonus;
00238 };
00239 
00240 class CProSplignText;
00241 
00242 /// spliced protein to genomic alignment
00243 ///
00244 class  CProSplign: public CObject
00245 {
00246 public:
00247 
00248     /// By default ProSplign looks for introns.
00249     /// Set intronless mode for protein to mRNA alignments, many viral genomes, etc.
00250     CProSplign( CProSplignScoring scoring = CProSplignScoring(), bool intronless=false );
00251     ~CProSplign();
00252 
00253     /// Aligns protein to a region on genomic sequence.
00254     /// genomic seq_loc should be a continuous region - an interval or a whole sequence
00255     ///
00256     /// Returns Spliced-seg
00257     CRef<objects::CSeq_align>
00258     FindAlignment(objects::CScope& scope,
00259                   const objects::CSeq_id& protein,
00260                   const objects::CSeq_loc& genomic, 
00261                   CProSplignOutputOptions output_options = CProSplignOutputOptions())
00262     {
00263         CRef<objects::CSeq_align> align_ref;
00264         align_ref = FindGlobalAlignment(scope, protein, genomic);
00265         align_ref = RefineAlignment(scope, *align_ref, output_options);
00266         return align_ref;
00267     }
00268 
00269     /// Globally aligns protein to a region on genomic sequence.
00270     /// genomic seq_loc should be a continuous region - an interval or a whole sequence
00271     ///
00272     /// Returns Spliced-seg
00273     CRef<objects::CSeq_align>
00274     FindGlobalAlignment(objects::CScope& scope,
00275                         const objects::CSeq_id& protein,
00276                         const objects::CSeq_loc& genomic);
00277 
00278     /// Refines Spliced-seg alignment by removing bad pieces according to output_options.
00279     /// This is irreversible action - more relaxed parameters will not change the alignment back
00280     CRef<objects::CSeq_align>
00281     RefineAlignment(objects::CScope& scope,
00282                     const objects::CSeq_align& seq_align,
00283                     CProSplignOutputOptions output_options = CProSplignOutputOptions());
00284 
00285     /// deprecated internals
00286     CProSplign( CProSplignScoring scoring, bool intronless, bool one_stage, bool just_second_stage, bool old);
00287     const vector<pair<int, int> >& GetExons() const;
00288     vector<pair<int, int> >& SetExons();
00289     void GetFlanks(bool& lgap, bool& rgap) const;
00290     void SetFlanks(bool lgap, bool rgap);
00291 
00292 public:
00293     class CImplementation;
00294 private:
00295     auto_ptr<CImplementation> m_implementation;
00296     
00297 private:
00298     /// forbidden
00299     CProSplign(const CProSplign&);
00300     CProSplign& operator=(const CProSplign&);
00301 };
00302 
00303 BEGIN_SCOPE(prosplign)
00304 class CSubstMatrix;
00305 class CTranslationTable;
00306 END_SCOPE(prosplign)
00307 
00308 /// Text representation of ProSplign alignment
00309 // dna        : GATGAAACAGCACTAGTGACAGGTAAA----GATCTAAATATCGTTGA<skip>GGAAGACATCCATTGGCAATGGCAATGGCAT
00310 // translation:  D  E  T  A  L  V  T  G  K        S  K  Y h                hh I  H       
00311 // match      :  |  |     +        |  |  |        |  |  | +                ++ +  | XXXXXbad partXXXXX
00312 // protein    :  D  E  Q  S  F --- T  G  K  E  Y  S  K  Y y.....intron.....yy L  H  D  T  S  T  E  G 
00313 //
00314 // there are no "<skip>", "intron", or "bad part" in actual values
00315 class CProSplignText {
00316 public:
00317     /// Outputs formatted text
00318     static void Output(const objects::CSeq_align& seqalign, objects::CScope& scope, ostream& out, int width, const string& matrix_name = "BLOSUM62");
00319 
00320     CProSplignText(objects::CScope& scope, const objects::CSeq_align& seqalign, const string& matrix_name = "BLOSUM62");
00321     ~CProSplignText();
00322 
00323     const string& GetDNA() { return m_dna; }
00324     const string& GetTranslation() { return m_translation; }
00325     const string& GetMatch() { return m_match; }
00326     const string& GetProtein() { return m_protein; }
00327 
00328 private:
00329     string m_dna;
00330     string m_translation;
00331     string m_match;
00332     string m_protein;
00333     auto_ptr<prosplign::CSubstMatrix> m_matrix;
00334     CRef<prosplign::CTranslationTable> m_trans_table;
00335 
00336     void AddDNAText(objects::CSeqVector_CI& genomic_ci, int& nuc_prev, size_t len);
00337     void TranslateDNA(int phase, size_t len, bool is_insertion);
00338     void AddProtText(objects::CSeqVector_CI& protein_ci, int& prot_prev, size_t len);
00339     void MatchText(size_t len, bool is_match=false);
00340     char MatchChar(size_t i);
00341     void AddHoleText(bool prev_3_prime_splice, bool cur_5_prime_splice,
00342                      objects::CSeqVector_CI& genomic_ci, objects::CSeqVector_CI& protein_ci,
00343                      int& nuc_prev, int& prot_prev,
00344                      int nuc_cur_start, int prot_cur_start);
00345     void AddSpliceText(objects::CSeqVector_CI& genomic_ci, int& nuc_prev, char match);
00346 };
00347 
00348 END_NCBI_SCOPE
00349 
00350 
00351 #endif
00352 
00353 

Generated on Sun Dec 6 21:55:30 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Mon Dec 07 16:20:32 2009 by modify_doxy.py rev. 173732