include/algo/align/contig_assembly/contig_assembly.hpp

Go to the documentation of this file.
00001 #ifndef ALGO_ALIGN_CONTIG_ASSEMBLY___CONTIG_ASSEMBLY__HPP
00002 #define ALGO_ALIGN_CONTIG_ASSEMBLY___CONTIG_ASSEMBLY__HPP
00003 /*  $Id: contig_assembly.hpp 167485 2009-08-03 17:09:32Z boukn $
00004  * ===========================================================================
00005  *
00006  *                            PUBLIC DOMAIN NOTICE
00007  *               National Center for Biotechnology Information
00008  *
00009  *  This software/database is a "United States Government Work" under the
00010  *  terms of the United States Copyright Act.  It was written as part of
00011  *  the author's official duties as a United States Government employee and
00012  *  thus cannot be copyrighted.  This software/database is freely available
00013  *  to the public for use. The National Library of Medicine and the U.S.
00014  *  Government have not placed any restriction on its use or reproduction.
00015  *
00016  *  Although all reasonable efforts have been taken to ensure the accuracy
00017  *  and reliability of the software and data, the NLM and the U.S.
00018  *  Government do not and cannot warrant the performance or results that
00019  *  may be obtained by using this software or data. The NLM and the U.S.
00020  *  Government disclaim all warranties, express or implied, including
00021  *  warranties of performance, merchantability or fitness for any particular
00022  *  purpose.
00023  *
00024  *  Please cite the author in any work or product based on this material.
00025  *
00026  * ===========================================================================
00027  *
00028  * Authors:  Josh Cherry
00029  *
00030  * File Description:
00031  *
00032  */
00033 
00034 #include <objects/seqloc/Seq_id.hpp>
00035 #include <objmgr/scope.hpp>
00036 #include <objects/seqloc/Na_strand_.hpp>
00037 #include <util/range.hpp>
00038 
00039 BEGIN_NCBI_SCOPE
00040 
00041 BEGIN_objects_SCOPE
00042 class CSeq_id;
00043 class CSeq_align;
00044 class CSeq_align_set;
00045 class CDense_seg;
00046 class CScope;
00047 class CAlnVec;
00048 END_objects_SCOPE
00049 
00050 
00051 /// This class provides alignment-related functions intended
00052 /// for finding overlaps for contig assembly.
00053 class  CContigAssembly
00054 {
00055 public:
00056     class  CAlnStats : public CObject
00057     {
00058     public:
00059         CAlnStats(unsigned int adjusted_len,
00060                   unsigned int mm,
00061                   unsigned int gaps) :
00062             m_AdjustedLen(adjusted_len), m_MM(mm), m_Gaps(gaps), m_PctIdent(0.0) {}
00063         CAlnStats(const objects::CDense_seg& ds, objects::CScope& scope);
00064 
00065         unsigned int GetAdjustedLength() const {return m_AdjustedLen;}
00066         /// Returns a fraction between 0 and 1, not a percentage
00067         double       GetFracIdentity() const { return m_PctIdent; }
00068         unsigned int GetNumMismatches() const {return m_MM;}
00069         unsigned int GetNumGaps() const {return m_Gaps;}
00070     private:
00071         unsigned int m_AdjustedLen;
00072         unsigned int m_MM;
00073         unsigned int m_Gaps;
00074         double m_PctIdent;
00075     };
00076 
00077     /// Most users of the class need only to call this function.
00078     /// It runs blastn and, if the results are not satisfactory,
00079     /// tries a banded dynamic-programming alignment, using a band
00080     /// chosen based on the blast results.
00081     static vector<CRef<objects::CSeq_align> >
00082     Align(const objects::CSeq_id& id0, const objects::CSeq_id& id1,
00083           const string& blast_params, double min_ident,
00084           unsigned int max_end_slop, objects::CScope& scope,
00085           CNcbiOstream* ostr = 0,
00086           const vector<unsigned int>& band_halfwidths
00087               = vector<unsigned int>(1, 200),
00088           unsigned int diag_finding_window = 200,
00089           unsigned int min_align_length = 50,
00090           objects::ENa_strand strand0 = objects::eNa_strand_unknown,
00091           objects::ENa_strand strand1 = objects::eNa_strand_unknown);
00092 
00093     /// Utility for running blastn.
00094     // It accepts a blast parameter string such as
00095     // "-W 28 -r 1 -q -3 -e 1e-5 -Z 200 -F 'm L; R -d rodents.lib'"
00096     // (single quotes are respected)
00097     static CRef<objects::CSeq_align_set>
00098     Blastn(const objects::CSeq_id& query_id,
00099            const objects::CSeq_id& subject_id,
00100            const string& param_string, objects::CScope& scope);
00101 
00102     static CRef<objects::CSeq_align_set>
00103     Blastn(const objects::CSeq_loc& query_loc,
00104            const objects::CSeq_loc& subject_loc,
00105            const string& param_string, objects::CScope& scope);
00106 
00107     /// Given a set of alignments, pick out a diagonal to use as
00108     /// the center of a band in a banded alignment.
00109     static void FindDiagFromAlignSet(const objects::CSeq_align_set& align_set,
00110                                      objects::CScope& scope,
00111                                      unsigned int window_size,
00112                                      objects::ENa_strand& strand,
00113                                      unsigned int& diag);
00114 
00115 
00116     /// Do a banded global alignment using an arbitrary band.
00117     static CRef<objects::CDense_seg>
00118     BandedGlobalAlignment(const objects::CSeq_id& id0,
00119                           const objects::CSeq_id& id1,
00120                           objects::ENa_strand strand,
00121                           unsigned int diag,
00122                           unsigned int half_width,
00123                           objects::CScope& scope);
00124 
00125     /// Find the highest-scoring local subalignment.
00126     /// This function is necessary only because we don't have
00127     /// a banded local alignment algorithm.
00128     static CRef<objects::CDense_seg>
00129     BestLocalSubAlignment(const objects::CDense_seg& ds_in,
00130                           objects::CScope& scope);
00131 
00132     /// Count the cells with "ink" along each diagonal in a
00133     /// dot-matrix-type plot of some set of alignments (e.g., blast results)
00134     static void DiagCounts(const objects::CSeq_align_set& align_set,
00135                            objects::CScope& scope,
00136                            vector<unsigned int>& plus_vec,
00137                            vector<unsigned int>& minus_vec);
00138 
00139     typedef map<unsigned int, unsigned int> TDiagMap;
00140     static void DiagCounts(const objects::CSeq_align_set& align_set,
00141                            objects::CScope& scope,
00142                            TDiagMap& plus_map,
00143                            TDiagMap& minus_map);
00144 
00145     /// Find the range (or more than one tied range) containing
00146     /// the maximal diagonal count, summed over a window.
00147     typedef CRange<unsigned int> TRange;
00148     static void FindMaxRange(const vector<unsigned int>& vec,
00149                              unsigned int window,
00150                              unsigned int& max,
00151                              vector<TRange>& max_range);
00152 
00153 
00154     static void FindMaxRange(const TDiagMap& map,
00155                              unsigned int window,
00156                              unsigned int& max,
00157                              vector<TRange>& max_range);
00158 
00159 
00160     static bool IsDovetail(const objects::CDense_seg& ds,
00161                            unsigned int slop, objects::CScope& scope);
00162     static bool IsAtLeastHalfDovetail(const objects::CDense_seg& ds,
00163                                       unsigned int slop,
00164                                       objects::CScope& scope);
00165     static bool IsContained(const objects::CDense_seg& ds,
00166                             unsigned int slop, objects::CScope& scope);
00167     static double FracIdent(const objects::CDense_seg& ds,
00168                             objects::CScope& scope);
00169 
00170 
00171     /// Alignment characterization
00172 
00173     struct SAlignStats {
00174 
00175         // unaligned tails
00176         struct STails {
00177             TSeqPos left;
00178             TSeqPos right;
00179         };
00180 
00181         // constructor
00182         SAlignStats()
00183                 : total_length(0),
00184                 aligned_length(0),
00185                 gap_count(0),
00186                 mismatches(0),
00187                 pct_identity(0)
00188         {
00189         }
00190 
00191         /// total covered length of the alignment, including gaps
00192         TSeqPos total_length;
00193 
00194         /// total number of bases included in the alignment
00195         TSeqPos aligned_length;
00196 
00197         /// count of total number of gaps
00198         TSeqPos gap_count;
00199 
00200         /// number of mismatched bases
00201         TSeqPos mismatches;
00202 
00203         /// % identity (varies from 0 to 100)
00204         double pct_identity;
00205 
00206         /// unaligned tails
00207         vector<STails> tails;
00208 
00209         /// the set of gap lengths for this alignment
00210         vector<TSeqPos> gaps;
00211 
00212         /// for each gap, whether is consists of "simple sequence"
00213         vector<bool> is_simple;
00214     };
00215     static void GatherAlignStats(const objects::CAlnVec& vec,
00216                                  SAlignStats& align_stats);
00217     static void GatherAlignStats(const objects::CDense_seg& ds,
00218                                  objects::CScope& scope,
00219                                  SAlignStats& align_stats);
00220     static void GatherAlignStats(const objects::CSeq_align& aln,
00221                                  objects::CScope& scope,
00222                                  SAlignStats& align_stats);
00223 
00224     private:
00225         static void x_OrientAlign(objects::CDense_seg& ds, objects::CScope& scope);
00226         static bool x_IsAllowedStrands(const objects::CDense_seg& ds,
00227                                        objects::ENa_strand strand0,
00228                                        objects::ENa_strand strand1);
00229         static TSeqPos x_DensegLength(const objects::CDense_seg& ds);
00230 
00231         static void x_GatherIdentStats(const objects::CAlnVec& vec,
00232                                        SAlignStats& align_stats);
00233 };
00234 
00235 
00236 /*inline
00237 double CContigAssembly::CAlnStats::GetFracIdentity() const
00238 {
00239     return 1.0 - double(m_MM + m_Gaps) / m_AdjustedLen;
00240 }*/
00241 
00242 
00243 END_NCBI_SCOPE
00244 
00245 #endif  // ALGO_ALIGN_CONTIG_ASSEMBLY___CONTIG_ASSEMBLY__HPP
00246 
00247 
00248 

Generated on Wed Dec 9 02:54:14 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Wed Dec 09 08:17:25 2009 by modify_doxy.py rev. 173732