include/algo/cobalt/kmercounts.hpp

Go to the documentation of this file.
00001 #ifndef ALGO_COBALT___KMERCOUNTS__HPP
00002 #define ALGO_COBALT___KMERCOUNTS__HPP
00003 
00004 /* $Id: kmercounts.hpp 152868 2009-02-19 23:19:34Z boratyng $
00005 * ===========================================================================
00006 *
00007 *                            PUBLIC DOMAIN NOTICE
00008 *               National Center for Biotechnology Information
00009 *
00010 *  This software/database is a "United States Government Work" under the
00011 *  terms of the United States Copyright Act.  It was written as part of
00012 *  the author's offical duties as a United States Government employee and
00013 *  thus cannot be copyrighted.  This software/database is freely available
00014 *  to the public for use. The National Library of Medicine and the U.S.
00015 *  Government have not placed any restriction on its use or reproduction.
00016 *
00017 *  Although all reasonable efforts have been taken to ensure the accuracy
00018 *  and reliability of the software and data, the NLM and the U.S.
00019 *  Government do not and cannot warrant the performance or results that
00020 *  may be obtained by using this software or data. The NLM and the U.S.
00021 *  Government disclaim all warranties, express or implied, including
00022 *  warranties of performance, merchantability or fitness for any particular
00023 *  purpose.
00024 *
00025 *  Please cite the author in any work or product based on this material.
00026 *
00027 * ===========================================================================*/
00028 
00029 /*****************************************************************************
00030 
00031 File name: kmercounts.hpp
00032 
00033 Author: Greg Boratyn
00034 
00035 Contents: Interface for k-mer counting
00036 
00037 ******************************************************************************/
00038 
00039 
00040 #include <util/math/matrix.hpp>
00041 #include <objects/seqloc/Seq_loc.hpp>
00042 #include <objmgr/scope.hpp>
00043 #include <algo/cobalt/base.hpp>
00044 #include <algo/blast/core/blast_encoding.h>
00045 #include <vector>
00046 #include <stack>
00047 
00048 
00049 BEGIN_NCBI_SCOPE
00050 BEGIN_SCOPE(cobalt)
00051 
00052 
00053 // TODO: Implement binary k-mer counts vector
00054 
00055 // Compressed alphabets taken from 
00056 // Shiryev et al.(2007),  Bioinformatics, 23:2949-2951
00057 // 23-to-10 letter compressed alphabet. Based on SE-V(10)
00058 static const string kAlphabet10("IJLMV AST BDENZ KQR G FY P H C W");
00059 // 23-to-15 letter compressed alphabet. Based on SE_B(14) 
00060 static const string kAlphabet15("ST IJV LM KR EQZ A G BD P N F Y H C W");
00061 
00062 
00063 /// Kmer counts for alignment free sequence similarity computation
00064 /// implemented as a sparse vector
00065 ///
00066 class  CSparseKmerCounts
00067 {
00068 public:
00069     typedef Uint1 TCount;
00070 
00071 
00072     /// Element of the sparse vector
00073     typedef struct SVectorElement {
00074     Uint4 position;   ///< position of non-zero element
00075     TCount value;     ///< value of non-zero element
00076 
00077     /// Default constructor
00078     SVectorElement(void) {position = 0; value = 0;}
00079 
00080     /// Create vector element
00081     /// @param pos Element position
00082     /// @param val Element value
00083     SVectorElement(Uint4 pos, TCount val) {position = pos; value = val;}
00084     } SVectorElement;
00085 
00086     typedef vector<SVectorElement>::const_iterator TNonZeroCounts_CI;
00087     
00088     
00089 public:
00090     /// Create empty counts vector
00091     ///
00092     CSparseKmerCounts(void) : m_SeqLength(0), m_NumCounts(0) {}
00093 
00094     /// Create k-mer counts vector from SSeqLoc with defalut k-mer length and
00095     /// alphabet size
00096     /// @param seq The sequence to be represented as k-mer counts [in]
00097     /// @param scope Scope
00098     ///
00099     CSparseKmerCounts(const objects::CSeq_loc& seq,
00100                       objects::CScope& scope);
00101 
00102     /// Reset the counts vector
00103     /// @param seq Sequence [in]
00104     /// @param scope Scope [in]
00105     ///
00106     void Reset(const objects::CSeq_loc& seq, objects::CScope& scope);
00107 
00108     /// Get sequence length
00109     /// @return Sequence length
00110     ///
00111     unsigned int GetSeqLength(void) const {return m_SeqLength;}
00112 
00113     /// Get number of all k-mers found in the sequence
00114     /// @return Number of all k-mers
00115     ///
00116     unsigned int GetNumCounts(void) const {return m_NumCounts;}
00117 
00118     /// Get default kmer length
00119     /// @return Default k-mer length
00120     ///
00121     static unsigned int GetKmerLength(void) 
00122     {return CSparseKmerCounts::sm_KmerLength;}
00123 
00124     /// Get default alphabet size
00125     /// @return Default alphabet size
00126     ///
00127     static unsigned int GetAlphabetSize(void) {return sm_AlphabetSize;}
00128 
00129     /// Get non-zero counts iterator
00130     /// @return Non-zero counts iterator pointing to the begining
00131     ///
00132     TNonZeroCounts_CI BeginNonZero(void) const {return m_Counts.begin();}
00133 
00134     /// Get non-zero counts iterator
00135     /// @return Non-zero counts iterator pointing to the end
00136     ///
00137     TNonZeroCounts_CI EndNonZero(void) const {return m_Counts.end();}
00138 
00139     /// Print counts
00140     /// @param ostr Output stream [in|out]
00141     /// @return Output stream
00142     ///
00143     CNcbiOstream& Print(CNcbiOstream& ostr) const;
00144 
00145     /// Set default k-mer length
00146     /// @param len Default k-mer length [in]
00147     ///
00148     static void SetKmerLength(unsigned len)
00149     {sm_KmerLength = len; sm_ForceSmallerMem = false;}
00150 
00151     /// Set Default alphabet size
00152     /// @param size Default alphabet size [in]
00153     ///
00154     static void SetAlphabetSize(unsigned size)
00155     {sm_AlphabetSize = size; sm_ForceSmallerMem = false;}
00156 
00157     /// Set default compressed alphabet letter translation table
00158     /// @return Reference to translation table [in|out]
00159     ///
00160     static vector<Uint1>& SetTransTable(void) {return sm_TransTable;}
00161 
00162     /// Set default option for using compressed alphabet
00163     /// @param use_comp Will compressed alphabet be used [in]
00164     ///
00165     static void SetUseCompressed(bool use_comp) {sm_UseCompressed = use_comp;}
00166 
00167     /// Compute 1 - local fraction of common k-mers between two count vectors
00168     /// normalized by length of shorter sequence
00169     /// @param vect1 K-mer counts vector [in]
00170     /// @param vect2 K-mer counts vector [in]
00171     /// @return Local fraction of common k-mer as distance
00172     ///
00173     /// Computes 1 - F(v1, v2), where 
00174     /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\min \{L_x, L_y\} 
00175     /// - k + 1), where
00176     /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x 
00177     ///  excluding Xaa, k - k-mer length
00178     /// F(x, y) is described in RC Edgar, BMC Bioinformatics 5:113, 2004
00179     static double FractionCommonKmersDist(const CSparseKmerCounts& vect1, 
00180                       const CSparseKmerCounts& vect2);
00181 
00182     /// Compute 1 - global fraction of common k-mers between two count vectors
00183     /// normalized by length of longer sequence
00184     /// @param vect1 K-mer counts vector [in]
00185     /// @param vect2 K-mer counts vector [in]
00186     /// @return Global fraction of common k-mers as distance
00187     ///
00188     /// Computes 1 - F(v1, v2), where 
00189     /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\max \{L_x, L_y\} 
00190     /// - k + 1), where
00191     /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x
00192     /// excluding Xaa, k - k-mer length
00193     /// F(x, y) is modified version of measure presented 
00194     /// RC Edgar, BMC Bioinformatics 5:113, 2004
00195     static double FractionCommonKmersGlobalDist(const CSparseKmerCounts& v1,
00196                         const CSparseKmerCounts& v2);
00197 
00198     /// Copmute number of common kmers between two count vectors
00199     /// @param v1 K-mer counts vector [in]
00200     /// @param v2 K-mer counts vecotr [in]
00201     /// @param repetitions Should multiple copies of the same k-mer be counted
00202     /// @return Number of k-mers that are present in both counts vectors
00203     ///
00204     static unsigned int CountCommonKmers(const CSparseKmerCounts& v1, 
00205                      const CSparseKmerCounts& v2, 
00206                      bool repetitions = true);
00207 
00208     /// Perform preparations before k-mer counting common to all sequences.
00209     /// Allocate buffer for storing temporary counts
00210     ///
00211     static void PreCount(void);
00212 
00213     /// Perform post-kmer counting tasks. Free buffer.
00214     ///
00215     static void PostCount(void);
00216 
00217 
00218 private:
00219     static TCount* ReserveCountsMem(unsigned int num_bits);
00220 
00221     static Uint4 GetAALetter(Uint1 letter)
00222     {
00223         _ASSERT(!sm_UseCompressed || letter < sm_TransTable.size());
00224         return (Uint4)(sm_UseCompressed ? sm_TransTable[(int)letter] : letter);
00225     }
00226 
00227     /// Initializes element index as bit vector for first k letters, 
00228     /// skipping Xaa
00229     /// @param sv Sequence [in]
00230     /// @param pos Element index in sparse vector [out]
00231     /// @param index Index of letter in the sequence where k-mer counting
00232     /// starts. At exit index points to the next letter after first 
00233     /// k-mer [in|out]
00234     /// @param num_bits Number of bits in pos per letter [in]
00235     /// @param kmer_len K-mer length [in]
00236     /// @return True if pos was initialized, false otherwise (if no k-mer
00237     /// without X was found)
00238     static bool InitPosBits(const objects::CSeqVector& sv, Uint4& pos,
00239                             unsigned int& index,  Uint4 num_bits,
00240                             Uint4 kmer_len);
00241     
00242 
00243 protected:
00244     vector<SVectorElement> m_Counts;
00245     unsigned int m_SeqLength;
00246     unsigned int m_NumCounts;
00247     static unsigned int sm_KmerLength;
00248     static unsigned int sm_AlphabetSize;
00249     static vector<Uint1> sm_TransTable;
00250     static bool sm_UseCompressed;
00251     static TCount* sm_Buffer;
00252     static bool sm_ForceSmallerMem;
00253     static const unsigned int kLengthBitsThreshold = 32;
00254 };
00255 
00256 
00257 
00258 /// Exception class for Kmer counts
00259 class CKmerCountsException : public CException
00260 {
00261 public:
00262     enum EErrCode {
00263         eUnsupportedSeqLoc,
00264         eUnsuportedDistMethod,
00265         eInvalidOptions,
00266         eBadSequence,
00267         eMemoryAllocation
00268     };
00269 
00270     NCBI_EXCEPTION_DEFAULT(CKmerCountsException, CException);
00271 };
00272 
00273 
00274 /// Creates translation table for compressed alphabets
00275 /// @param trans_string String with groupped letters [in]
00276 /// @param trans_table Translation table [out]
00277 /// @param alphabet_len Number of letters in compressed alphabet
00278 ///
00279 static void BuildCompressedTranslation(const string& trans_string,
00280                        vector<Uint1>& trans_table,
00281                        unsigned alphabet_len)
00282 {
00283     Uint4 compressed_letter = 1; // this allows for gaps
00284     trans_table.clear();
00285     trans_table.resize(alphabet_len + 1, 0);
00286     for (Uint4 i = 0; i < trans_string.length();i++) {
00287         if (isspace(trans_string[i])) {
00288             compressed_letter++;
00289         }
00290         else if (isalpha(trans_string[i])) {
00291             Uint1 aa_letter = AMINOACID_TO_NCBISTDAA[(int)trans_string[i]];
00292 
00293             _ASSERT(aa_letter < trans_table.size());
00294 
00295             trans_table[aa_letter] = compressed_letter;
00296         }
00297     }
00298 }
00299 
00300 
00301 /// Interface for computing and manipulating k-mer counts vectors that allows
00302 /// for different implementations of K-mer counts vectors
00303 ///
00304 template <class TKmerCounts>
00305 class TKmerMethods
00306 {
00307 public:
00308     enum ECompressedAlphabet {
00309         eRegular, eSE_V10, eSE_B15
00310     };
00311 
00312     enum EDistMeasures {
00313         eFractionCommonKmersGlobal, 
00314         eFractionCommonKmersLocal
00315     };
00316 
00317     typedef CNcbiMatrix<double> TDistMatrix;
00318 
00319 public:
00320 
00321     /// Set default counts vector parameters
00322     /// @param kmer_len K-mer length [in]
00323     /// @param alphabet_size Alphabet size [in]
00324     ///
00325     static void SetParams(unsigned kmer_len, unsigned alphabet_size)
00326     {
00327         TKmerCounts::SetKmerLength(kmer_len);
00328         TKmerCounts::SetAlphabetSize(alphabet_size);
00329         TKmerCounts::SetTransTable().clear();
00330         TKmerCounts::SetUseCompressed(false);
00331     }
00332 
00333     /// Set default counts vector parameters for use with compressed alphabet
00334     /// @param kmer_len K-mer length [in]
00335     /// @param alph Compressed alphabet to use [in]
00336     ///
00337     static void SetParams(unsigned kmer_len, ECompressedAlphabet alph) {
00338         TKmerCounts::SetKmerLength(kmer_len);
00339         unsigned int len;
00340         unsigned int compressed_len;
00341         switch (alph) {
00342         case eSE_V10:
00343             len = 28;
00344             compressed_len = 11; //including gap
00345             BuildCompressedTranslation(kAlphabet10, 
00346                                        TKmerCounts::SetTransTable(), 
00347                                        len);
00348             TKmerCounts::SetAlphabetSize(compressed_len);
00349             TKmerCounts::SetUseCompressed(true);
00350             break;
00351             
00352         case eSE_B15:
00353             len = 28;
00354             compressed_len = 16; //including gap
00355             BuildCompressedTranslation(kAlphabet15,
00356                                        TKmerCounts::SetTransTable(),
00357                                        len);
00358             TKmerCounts::SetAlphabetSize(compressed_len);
00359             TKmerCounts::SetUseCompressed(true);
00360             break;
00361 
00362         case eRegular:
00363             TKmerCounts::SetAlphabetSize(kAlphabetSize);
00364             TKmerCounts::SetTransTable().clear();
00365             TKmerCounts::SetUseCompressed(false);
00366         }
00367     }
00368 
00369     /// Create k-mer counts vectors for given sequences
00370     /// @param seqs List of sequences [in]
00371     /// @param counts List of k-mer counts vectors [out]
00372     ///
00373     static void ComputeCounts(const vector< CRef<objects::CSeq_loc> >& seqs,
00374                               objects::CScope& scope,
00375                               vector<TKmerCounts>& counts)
00376     {
00377         if (seqs.empty()) {
00378             NCBI_THROW(CKmerCountsException, eInvalidOptions,
00379                        "Empty list of sequences");
00380         }
00381 
00382         counts.clear();
00383 
00384         TKmerCounts::PreCount();
00385 
00386         ITERATE(vector< CRef<objects::CSeq_loc> >, it, seqs) {
00387             counts.push_back(TKmerCounts(**it, scope));
00388         }
00389 
00390         TKmerCounts::PostCount();
00391     }
00392 
00393     /// Compute matrix of distances between given counts vectors
00394     /// @param counts List of k-mer counts vectors [in]
00395     /// @param fsim Function that computes distance betwee two vectors [in]
00396     /// @param dmat Distance matrix [out]
00397     ///
00398     static void ComputeDistMatrix(const vector<TKmerCounts>& counts,
00399                   double(*fsim)(const TKmerCounts&, const TKmerCounts&),
00400                   TDistMatrix& dmat)
00401         
00402     {
00403         if (counts.empty()) {
00404             NCBI_THROW(CKmerCountsException, eBadSequence,
00405                        "The list of k-mer counts vectors is empty");
00406         }
00407 
00408         dmat.Resize(counts.size(), counts.size(), 0.0);
00409         for (int i=0;i < (int)counts.size() - 1;i++) {
00410             for (int j=i+1;j < (int)counts.size();j++) {
00411                 dmat(i, j) = fsim(counts[i], counts[j]);
00412                 dmat(j, i) = dmat(i, j);
00413             }
00414         }
00415     }
00416 
00417     /// Compute matrix of distances between given list of counts vectors
00418     /// using distance function with additional normalizing values
00419     /// @param counts List of k-mer counts vectors [in]
00420     /// @param dmat Distance matrix [out]
00421     /// @param fsim Function that computes distance betwee two vectors [in]
00422     /// @param normalizers List of normalizing arguments [in]
00423     ///
00424     static void ComputeDistMatrix(const vector<TKmerCounts>& counts,
00425         TDistMatrix& dmat,
00426         double(*fsim)(const TKmerCounts&, const TKmerCounts&, double, double),
00427         const vector<double>& normalizers);
00428 
00429 
00430     /// Compute distance matrix for given counts vectors and distance measure
00431     /// @param counts List of k-mer counts vecotrs [in]
00432     /// @param dist_method Distance measure [in]
00433     /// @param dmat Distance matrix [out]
00434     ///
00435     static void ComputeDistMatrix(const vector<TKmerCounts>& counts,
00436                                   EDistMeasures dist_method,
00437                                   TDistMatrix& dmat)
00438     {
00439         switch (dist_method) {
00440         case eFractionCommonKmersLocal:
00441             ComputeDistMatrix(counts, TKmerCounts::FractionCommonKmersDist, 
00442                               dmat);
00443             break;
00444         
00445         case eFractionCommonKmersGlobal:
00446             ComputeDistMatrix(counts, 
00447                               TKmerCounts::FractionCommonKmersGlobalDist, 
00448                               dmat);
00449             break;
00450         
00451         default:
00452             NCBI_THROW(CKmerCountsException, eUnsuportedDistMethod,
00453                        "Unrecognised distance measure");
00454         }
00455     }
00456 
00457 
00458     /// Compute distance matrix for given counts vectors and distance measure
00459     /// and avoid copying
00460     /// @param counts List of k-mer counts vecotrs [in]
00461     /// @param dist_method Distance measure [in]
00462     /// @return Distance matrix
00463     ///
00464     static auto_ptr<TDistMatrix> ComputeDistMatrix(
00465                                           const vector<TKmerCounts>& counts,
00466                                           EDistMeasures dist_method)
00467     {
00468         auto_ptr<TDistMatrix> dmat(new TDistMatrix(counts.size(), 
00469                                                    counts.size(), 0));
00470         ComputeDistMatrix(counts, dist_method, *dmat.get());
00471         return dmat;
00472     }
00473 };
00474 
00475 
00476 
00477 END_SCOPE(cobalt)
00478 END_NCBI_SCOPE
00479 
00480 #endif /* ALGO_COBALT___KMERCOUNTS__HPP */
00481 
00482 

Generated on Sun Dec 6 21:56:15 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Mon Dec 07 16:20:32 2009 by modify_doxy.py rev. 173732