00001 #ifndef ALGO_COBALT___KMERCOUNTS__HPP
00002 #define ALGO_COBALT___KMERCOUNTS__HPP
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040 #include <util/math/matrix.hpp>
00041 #include <objects/seqloc/Seq_loc.hpp>
00042 #include <objmgr/scope.hpp>
00043 #include <algo/cobalt/base.hpp>
00044 #include <algo/blast/core/blast_encoding.h>
00045 #include <vector>
00046 #include <stack>
00047
00048
00049 BEGIN_NCBI_SCOPE
00050 BEGIN_SCOPE(cobalt)
00051
00052
00053
00054
00055
00056
00057
00058 static const string kAlphabet10("IJLMV AST BDENZ KQR G FY P H C W");
00059
00060 static const string kAlphabet15("ST IJV LM KR EQZ A G BD P N F Y H C W");
00061
00062
00063
00064
00065
00066 class CSparseKmerCounts
00067 {
00068 public:
00069 typedef Uint1 TCount;
00070
00071
00072
00073 typedef struct SVectorElement {
00074 Uint4 position;
00075 TCount value;
00076
00077
00078 SVectorElement(void) {position = 0; value = 0;}
00079
00080
00081
00082
00083 SVectorElement(Uint4 pos, TCount val) {position = pos; value = val;}
00084 } SVectorElement;
00085
00086 typedef vector<SVectorElement>::const_iterator TNonZeroCounts_CI;
00087
00088
00089 public:
00090
00091
00092 CSparseKmerCounts(void) : m_SeqLength(0), m_NumCounts(0) {}
00093
00094
00095
00096
00097
00098
00099 CSparseKmerCounts(const objects::CSeq_loc& seq,
00100 objects::CScope& scope);
00101
00102
00103
00104
00105
00106 void Reset(const objects::CSeq_loc& seq, objects::CScope& scope);
00107
00108
00109
00110
00111 unsigned int GetSeqLength(void) const {return m_SeqLength;}
00112
00113
00114
00115
00116 unsigned int GetNumCounts(void) const {return m_NumCounts;}
00117
00118
00119
00120
00121 static unsigned int GetKmerLength(void)
00122 {return CSparseKmerCounts::sm_KmerLength;}
00123
00124
00125
00126
00127 static unsigned int GetAlphabetSize(void) {return sm_AlphabetSize;}
00128
00129
00130
00131
00132 TNonZeroCounts_CI BeginNonZero(void) const {return m_Counts.begin();}
00133
00134
00135
00136
00137 TNonZeroCounts_CI EndNonZero(void) const {return m_Counts.end();}
00138
00139
00140
00141
00142
00143 CNcbiOstream& Print(CNcbiOstream& ostr) const;
00144
00145
00146
00147
00148 static void SetKmerLength(unsigned len)
00149 {sm_KmerLength = len; sm_ForceSmallerMem = false;}
00150
00151
00152
00153
00154 static void SetAlphabetSize(unsigned size)
00155 {sm_AlphabetSize = size; sm_ForceSmallerMem = false;}
00156
00157
00158
00159
00160 static vector<Uint1>& SetTransTable(void) {return sm_TransTable;}
00161
00162
00163
00164
00165 static void SetUseCompressed(bool use_comp) {sm_UseCompressed = use_comp;}
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179 static double FractionCommonKmersDist(const CSparseKmerCounts& vect1,
00180 const CSparseKmerCounts& vect2);
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195 static double FractionCommonKmersGlobalDist(const CSparseKmerCounts& v1,
00196 const CSparseKmerCounts& v2);
00197
00198
00199
00200
00201
00202
00203
00204 static unsigned int CountCommonKmers(const CSparseKmerCounts& v1,
00205 const CSparseKmerCounts& v2,
00206 bool repetitions = true);
00207
00208
00209
00210
00211 static void PreCount(void);
00212
00213
00214
00215 static void PostCount(void);
00216
00217
00218 private:
00219 static TCount* ReserveCountsMem(unsigned int num_bits);
00220
00221 static Uint4 GetAALetter(Uint1 letter)
00222 {
00223 _ASSERT(!sm_UseCompressed || letter < sm_TransTable.size());
00224 return (Uint4)(sm_UseCompressed ? sm_TransTable[(int)letter] : letter);
00225 }
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238 static bool InitPosBits(const objects::CSeqVector& sv, Uint4& pos,
00239 unsigned int& index, Uint4 num_bits,
00240 Uint4 kmer_len);
00241
00242
00243 protected:
00244 vector<SVectorElement> m_Counts;
00245 unsigned int m_SeqLength;
00246 unsigned int m_NumCounts;
00247 static unsigned int sm_KmerLength;
00248 static unsigned int sm_AlphabetSize;
00249 static vector<Uint1> sm_TransTable;
00250 static bool sm_UseCompressed;
00251 static TCount* sm_Buffer;
00252 static bool sm_ForceSmallerMem;
00253 static const unsigned int kLengthBitsThreshold = 32;
00254 };
00255
00256
00257
00258
00259 class CKmerCountsException : public CException
00260 {
00261 public:
00262 enum EErrCode {
00263 eUnsupportedSeqLoc,
00264 eUnsuportedDistMethod,
00265 eInvalidOptions,
00266 eBadSequence,
00267 eMemoryAllocation
00268 };
00269
00270 NCBI_EXCEPTION_DEFAULT(CKmerCountsException, CException);
00271 };
00272
00273
00274
00275
00276
00277
00278
00279 static void BuildCompressedTranslation(const string& trans_string,
00280 vector<Uint1>& trans_table,
00281 unsigned alphabet_len)
00282 {
00283 Uint4 compressed_letter = 1;
00284 trans_table.clear();
00285 trans_table.resize(alphabet_len + 1, 0);
00286 for (Uint4 i = 0; i < trans_string.length();i++) {
00287 if (isspace(trans_string[i])) {
00288 compressed_letter++;
00289 }
00290 else if (isalpha(trans_string[i])) {
00291 Uint1 aa_letter = AMINOACID_TO_NCBISTDAA[(int)trans_string[i]];
00292
00293 _ASSERT(aa_letter < trans_table.size());
00294
00295 trans_table[aa_letter] = compressed_letter;
00296 }
00297 }
00298 }
00299
00300
00301
00302
00303
00304 template <class TKmerCounts>
00305 class TKmerMethods
00306 {
00307 public:
00308 enum ECompressedAlphabet {
00309 eRegular, eSE_V10, eSE_B15
00310 };
00311
00312 enum EDistMeasures {
00313 eFractionCommonKmersGlobal,
00314 eFractionCommonKmersLocal
00315 };
00316
00317 typedef CNcbiMatrix<double> TDistMatrix;
00318
00319 public:
00320
00321
00322
00323
00324
00325 static void SetParams(unsigned kmer_len, unsigned alphabet_size)
00326 {
00327 TKmerCounts::SetKmerLength(kmer_len);
00328 TKmerCounts::SetAlphabetSize(alphabet_size);
00329 TKmerCounts::SetTransTable().clear();
00330 TKmerCounts::SetUseCompressed(false);
00331 }
00332
00333
00334
00335
00336
00337 static void SetParams(unsigned kmer_len, ECompressedAlphabet alph) {
00338 TKmerCounts::SetKmerLength(kmer_len);
00339 unsigned int len;
00340 unsigned int compressed_len;
00341 switch (alph) {
00342 case eSE_V10:
00343 len = 28;
00344 compressed_len = 11;
00345 BuildCompressedTranslation(kAlphabet10,
00346 TKmerCounts::SetTransTable(),
00347 len);
00348 TKmerCounts::SetAlphabetSize(compressed_len);
00349 TKmerCounts::SetUseCompressed(true);
00350 break;
00351
00352 case eSE_B15:
00353 len = 28;
00354 compressed_len = 16;
00355 BuildCompressedTranslation(kAlphabet15,
00356 TKmerCounts::SetTransTable(),
00357 len);
00358 TKmerCounts::SetAlphabetSize(compressed_len);
00359 TKmerCounts::SetUseCompressed(true);
00360 break;
00361
00362 case eRegular:
00363 TKmerCounts::SetAlphabetSize(kAlphabetSize);
00364 TKmerCounts::SetTransTable().clear();
00365 TKmerCounts::SetUseCompressed(false);
00366 }
00367 }
00368
00369
00370
00371
00372
00373 static void ComputeCounts(const vector< CRef<objects::CSeq_loc> >& seqs,
00374 objects::CScope& scope,
00375 vector<TKmerCounts>& counts)
00376 {
00377 if (seqs.empty()) {
00378 NCBI_THROW(CKmerCountsException, eInvalidOptions,
00379 "Empty list of sequences");
00380 }
00381
00382 counts.clear();
00383
00384 TKmerCounts::PreCount();
00385
00386 ITERATE(vector< CRef<objects::CSeq_loc> >, it, seqs) {
00387 counts.push_back(TKmerCounts(**it, scope));
00388 }
00389
00390 TKmerCounts::PostCount();
00391 }
00392
00393
00394
00395
00396
00397
00398 static void ComputeDistMatrix(const vector<TKmerCounts>& counts,
00399 double(*fsim)(const TKmerCounts&, const TKmerCounts&),
00400 TDistMatrix& dmat)
00401
00402 {
00403 if (counts.empty()) {
00404 NCBI_THROW(CKmerCountsException, eBadSequence,
00405 "The list of k-mer counts vectors is empty");
00406 }
00407
00408 dmat.Resize(counts.size(), counts.size(), 0.0);
00409 for (int i=0;i < (int)counts.size() - 1;i++) {
00410 for (int j=i+1;j < (int)counts.size();j++) {
00411 dmat(i, j) = fsim(counts[i], counts[j]);
00412 dmat(j, i) = dmat(i, j);
00413 }
00414 }
00415 }
00416
00417
00418
00419
00420
00421
00422
00423
00424 static void ComputeDistMatrix(const vector<TKmerCounts>& counts,
00425 TDistMatrix& dmat,
00426 double(*fsim)(const TKmerCounts&, const TKmerCounts&, double, double),
00427 const vector<double>& normalizers);
00428
00429
00430
00431
00432
00433
00434
00435 static void ComputeDistMatrix(const vector<TKmerCounts>& counts,
00436 EDistMeasures dist_method,
00437 TDistMatrix& dmat)
00438 {
00439 switch (dist_method) {
00440 case eFractionCommonKmersLocal:
00441 ComputeDistMatrix(counts, TKmerCounts::FractionCommonKmersDist,
00442 dmat);
00443 break;
00444
00445 case eFractionCommonKmersGlobal:
00446 ComputeDistMatrix(counts,
00447 TKmerCounts::FractionCommonKmersGlobalDist,
00448 dmat);
00449 break;
00450
00451 default:
00452 NCBI_THROW(CKmerCountsException, eUnsuportedDistMethod,
00453 "Unrecognised distance measure");
00454 }
00455 }
00456
00457
00458
00459
00460
00461
00462
00463
00464 static auto_ptr<TDistMatrix> ComputeDistMatrix(
00465 const vector<TKmerCounts>& counts,
00466 EDistMeasures dist_method)
00467 {
00468 auto_ptr<TDistMatrix> dmat(new TDistMatrix(counts.size(),
00469 counts.size(), 0));
00470 ComputeDistMatrix(counts, dist_method, *dmat.get());
00471 return dmat;
00472 }
00473 };
00474
00475
00476
00477 END_SCOPE(cobalt)
00478 END_NCBI_SCOPE
00479
00480 #endif
00481
00482