00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #ifndef C_SR_SEARCH_HPP
00034 #define C_SR_SEARCH_HPP
00035
00036 #include <objmgr/seq_vector.hpp>
00037 #include <algo/blast/dbindex/dbindex.hpp>
00038
00039 BEGIN_NCBI_SCOPE
00040 BEGIN_SCOPE( dbindex_search )
00041
00042 USING_SCOPE( ncbi::objects );
00043 USING_SCOPE( ncbi::blastdbindex );
00044
00045
00046 class CSRSearch : public CObject
00047 {
00048 public:
00049
00050 enum ELevel { PE, PM, SE, SM, EM };
00051
00052 typedef CDbIndex::TSeqNum TSeqNum;
00053
00054 struct SSearchData
00055 {
00056 const CSeqVector * seq_1;
00057 const CSeqVector * seq_2;
00058 Uint4 num_res;
00059 Uint4 nr_1;
00060 Uint4 nr_2;
00061 ELevel rlevel_1;
00062 ELevel rlevel_2;
00063 bool mismatch;
00064
00065 SSearchData(
00066 const CSeqVector & seq, Uint4 nr,
00067 Uint4 nr1, Uint4 nr2, ELevel rl1, ELevel rl2,
00068 bool exact = false )
00069 : seq_1( &seq ), seq_2( 0 ), num_res( nr ),
00070 nr_1( nr1 ), nr_2( nr2 ),
00071 rlevel_1( rl1 ), rlevel_2( rl2 ),
00072 mismatch( !exact )
00073 {}
00074
00075 SSearchData(
00076 const CSeqVector & seq1,
00077 const CSeqVector & seq2,
00078 Uint4 nr,
00079 Uint4 nr1, Uint4 nr2, ELevel rl1, ELevel rl2,
00080 bool exact = false )
00081 : seq_1( &seq1 ), seq_2( &seq2 ),
00082 num_res( nr ),
00083 nr_1( nr1 ), nr_2( nr2 ),
00084 rlevel_1( rl1 ), rlevel_2( rl2 ),
00085 mismatch( !exact )
00086 {}
00087 };
00088
00089 struct SResultData
00090 {
00091 bool pair;
00092 Uint1 type;
00093 TSeqNum snum;
00094 TSeqPos spos_1;
00095 bool fw_strand_1;
00096 TSeqPos mpos_1;
00097 Uint1 mbase_1;
00098 TSeqPos spos_2;
00099 bool fw_strand_2;
00100 TSeqPos mpos_2;
00101 Uint1 mbase_2;
00102
00103 SResultData(
00104 TSeqNum subject,
00105 TSeqPos sp,
00106 bool fw_strand,
00107 TSeqPos mp = 0,
00108 Uint1 mbase = '-',
00109 Uint1 tp = 0 )
00110 : pair( false ),
00111 type( tp ),
00112 snum( subject ),
00113 spos_1( sp ),
00114 fw_strand_1( fw_strand ),
00115 mpos_1( mp ),
00116 mbase_1( mbase )
00117 {}
00118
00119 SResultData(
00120 void *,
00121 TSeqNum subject,
00122 TSeqPos sp1,
00123 TSeqPos sp2,
00124 bool fw1,
00125 bool fw2,
00126 TSeqPos mp1 = 0,
00127 TSeqPos mp2 = 0,
00128 Uint1 mb1 = '-',
00129 Uint1 mb2 = '-' )
00130 : pair( true ),
00131 type( 0 ),
00132 snum( subject ),
00133 spos_1( sp1 ),
00134 fw_strand_1( fw1 ),
00135 mpos_1( mp1 ),
00136 mbase_1( mb1 ),
00137 spos_2( sp2 ),
00138 fw_strand_2( fw2 ),
00139 mpos_2( mp2 ),
00140 mbase_2( mb2 )
00141 {}
00142 };
00143
00144 struct SResults
00145 {
00146 ELevel level_1;
00147 ELevel level_2;
00148 Uint4 nres_1;
00149 vector< SResultData > res;
00150 };
00151
00152 typedef SResults TResults;
00153
00154 static CRef< CSRSearch > MakeSRSearch(
00155 CRef< CDbIndex > index,
00156 TSeqPos d = 0, TSeqPos dfuzz = 0 );
00157
00158 virtual void search( const SSearchData & sdata, TResults & results ) = 0;
00159
00160 public:
00161
00162 static const Uint4 TSRRESULTS_INISIZE = 1000000UL;
00163 static const Uint4 TSRPAIRS_INISIZE = 10000UL;
00164
00165 protected:
00166
00167 struct SSRResult
00168 {
00169 Uint4 seqnum;
00170 Uint4 soff;
00171 };
00172
00173 public:
00174
00175 typedef vector< SSRResult > TSRResults;
00176 typedef pair< SSRResult, SSRResult > TSRPairedResult;
00177 typedef vector< TSRPairedResult > TSRPairedResults;
00178
00179 class CResCache
00180 {
00181 private:
00182
00183 vector< Uint1 > f_ind_;
00184 vector< Uint1 > r_ind_;
00185 vector< TSRResults > f_res_cache_;
00186 vector< TSRResults > r_res_cache_;
00187
00188 Uint4 size() { return (Uint4)f_ind_.size(); }
00189
00190 void resize( Uint4 new_sz )
00191 {
00192 if( new_sz != size() ) {
00193 f_ind_.resize( new_sz );
00194 r_ind_.resize( new_sz );
00195
00196 Uint4 start = size();
00197 Uint4 end = new_sz;
00198
00199 f_res_cache_.resize( new_sz );
00200 r_res_cache_.resize( new_sz );
00201
00202 for( Uint4 i = start; i < end; ++i ) {
00203 f_res_cache_[i].reserve( TSRRESULTS_INISIZE );
00204 r_res_cache_[i].reserve( TSRRESULTS_INISIZE );
00205 }
00206 }
00207 }
00208
00209 public:
00210
00211 void init( Uint4 sz )
00212 {
00213 resize( sz );
00214 fill( f_ind_.begin(), f_ind_.end(), 0 );
00215 fill( r_ind_.begin(), r_ind_.end(), 0 );
00216 }
00217
00218 bool is_set( Uint4 idx, bool fw_strand ) const
00219 { return fw_strand ? (bool)f_ind_[idx] : (bool)r_ind_[idx]; }
00220
00221 void set( Uint4 idx, bool fw_strand )
00222 {
00223 if( fw_strand ) {
00224 f_ind_[idx] = 1;
00225 f_res_cache_[idx].clear();
00226 }
00227 else {
00228 r_ind_[idx] = 1;
00229 r_res_cache_[idx].clear();
00230 }
00231 }
00232
00233 TSRResults & at( Uint4 idx, bool fw_strand )
00234 { return fw_strand ? f_res_cache_[idx] : r_res_cache_[idx]; }
00235 };
00236
00237 protected:
00238
00239 struct SMismatchInfo
00240 {
00241 Uint4 idx;
00242 Uint4 num_keys;
00243 TSeqPos key_pos[2];
00244 };
00245
00246 public:
00247
00248 struct SMismatchResultsEntry
00249 {
00250 TSRResults results;
00251 TSeqPos mismatch_position;
00252 Uint4 adjustment;
00253 Uint1 mismatch_letter;
00254
00255 SMismatchResultsEntry() {}
00256
00257 SMismatchResultsEntry( TSeqPos pos, Uint4 letter, Uint1 adj )
00258 { init( pos, letter, adj ); }
00259
00260 void init( TSeqPos pos, Uint4 letter, Uint1 adj )
00261 {
00262 mismatch_position = pos;
00263 adjustment = adj;
00264 mismatch_letter = letter;
00265 }
00266 };
00267
00268 typedef vector< SMismatchResultsEntry > TMismatchResults;
00269
00270 class CMismatchResultsInfo
00271 {
00272 static const Uint4 BLOCK_SHIFT = 7UL;
00273 static const Uint4 BLOCK_SIZE = (1UL<<BLOCK_SHIFT);
00274 static const Uint4 BLOCK_MASK = (BLOCK_SIZE - 1);
00275 static const Uint4 BLOCKS_INISIZE = 1024UL;
00276 static const Uint4 RESULTS_INISIZE = 1024UL*10UL;
00277
00278 typedef vector< TMismatchResults > TBlocks;
00279
00280 public:
00281
00282 CMismatchResultsInfo() : size_( 0 )
00283 {
00284 blocks_.reserve( BLOCKS_INISIZE );
00285 }
00286
00287 void clear() { size_ = 0; }
00288 Uint4 size() const { return size_; }
00289 bool empty() const { return size_ == 0; }
00290
00291 SMismatchResultsEntry & operator[]( Uint4 i )
00292 {
00293 if( i >= size_ ) resize( i + 1 );
00294 return at( i );
00295 }
00296
00297 void resize( Uint4 i )
00298 {
00299 size_ = i;
00300
00301 if( size_ == 0 ) return;
00302
00303 Uint4 block = ((i-1)>>BLOCK_SHIFT);
00304
00305 while( block >= blocks_.size() ) {
00306 blocks_.push_back( TMismatchResults( BLOCK_SIZE ) );
00307 TMismatchResults & block = *blocks_.rbegin();
00308
00309 for( Uint4 j = 0; j < BLOCK_SIZE; ++j ) {
00310 block[j].results.reserve( RESULTS_INISIZE );
00311 }
00312 }
00313 }
00314
00315 private:
00316
00317 SMismatchResultsEntry & at( Uint4 i )
00318 { return blocks_[i>>BLOCK_SHIFT][i&BLOCK_MASK]; }
00319
00320 Uint4 size_;
00321 TBlocks blocks_;
00322 };
00323
00324 protected:
00325
00326 struct SHKData
00327 {
00328 SHKData()
00329 {
00330 exact_1[0].reserve( TSRRESULTS_INISIZE );
00331 exact_1[1].reserve( TSRRESULTS_INISIZE );
00332 exact_2[0].reserve( TSRRESULTS_INISIZE );
00333 exact_2[1].reserve( TSRRESULTS_INISIZE );
00334
00335 pres.reserve( TSRPAIRS_INISIZE );
00336 }
00337
00338 void reset()
00339 {
00340 }
00341
00342 CResCache rc1, rc2;
00343 TSRResults exact_1[2];
00344 TSRResults exact_2[2];
00345
00346 CMismatchResultsInfo mm_1[2];
00347 CMismatchResultsInfo mm_2[2];
00348
00349 TSRPairedResults pres;
00350 };
00351
00352 #ifdef NCBI_COMPILER_MIPSPRO
00353 public:
00354 #endif
00355 class InternalException : public CException
00356 {
00357 public:
00358
00359 enum EErrCode
00360 {
00361 eAmbig
00362 };
00363
00364 virtual const char * GetErrCodeString() const;
00365
00366 NCBI_EXCEPTION_DEFAULT( InternalException, CException );
00367 };
00368
00369 protected:
00370 CSRSearch( CRef< CDbIndex > index, TSeqPos d, TSeqPos dfuzz )
00371 : index_( index ), dmax_( d + dfuzz ), dmin_( d - dfuzz )
00372 {
00373 ASSERT( d >= dfuzz );
00374 hkey_width_ = index_->getHKeyWidth();
00375 }
00376
00377 vector< TSeqPos > GetQNmerPositions( TSeqPos sz ) const;
00378
00379 Uint4 getNMer(
00380 const CSeqVector & seq, TSeqPos pos,
00381 bool fw, bool & ambig ) const;
00382
00383 Uint4 getNMer(
00384 const CSeqVector & seq, TSeqPos pos,
00385 bool fw, bool & ambig,
00386 TSeqPos sub_pos, Uint1 sub_letter ) const;
00387
00388 pair< TSeqPos, TSeqPos > Pos2Index(
00389 TSeqPos pos, TSeqPos sz, SMismatchInfo & mismatch_info ) const;
00390
00391 void mergeResults(
00392 TSRResults & results, const TSRResults & src, Int4 step );
00393
00394 void combine( const TSRResults & results_1,
00395 const TSRResults & results_2,
00396 TSRPairedResults & results,
00397 Int4 adjust = 0 );
00398
00399 pair< TSeqNum, TSeqPos > MapSOff(
00400 TSeqNum lid, TSeqPos loff, TSeqPos sz, bool & overlap ) const;
00401
00402 bool reportResults(
00403 TResults & r, Uint4 nr,
00404 TSeqPos qsz1, TSeqPos qsz2,
00405 const TSRPairedResults & results, bool fw1, bool fw2,
00406 bool mismatch1 = false, bool mismatch2 = false,
00407 TSeqPos mismatch_pos1 = 0, TSeqPos mismatch_pos2 = 0,
00408 Uint1 mismatch_letter1 = (Uint1)'-',
00409 Uint1 mismatch_letter2 = (Uint1)'-',
00410 Uint4 adjustment1 = 0, Uint4 adjustment2 = 0 );
00411
00412 bool reportResults(
00413 TResults & r, Uint4 nr,
00414 TSeqPos qsz, const TSRResults & results,
00415 bool fw, bool mismatch = false,
00416 TSeqPos mismatch_pos = 0, Uint1 mismatch_letter = (Uint1)'-',
00417 Uint4 adjustment = 0, Uint4 pos_in_pair = 0 );
00418
00419 unsigned long hkey_width_;
00420 SHKData hk_data_;
00421
00422 private:
00423
00424 CRef< CDbIndex > index_;
00425 TSeqPos dmax_, dmin_;
00426 };
00427
00428 END_SCOPE( dbindex_search )
00429 END_NCBI_SCOPE
00430
00431 #endif
00432
00433
00434