include/algo/blast/dbindex_search/sr_search.hpp

Go to the documentation of this file.
00001 /*  $Id: sr_search.hpp 141028 2008-09-23 17:37:34Z ucko $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE
00005  *               National Center for Biotechnology Information
00006  *
00007  *  This software/database is a "United States Government Work" under the
00008  *  terms of the United States Copyright Act.  It was written as part of
00009  *  the author's official duties as a United States Government employee and
00010  *  thus cannot be copyrighted.  This software/database is freely available
00011  *  to the public for use. The National Library of Medicine and the U.S.
00012  *  Government have not placed any restriction on its use or reproduction.
00013  *
00014  *  Although all reasonable efforts have been taken to ensure the accuracy
00015  *  and reliability of the software and data, the NLM and the U.S.
00016  *  Government do not and cannot warrant the performance or results that
00017  *  may be obtained by using this software or data. The NLM and the U.S.
00018  *  Government disclaim all warranties, express or implied, including
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.
00021  *
00022  *  Please cite the author in any work or product based on this material.
00023  *
00024  * ===========================================================================
00025  *
00026  * Author:  Aleksandr Morgulis
00027  *
00028  * File Description:
00029  *   Header file for CSRSearch and some related declarations.
00030  *
00031  */
00032 
00033 #ifndef C_SR_SEARCH_HPP
00034 #define C_SR_SEARCH_HPP
00035 
00036 #include <objmgr/seq_vector.hpp>
00037 #include <algo/blast/dbindex/dbindex.hpp>
00038 
00039 BEGIN_NCBI_SCOPE
00040 BEGIN_SCOPE( dbindex_search )
00041 
00042 USING_SCOPE( ncbi::objects );
00043 USING_SCOPE( ncbi::blastdbindex );
00044 
00045 //----------------------------------------------------------------------------
00046 class CSRSearch : public CObject
00047 {
00048     public:
00049 
00050         enum ELevel { PE, PM, SE, SM, EM };
00051 
00052         typedef CDbIndex::TSeqNum TSeqNum;
00053 
00054         struct SSearchData
00055         {
00056             const CSeqVector * seq_1;
00057             const CSeqVector * seq_2;
00058             Uint4 num_res;
00059             Uint4 nr_1;
00060             Uint4 nr_2;
00061             ELevel rlevel_1;
00062             ELevel rlevel_2;
00063             bool mismatch;
00064 
00065             SSearchData( 
00066                     const CSeqVector & seq, Uint4 nr, 
00067                     Uint4 nr1, Uint4 nr2, ELevel rl1, ELevel rl2, 
00068                     bool exact = false )
00069                 : seq_1( &seq ), seq_2( 0 ), num_res( nr ), 
00070                   nr_1( nr1 ), nr_2( nr2 ), 
00071                   rlevel_1( rl1 ), rlevel_2( rl2 ),
00072                   mismatch( !exact )
00073             {}
00074 
00075             SSearchData( 
00076                     const CSeqVector & seq1, 
00077                     const CSeqVector & seq2,
00078                     Uint4 nr, 
00079                     Uint4 nr1, Uint4 nr2, ELevel rl1, ELevel rl2, 
00080                     bool exact = false )
00081                 : seq_1( &seq1 ), seq_2( &seq2 ), 
00082                   num_res( nr ),
00083                   nr_1( nr1 ), nr_2( nr2 ), 
00084                   rlevel_1( rl1 ), rlevel_2( rl2 ),
00085                   mismatch( !exact )
00086             {}
00087         };
00088 
00089         struct SResultData
00090         {
00091             bool    pair;
00092             Uint1   type;
00093             TSeqNum snum;
00094             TSeqPos spos_1;
00095             bool    fw_strand_1;
00096             TSeqPos mpos_1;
00097             Uint1   mbase_1;
00098             TSeqPos spos_2;
00099             bool    fw_strand_2;
00100             TSeqPos mpos_2;
00101             Uint1   mbase_2;
00102 
00103             SResultData( 
00104                     TSeqNum subject, 
00105                     TSeqPos sp,
00106                     bool fw_strand,
00107                     TSeqPos mp = 0,
00108                     Uint1 mbase = '-',
00109                     Uint1 tp = 0 )
00110                 : pair( false ),
00111                   type( tp ),
00112                   snum( subject ),
00113                   spos_1( sp ),
00114                   fw_strand_1( fw_strand ),
00115                   mpos_1( mp ),
00116                   mbase_1( mbase )
00117             {}
00118 
00119             SResultData(
00120                     void *,
00121                     TSeqNum subject,
00122                     TSeqPos sp1,
00123                     TSeqPos sp2,
00124                     bool fw1,
00125                     bool fw2, 
00126                     TSeqPos mp1 = 0,
00127                     TSeqPos mp2 = 0,
00128                     Uint1 mb1 = '-',
00129                     Uint1 mb2 = '-' )
00130                 : pair( true ),
00131                   type( 0 ),
00132                   snum( subject ),
00133                   spos_1( sp1 ),
00134                   fw_strand_1( fw1 ),
00135                   mpos_1( mp1 ),
00136                   mbase_1( mb1 ),
00137                   spos_2( sp2 ),
00138                   fw_strand_2( fw2 ),
00139                   mpos_2( mp2 ),
00140                   mbase_2( mb2 )
00141             {}
00142         };
00143         
00144         struct SResults
00145         {
00146             ELevel level_1;
00147             ELevel level_2;
00148             Uint4 nres_1;
00149             vector< SResultData > res;
00150         };
00151 
00152         typedef SResults TResults;
00153 
00154         static CRef< CSRSearch > MakeSRSearch( 
00155                 CRef< CDbIndex > index, 
00156                 TSeqPos d = 0, TSeqPos dfuzz = 0 );
00157 
00158         virtual void search( const SSearchData & sdata, TResults & results ) = 0;
00159 
00160     public: // for Solaris only
00161 
00162         static const Uint4 TSRRESULTS_INISIZE = 1000000UL;
00163         static const Uint4 TSRPAIRS_INISIZE   = 10000UL;
00164 
00165     protected:
00166 
00167         struct SSRResult
00168         {
00169             Uint4 seqnum;
00170             Uint4 soff;
00171         };
00172 
00173     public: // for Solaris only
00174 
00175         typedef vector< SSRResult >          TSRResults;
00176         typedef pair< SSRResult, SSRResult > TSRPairedResult;
00177         typedef vector< TSRPairedResult >    TSRPairedResults;
00178 
00179         class CResCache
00180         {
00181             private:
00182 
00183                 vector< Uint1 > f_ind_;
00184                 vector< Uint1 > r_ind_;
00185                 vector< TSRResults > f_res_cache_;
00186                 vector< TSRResults > r_res_cache_;
00187 
00188                 Uint4 size() { return (Uint4)f_ind_.size(); }
00189 
00190                 void resize( Uint4 new_sz )
00191                 {
00192                     if( new_sz != size() ) {
00193                         f_ind_.resize( new_sz );
00194                         r_ind_.resize( new_sz );
00195 
00196                         Uint4 start = size();
00197                         Uint4 end = new_sz;
00198 
00199                         f_res_cache_.resize( new_sz );
00200                         r_res_cache_.resize( new_sz );
00201                     
00202                         for( Uint4 i = start; i < end; ++i ) {
00203                             f_res_cache_[i].reserve( TSRRESULTS_INISIZE );
00204                             r_res_cache_[i].reserve( TSRRESULTS_INISIZE );
00205                         }
00206                     }
00207                 }
00208 
00209             public:
00210 
00211                 void init( Uint4 sz )
00212                 {
00213                     resize( sz );
00214                     fill( f_ind_.begin(), f_ind_.end(), 0 );
00215                     fill( r_ind_.begin(), r_ind_.end(), 0 );
00216                 }
00217 
00218                 bool is_set( Uint4 idx, bool fw_strand ) const 
00219                 { return fw_strand ? (bool)f_ind_[idx] : (bool)r_ind_[idx]; }
00220 
00221                 void set( Uint4 idx, bool fw_strand )
00222                 {
00223                     if( fw_strand ) {
00224                         f_ind_[idx] = 1;
00225                         f_res_cache_[idx].clear();
00226                     }
00227                     else {
00228                         r_ind_[idx] = 1;
00229                         r_res_cache_[idx].clear();
00230                     }
00231                 }
00232 
00233                 TSRResults & at( Uint4 idx, bool fw_strand ) 
00234                 { return fw_strand ? f_res_cache_[idx] : r_res_cache_[idx]; }
00235         };
00236 
00237     protected:
00238 
00239         struct SMismatchInfo
00240         {
00241             Uint4   idx;
00242             Uint4   num_keys;
00243             TSeqPos key_pos[2];
00244         };
00245 
00246     public: // for Solaris only
00247         
00248         struct SMismatchResultsEntry
00249         {
00250             TSRResults results;
00251             TSeqPos mismatch_position;
00252             Uint4 adjustment;
00253             Uint1 mismatch_letter;
00254 
00255             SMismatchResultsEntry() {}
00256 
00257             SMismatchResultsEntry( TSeqPos pos, Uint4 letter, Uint1 adj )
00258             { init( pos, letter, adj ); }
00259 
00260             void init( TSeqPos pos, Uint4 letter, Uint1 adj )
00261             {
00262                 mismatch_position = pos;
00263                 adjustment = adj;
00264                 mismatch_letter = letter;
00265             }
00266         };
00267 
00268         typedef vector< SMismatchResultsEntry > TMismatchResults;
00269 
00270         class CMismatchResultsInfo
00271         {
00272                 static const Uint4 BLOCK_SHIFT     = 7UL;
00273                 static const Uint4 BLOCK_SIZE      = (1UL<<BLOCK_SHIFT);
00274                 static const Uint4 BLOCK_MASK      = (BLOCK_SIZE - 1);
00275                 static const Uint4 BLOCKS_INISIZE  = 1024UL;
00276                 static const Uint4 RESULTS_INISIZE = 1024UL*10UL;
00277 
00278                 typedef vector< TMismatchResults > TBlocks;
00279 
00280             public:
00281 
00282                 CMismatchResultsInfo() : size_( 0 )
00283                 {
00284                     blocks_.reserve( BLOCKS_INISIZE );
00285                 }
00286 
00287                 void clear() { size_ = 0; }
00288                 Uint4 size() const { return size_; }
00289                 bool empty() const { return size_ == 0; }
00290 
00291                 SMismatchResultsEntry & operator[]( Uint4 i )
00292                 {
00293                     if( i >= size_ ) resize( i + 1 );
00294                     return at( i );
00295                 }
00296 
00297                 void resize( Uint4 i )
00298                 {
00299                     size_ = i;
00300 
00301                     if( size_ == 0 ) return;
00302 
00303                     Uint4 block = ((i-1)>>BLOCK_SHIFT);
00304 
00305                     while( block >= blocks_.size() ) {
00306                         blocks_.push_back( TMismatchResults( BLOCK_SIZE ) );
00307                         TMismatchResults & block = *blocks_.rbegin();
00308 
00309                         for( Uint4 j = 0; j < BLOCK_SIZE; ++j ) {
00310                             block[j].results.reserve( RESULTS_INISIZE );
00311                         }
00312                     }
00313                 }
00314 
00315             private:
00316 
00317                 SMismatchResultsEntry & at( Uint4 i )
00318                 { return blocks_[i>>BLOCK_SHIFT][i&BLOCK_MASK]; }
00319 
00320                 Uint4 size_;
00321                 TBlocks blocks_;
00322         };
00323 
00324     protected:
00325 
00326         struct SHKData
00327         {
00328             SHKData()
00329             {
00330                 exact_1[0].reserve( TSRRESULTS_INISIZE );
00331                 exact_1[1].reserve( TSRRESULTS_INISIZE );
00332                 exact_2[0].reserve( TSRRESULTS_INISIZE );
00333                 exact_2[1].reserve( TSRRESULTS_INISIZE );
00334 
00335                 pres.reserve( TSRPAIRS_INISIZE );
00336             }
00337 
00338             void reset()
00339             {
00340             }
00341 
00342             CResCache rc1, rc2;
00343             TSRResults exact_1[2];
00344             TSRResults exact_2[2];
00345 
00346             CMismatchResultsInfo mm_1[2];
00347             CMismatchResultsInfo mm_2[2];
00348 
00349             TSRPairedResults pres;
00350         };
00351 
00352 #ifdef NCBI_COMPILER_MIPSPRO
00353     public:
00354 #endif
00355         class InternalException : public CException
00356         {
00357             public:
00358 
00359                 enum EErrCode
00360                 {
00361                     eAmbig
00362                 };
00363 
00364                 virtual const char * GetErrCodeString() const;
00365     
00366                 NCBI_EXCEPTION_DEFAULT( InternalException, CException );
00367         };
00368 
00369     protected:
00370         CSRSearch( CRef< CDbIndex > index, TSeqPos d, TSeqPos dfuzz ) 
00371             : index_( index ), dmax_( d + dfuzz ), dmin_( d - dfuzz )
00372         {
00373             ASSERT( d >= dfuzz );
00374             hkey_width_ = index_->getHKeyWidth();
00375         }
00376 
00377         vector< TSeqPos > GetQNmerPositions( TSeqPos sz ) const;
00378 
00379         Uint4 getNMer( 
00380                 const CSeqVector & seq, TSeqPos pos, 
00381                 bool fw, bool & ambig ) const;
00382 
00383         Uint4 getNMer( 
00384                 const CSeqVector & seq, TSeqPos pos, 
00385                 bool fw, bool & ambig,
00386                 TSeqPos sub_pos, Uint1 sub_letter ) const;
00387 
00388         pair< TSeqPos, TSeqPos > Pos2Index( 
00389                 TSeqPos pos, TSeqPos sz, SMismatchInfo & mismatch_info ) const;
00390 
00391         void mergeResults( 
00392                 TSRResults & results, const TSRResults & src, Int4 step );
00393 
00394         void combine( const TSRResults & results_1,
00395                       const TSRResults & results_2,
00396                       TSRPairedResults & results,
00397                       Int4 adjust = 0 );
00398 
00399         pair< TSeqNum, TSeqPos > MapSOff( 
00400                 TSeqNum lid, TSeqPos loff, TSeqPos sz, bool & overlap ) const;
00401 
00402         bool reportResults(
00403                 TResults & r, Uint4 nr,
00404                 TSeqPos qsz1, TSeqPos qsz2, 
00405                 const TSRPairedResults & results, bool fw1, bool fw2, 
00406                 bool mismatch1 = false, bool mismatch2 = false,
00407                 TSeqPos mismatch_pos1 = 0, TSeqPos mismatch_pos2 = 0,
00408                 Uint1 mismatch_letter1 = (Uint1)'-', 
00409                 Uint1 mismatch_letter2 = (Uint1)'-',
00410                 Uint4 adjustment1 = 0, Uint4 adjustment2 = 0 );
00411 
00412         bool reportResults( 
00413                 TResults & r, Uint4 nr,
00414                 TSeqPos qsz, const TSRResults & results, 
00415                 bool fw, bool mismatch = false, 
00416                 TSeqPos mismatch_pos = 0, Uint1 mismatch_letter = (Uint1)'-',
00417                 Uint4 adjustment = 0, Uint4 pos_in_pair = 0 );
00418 
00419         unsigned long hkey_width_;
00420         SHKData hk_data_;
00421 
00422     private:
00423 
00424         CRef< CDbIndex > index_;
00425         TSeqPos dmax_, dmin_;
00426 };
00427 
00428 END_SCOPE( dbindex_search )
00429 END_NCBI_SCOPE
00430 
00431 #endif
00432 
00433 
00434 

Generated on Wed Dec 9 02:55:51 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Wed Dec 09 08:17:25 2009 by modify_doxy.py rev. 173732