NCBI C++ ToolKit
win_mask_dup_table.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

00001 /*  $Id: win_mask_dup_table.cpp 48619 2011-02-10 17:03:08Z mozese2 $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE
00005  *               National Center for Biotechnology Information
00006  *
00007  *  This software/database is a "United States Government Work" under the
00008  *  terms of the United States Copyright Act.  It was written as part of
00009  *  the author's official duties as a United States Government employee and
00010  *  thus cannot be copyrighted.  This software/database is freely available
00011  *  to the public for use. The National Library of Medicine and the U.S.
00012  *  Government have not placed any restriction on its use or reproduction.
00013  *
00014  *  Although all reasonable efforts have been taken to ensure the accuracy
00015  *  and reliability of the software and data, the NLM and the U.S.
00016  *  Government do not and cannot warrant the performance or results that
00017  *  may be obtained by using this software or data. The NLM and the U.S.
00018  *  Government disclaim all warranties, express or implied, including
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.
00021  *
00022  *  Please cite the author in any work or product based on this material.
00023  *
00024  * ===========================================================================
00025  *
00026  * Author:  Aleksandr Morgulis
00027  *
00028  * File Description:
00029  *   Implementation of CheckDuplicates() function.
00030  *
00031  */
00032 
00033 #include <ncbi_pch.hpp>
00034 #include <vector>
00035 #include <string>
00036 #include <map>
00037 
00038 #include <corelib/ncbitype.h>
00039 #include <corelib/ncbistre.hpp>
00040 #include <objects/seq/Bioseq.hpp>
00041 #include <objects/seqloc/Seq_id.hpp>
00042 
00043 #include <objmgr/object_manager.hpp>
00044 #include <objmgr/scope.hpp>
00045 #include <objmgr/seq_entry_handle.hpp>
00046 #include <objmgr/bioseq_ci.hpp>
00047 #include <objmgr/seq_vector.hpp>
00048 #include <objmgr/util/sequence.hpp>
00049 
00050 #include <algo/winmask/win_mask_dup_table.hpp>
00051 #include <algo/winmask/win_mask_util.hpp>
00052 
00053 BEGIN_NCBI_SCOPE
00054 USING_SCOPE(objects);
00055 
00056 
00057 
00058 const Uint4 SAMPLE_LENGTH    = 100; /**<\internal length of a sample segment */
00059 const Uint4 SAMPLE_SKIP      = 10000;   /**<\internal distance between subsequent samples */
00060 const Uint4 MIN_SEQ_LENGTH   = 50000;   /**<\internal sequences of length below MIN_SEQ_LENGTH are not considered for 
00061                                           duplication check */
00062 const Uint4 MAX_OFFSET_ERROR = 5;   /**<\internal fuzziness in distance between consequtive matches */
00063 const Uint4 MIN_MATCH_COUNT  = 4;   /**<\internal minimal number of successful consequtive sample matches that
00064                                       defines a duplication */
00065 
00066 //------------------------------------------------------------------------------
00067 /**\internal
00068  **\brief This class represents a lookup table used to check incoming
00069  **       sequences for duplication.
00070  **/
00071 class dup_lookup_table
00072 {
00073 public:
00074 
00075     /**\internal
00076      **\brief Structure describing a location of a sample in a sequence.
00077      **/
00078     struct sample_loc
00079     {
00080         size_t seqnum; /**<\internal sequence number (in the order the sequences appear in the input) */
00081         Uint4 offset;       /**<\internal offset of the sample in the sequence defined by seqnum */
00082 
00083         /**\internal
00084          **\brief Instance constructor.
00085          **
00086          **\param new_seqnum the sequence number
00087          **\param new_offset the sample offset
00088          **/
00089         sample_loc( size_t new_seqnum, Uint4 new_offset )
00090             : seqnum( new_seqnum ), offset( new_offset ) {}
00091     };
00092 
00093     /**\internal
00094      **\brief Class describing the data about a particular sample value.
00095      **
00096      ** A sample is a segment of sequence data of length SAMPLE_LENGTH.
00097      ** This class encapsulates information about all occurances of the
00098      ** given sample value in the sequences that have been read so far.
00099      **/
00100     class sample
00101     {
00102     private:
00103 
00104         /**\internal \brief Type for lists of sample locations. */
00105         typedef vector< sample_loc > loc_list_type;
00106 
00107     public:
00108 
00109         /**\internal
00110          **\brief Iterator type to traverse lists of sample locations.
00111          **/
00112         typedef loc_list_type::const_iterator iterator;
00113 
00114         /**\internal
00115          **\brief Add a new sample location.
00116          **
00117          **\param loc new location
00118          **/
00119         void add_loc( const sample_loc & loc )
00120         { locs.push_back( loc ); }
00121 
00122         /**\internal\name Location traversal.*/
00123         /**@{*/
00124         const iterator begin() const { return locs.begin(); }
00125         const iterator end() const { return locs.end(); }
00126         /**@}*/
00127 
00128     private:
00129 
00130         loc_list_type locs; /**<\internal the list of sample locations */
00131     };
00132 
00133 private:
00134 
00135     /**\internal
00136      **\brief Type representing a list of sequence id strings for all 
00137      **       sequences that have been read so far.
00138      **/
00139     typedef vector< string > seq_id_list_type;
00140 
00141     /**\internal
00142      **\brief A type mapping sample strings to the information about their
00143      **       occurances in the input.
00144      **/
00145     typedef map< string, sample > sample_set_type;
00146 
00147 public:
00148 
00149     /**\internal \brief Alias for sample::iterator. */
00150     typedef sample::iterator iterator;
00151 
00152     /**\internal
00153      **\brief Augment the data structure with the information about the
00154      **       new sequence.
00155      **
00156      **\param seq_id id string for the new sequence
00157      **\param seq_data new sequence data in IUPACNA format
00158      **/
00159     void add_seq_info( const string & seq_id,
00160                        const objects::CSeqVector & seq_data );
00161 
00162     /**\internal
00163      **\brief Get the sequence id string from the sequence number.
00164      **
00165      **\param seqnum the sequence number
00166      **\return the sequence id string in FASTA format
00167      **/
00168     const string seqid( Uint4 seqnum ) const
00169     { return seq_id_list[seqnum]; }
00170 
00171     /**\internal
00172      **\brief Get the information about positions of the sample in the
00173      **       data base from the sample value.
00174      **
00175      **\param index the sample value
00176      **\return pointer to the corresponding instance of the sample class
00177      **/
00178     const sample * operator[]( const string & index ) const
00179     {
00180         sample_set_type::const_iterator i( sample_set.find( index ) );
00181         return i == sample_set.end() ? 0 : &(i->second);
00182     }
00183 
00184 private:
00185 
00186     /**\internal
00187      **\brief Add a sample location to the sample information structure.
00188      **
00189      **\param sample the sample value
00190      **\param loc the sample location description
00191      **/
00192     void add_loc( const string & sample, const sample_loc & loc )
00193     { sample_set[sample].add_loc( loc ); }
00194 
00195     seq_id_list_type seq_id_list;   /**<\internal the list of sequence id strings */
00196     sample_set_type sample_set;     /**<\internal the sample->sample information map */
00197 };
00198 
00199 //------------------------------------------------------------------------------
00200 /**\internal
00201  **\brief "Less than" comparison between two sample locations.
00202  **
00203  ** Sample locations are compared lexicographically on the (seqnum, offset)
00204  ** pair.
00205  **
00206  **\param lhs the first sample location
00207  **\param rhs the second sample location
00208  **\return true if lhs < rhs; false otherwise
00209  **/
00210 inline bool operator<( const dup_lookup_table::sample_loc & lhs, 
00211                        const dup_lookup_table::sample_loc & rhs )
00212 {
00213     return lhs.seqnum < rhs.seqnum ? true  :
00214         lhs.seqnum > rhs.seqnum ? false :
00215         lhs.offset < rhs.offset ? true  : false;
00216 }
00217 
00218 //------------------------------------------------------------------------------
00219 /**\internal
00220  **\brief "Greater than" comparison of two sample locations.
00221  **
00222  ** Defined as: a > b iff b < a.
00223  **
00224  **\param lhs the first sample location
00225  **\param rhs the second sample location
00226  **\return true if lhs > rhs; false otherwise
00227  **/
00228 inline bool operator>( const dup_lookup_table::sample_loc & lhs, 
00229                        const dup_lookup_table::sample_loc & rhs )
00230 { return rhs < lhs; }
00231 
00232 //------------------------------------------------------------------------------
00233 /**\internal
00234  **\brief Comparisong of two sample locations for equality.
00235  **
00236  ** Defined as !(lhs < rhs) and !(rhs < lhs).
00237  **
00238  **\param lhs the first sample location
00239  **\param rhs the second sample location
00240  **\return true if lhs == rhs; false otherwise
00241  **/
00242 inline bool operator==( const dup_lookup_table::sample_loc & lhs,
00243                         const dup_lookup_table::sample_loc & rhs )
00244 { return !(lhs < rhs) && !(rhs < lhs); }
00245 
00246 //------------------------------------------------------------------------------
00247 void dup_lookup_table::add_seq_info( const string & seq_id, 
00248                                      const objects::CSeqVector & seq_data )
00249 {
00250     static TSeqPos next_offset( 0 );
00251 
00252     seq_id_list.push_back( seq_id );
00253     TSeqPos data_len( seq_data.size() );
00254 
00255     string sample;
00256     while( next_offset < data_len - SAMPLE_LENGTH )
00257     {
00258         sample.erase();
00259         seq_data.GetSeqData(next_offset, next_offset + SAMPLE_LENGTH, sample);
00260         sample_loc loc( seq_id_list.size() - 1, next_offset );
00261         add_loc( sample, loc );
00262         next_offset += SAMPLE_SKIP;
00263     }
00264 
00265     next_offset = (next_offset <= data_len) ? 0 : next_offset - data_len;
00266 }
00267 
00268 //------------------------------------------------------------------------------
00269 /**\internal
00270  **\brief This class encapsulates the state of duplication search process.
00271  **
00272  ** An instance of this class is created for each subject sequence to search
00273  ** for duplicates among the sequences that were processed earlier.
00274  **/
00275 class tracker
00276 {
00277 public:
00278 
00279     /**\internal \brief Alias for the sample location description type. */
00280     typedef dup_lookup_table::sample_loc sample_loc;
00281 
00282 private:
00283 
00284     /**\internal
00285      **\brief Alias for the sample location information iterator type.
00286      **/
00287     typedef dup_lookup_table::iterator iterator;
00288 
00289     /**\internal
00290      **\brief Type representing a possible match currently being tracked
00291      **       by the tracker instance.
00292      **/
00293     struct result
00294     {
00295         Uint4 count;            /**<\internal current number of consequtive sample matches */
00296         sample_loc loc;         /**<\internal information about query location of the last sample match */
00297         string::size_type s_offset;    /**<\internal location in the subject of the last sample match */
00298 
00299         /**\internal
00300          **\brief Object constructor.
00301          **
00302          **\param newloc location in the query
00303          **\param new_offset location in the subject
00304          **\param new_count initial value of match count
00305          **/
00306         result( const sample_loc & newloc,
00307                 string::size_type new_offset, 
00308                 Uint4 new_count = 1 )
00309             : count( new_count ), loc( newloc ), s_offset( new_offset ) {}
00310     };
00311 
00312     /**\internal
00313      **\brief type used to store the set of currently tracked matches. 
00314      **/
00315     typedef vector< result > result_list_type;
00316 
00317 public:
00318 
00319     /**\internal
00320      **\brief Object constructor.
00321      **
00322      **\param the_table the lookup table to search against
00323      **\param the_subject_id the id string for the subject sequence
00324      **/
00325     tracker( const dup_lookup_table & the_table, const string & the_subject_id  ) 
00326         : table( the_table ), subject_id( the_subject_id ) {}
00327 
00328     /**\internal \brief Object destructor. */
00329     ~tracker();
00330 
00331     /**\internal
00332      **\brief Process a set of matches to the lookup table.
00333      **
00334      ** The list of matches is given by the pair of iterators. For the
00335      ** current set of results determines which could be extended. The
00336      ** ones that can not be extended and whose subject offset is too
00337      ** far in the past are deleted. The location in [start, end) that
00338      ** do not extend existing matches start new matches.
00339      **
00340      **\param index the sample sequence at the current subject offset
00341      **\param seqnum the sequence number of the current sequence
00342      **\param subject_offset current position in the subject sequence
00343      **\param start start of the list of matches to the lookup table
00344      **\param end end of the list of matches to the lookup table
00345      **/
00346     void operator()( const string & index, Uint4 seqnum,
00347                      string::size_type subject_offset,
00348                      iterator start, iterator end );
00349 
00350 private:
00351 
00352     const dup_lookup_table & table; /**<\internal lookup table to use */
00353     const string & subject_id;      /**<\internal id string of the current subject sequence */
00354 
00355     /**\internal
00356      **\brief Report a candidate for duplicate sequence pair to the
00357      **       standard error.
00358      **
00359      **\param queryseq number of the query sequence
00360      **\param match_count number consequtive sample matches detected
00361      **\param s_off last position in the subject sequence
00362      **\param q_off last position in the query sequence
00363      **/
00364     void report_match( Uint4 queryseq, 
00365                        Uint4 match_count,
00366                        string::size_type s_off,
00367                        string::size_type q_off );
00368 
00369     result_list_type main_list;     /**<\internal current result list */
00370     result_list_type aux_list;      /**<\internal additional (helper) result list */
00371 };
00372 
00373 //------------------------------------------------------------------------------
00374 void tracker::report_match( Uint4 queryseq, Uint4 match_count,
00375                             string::size_type s_off,
00376                             string::size_type q_off )
00377 {
00378     string query_id( table.seqid( queryseq ) );
00379     LOG_POST( Warning << 
00380            "Possible duplication of sequences:\n"
00381         << "subject: " << subject_id << " and query: " << query_id << "\n"
00382         << "at intervals\n"
00383         << "subject: " << s_off - match_count*SAMPLE_SKIP
00384         << " --- " << s_off - SAMPLE_SKIP << "\n"
00385         << "query  : " << q_off - match_count*SAMPLE_SKIP
00386         << " --- " << q_off - SAMPLE_SKIP << "\n" );
00387 }
00388 
00389 //------------------------------------------------------------------------------
00390 tracker::~tracker()
00391 {
00392     typedef result_list_type::const_iterator r_iterator;
00393 
00394     r_iterator riter( main_list.begin() );
00395     r_iterator rend( main_list.end() );
00396 
00397     while( riter != rend )
00398     {
00399         if( riter->count >= MIN_MATCH_COUNT )
00400             report_match( riter->loc.seqnum, riter->count, 
00401                           riter->s_offset + SAMPLE_SKIP, riter->loc.offset );
00402 
00403         ++riter;
00404     }
00405 }
00406 
00407 //------------------------------------------------------------------------------
00408 void tracker::operator()( const string & index, 
00409                           Uint4 seqnum,
00410                           string::size_type subject_offset,
00411                           dup_lookup_table::iterator iter,
00412                           dup_lookup_table::iterator end )
00413 {
00414     typedef result_list_type::const_iterator r_iterator;
00415     typedef dup_lookup_table::sample_loc sample_loc;
00416 
00417     r_iterator riter( main_list.begin() );
00418     r_iterator rend( main_list.end() );
00419 
00420     bool do_swap( iter == end ? false : true );
00421 
00422     while( true )
00423         if( riter == rend )
00424             if( iter == end )
00425                 break;
00426             else
00427             {
00428                 aux_list.push_back( result( sample_loc( iter->seqnum, 
00429                                                         iter->offset + SAMPLE_SKIP ), 
00430                                             subject_offset ) );
00431                 ++iter;
00432             }
00433         else if( iter == end )
00434         {
00435             if( riter->s_offset + SAMPLE_SKIP + MAX_OFFSET_ERROR < subject_offset )
00436             {
00437                 if( riter->count >= MIN_MATCH_COUNT )
00438                     report_match( riter->loc.seqnum, riter->count,
00439                                   riter->s_offset + SAMPLE_SKIP, riter->loc.offset );
00440             }
00441             else aux_list.push_back( *riter );
00442 
00443             ++riter;
00444         }
00445         else // both iter and riter are valid
00446         {
00447             if( *iter < riter->loc )
00448             {
00449                 aux_list.push_back( result( sample_loc( iter->seqnum,
00450                                                         iter->offset + SAMPLE_SKIP ),
00451                                             subject_offset ) );
00452                 ++iter;
00453             }
00454             else if( *iter > riter->loc )
00455             {
00456                 if( riter->s_offset + SAMPLE_SKIP + MAX_OFFSET_ERROR < subject_offset )
00457                 {
00458                     if( riter->count >= MIN_MATCH_COUNT )
00459                         report_match( riter->loc.seqnum, riter->count,
00460                                       riter->s_offset + SAMPLE_SKIP, riter->loc.offset );
00461                 }
00462                 else aux_list.push_back( *riter );
00463 
00464                 ++riter;
00465             }
00466             else // *iter == riter->loc --- same sequence and corresponding offsets
00467             {
00468                 Uint4 count( 1 );
00469 
00470                 while( riter != rend && riter->loc == *iter )
00471                 {
00472                     if( subject_offset < riter->s_offset + SAMPLE_SKIP - MAX_OFFSET_ERROR )
00473                         aux_list.push_back( *riter );
00474                     else if( subject_offset > riter->s_offset + SAMPLE_SKIP 
00475                              + MAX_OFFSET_ERROR )
00476                     {
00477                         if( riter->count >= MIN_MATCH_COUNT )
00478                             report_match( riter->loc.seqnum, riter->count,
00479                                           riter->s_offset + SAMPLE_SKIP, riter->loc.offset );
00480                     }
00481                     else // MATCH!!! Extend it.
00482                         count = riter->count + 1;
00483 
00484                     ++riter;
00485                 }
00486 
00487                 aux_list.push_back( result( sample_loc( iter->seqnum,
00488                                                         iter->offset + SAMPLE_SKIP ),
00489                                             subject_offset, count ) );
00490                 ++iter;
00491             }
00492         }
00493 
00494     // Swap the lists.
00495     if( do_swap )
00496     {
00497         main_list.clear();
00498         main_list.swap( aux_list );
00499     }
00500 }
00501 
00502 #if 0
00503 //------------------------------------------------------------------------------
00504 /**\internal
00505  **\brief Get a FASTA formatted id string (the first available) from the 
00506  **       CSeq_entry structure.
00507  **
00508  **\param entry sequence description structure
00509  **\return the first id string corresponding to entry
00510  **/
00511 static const string GetIdString( const CSeq_entry & entry )
00512 {
00513     CRef<CObjectManager> om(CObjectManager::GetInstance());
00514     const CBioseq & seq = entry.GetSeq();
00515     CRef<CScope> scope(new CScope(*om));
00516     CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry( 
00517         const_cast< CSeq_entry & >( entry ) );
00518     return CWinMaskSeqTitle::GetId( seh, seq );
00519 /*
00520     list< CRef< CSeq_id > > idlist = seq.GetId();
00521 
00522     if( idlist.empty() ) 
00523         return "???";
00524     else
00525     {
00526         CNcbiOstrstream os;
00527         (*idlist.begin())->WriteAsFasta( os );
00528         return CNcbiOstrstreamToString(os);
00529     }
00530 */
00531 }
00532 #endif
00533 
00534 //------------------------------------------------------------------------------
00535 void CheckDuplicates( const vector< string > & input,
00536                       const string & infmt,
00537                       const CWinMaskUtil::CIdSet * ids,
00538                       const CWinMaskUtil::CIdSet * exclude_ids )
00539 {
00540     typedef vector< string >::const_iterator input_iterator;
00541 
00542     dup_lookup_table table;
00543     CRef<CObjectManager> om(CObjectManager::GetInstance());
00544 
00545     for( input_iterator i( input.begin() ); i != input.end(); ++i )
00546     {
00547         Uint4 seqnum( 0 );
00548 
00549         for(CWinMaskUtil::CInputBioseq_CI bs_iter(*i, infmt); bs_iter; ++bs_iter)
00550         {
00551             CBioseq_Handle bsh = *bs_iter;
00552 
00553             if( CWinMaskUtil::consider( bsh, ids, exclude_ids ) )
00554             {
00555                 TSeqPos data_len = bsh.GetBioseqLength();
00556                 if( data_len < MIN_SEQ_LENGTH )
00557                     continue;
00558 
00559                 string id;
00560                 sequence::GetId(bsh, sequence::eGetId_Best)
00561                     .GetSeqId()->GetLabel(&id);
00562                 data_len -= SAMPLE_SKIP;
00563                 tracker track( table, id );
00564 
00565                 string index;
00566                 CSeqVector data =
00567                     bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
00568                 for( TSeqPos i = 0;  i < data_len;  ++i )
00569                 {
00570                     index.erase();
00571                     data.GetSeqData(i, i + SAMPLE_LENGTH, index);
00572                     const dup_lookup_table::sample * sample( table[index] );
00573 
00574                     if( sample != 0 )
00575                         track( index, seqnum, i, sample->begin(), sample->end() );
00576                 }
00577 
00578                 table.add_seq_info( id, data );
00579                 ++seqnum;
00580             }
00581         }
00582     }
00583 }
00584 
00585 
00586 END_NCBI_SCOPE
Modified on Sat Dec 20 10:36:46 2014 by modify_doxy.py rev. 426318