src/objtools/readers/rm_reader.cpp

Go to the documentation of this file.
00001 /*  $Id: rm_reader.cpp 175260 2009-11-05 12:03:46Z ludwigf $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE
00005  *               National Center for Biotechnology Information
00006  *
00007  *  This software/database is a "United States Government Work" under the
00008  *  terms of the United States Copyright Act.  It was written as part of
00009  *  the author's official duties as a United States Government employee and
00010  *  thus cannot be copyrighted.  This software/database is freely available
00011  *  to the public for use. The National Library of Medicine and the U.S.
00012  *  Government have not placed any restriction on its use or reproduction.
00013  *
00014  *  Although all reasonable efforts have been taken to ensure the accuracy
00015  *  and reliability of the software and data, the NLM and the U.S.
00016  *  Government do not and cannot warrant the performance or results that
00017  *  may be obtained by using this software or data. The NLM and the U.S.
00018  *  Government disclaim all warranties, express or implied, including
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.
00021  *
00022  *  Please cite the author in any work or product based on this material.
00023  *
00024  * ===========================================================================
00025  *
00026  * Author:  Frank Ludwig
00027  *
00028  * File Description:
00029  *   Repeat Masker file reader
00030  *
00031  */
00032 
00033 #include <ncbi_pch.hpp>
00034 #include <corelib/ncbistd.hpp>
00035 #include <corelib/ncbithr.hpp>
00036 #include <corelib/ncbiutil.hpp>
00037 #include <corelib/ncbiexpt.hpp>
00038 
00039 #include <util/static_map.hpp>
00040 
00041 #include <serial/iterator.hpp>
00042 #include <serial/objistrasn.hpp>
00043 
00044 // Objects includes
00045 #include <objects/general/Int_fuzz.hpp>
00046 #include <objects/general/Object_id.hpp>
00047 #include <objects/general/User_object.hpp>
00048 #include <objects/general/User_field.hpp>
00049 #include <objects/general/Dbtag.hpp>
00050 
00051 #include <objects/seqloc/Seq_id.hpp>
00052 #include <objects/seqloc/Seq_loc.hpp>
00053 #include <objects/seqloc/Seq_interval.hpp>
00054 #include <objects/seqloc/Seq_point.hpp>
00055 
00056 #include <objects/seq/Seq_annot.hpp>
00057 #include <objects/seq/Annotdesc.hpp>
00058 #include <objects/seq/Annot_descr.hpp>
00059 #include <objects/seqfeat/SeqFeatData.hpp>
00060 
00061 #include <objects/seqfeat/Seq_feat.hpp>
00062 #include <objects/seqfeat/BioSource.hpp>
00063 #include <objects/seqfeat/Org_ref.hpp>
00064 #include <objects/seqfeat/OrgName.hpp>
00065 #include <objects/seqfeat/SubSource.hpp>
00066 #include <objects/seqfeat/OrgMod.hpp>
00067 #include <objects/seqfeat/Gene_ref.hpp>
00068 #include <objects/seqfeat/Cdregion.hpp>
00069 #include <objects/seqfeat/Code_break.hpp>
00070 #include <objects/seqfeat/Genetic_code.hpp>
00071 #include <objects/seqfeat/Genetic_code_table.hpp>
00072 #include <objects/seqfeat/RNA_ref.hpp>
00073 #include <objects/seqfeat/Trna_ext.hpp>
00074 #include <objects/seqfeat/Imp_feat.hpp>
00075 #include <objects/seqfeat/Gb_qual.hpp>
00076 
00077 #include <objtools/readers/reader_exception.hpp>
00078 #include <objtools/readers/rm_reader.hpp>
00079 #include <objtools/error_codes.hpp>
00080 
00081 #include <algorithm>
00082 
00083 
00084 #define NCBI_USE_ERRCODE_X   Objtools_Rd_RepMask
00085 
00086 BEGIN_NCBI_SCOPE
00087 
00088 BEGIN_objects_SCOPE // namespace ncbi::objects::
00089 
00090 struct CMaskData;
00091 
00092 //-----------------------------------------------------------------------------
00093 class CRmOutReader: public CRmReader
00094 //-----------------------------------------------------------------------------
00095 {
00096     friend CRmReader* CRmReader::OpenReader( CNcbiIstream& );
00097     
00098     //
00099     //  object management:
00100     //
00101 protected:
00102     CRmOutReader( CNcbiIstream& );
00103 public:
00104     virtual ~CRmOutReader();
00105     
00106     //
00107     //  interface:
00108     //
00109 public:
00110     virtual void Read( CRef<CSeq_annot>, TFlags flags = fDefaults, 
00111         size_t = kMax_UInt );
00112 
00113     //
00114     //  internal helpers:
00115     //
00116 protected:
00117     virtual bool IsHeaderLine( const string& );
00118     virtual bool IsIgnoredLine( const string& ); 
00119     
00120     virtual bool ParseRecord( const string& record, CMaskData& );
00121     virtual bool VerifyData( const CMaskData& );
00122     virtual bool MakeFeature( const CMaskData&, CRef<CSeq_feat>&, TFlags flags);
00123     
00124     //
00125     //  data:
00126     //
00127 protected:
00128     static const unsigned long BUFFERSIZE = 256;
00129     char pReadBuffer[ BUFFERSIZE ];
00130 };
00131 
00132 //-----------------------------------------------------------------------------
00133 struct CMaskData
00134 //-----------------------------------------------------------------------------
00135 {
00136     unsigned long sw_score;
00137     unsigned long outer_pos_begin;
00138     unsigned long outer_pos_end;
00139     double perc_div;
00140     double perc_del;
00141     double perc_ins;
00142     string query_sequence;
00143     string strand;
00144     string matching_repeat;
00145     string repeat_class_family;
00146 };
00147     
00148 
00149 CRmReader::CRmReader( CNcbiIstream& InStream )
00150     :
00151     m_InStream( InStream )
00152 {
00153 }
00154 
00155 
00156 CRmReader::~CRmReader()
00157 {
00158 }
00159 
00160 
00161 CRmOutReader::CRmOutReader( CNcbiIstream& InStream )
00162     :
00163     CRmReader( InStream )
00164 {
00165 }
00166 
00167 
00168 CRmOutReader::~CRmOutReader()
00169 {
00170 }
00171 
00172 
00173 void CRmOutReader::Read( CRef<CSeq_annot> entry, TFlags flags, size_t uMaxErrorCount )
00174 {
00175     string line;
00176     CSeq_annot::C_Data::TFtable& ftable = entry->SetData().SetFtable();
00177     CRef<CSeq_feat> feat;
00178     
00179     size_t line_counter = 0;
00180     size_t record_counter = 0;
00181     size_t error_counter = 0;
00182     
00183     while ( ! m_InStream.eof() ) {
00184 
00185         NcbiGetlineEOL( m_InStream, line );
00186         ++line_counter;
00187         
00188         if ( IsHeaderLine( line ) || IsIgnoredLine( line ) ) {
00189             continue;
00190         }
00191         ++record_counter;
00192         
00193         CMaskData mask_data;
00194         if ( ! ParseRecord( line, mask_data ) ) {
00195             ++error_counter;
00196             LOG_POST_X( 1, Error << "Rmo Reader: Parse error in record " 
00197                 << record_counter << " (line " << line_counter 
00198                 << "). Record skipped" );
00199             if ( error_counter < uMaxErrorCount ) {
00200                 continue;
00201             }
00202             else {
00203                 break;
00204             }
00205         }
00206         
00207         if ( ! VerifyData( mask_data ) ) {
00208             ++error_counter;
00209             LOG_POST_X( 2, Error << "Rmo Reader: Verification error in record " 
00210                 << record_counter << " (line " << line_counter 
00211                 << "). Record skipped." );
00212             if ( error_counter < uMaxErrorCount ) {
00213                 continue;
00214             }
00215             else {
00216                 break;
00217             }
00218         }
00219         
00220         if ( ! MakeFeature( mask_data, feat, flags ) ) {
00221             // we don't tolerate even a few errors here!
00222             error_counter = uMaxErrorCount;
00223             LOG_POST_X( 3, Error << "Rmo Reader: Unable to create feature table for record " 
00224                 << record_counter << " (line " << line_counter 
00225                 << "). Aborting file import." );
00226             break;
00227         }
00228         
00229         ftable.push_back( feat );
00230     }
00231     
00232     if ( error_counter == uMaxErrorCount ) {
00233         LOG_POST_X( 4, Error << "Rmo Reader: File import aborted due to error count or severity." );
00234         throw 0; // upper layer catches everything in sight and reports error to file_loader.
00235     }
00236 }
00237 
00238 
00239 bool CRmOutReader::IsHeaderLine( const string& line )
00240 {
00241     string labels_1st_line[] = { "SW", "perc", "query", "position", "matching", "" };
00242     string labels_2nd_line[] = { "score", "div.", "del.", "ins.", "sequence", "" };
00243 
00244     // try to identify 1st line of column labels:
00245     size_t current_offset = 0;
00246     size_t i = 0;
00247     for ( ; labels_1st_line[i] != ""; ++i ) {
00248         current_offset = NStr::FindCase( line, labels_1st_line[i], current_offset );
00249         if ( NPOS == current_offset ) {
00250             break;
00251         }
00252     }
00253     if ( labels_1st_line[i] == "" ) {
00254         return true;
00255     }
00256     
00257     // try to identify 2nd line of column labels:
00258     current_offset = 0;
00259     i = 0;
00260     for ( ; labels_2nd_line[i] != ""; ++i ) {
00261         current_offset = NStr::FindCase( line, labels_2nd_line[i], current_offset );
00262         if ( NPOS == current_offset ) {
00263             return false;
00264         }
00265     }
00266     return true;
00267 }
00268 
00269 
00270 bool CRmOutReader::IsIgnoredLine( const string& line )
00271 {
00272     //
00273     //  Currently, only lines with only whitespace on them are ignored.
00274     //
00275     return ( NStr::TruncateSpaces( line ).length() == 0 );
00276 }
00277 
00278 
00279 bool CRmOutReader::ParseRecord( const string& record, CMaskData& mask_data )
00280 {
00281     const size_t MIN_VALUE_COUNT = 15;
00282     
00283     string line = NStr::TruncateSpaces( record );
00284     list< string > values;
00285     if ( NStr::Split( line, " \t", values ).size() < MIN_VALUE_COUNT ) {
00286         return false;
00287     }
00288     
00289     try {
00290         // 1: "SW score"
00291         list<string>::iterator it = values.begin();
00292         mask_data.sw_score = NStr::StringToUInt( *it );
00293         
00294         // 2: "perc div."
00295         ++it;
00296         mask_data.perc_div = NStr::StringToDouble( *it );
00297         
00298         // 3: "perc del."
00299         ++it;
00300         mask_data.perc_del = NStr::StringToDouble( *it );
00301         
00302         // 4: "perc ins."
00303         ++it;
00304         mask_data.perc_ins = NStr::StringToDouble( *it );
00305         
00306         // 5: "query sequence"
00307         ++it;
00308         mask_data.query_sequence = *it;
00309         
00310         // 6: "position begin"
00311         ++it;
00312         mask_data.outer_pos_begin = NStr::StringToUInt( *it );
00313         
00314         // 7: "in end"
00315         ++it;
00316         mask_data.outer_pos_end = NStr::StringToUInt( *it );
00317         
00318         // 8: "query (left)"
00319         ++it;
00320         /* not used */
00321         
00322         // 9: "" (meaning "strand")
00323         ++it;
00324         mask_data.strand = *it;
00325         
00326         // 10: "matching repeat"
00327         ++it;
00328         mask_data.matching_repeat = *it;
00329         
00330         // 11: "repeat class/family"
00331         ++it;
00332         mask_data.repeat_class_family = *it;
00333         
00334         // 12: "position in"
00335         ++it;
00336         /* not used */
00337         
00338         // 13: "in end"
00339         ++it;
00340         /* not used */
00341         
00342         // 14: "repeat left"
00343         ++it;
00344         /* not used */
00345         
00346         // 15: "ID"
00347         ++it;
00348         /* not used */
00349         
00350     }
00351     catch( ... ) {
00352         return false;
00353     }
00354     
00355     return true;
00356 }
00357 
00358 
00359 bool CRmOutReader::VerifyData( const CMaskData& mask_data )
00360 {
00361     //
00362     //  This would be the place for any higher level checks of the mask data
00363     //  collected from the record ...
00364     // 
00365     return true;
00366 }
00367 
00368 
00369 bool CRmOutReader::MakeFeature( const CMaskData& mask_data, CRef<CSeq_feat>& feat,
00370                                 TFlags flags )
00371 {
00372     feat.Reset( new CSeq_feat );
00373     feat->ResetLocation();
00374     
00375     //  data:
00376     CSeqFeatData& sfdata = feat->SetData();
00377     CImp_feat_Base& imp = sfdata.SetImp();
00378     imp.SetKey( "repeat_region" );
00379     
00380     //  location:
00381     CRef<CSeq_loc> location( new CSeq_loc );
00382     CSeq_interval& interval = location->SetInt();
00383     interval.SetFrom( min( mask_data.outer_pos_begin, mask_data.outer_pos_end ) -1 );
00384     interval.SetTo( max( mask_data.outer_pos_begin, mask_data.outer_pos_end ) -1 );
00385     interval.SetStrand( strcmp( mask_data.strand.c_str(), "C" ) ? 
00386         eNa_strand_plus : eNa_strand_minus );
00387 
00388     CBioseq::TId ids;
00389     CSeq_id::ParseFastaIds(ids, mask_data.query_sequence);
00390     location->SetId(*FindBestChoice(ids, CSeq_id::Score));
00391 
00392     feat->SetLocation( *location );
00393 
00394     //  qualifiers:
00395     if (flags) {
00396         CSeq_feat::TQual& qual_list = feat->SetQual();
00397         
00398         if (flags & fIncludeRepeatName) {
00399             CRef<CGb_qual> repeat( new CGb_qual );
00400             repeat->SetQual( "repeat_region" );
00401             repeat->SetVal( mask_data.matching_repeat );
00402             qual_list.push_back( repeat );
00403         }
00404 
00405         if (flags & fIncludeRepeatClass) {
00406             CRef<CGb_qual> rpt_family( new CGb_qual );
00407             rpt_family->SetQual( "rpt_family" );
00408             rpt_family->SetVal( mask_data.repeat_class_family );
00409             qual_list.push_back( rpt_family );
00410         }
00411 
00412         if (flags & fIncludeStatistics) {
00413             CRef<CGb_qual> sw_score( new CGb_qual );
00414             sw_score->SetQual( "sw_score" );
00415             sw_score->SetVal( NStr::IntToString( mask_data.sw_score ) );
00416             qual_list.push_back( sw_score );
00417             
00418             CRef<CGb_qual> perc_div( new CGb_qual );
00419             perc_div->SetQual( "perc_div" );
00420             perc_div->SetVal( NStr::DoubleToString( mask_data.perc_div ) );
00421             qual_list.push_back( perc_div );
00422             
00423             CRef<CGb_qual> perc_del( new CGb_qual );
00424             perc_del->SetQual( "perc_del" );
00425             perc_del->SetVal( NStr::DoubleToString( mask_data.perc_del ) );
00426             qual_list.push_back( perc_del );
00427             
00428             CRef<CGb_qual> perc_ins( new CGb_qual );
00429             perc_ins->SetQual( "perc_ins" );
00430             perc_ins->SetVal( NStr::DoubleToString( mask_data.perc_ins ) );
00431             qual_list.push_back( perc_ins );
00432         }
00433     }
00434     
00435     return true;
00436 }
00437 
00438 
00439 CRmReader* CRmReader::OpenReader( CNcbiIstream& InStream )
00440 {
00441     //
00442     //  This is the point to make sure we are dealing with the right file type and
00443     //  to allocate the specialist reader for any subtype (OUT, HTML) we encouter.
00444     //  When this function returns the file pointer should be past the file header
00445     //  and at the beginning of the actual mask data.
00446     //
00447     //  Note:
00448     //  If something goes wrong during header processing then the file pointer will
00449     //  still be modified. It's the caller's job to restore the file pointer if this
00450     //  is possible for this type of stream.
00451     //
00452     
00453     //
00454     //  2006-03-31: Only supported file type at this time: ReadMasker OUT.
00455     //
00456     return new CRmOutReader( InStream );
00457 }
00458 
00459 
00460 void CRmReader::CloseReader( CRmReader* pReader )
00461 {
00462     delete pReader;
00463 }
00464 
00465 
00466 END_objects_SCOPE
00467 END_NCBI_SCOPE
00468 
00469 

Generated on Wed Dec 9 05:20:39 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Wed Dec 09 08:18:12 2009 by modify_doxy.py rev. 173732