00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #include <ncbi_pch.hpp>
00034 #include <corelib/ncbistd.hpp>
00035 #include <corelib/ncbithr.hpp>
00036 #include <corelib/ncbiutil.hpp>
00037 #include <corelib/ncbiexpt.hpp>
00038
00039 #include <util/static_map.hpp>
00040
00041 #include <serial/iterator.hpp>
00042 #include <serial/objistrasn.hpp>
00043
00044
00045 #include <objects/general/Int_fuzz.hpp>
00046 #include <objects/general/Object_id.hpp>
00047 #include <objects/general/User_object.hpp>
00048 #include <objects/general/User_field.hpp>
00049 #include <objects/general/Dbtag.hpp>
00050
00051 #include <objects/seqloc/Seq_id.hpp>
00052 #include <objects/seqloc/Seq_loc.hpp>
00053 #include <objects/seqloc/Seq_interval.hpp>
00054 #include <objects/seqloc/Seq_point.hpp>
00055
00056 #include <objects/seq/Seq_annot.hpp>
00057 #include <objects/seq/Annotdesc.hpp>
00058 #include <objects/seq/Annot_descr.hpp>
00059 #include <objects/seqfeat/SeqFeatData.hpp>
00060
00061 #include <objects/seqfeat/Seq_feat.hpp>
00062 #include <objects/seqfeat/BioSource.hpp>
00063 #include <objects/seqfeat/Org_ref.hpp>
00064 #include <objects/seqfeat/OrgName.hpp>
00065 #include <objects/seqfeat/SubSource.hpp>
00066 #include <objects/seqfeat/OrgMod.hpp>
00067 #include <objects/seqfeat/Gene_ref.hpp>
00068 #include <objects/seqfeat/Cdregion.hpp>
00069 #include <objects/seqfeat/Code_break.hpp>
00070 #include <objects/seqfeat/Genetic_code.hpp>
00071 #include <objects/seqfeat/Genetic_code_table.hpp>
00072 #include <objects/seqfeat/RNA_ref.hpp>
00073 #include <objects/seqfeat/Trna_ext.hpp>
00074 #include <objects/seqfeat/Imp_feat.hpp>
00075 #include <objects/seqfeat/Gb_qual.hpp>
00076
00077 #include <objtools/readers/reader_exception.hpp>
00078 #include <objtools/readers/rm_reader.hpp>
00079 #include <objtools/error_codes.hpp>
00080
00081 #include <algorithm>
00082
00083
00084 #define NCBI_USE_ERRCODE_X Objtools_Rd_RepMask
00085
00086 BEGIN_NCBI_SCOPE
00087
00088 BEGIN_objects_SCOPE
00089
00090 struct CMaskData;
00091
00092
00093 class CRmOutReader: public CRmReader
00094
00095 {
00096 friend CRmReader* CRmReader::OpenReader( CNcbiIstream& );
00097
00098
00099
00100
00101 protected:
00102 CRmOutReader( CNcbiIstream& );
00103 public:
00104 virtual ~CRmOutReader();
00105
00106
00107
00108
00109 public:
00110 virtual void Read( CRef<CSeq_annot>, TFlags flags = fDefaults,
00111 size_t = kMax_UInt );
00112
00113
00114
00115
00116 protected:
00117 virtual bool IsHeaderLine( const string& );
00118 virtual bool IsIgnoredLine( const string& );
00119
00120 virtual bool ParseRecord( const string& record, CMaskData& );
00121 virtual bool VerifyData( const CMaskData& );
00122 virtual bool MakeFeature( const CMaskData&, CRef<CSeq_feat>&, TFlags flags);
00123
00124
00125
00126
00127 protected:
00128 static const unsigned long BUFFERSIZE = 256;
00129 char pReadBuffer[ BUFFERSIZE ];
00130 };
00131
00132
00133 struct CMaskData
00134
00135 {
00136 unsigned long sw_score;
00137 unsigned long outer_pos_begin;
00138 unsigned long outer_pos_end;
00139 double perc_div;
00140 double perc_del;
00141 double perc_ins;
00142 string query_sequence;
00143 string strand;
00144 string matching_repeat;
00145 string repeat_class_family;
00146 };
00147
00148
00149 CRmReader::CRmReader( CNcbiIstream& InStream )
00150 :
00151 m_InStream( InStream )
00152 {
00153 }
00154
00155
00156 CRmReader::~CRmReader()
00157 {
00158 }
00159
00160
00161 CRmOutReader::CRmOutReader( CNcbiIstream& InStream )
00162 :
00163 CRmReader( InStream )
00164 {
00165 }
00166
00167
00168 CRmOutReader::~CRmOutReader()
00169 {
00170 }
00171
00172
00173 void CRmOutReader::Read( CRef<CSeq_annot> entry, TFlags flags, size_t uMaxErrorCount )
00174 {
00175 string line;
00176 CSeq_annot::C_Data::TFtable& ftable = entry->SetData().SetFtable();
00177 CRef<CSeq_feat> feat;
00178
00179 size_t line_counter = 0;
00180 size_t record_counter = 0;
00181 size_t error_counter = 0;
00182
00183 while ( ! m_InStream.eof() ) {
00184
00185 NcbiGetlineEOL( m_InStream, line );
00186 ++line_counter;
00187
00188 if ( IsHeaderLine( line ) || IsIgnoredLine( line ) ) {
00189 continue;
00190 }
00191 ++record_counter;
00192
00193 CMaskData mask_data;
00194 if ( ! ParseRecord( line, mask_data ) ) {
00195 ++error_counter;
00196 LOG_POST_X( 1, Error << "Rmo Reader: Parse error in record "
00197 << record_counter << " (line " << line_counter
00198 << "). Record skipped" );
00199 if ( error_counter < uMaxErrorCount ) {
00200 continue;
00201 }
00202 else {
00203 break;
00204 }
00205 }
00206
00207 if ( ! VerifyData( mask_data ) ) {
00208 ++error_counter;
00209 LOG_POST_X( 2, Error << "Rmo Reader: Verification error in record "
00210 << record_counter << " (line " << line_counter
00211 << "). Record skipped." );
00212 if ( error_counter < uMaxErrorCount ) {
00213 continue;
00214 }
00215 else {
00216 break;
00217 }
00218 }
00219
00220 if ( ! MakeFeature( mask_data, feat, flags ) ) {
00221
00222 error_counter = uMaxErrorCount;
00223 LOG_POST_X( 3, Error << "Rmo Reader: Unable to create feature table for record "
00224 << record_counter << " (line " << line_counter
00225 << "). Aborting file import." );
00226 break;
00227 }
00228
00229 ftable.push_back( feat );
00230 }
00231
00232 if ( error_counter == uMaxErrorCount ) {
00233 LOG_POST_X( 4, Error << "Rmo Reader: File import aborted due to error count or severity." );
00234 throw 0;
00235 }
00236 }
00237
00238
00239 bool CRmOutReader::IsHeaderLine( const string& line )
00240 {
00241 string labels_1st_line[] = { "SW", "perc", "query", "position", "matching", "" };
00242 string labels_2nd_line[] = { "score", "div.", "del.", "ins.", "sequence", "" };
00243
00244
00245 size_t current_offset = 0;
00246 size_t i = 0;
00247 for ( ; labels_1st_line[i] != ""; ++i ) {
00248 current_offset = NStr::FindCase( line, labels_1st_line[i], current_offset );
00249 if ( NPOS == current_offset ) {
00250 break;
00251 }
00252 }
00253 if ( labels_1st_line[i] == "" ) {
00254 return true;
00255 }
00256
00257
00258 current_offset = 0;
00259 i = 0;
00260 for ( ; labels_2nd_line[i] != ""; ++i ) {
00261 current_offset = NStr::FindCase( line, labels_2nd_line[i], current_offset );
00262 if ( NPOS == current_offset ) {
00263 return false;
00264 }
00265 }
00266 return true;
00267 }
00268
00269
00270 bool CRmOutReader::IsIgnoredLine( const string& line )
00271 {
00272
00273
00274
00275 return ( NStr::TruncateSpaces( line ).length() == 0 );
00276 }
00277
00278
00279 bool CRmOutReader::ParseRecord( const string& record, CMaskData& mask_data )
00280 {
00281 const size_t MIN_VALUE_COUNT = 15;
00282
00283 string line = NStr::TruncateSpaces( record );
00284 list< string > values;
00285 if ( NStr::Split( line, " \t", values ).size() < MIN_VALUE_COUNT ) {
00286 return false;
00287 }
00288
00289 try {
00290
00291 list<string>::iterator it = values.begin();
00292 mask_data.sw_score = NStr::StringToUInt( *it );
00293
00294
00295 ++it;
00296 mask_data.perc_div = NStr::StringToDouble( *it );
00297
00298
00299 ++it;
00300 mask_data.perc_del = NStr::StringToDouble( *it );
00301
00302
00303 ++it;
00304 mask_data.perc_ins = NStr::StringToDouble( *it );
00305
00306
00307 ++it;
00308 mask_data.query_sequence = *it;
00309
00310
00311 ++it;
00312 mask_data.outer_pos_begin = NStr::StringToUInt( *it );
00313
00314
00315 ++it;
00316 mask_data.outer_pos_end = NStr::StringToUInt( *it );
00317
00318
00319 ++it;
00320
00321
00322
00323 ++it;
00324 mask_data.strand = *it;
00325
00326
00327 ++it;
00328 mask_data.matching_repeat = *it;
00329
00330
00331 ++it;
00332 mask_data.repeat_class_family = *it;
00333
00334
00335 ++it;
00336
00337
00338
00339 ++it;
00340
00341
00342
00343 ++it;
00344
00345
00346
00347 ++it;
00348
00349
00350 }
00351 catch( ... ) {
00352 return false;
00353 }
00354
00355 return true;
00356 }
00357
00358
00359 bool CRmOutReader::VerifyData( const CMaskData& mask_data )
00360 {
00361
00362
00363
00364
00365 return true;
00366 }
00367
00368
00369 bool CRmOutReader::MakeFeature( const CMaskData& mask_data, CRef<CSeq_feat>& feat,
00370 TFlags flags )
00371 {
00372 feat.Reset( new CSeq_feat );
00373 feat->ResetLocation();
00374
00375
00376 CSeqFeatData& sfdata = feat->SetData();
00377 CImp_feat_Base& imp = sfdata.SetImp();
00378 imp.SetKey( "repeat_region" );
00379
00380
00381 CRef<CSeq_loc> location( new CSeq_loc );
00382 CSeq_interval& interval = location->SetInt();
00383 interval.SetFrom( min( mask_data.outer_pos_begin, mask_data.outer_pos_end ) -1 );
00384 interval.SetTo( max( mask_data.outer_pos_begin, mask_data.outer_pos_end ) -1 );
00385 interval.SetStrand( strcmp( mask_data.strand.c_str(), "C" ) ?
00386 eNa_strand_plus : eNa_strand_minus );
00387
00388 CBioseq::TId ids;
00389 CSeq_id::ParseFastaIds(ids, mask_data.query_sequence);
00390 location->SetId(*FindBestChoice(ids, CSeq_id::Score));
00391
00392 feat->SetLocation( *location );
00393
00394
00395 if (flags) {
00396 CSeq_feat::TQual& qual_list = feat->SetQual();
00397
00398 if (flags & fIncludeRepeatName) {
00399 CRef<CGb_qual> repeat( new CGb_qual );
00400 repeat->SetQual( "repeat_region" );
00401 repeat->SetVal( mask_data.matching_repeat );
00402 qual_list.push_back( repeat );
00403 }
00404
00405 if (flags & fIncludeRepeatClass) {
00406 CRef<CGb_qual> rpt_family( new CGb_qual );
00407 rpt_family->SetQual( "rpt_family" );
00408 rpt_family->SetVal( mask_data.repeat_class_family );
00409 qual_list.push_back( rpt_family );
00410 }
00411
00412 if (flags & fIncludeStatistics) {
00413 CRef<CGb_qual> sw_score( new CGb_qual );
00414 sw_score->SetQual( "sw_score" );
00415 sw_score->SetVal( NStr::IntToString( mask_data.sw_score ) );
00416 qual_list.push_back( sw_score );
00417
00418 CRef<CGb_qual> perc_div( new CGb_qual );
00419 perc_div->SetQual( "perc_div" );
00420 perc_div->SetVal( NStr::DoubleToString( mask_data.perc_div ) );
00421 qual_list.push_back( perc_div );
00422
00423 CRef<CGb_qual> perc_del( new CGb_qual );
00424 perc_del->SetQual( "perc_del" );
00425 perc_del->SetVal( NStr::DoubleToString( mask_data.perc_del ) );
00426 qual_list.push_back( perc_del );
00427
00428 CRef<CGb_qual> perc_ins( new CGb_qual );
00429 perc_ins->SetQual( "perc_ins" );
00430 perc_ins->SetVal( NStr::DoubleToString( mask_data.perc_ins ) );
00431 qual_list.push_back( perc_ins );
00432 }
00433 }
00434
00435 return true;
00436 }
00437
00438
00439 CRmReader* CRmReader::OpenReader( CNcbiIstream& InStream )
00440 {
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456 return new CRmOutReader( InStream );
00457 }
00458
00459
00460 void CRmReader::CloseReader( CRmReader* pReader )
00461 {
00462 delete pReader;
00463 }
00464
00465
00466 END_objects_SCOPE
00467 END_NCBI_SCOPE
00468
00469