NCBI C++ ToolKit
rm_reader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: rm_reader.cpp 74900 2016-10-05 12:32:29Z ivanov $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Frank Ludwig, Wratko Hlavina
27  *
28  * File Description:
29  * Repeat Masker file reader
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbithr.hpp>
36 #include <corelib/ncbiutil.hpp>
37 #include <corelib/ncbiexpt.hpp>
38 
39 #include <util/line_reader.hpp>
40 #include <util/value_convert.hpp>
41 #include <util/static_map.hpp>
42 
43 #include <serial/iterator.hpp>
44 #include <serial/objistrasn.hpp>
45 
51 
56 
61 
78 
83 #include <objtools/error_codes.hpp>
84 
85 #include <algorithm>
86 
87 
88 #define NCBI_USE_ERRCODE_X Objtools_Rd_RepMask
89 
91 BEGIN_objects_SCOPE
92 
94 {
96  CBioseq::TId ids;
97  CSeq_id::ParseFastaIds(ids, id);
99  if (best) result = CSeq_id_Handle::GetHandle(*best);
100  return result;
101 }
102 
104 {
105  string family(GetRptFamily());
106  if (family.empty()) {
107  return GetRptClass();
108  } else {
109  return GetRptClass() + '/' + family;
110  }
111 }
112 
113 /*
114 IRepeatRegion::TTaxId IRepeatRegion::GetRptSpecificity() const
115 {
116  return 0;
117 }
118 
119 string IRepeatRegion::GetRptSpecificityName() const
120 {
121  return kEmptyStr;
122 }
123 */
124 
126 {
128  if (location) {
129  result.Assign(*location);
130  } else {
131  result.Reset();
132  }
133 }
134 
136 {
137  return GetLocation()->GetId()->AsFastaString();
138 }
139 
141 {
143 }
144 
146 {
147  return GetLocation()->GetStop(eExtreme_Positional) + 1;
148 }
149 
151 {
152  return GetLocation()->IsReverseStrand();
153 }
154 
156 {
157  return query_location;
158 }
159 
161 {
163  result->SetLocal().SetId(GetRptId());
164  return result;
165 }
166 
167 /// Overridden version returns the orginal unparsed
168 /// sequence identifier, if it was set (non-empty).
169 ///
171 {
172  if (! query_sequence.empty()) return query_sequence;
173  return TParent::GetSeqIdString();
174 }
175 
176 // Implement IRepeatRegion interface. If it weren't for virtual
177 // methods, all of the following could be inlined.
178 
180 {
181  return matching_repeat;
182 }
183 
185 {
186  return rpt_family;
187 }
188 
190 {
191  return rpt_class;
192 }
193 
195  return 0;
196 }
197 
199  if (GetRptPosEnd() == kInvalidSeqPos ||
201  return GetRptPosEnd() + GetRptLeft();
202 }
203 
205  return kEmptyStr;
206 }
207 
209  return kEmptyStr;
210 }
211 
213 {
214  return rpt_id;
215 }
216 
218 {
219  return sw_score;
220 }
221 
223 {
224  return perc_div;
225 }
226 
228 {
229  return perc_del;
230 }
231 
233 {
234  return perc_ins;
235 }
236 
238 {
239  return rpt_pos_begin;
240 }
241 
243 {
244  return rpt_pos_end;
245 }
246 
248 {
249  return rpt_left;
250 }
251 
253 {
254  return query_left;
255 }
256 
258 {
259  return overlapped;
260 }
261 
262 bool CRepeatLibrary::Get(const string& name, TRepeat& dest) const
263 {
264  TMap::const_iterator it(m_Map.find(name));
265  if (it == m_Map.end()) return false;
266  dest = it->second;
267  return true;
268 }
269 
271 {
272  TRepeat repeat;
273  string line;
274  vector<string> tokens;
275 
276  while (! stream.eof()) {
277  NcbiGetlineEOL(stream, line);
278  if (NStr::StartsWith(line, "//")) {
279  // Perl equivalent from rpt_lib2repeat_q.pl:
280  // # Repeats with undefined specificity can be skipped because RepeatMasker
281  // # only uses repeats that have a "Species" set in searches. The
282  // # specificity will be undefined when the "Species:" field is empty.
283  // if ( ($class eq "Simple_repeat" || $class eq "Low_complexity")
284  // && $specificity eq "universal" && $family eq "") {
285  // $length = ""; # length of database sequence is arbitrary
286  // }
287  if ((repeat.m_RptClass == "Simple_repeat" ||
288  repeat.m_RptClass == "Low_complexity") &&
289  repeat.m_RptSpecificityName == "universal" &&
290  repeat.m_RptFamily == "") repeat.m_RptLength = kInvalidSeqPos;
291  m_Map[repeat.m_RptName] = repeat;
292  continue;
293  }
294 
295  // As per EMBL Release 3.4:
296  //
297  // Each line begins with a two-character line type code.
298  // This code is always followed by three blanks, so that the
299  // actual information in each line begins in character position 6.
300  if (line.length() < 6 || line.substr(2, 3) != " ") continue;
301  string code(line.substr(0, 2));
302  string value(line.substr(5));
304 
305  if (code == "ID") {
306  // NOTE: Violates specs as per EMBL Release 3.4.1.
307  // There should be 7 fields.
308  //
309  // Perl equivalent from rpt_lib2repeat_q.pl:
310  // if (m/^ID\s/) {
311  // die "Multiple ID lines found in one record.\nLine $.: $_\n" if defined $name or defined $length;
312  // ($name, $length) = m/^ID\s+(\S+).*\s([1-9][0-9]*) BP\.$/;
313  // die "Failed to extract a repeat name and length from line:\nLine $.: $_\n"
314  // unless defined $name and $name and defined $length and $length;
315  // }
316  repeat.m_RptName = value.substr(0, value.find(' '));
317  string bp(value.substr(value.rfind(';') + 1));
319  repeat.m_RptLength = Convert(bp.substr(0, bp.find(' ')));
320  } else if (code == "DE") {
321  // DE RepbaseID: ACROBAT1
322  if (NStr::StartsWith(value, "RepbaseID:")) {
323  repeat.m_RptRepbaseId = NStr::TruncateSpaces(value.substr(10));
324  }
325  } else if (code == "CC") {
326  if (NStr::MatchesMask(value, "RELEASE *;*")) {
327  m_Release = value.substr(8, value.find(';') - 8);
328  } else if (NStr::StartsWith(value, "Type:")) {
329  // Perl equivalent from rpt_lib2repeat_q.pl:
330  // if (m/^CC +Type:\s*((.*\S)*)\s*$/) {
331  // die "Multiple Type lines found in one record.\nLine $.: $_\n" if defined $class;
332  // $class = $1;
333  // die "Failed to extract a repeat class from line:\nLine $.: $_\n" unless defined $class and $class;
334  // }
335  repeat.m_RptClass = NStr::TruncateSpaces(value.substr(5));
336  } else if (NStr::StartsWith(value, "SubType:")) {
337  // Perl equivalent from rpt_lib2repeat_q.pl:
338  // if (m/^CC +SubType:\s*((.*\S)*)\s*$/) {
339  // die "Multiple SubType lines found in one record.\nLine $.: $_\n" if defined $family;
340  // $family = $1 || ""; # NULL indicates unknown
341  // }
342  repeat.m_RptFamily = NStr::TruncateSpaces(value.substr(8));
343  } else if (NStr::StartsWith(value, "Species:")) {
344  // Perl equivalent from rpt_lib2repeat_q.pl:
345  // if (m/^CC +Species:\s*((.*\S)*)\s*$/) {
346  // die "Multiple Species lines found in one record.\nLine $.: $_\n" if defined $specificity;
347  // $specificity = $1;
348  // $specificity =~ s/_/ /g;
349  // $specificity = "universal" if $specificity eq "root";
350  // }
351  repeat.m_RptSpecificityName = NStr::TruncateSpaces(value.substr(8));
352  if (m_Taxonomy && repeat.m_RptSpecificityName.size()) {
353  pair<TSpecificity2Taxid::iterator, bool> i_specificity =
355  if (i_specificity.second) {
356  i_specificity.first->second = m_Taxonomy->GetTaxId(repeat.m_RptSpecificityName);
357  if (! i_specificity.first->second) {
359  << "RepeatMasker library species failed lookup to taxonomy: "
360  << repeat.m_RptSpecificityName);
361  }
362  }
363  repeat.m_RptSpecificity = i_specificity.first->second;
364  }
365  }
366  }
367  }
368 
369  // Don't need specificity to taxonomy lookups anymore.
371  }
372 
374  const string& name) const
375 {
376  return m_Taxonomy && m_Taxonomy->GetName(taxid) == name;
377 }
378 
379 template <typename T>
380 static void s_SetQual(CSeq_feat::TQual& qual_list,
381  const string& qual, const T val)
382 {
384  result->SetQual(qual);
385  string s = Convert(val).operator string();
386  result->SetVal(s);
387  qual_list.push_back(result);
388 }
389 
390 /// Translate RepeatMasker output to INSDC standard
391 /// nomenclature for repeats. This includes remapping repeat
392 /// family to satellite and mobile element qualifiers, as
393 /// appropriate.
394 ///
395 /// Available INSDC qualifiers are:
396 /// rpt_family, rpt_type, rpt_unit_seq, satellite, standard_name
397 ///
398 static bool s_StandardizeNomenclature(const IRepeatRegion& repeat,
399  CSeq_feat::TQual& qual_list)
400 {
401  string val;
402 
403  string klass = repeat.GetRptClass();
404  string family = repeat.GetRptFamily();
405 
406  if (NStr::EqualNocase(klass, "Satellite")) {
407  val = "satellite:";
408  if (! family.empty()) val += family;
409  val += ' ';
410  val += repeat.GetRptName();
411  s_SetQual(qual_list, "satellite", val);
412  if (! family.empty()) s_SetQual(qual_list, "rpt_family", family);
413  return true;
414  }
415 
416  if (NStr::EqualNocase(klass, "Simple_repeat")) {
417  // Simple_repeat is the family in ASN.1, not the class, based on
418  // evidence of prior submissions to GenBank. For example:
419  // GI:45269107, although this is weak evidence (stuffing
420  // RepeatMasker into Genbank qualifiers without much
421  // effort at standardization).
422  //
423  // Do not expect Simple_repeat/xxx.
424  s_SetQual(qual_list, "rpt_family", klass);
425  s_SetQual(qual_list, "rpt_unit", repeat.GetRptName());
426  return true;
427  }
428 
429  if (NStr::EqualNocase(klass, "SINE") ||
430  NStr::EqualNocase(klass, "LINE") ||
431  NStr::EqualNocase(klass, "LTR")) {
432  // Other valid INSDC mobile elements:
433  // "transposon", "retrotransposon", "integron",
434  // "insertion sequence", "non-LTR retrotransposon",
435  // "MITE", "other"
436  val = klass;
437  val += ':';
438  val += repeat.GetRptName();
439  s_SetQual(qual_list, "mobile_element", val);
440  if (! family.empty()) s_SetQual(qual_list, "rpt_family", family);
441  return true;
442  }
443 
444  return false;
445 }
446 
449  TIdGenerator& ids)
450  : m_Flags(flags)
451  , m_Library(lib)
452  , m_Ids(&ids)
453 {
454 }
455 
457 {
458  m_Library.Reset();
459 }
460 
462 {
463  m_Library.Reset(&lib);
464 }
465 
467 {
468  m_Ids.Reset(new COrdinalFeatIdGenerator);
469 }
470 
472 {
473  m_Ids.Reset(&generator);
474 }
475 
477 {
478  // We can forget old IDs once references have been resolved.
479  m_IdMap.clear();
480 }
481 
483 {
484  CRef<CSeq_feat> feat(new CSeq_feat);
485 
486  // data:
487  CSeqFeatData& sfdata = feat->SetData();
488  CImp_feat_Base& imp = sfdata.SetImp();
489  imp.SetKey("repeat_region");
490 
491  CRef<CFeat_id> id(m_Ids->GenerateId());
492  feat->SetId(*id);
493  TIdMap::iterator id_it(m_IdMap.find(repeat.GetRptId()));
494  if (id_it == m_IdMap.end()) {
495  m_IdMap[repeat.GetRptId()] = id;
496  } else {
498  ref->SetId().Assign(*id_it->second);
499  feat->SetXref().push_back(ref);
500  }
501 
502  // location:
503  repeat.GetLocation(feat->SetLocation());
504 
505  // qualifiers & ext's.
506  if (m_Flags) {
507  // Record if attributes were modified to conform with INSDC standards.
508  bool standardized(false);
509 
511  if (m_Library) m_Library->Get(repeat.GetRptName(), extra);
512 
513  CSeq_feat::TQual& qual_list = feat->SetQual();
514 
517  standardized = s_StandardizeNomenclature(repeat, qual_list);
518  }
519 
520  if (! standardized) {
521  // Did not succeed in standardizing nomenclature
522  // from RepeatMasker to INSDC standards. Fall back to
523  // storing the class/family verbatim.
524  s_SetQual(qual_list, "rpt_family", repeat.GetRptClassFamily());
525  }
526  }
527 
528  if (m_Flags & fIncludeRepeatName && ! standardized) {
529  s_SetQual(qual_list, "standard_name", repeat.GetRptName());
530  }
531 
532  if (m_Flags & fIncludeRepeatPos) {
533  s_SetQual(qual_list, "rpt_unit_range",
535  ".." + NStr::IntToString(repeat.GetRptPosEnd()));
536  }
537 
538  // Get specificity and check it for redundancy (taxid vs name).
539  bool include_specificity_name(false);
541  const IRepeat::TTaxId specificity(extra.GetRptSpecificity());
542  const string specificity_name(extra.GetRptSpecificityName());
543  include_specificity_name = ! specificity_name.empty();
544  if (specificity) {
545  CRef<CDbtag> tag(new CDbtag);
546  // eDbtagType_taxon except the enum is almost useless,
547  // being available to only one function in the Dbtag API.
548  tag->SetDb("taxon");
549  tag->SetTag().SetId(specificity);
550  feat->SetDbxref().push_back(tag);
551  if (fRemoveRedundancy && m_Library &&
552  m_Library->TestSpecificityMatchesName(
553  specificity,
554  specificity_name)) {
555  // Name matches taxonomy exactly, so don't store both.
556  include_specificity_name=false;
557  }
558  }
559  }
560 
561  // Get repeat length and check it for redundancy with rpt_left.
562  TSeqPos rpt_length(extra.GetRptLength());
563  if (rpt_length == kInvalidSeqPos) {
564  rpt_length = repeat.GetRptPosEnd() +
565  repeat.GetRptLeft();
566  }
567  bool include_rpt_left(m_Flags & fIncludeCoreStatistics);
568  if ((m_Flags & fRemoveRedundancy) &&
570  (rpt_length == repeat.GetRptPosEnd() +
571  repeat.GetRptLeft())) {
572  // Do not store rpt_left if we know the repeat length,
573  // rpt_left matches it (so it's redundant), and we
574  // want to remove redundancy.
575  include_rpt_left = false;
576  }
577 
578  // Store anything beyond what is possible in INDSC-approved
579  // qualifiers using either non-standard qualifiers or user objects.
580  // There are two options.
581 
583  // Option 1: Use Genbank qualifiers beyond the INDSC-approved set.
584 
585  if (m_Flags & fIncludeCoreStatistics) {
586  s_SetQual(qual_list, "sw_score", repeat.GetSwScore());
587  s_SetQual(qual_list, "perc_div", repeat.GetPercDiv());
588  s_SetQual(qual_list, "perc_del", repeat.GetPercDel());
589  s_SetQual(qual_list, "perc_ins", repeat.GetPercIns());
590  if (include_rpt_left) {
591  s_SetQual(qual_list, "rpt_left", repeat.GetRptLeft());
592  }
593  }
594 
596  if (! (m_Flags & fRemoveRedundancy)) {
597  // Query length is always redundant, since sequences
598  // have a bioseq length, and we know the location.
599  s_SetQual(qual_list, "query_length",
600  repeat.GetSeqPosEnd() + repeat.GetSeqLeft());
601  }
602  if (repeat.IsOverlapped()) {
603  s_SetQual(qual_list, "overlapped", true);
604  }
605  }
606 
607  if (m_Flags & fIncludeRepeatId) {
608  s_SetQual(qual_list, "rpt_id", repeat.GetRptId());
609  }
610 
611  if (m_Flags & fIncludeRepeatLength) {
612  s_SetQual(qual_list, "rpt_length", rpt_length);
613  }
614 
615  if (include_specificity_name) {
616  s_SetQual(qual_list, "specificity",
617  extra.GetRptSpecificityName());
618  }
619 
620  } else {
621  // Option 2: Use user objects.
622 
624  feat->SetExts().push_back(uo);
625  uo->SetType().SetStr("RepeatMasker");
626 
627  if (m_Flags & fIncludeCoreStatistics) {
628  uo->AddField("sw_score", static_cast<double>(repeat.GetSwScore()));
629  uo->AddField("perc_div", repeat.GetPercDiv());
630  uo->AddField("perc_del", repeat.GetPercDel());
631  uo->AddField("perc_ins", repeat.GetPercIns());
632  if (include_rpt_left) {
633  uo->AddField("rpt_left", static_cast<int>(repeat.GetRptLeft()));
634  }
635  }
636 
638  if (! (m_Flags & fRemoveRedundancy)) {
639  // Query length is always redundant, since sequences
640  // have a bioseq length, and we know the location.
641  uo->AddField("query_length", static_cast<int>(
642  repeat.GetSeqPosEnd() + repeat.GetSeqLeft()));
643  }
644  if (repeat.IsOverlapped()) {
645  uo->AddField("overlapped", true);
646  }
647  }
648 
649  if (m_Flags & fIncludeRepeatId) {
650  uo->AddField("rpt_id", static_cast<int>(repeat.GetRptId()));
651  }
652 
653  if (m_Flags & fIncludeRepeatLength) {
654  uo->AddField("rpt_length", static_cast<int>(rpt_length));
655  }
656 
657  if (include_specificity_name) {
658  uo->AddField("specificity", extra.GetRptSpecificityName());
659  }
660 
661  // Clear out storage of empty user objects.
662  if (! uo->IsSetData()) feat->ResetExts();
663  }
664 
665  // Clear out storage if empty Genbank qualifier lists.
666  if (qual_list.empty()) feat->ResetQual();
667 
669  ! extra.GetRptRepbaseId().empty()) {
670  CRef<CDbtag> tag(new CDbtag);
671  tag->SetDb("REPBASE");
672  tag->SetTag().SetStr(extra.GetRptRepbaseId());
673  feat->SetDbxref().push_back(tag);
674  }
675 
676  if (m_Flags & fSetComment) {
677  // Redundantly, store comments with original information.
678  // The comment tries to stay close to RepeatMasker native
679  // nomenclature. For example, query_left is reported,
680  // rather than the normalized query_length as stored
681  // in user objects or Genbank qualifiers. To accommodate
682  // the possibility the annotation is remapped, the original
683  // query identifier is preserved.
684 
685  CNcbiOstrstream comment;
686  const char eq('='), sep(' ');
687 
688  comment << "source=RepeatMasker";
689  if (m_Flags & fIncludeRepeatName) {
690  comment << sep
691  << "rpt_name" << eq << repeat.GetRptName();
692  }
693  if (m_Flags & fIncludeCoreStatistics) {
694  comment << sep
695  << "sw_score" << eq << repeat.GetSwScore() << sep
696  << "perc_div" << eq << repeat.GetPercDiv() << sep
697  << "perc_del" << eq << repeat.GetPercDel() << sep
698  << "perc_ins" << eq << repeat.GetPercIns() << sep
699  << "rpt_left" << eq << repeat.GetRptLeft();
700  }
702  comment << sep
703  << "query" << eq << repeat.GetSeqIdString() << sep
704  << "query_range" << eq;
705  bool reverse(repeat.IsReverseStrand());
706  if (reverse) comment << "complement(";
707  comment << repeat.GetSeqPosBegin()
708  << ".." << repeat.GetSeqPosEnd();
709  if (reverse) comment << ")";
710  comment << sep
711  << "query_left" << eq << repeat.GetSeqLeft();
712  }
713  if (m_Flags & fIncludeRepeatId) {
714  comment << sep
715  << "ID" << eq << repeat.GetRptId();
716  }
717  if (m_Flags & fIncludeExtraStatistics && repeat.IsOverlapped()) {
718  comment << " *";
719  }
720  if (! extra.GetRptSpecificityName().empty()) {
721  comment << sep
722  << "specificity" << eq << extra.GetRptSpecificityName();
723  }
724  if (extra.GetRptLength() != kInvalidSeqPos) {
725  comment << sep
726  << "rpt_length" << eq << extra.GetRptLength();
727  }
728  feat->SetComment(CNcbiOstrstreamToString(comment));
729  }
730  }
731 
732  return feat;
733 }
734 
737  const ISeqIdResolver& seqid_resolver,
738  TIdGenerator& ids)
739  : m_SeqIdResolver(&seqid_resolver)
740  , m_ToFeat(flags, lib, ids)
741 {
742 }
743 
745 {
746 }
747 
749 {
751 }
752 
754 {
755  m_SeqIdResolver.Reset(&seqid_resolver);
756 }
757 
759 {
760  return m_ToFeat;
761 }
762 
765 {
766  CRef<CSerialObject> object(
767  ReadSeqAnnot(lr, pMessageListener).ReleaseOrNull());
768  return object;
769 }
770 
773 {
774  CRef<CSeq_annot> annot(new CSeq_annot);
775  // CRef<CAnnot_descr> desc(new CAnnot_descr);
776  // annot->SetDesc(*desc);
777  CSeq_annot::C_Data::TFtable& ftable = annot->SetData().SetFtable();
778 
779  string line;
780  size_t record_counter = 0;
781 
782  while ( ! lr.AtEOF() ) {
783  line = *++lr;
784 
785  if ( IsHeaderLine( line ) || IsIgnoredLine( line ) ) {
786  continue;
787  }
788  ++record_counter;
789 
790  SRepeatRegion mask_data;
791  if ( ! ParseRecord( line, mask_data ) ) {
794  eDiag_Error,
795  lr.GetLineNumber(),
796  "RepeatMasker Reader: Parse error in record = " + line) );
797  ProcessError(*pErr, pMessageListener);
798  continue;
799  }
800 
801  if ( ! VerifyData( mask_data ) ) {
804  eDiag_Error,
805  lr.GetLineNumber(),
806  "RepeatMasker Reader: Verification error in record = " + line) );
807  ProcessError(*pErr, pMessageListener);
808  continue;
809  }
810 
811  CRef<CSeq_feat> feat(m_ToFeat(mask_data));
812  if ( ! feat ) {
815  eDiag_Error,
816  lr.GetLineNumber(),
817  "RepeatMasker Reader: Aborting file import, "
818  "unable to create feature table for record = " + line) );
819  ProcessError(*pErr, pMessageListener);
820  // we don't tolerate even a few errors here!
821  break;
822  }
823 
824  ftable.push_back(feat);
825  }
826  // if (! record_counter) annot.Reset();
827  xAddConversionInfo(annot, pMessageListener);
828  return annot;
829 }
830 
831 
832 bool CRepeatMaskerReader::IsHeaderLine(const string& line)
833 {
834  string labels_1st_line[] = { "SW", "perc", "query", "position", "matching", "" };
835  string labels_2nd_line[] = { "score", "div.", "del.", "ins.", "sequence", "" };
836 
837  // try to identify 1st line of column labels:
838  size_t current_offset = 0;
839  size_t i = 0;
840  for ( ; labels_1st_line[i] != ""; ++i ) {
841  current_offset = NStr::FindCase( line, labels_1st_line[i], current_offset );
842  if ( NPOS == current_offset ) {
843  break;
844  }
845  }
846  if ( labels_1st_line[i] == "" ) {
847  return true;
848  }
849 
850  // try to identify 2nd line of column labels:
851  current_offset = 0;
852  i = 0;
853  for ( ; labels_2nd_line[i] != ""; ++i ) {
854  current_offset = NStr::FindCase( line, labels_2nd_line[i], current_offset );
855  if ( NPOS == current_offset ) {
856  return false;
857  }
858  }
859  return true;
860 }
861 
862 
863 bool CRepeatMaskerReader::IsIgnoredLine(const string& line)
864 {
865  if ( NStr::StartsWith(line, "There were no repetitive sequences detected in "))
866  return true;
867  if ( NStr::FindCase(line, "only contains ambiguous bases") != NPOS)
868  return true;
869  return ( NStr::TruncateSpaces( line ).length() == 0 );
870 }
871 
872 
873 static void StripParens(string& s)
874 {
875  SIZE_TYPE b = 0;
876  SIZE_TYPE e = s.size();
877  if (e > 0 && s[b] == '(') {
878  ++b;
879  if (s[e - 1] == ')') --e;
880  if (e == b)
881  s = kEmptyStr;
882  else
883  s = s.substr(b, e - b);
884  }
885 }
886 
887 bool CRepeatMaskerReader::ParseRecord(const string& record, SRepeatRegion& mask_data)
888 {
889  const size_t MIN_VALUE_COUNT = 15;
890 
891  string line = NStr::TruncateSpaces( record );
892  list< string > values;
893  if ( NStr::Split( line, " \t", values, NStr::fSplit_Tokenize ).size() < MIN_VALUE_COUNT ) {
894  return false;
895  }
896 
897  try {
898  // 1: "SW score"
899  list<string>::iterator it = values.begin();
900  mask_data.sw_score = NStr::StringToUInt( *it );
901 
902  // 2: "perc div."
903  ++it;
904  mask_data.perc_div = NStr::StringToDouble( *it );
905 
906  // 3: "perc del."
907  ++it;
908  mask_data.perc_del = NStr::StringToDouble( *it );
909 
910  // 4: "perc ins."
911  ++it;
912  mask_data.perc_ins = NStr::StringToDouble( *it );
913 
914  // 5: "query sequence"
915  ++it;
916  mask_data.query_sequence = *it;
917  CSeq_id_Handle idh(m_SeqIdResolver->ResolveSeqId(mask_data.query_sequence));
918  CConstRef<CSeq_id> id(idh.GetSeqIdOrNull());
919  if (! id) return false;
920  mask_data.query_location.Reset(new CSeq_loc);
922  location.SetId().Assign(*id);
923 
924  // 6: "position begin"
925  ++it;
926  TSeqPos pos_begin = NStr::StringToUInt(*it);
927  if (pos_begin == 0) return false;
928  location.SetFrom(pos_begin - 1);
929 
930  // 7: "in end"
931  ++it;
932  TSeqPos pos_end = NStr::StringToUInt(*it);
933  if (pos_end == 0 || pos_end < pos_begin) return false;
934  location.SetTo(pos_end - 1);
935 
936  // 8: "query (left)"
937  ++it;
938  StripParens(*it);
939  mask_data.query_left = NStr::StringToUInt( *it );
940 
941  // 9: "" (meaning "strand")
942  ++it;
943  // Having the strand, we now have all fields to populate the location.
944  location.SetStrand(*it == "C" ? eNa_strand_minus : eNa_strand_plus);
945 
946  // 10: "matching repeat"
947  ++it;
948  mask_data.matching_repeat = *it;
949 
950  // 11: "repeat class/family"
951  ++it;
952  string class_family = *it;
953  NStr::SplitInTwo(class_family, "/",
954  mask_data.rpt_class, mask_data.rpt_family);
955 
956  // 12: "position in"
957  ++it;
958  string field12 = *it;
959 
960  // 13: "in end"
961  ++it;
962  mask_data.rpt_pos_end = NStr::StringToUInt( *it );
963 
964  // 14: "repeat left"
965  ++it;
966  string field14 = *it;
967 
968  // fields position 12 and 14 flip depending on the strand value.
969  string rpt_left;
970  if (mask_data.IsReverseStrand()) {
971  mask_data.rpt_pos_begin = NStr::StringToUInt( field14 );
972  rpt_left = field12;
973  } else {
974  mask_data.rpt_pos_begin = NStr::StringToUInt( field12 );
975  rpt_left = field14;
976  }
977 
978  StripParens(rpt_left);
979  mask_data.rpt_left = NStr::StringToUInt(rpt_left);
980 
981  // 15: "ID"
982  ++it;
983  mask_data.rpt_id = NStr::StringToUInt(*it);
984 
985  // 16: overlapped (higher score repeat overlaps)
986  ++it;
987  mask_data.overlapped = (it != values.end() && (*it) == "*");
988  }
989  catch( ... ) {
990  return false;
991  }
992 
993  return true;
994 }
995 
997 {
998  //
999  // This would be the place for any higher level checks of the mask data
1000  // collected from the record ...
1001  //
1002  return true;
1003 }
1004 
1005 
1006 CRmReader::CRmReader(CNcbiIstream& istr) : m_Istr(istr)
1007 {
1008 }
1009 
1011 {
1012  //
1013  // This is the point to make sure we are dealing with the right file type and
1014  // to allocate the specialist reader for any subtype (OUT, HTML) we encouter.
1015  // When this function returns the file pointer should be past the file header
1016  // and at the beginning of the actual mask data.
1017  //
1018  // Note:
1019  // If something goes wrong during header processing then the file pointer will
1020  // still be modified. It's the caller's job to restore the file pointer if this
1021  // is possible for this type of stream.
1022  //
1023 
1024  //
1025  // 2006-03-31: Only supported file type at this time: ReadMasker OUT.
1026  //
1027  return new CRmReader(istr);
1028 }
1029 
1031 {
1032  delete reader;
1033 }
1034 
1036  TFlags flags, size_t errors)
1037 {
1038  annot->Reset();
1039  CRepeatMaskerReader impl(flags);
1040  CMessageListenerWithLog error_container(DIAG_COMPILE_INFO);
1041  CRef<CSeq_annot> result(impl.ReadSeqAnnot(m_Istr, &error_container));
1042  annot->Assign(*result, eShallow);
1043 }
1044 
1045 
1046 END_objects_SCOPE
Translate RepeatMasker output to INSDC standard nomenclature for repeats.
Definition: rm_reader.hpp:475
string m_RptSpecificityName
Definition: rm_reader.hpp:141
TSeqPos m_RptLength
Definition: rm_reader.hpp:139
TConverter & SetConverter()
Delegate for conversion from IRepeatRegion to ASN.1.
Definition: rm_reader.cpp:758
Class acting as an interface to a RepeatMasker library.
Definition: rm_reader.hpp:420
Default implementation of a Seq-id resolver, which knows about FASTA-formatted sequence identifiers...
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3081
Store the repeat position, that is, the interval on the repeat sequence.
Definition: rm_reader.hpp:541
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
TSeqPos query_left
Definition: rm_reader.hpp:401
virtual TPercent GetPercDel() const =0
TPercent perc_ins
Definition: rm_reader.hpp:404
string m_RptClass
Definition: rm_reader.hpp:138
TTaxId GetRptSpecificity() const
Gets specificity as a taxonomy ID, or 0 if not known.
Definition: rm_reader.hpp:130
void SetDb(const TDb &value)
Assign a value to Db data member.
Definition: Dbtag_.hpp:220
Implements a concrete class for reading RepeatMasker output from tabular form and rendering it as ASN...
Definition: rm_reader.hpp:689
virtual void Reset(void)
Reset the whole object.
Definition: Seq_loc_.cpp:59
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
bool IsOverlapped() const
Flag that there is a higher-scoring match whose domain partly (<80%) includes the domain of this matc...
Definition: rm_reader.cpp:257
void ResetQual(void)
Reset Qual data member.
Definition: Seq_feat_.cpp:136
void SetQual(const TQual &value)
Assign a value to Qual data member.
Definition: Gb_qual_.hpp:211
#define T(s)
Definition: common.h:225
TSeqPos GetRptLength() const
Gets repeat length, or kInvalidSeqPos if not known.
Definition: rm_reader.hpp:129
TTaxId GetRptSpecificity() const
Returns 0, not known.
Definition: rm_reader.cpp:194
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:836
User-defined methods of the data storage class.
CRmReader(CNcbiIstream &istr)
Definition: rm_reader.cpp:1006
CRepeatMaskerReader(TFlags flags=fDefaults, CConstRef< TRepeatLibrary > lib=null, const ISeqIdResolver &seqid_resolver=*(CConstIRef< ISeqIdResolver >(new CFastaIdsResolver)), TIdGenerator &ids=*(CIRef< TIdGenerator >(new COrdinalFeatIdGenerator)))
Implement CReaderBase.
Definition: rm_reader.cpp:735
void clear()
Definition: map.hpp:169
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line) ...
CSeq_id_Handle ResolveSeqId(const string &id) const
Returns a normalized representation of a sequence identifier, as Seq-id handle.
Definition: rm_reader.cpp:93
CRepeatToFeat(TFlags flags=fDefaults, CConstRef< TRepeatLibrary > lib=null, TIdGenerator &ids=*(CIRef< TIdGenerator >(new COrdinalFeatIdGenerator)))
Definition: rm_reader.cpp:447
TConverter m_ToFeat
Definition: rm_reader.hpp:740
User-defined methods of the data storage class.
virtual bool VerifyData(const SRepeatRegion &mask_data)
Definition: rm_reader.cpp:996
string GetRptName() const
Gets repeat name.
Definition: rm_reader.cpp:179
CNcbiIstream & m_Istr
Definition: rm_reader.hpp:757
string GetRptSpecificityName() const
Gets specificity as a name, or empty string if not known.
Definition: rm_reader.hpp:131
virtual TSeqPos GetRptPosEnd() const =0
string rpt_class
Definition: rm_reader.hpp:408
static bool MatchesMask(CTempString str, CTempString mask, ECase use_case=eCase)
Match "str" against the "mask".
Definition: ncbistr.cpp:235
ITaxonomyResolver::TTaxId TTaxId
Definition: rm_reader.hpp:90
container_type::const_iterator const_iterator
Definition: map.hpp:53
User-defined methods of the data storage class.
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1008
void ResetExts(void)
Reset Exts data member.
Definition: Seq_feat_.cpp:206
bool TestSpecificityMatchesName(TRepeat::TTaxId taxid, const string &name) const
Check if a given taxid's scientific name matches the original specificity string. ...
Definition: rm_reader.cpp:373
Abstract base class for lightweight line-by-line reading.
Definition: line_reader.hpp:53
CConstRef< CSeq_loc > GetLocation(void) const
Gets the location of this repeat.
Definition: rm_reader.cpp:155
virtual string GetRptClass() const =0
Gets repeat class, or empty string if not known.
string
Definition: cgiapp.hpp:498
void Read(CNcbiIstream &stream)
Reads a library from the RepeatMaskerLib.embl-style input.
Definition: rm_reader.cpp:270
virtual string GetSeqIdString() const
Gets the sequence from the location of the repeat, without dealing with a Seq-loc.
Definition: rm_reader.cpp:135
Removes redundant fields.
Definition: rm_reader.hpp:482
virtual string GetRptName() const =0
Gets repeat name.
Class which, given an input IRepeatRegion, can generate an appropriate and normalized NCBI ASN...
Definition: rm_reader.hpp:627
const value_slice::CValueConvert< value_slice::SRunTimeCP, FROM > Convert(const FROM &value)
Defines NCBI C++ exception handling.
bool IsSetData(void) const
the object itself Check if a value has been assigned to Data data member.
Deprecated, old API for loading RepeatMasker output.
Definition: rm_reader.hpp:747
TPercent GetPercDel() const
Definition: rm_reader.cpp:227
Store extra statistics, which includes the length of the query (or query_left, equivalently), and the flag has_higher_score_overlapping_match.
Definition: rm_reader.hpp:517
virtual bool IsOverlapped() const =0
Flag that there is a higher-scoring match whose domain partly (<80%) includes the domain of this matc...
*** Import *********************************************** * * Features imported from other databases...
Definition: Imp_feat_.hpp:76
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1343
TPercent GetPercIns() const
Definition: rm_reader.cpp:232
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
Definition: ncbiutil.hpp:262
#define kEmptyStr
Definition: ncbistr.hpp:120
virtual TPercent GetPercIns() const =0
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
unsigned int TRptId
Definition: rm_reader.hpp:176
TScore sw_score
Definition: rm_reader.hpp:400
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:839
#define NPOS
Definition: ncbistr.hpp:130
int i
string m_RptFamily
Definition: rm_reader.hpp:137
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2429
string GetRptSpecificityName() const
Returns an empty string, not known.
Definition: rm_reader.cpp:204
User-defined methods of the data storage class.
virtual TSeqPos GetSeqLeft() const =0
static SIZE_TYPE FindCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case sensitive search.
Definition: ncbistr.hpp:5487
virtual bool IsIgnoredLine(const string &line)
Definition: rm_reader.cpp:863
virtual TSeqPos GetSeqPosBegin() const
Convenience function that gets the position start on the sequence, without dealing with a Seq-loc...
Definition: rm_reader.cpp:140
CFeat_id –.
Definition: Feat_id.hpp:65
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5079
TSeqPos rpt_pos_begin
Definition: rm_reader.hpp:410
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
TXref & SetXref(void)
Assign a value to Xref data member.
Definition: Seq_feat_.hpp:1272
void SetId(TId &value)
Assign a value to Id data member.
Definition: Seq_feat_.cpp:73
void ResetSeqIdResolver()
Use default Seq-id resolution.
Definition: rm_reader.cpp:748
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:101
TScore GetSwScore() const
Definition: rm_reader.cpp:217
CRef< CSerialObject > ReadObject(ILineReader &lr, ILineErrorListener *pMessageListener=0)
Read an object from a given line reader, render it as the most appropriate Genbank object...
Definition: rm_reader.cpp:764
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
Definition: ncbistr.cpp:543
#define DIAG_COMPILE_INFO
Make compile time diagnostic information object to use in CNcbiDiag and CException.
Definition: ncbidiag.hpp:169
const_iterator end() const
Definition: map.hpp:152
string GetRptFamily() const
Gets repeat family, or empty string if not known.
Definition: rm_reader.cpp:184
TId & SetId(void)
Select the variant.
Definition: Object_id_.hpp:277
CSeqFeatXref –.
Definition: SeqFeatXref.hpp:65
TSeqPos rpt_pos_end
Definition: rm_reader.hpp:411
User-defined methods of the data storage class.
string query_sequence
Definition: rm_reader.hpp:405
const char * tag
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:898
const_iterator find(const key_type &key) const
Definition: map.hpp:153
static SIZE_TYPE ParseFastaIds(CBioseq::TId &ids, const CTempString &s, bool allow_partial_failure=false)
Parse an entire set of |-delimited FASTA-style IDs, appending the results to IDS. ...
Definition: Seq_id.cpp:1894
CConstRef< TRepeatLibrary > m_Library
Definition: rm_reader.hpp:680
void Read(CRef< CSeq_annot > annot, TFlags flags=fDefaults, size_t errors=kMax_UInt)
Definition: rm_reader.cpp:1035
string rpt_family
Definition: rm_reader.hpp:409
Error message.
Definition: ncbidiag.hpp:647
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3455
static void s_SetQual(CSeq_feat::TQual &qual_list, const string &qual, const T val)
Definition: rm_reader.cpp:380
Lightweight interface for getting lines of data with minimal memory copying.
TSeqPos GetRptPosEnd() const
Definition: rm_reader.cpp:242
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:946
void SetTag(TTag &value)
Assign a value to Tag data member.
Definition: Dbtag_.cpp:66
virtual void xAddConversionInfo(CRef< CSeq_annot > &, ILineErrorListener *)
void SetType(TType &value)
Assign a value to Type data member.
static double StringToDouble(const CTempStringEx str, TStringToNumFlags flags=0)
Convert string to double.
Definition: ncbistr.cpp:1288
string matching_repeat
Definition: rm_reader.hpp:407
CIRef< TIdGenerator > m_Ids
Definition: rm_reader.hpp:681
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
virtual ~CRepeatMaskerReader(void)
Definition: rm_reader.cpp:744
string GetRptClassFamily() const
Covenience function to get the class and family as one value, the way that RepeatMasker emits them...
Definition: rm_reader.cpp:103
User-defined methods of the data storage class.
TSeqPos GetRptPosBegin() const
Definition: rm_reader.cpp:237
void SetRepeatLibrary(const TRepeatLibrary &lib)
Set a repeat library which may be used to add additional attributes to repeats.
Definition: rm_reader.cpp:461
void SetVal(const TVal &value)
Assign a value to Val data member.
Definition: Gb_qual_.hpp:251
unsigned long TScore
Definition: rm_reader.hpp:177
string GetSeqIdString() const
Overridden version returns the orginal unparsed sequence identifier, if it was set (non-empty)...
Definition: rm_reader.cpp:170
Selected attributes beyond what is stored in GenBank standard qualifiers will be included as comments...
Definition: rm_reader.hpp:505
TTaxId m_RptSpecificity
Definition: rm_reader.hpp:140
virtual TSeqPos GetRptLeft() const =0
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
virtual TPercent GetPercDiv() const =0
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
Store core statistics, which include the scores of sw_score, perc_div, perc_del, perc_ins, and the length of the repeat (or rpt_left, equivalently).
Definition: rm_reader.hpp:511
numerical value
Definition: Na_strand.hpp:63
Interface for resolving a sequence identifier given a textual representation.
void ResetIdGenerator()
Reset the Feature-id generator, do use a default implementation which will generate unique integer lo...
Definition: rm_reader.cpp:466
TSeqPos GetRptLeft() const
Definition: rm_reader.cpp:247
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:198
void SetKey(const TKey &value)
Assign a value to Key data member.
Definition: Imp_feat_.hpp:256
int size
Store the specificity from the RepeatMasker library, if provided.
Definition: rm_reader.hpp:550
Store the repeat length as reported in the library.
Definition: rm_reader.hpp:554
TPercent perc_div
Definition: rm_reader.hpp:402
Store the RepbaseID from the RepeatMasker library, if provided.
Definition: rm_reader.hpp:559
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
.hpp User-defined methods of the data storage class.
Definition: Gb_qual.hpp:60
static CRmReader * OpenReader(CNcbiIstream &istr)
Definition: rm_reader.cpp:1010
TSeqPos GetSeqLeft() const
Definition: rm_reader.cpp:252
virtual TSeqPos GetSeqPosEnd() const
Convenience functions that gets the position end on the sequence, without dealing with a Seq-loc...
Definition: rm_reader.cpp:145
virtual unsigned int GetLineNumber(void) const =0
Returns the current line number (counting from 1, not 0).
CConstIRef< ISeqIdResolver > m_SeqIdResolver
Definition: rm_reader.hpp:739
char value[7]
Definition: config.c:428
virtual void Reset(void)
Reset the whole object.
Definition: Seq_annot_.cpp:248
string m_RptName
Definition: rm_reader.hpp:136
TDbxref & SetDbxref(void)
Assign a value to Dbxref data member.
Definition: Seq_feat_.hpp:1297
Implementation of IRepeat backed by a simple structure.
Definition: rm_reader.hpp:121
string GetRptRepbaseId() const
Returns an empty string, not known.
Definition: rm_reader.cpp:208
Multi-threading – classes, functions, and features.
Definition: inftrees.h:24
string m_RptRepbaseId
Definition: rm_reader.hpp:142
Useful/utility classes and methods.
Assign/Compare pointers only.
Definition: serialdef.hpp:192
TPercent GetPercDiv() const
Definition: rm_reader.cpp:222
void AssertReferencesResolved()
Asserts that all forward/backward references between any objects visited have now been resolved...
Definition: rm_reader.cpp:476
string GetRptClass() const
Gets repeat class, or empty string if not known.
Definition: rm_reader.cpp:189
virtual CConstRef< CSeq_loc > GetLocation(void) const =0
Gets the location of this repeat.
void SetInt(TInt &v)
Definition: Seq_loc.hpp:967
string m_Release
Definition: rm_reader.hpp:450
AutoPtr –.
Definition: ncbimisc.hpp:384
Default implementation for a generator of identifiers, as integers, mashalled as CFeat_id objects...
CRef< CSeq_loc > query_location
Definition: rm_reader.hpp:399
static CObjReaderLineException * Create(EDiagSev eSeverity, unsigned int uLine, const std::string &strMessage, EProblem eProblem=eProblem_GeneralParsingError, const std::string &strSeqId=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), CObjReaderLineException::EErrCode eErrCode=eFormat, const TVecOfLines &vecOfOtherLines=TVecOfLines())
Please use this instead of the constructor because the ctor is protected.
Definition: line_error.cpp:103
TIdMap m_IdMap
Definition: rm_reader.hpp:682
TPercent perc_del
Definition: rm_reader.hpp:403
void SetComment(const TComment &value)
Assign a value to Comment data member.
Definition: Seq_feat_.hpp:1030
const string AsFastaString(void) const
Definition: Seq_id.cpp:1637
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:925
bool Get(const string &name, TRepeat &dest) const
Gets information about a given repeat, specified by name.
Definition: rm_reader.cpp:262
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3096
else result
Definition: token2.c:20
TSeqPos GetRptLength() const
Gets repeat length, or kInvalidSeqPos if not known.
Definition: rm_reader.cpp:198
virtual TRptId GetRptId() const =0
static void StripParens(string &s)
Definition: rm_reader.cpp:873
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
TExts & SetExts(void)
Assign a value to Exts data member.
Definition: Seq_feat_.hpp:1434
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:129
IO_PREFIX::ostrstream CNcbiOstrstream
Portable alias for ostrstream.
Definition: ncbistre.hpp:155
bool eq(T x_, T y_, T round_)
Definition: njn_approx.hpp:79
TLocal & SetLocal(void)
Select the variant.
Definition: Feat_id_.cpp:140
Structure implementing the IRepeatRegion API as a simple store of data memebers.
Definition: rm_reader.hpp:350
CRef< CSeq_annot > ReadSeqAnnot(ILineReader &lr, ILineErrorListener *pMessageListener=0)
Read an object from a given line reader, render it as a single Seq-annot, if possible.
Definition: rm_reader.cpp:772
TSpecificity2Taxid m_Specificity2TaxId
Definition: rm_reader.hpp:449
void ResetRepeatLibrary()
Clear out any repeat library which may be used to add additional attributes to repeats.
Definition: rm_reader.cpp:456
static bool EqualNocase(const CTempString str, SIZE_TYPE pos, SIZE_TYPE n, const char *pattern)
Case-insensitive equality of a substring with a pattern.
Definition: ncbistr.hpp:5358
virtual TScore GetSwScore() const =0
static const char location[]
Definition: config.c:97
void ProcessError(CObjReaderLineException &, ILineErrorListener *)
virtual TSeqPos GetRptPosBegin() const =0
namespace ncbi::objects::
Definition: Seq_feat.hpp:56
TSeqPos rpt_left
Definition: rm_reader.hpp:412
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:336
void SetImp(TImp &v)
static bool s_StandardizeNomenclature(const IRepeatRegion &repeat, CSeq_feat::TQual &qual_list)
Translate RepeatMasker output to INSDC standard nomenclature for repeats.
Definition: rm_reader.cpp:398
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3362
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:243
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
virtual bool IsReverseStrand() const
Convenience functions that gets the strand on the sequence, without dealing with a Seq-loc...
Definition: rm_reader.cpp:150
vector< CRef< CGb_qual > > TQual
Definition: Seq_feat_.hpp:117
TFlags m_Flags
Definition: rm_reader.hpp:679
virtual bool IsHeaderLine(const string &line)
Definition: rm_reader.cpp:832
void SetId(TId &value)
Assign a value to Id data member.
TRptId GetRptId() const
Definition: rm_reader.cpp:212
virtual bool ParseRecord(const string &record, SRepeatRegion &mask_data)
Definition: rm_reader.cpp:887
TQual & SetQual(void)
Assign a value to Qual data member.
Definition: Seq_feat_.hpp:1118
Store original RepeatMasker repeat_id.
Definition: rm_reader.hpp:545
string GetRptRepbaseId() const
Gets the RepbaseID, or empty string if not known.
Definition: rm_reader.hpp:132
Definition: Dbtag.hpp:52
CConstRef< CFeat_id > GetId() const
Gets the more general feature ID for this repeat, which identifies a single repeat, which may be multi-segement, and allows linking the segments together.
Definition: rm_reader.cpp:160
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:139
bool IsReverseStrand(void) const
Return true if all ranges have reverse strand.
Definition: Seq_loc.hpp:979
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:98
Avoid user objects and instead, put selected information in non-standard and invalid GenBank qualifie...
Definition: rm_reader.hpp:497
static void CloseReader(CRmReader *reader)
Definition: rm_reader.cpp:1030
CConstIRef< ITaxonomyResolver > m_Taxonomy
Definition: rm_reader.hpp:447
CRef< CSeq_feat > operator()(const IRepeatRegion &repeat)
Transforms the input repeat into a repeat feature.
Definition: rm_reader.cpp:482
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:756
virtual string GetRptFamily() const =0
Gets repeat family, or empty string if not known.
void SetIdGenerator(TIdGenerator &generator)
Set the Feature-id generator which will be used to assign unique feature IDs.
Definition: rm_reader.cpp:471
Interface defining a read-only RepeatMasker repeat feature.
Definition: rm_reader.hpp:229
void SetSeqIdResolver(ISeqIdResolver &seqid_resolver)
Use specified delegate for Seq-id resolution.
Definition: rm_reader.cpp:753
static int Score(const CRef< CSeq_id > &id)
Wrappers for use with FindBestChoice from
Definition: Seq_id.hpp:561
Modified on Sun Jun 25 17:53:16 2017 by modify_doxy.py rev. 533848