NCBI C++ ToolKit
gff_reader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gff_reader.cpp 79860 2017-10-18 13:38:41Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Aaron Ucko, Wratko Hlavina
27 *
28 * File Description:
29 * Reader for GFF (including GTF) files.
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
36 
37 #include <corelib/ncbitime.hpp>
38 #include <corelib/ncbiutil.hpp>
39 #include <corelib/stream_utils.hpp>
40 #include <serial/iterator.hpp>
41 
42 #include <objects/general/Date.hpp>
46 #include <objects/seq/Seq_inst.hpp>
47 #include <objects/seq/Seqdesc.hpp>
58 
62 #include <objtools/error_codes.hpp>
63 
64 #include <algorithm>
65 #include <ctype.h>
66 
67 
68 #define NCBI_USE_ERRCODE_X Objtools_Rd_GFF
69 
72 
73 static CRef<CFeat_id>
75 {
76  CRef<CObject_id> objid( new CObject_id );
77  objid->SetStr( str );
78  CRef<CFeat_id> featid( new CFeat_id );
79  featid->SetLocal( *objid );
80  return featid;
81 }
82 
83 static string& s_URLDecode(const CTempString& s, string& out) {
84  SIZE_TYPE pos = 0;
85  out.erase();
86  out.reserve(s.size());
87  while (pos < s.size()) {
88  SIZE_TYPE pos2 = s.find_first_of("%" /* "+" */, pos);
89  out += s.substr(pos, pos2 - pos);
90  if (pos2 == NPOS) {
91  break;
92  } else if (s[pos2] == '+') { // disabled -- often used literally
93  out += ' ';
94  pos = pos2 + 1;
95  } else if (s[pos2] == '%') {
96  try {
97  out += (char)NStr::StringToInt(s.substr(pos2 + 1, 2), 0, 16);
98  pos = pos2 + 3;
99  } catch (CStringException&) {
100  // some sources neglect to encode % (!)
101  out += '%';
102  pos = pos2 + 1;
103  }
104  } else {
105  _TROUBLE;
106  }
107  }
108  return out;
109 }
110 
111 
113 {
114  CStreamLineReader lr(in);
115  return Read(lr, flags);
116 }
117 
119 {
120  x_Reset();
121  m_Flags = flags;
122  m_LineReader = &in;
123 
124  if (m_Flags & fSetVersion3) {
125  m_Version = 3;
126  }
127 
128  TStr line;
129  while ( !in.AtEOF() ) {
130  ++m_LineNumber;
131  char c = in.PeekChar();
132  if (c == '#') {
133  line = *++in;
134  if (line.size() > 2 && line[1] == '#') {
136  // ignore regular comments
137  }
138  } else if (c == '>') {
139  // implicit ##FASTA
141  } else {
142  line = *++in;
143  if ( x_IsLineUcscMetaInformation(line) ) {
144  // UCSC browser or track line. For now, we ignore those.
145  continue;
146  }
147  if ( line.empty() ) {
148  // too commonly used for file formatting to even warn about
149  continue;
150  }
151  CRef<SRecord> record = x_ParseFeatureInterval(line);
152  if (record) {
153 
154  if (record->id.empty()) {
155  x_ParseAndPlace(*record);
156  } else {
157  CRef<SRecord>& match = m_DelayedRecords[ record->id ];
158  // _TRACE(id << " -> " << match.GetPointer());
159  if (match) {
160  x_MergeRecords(*match, *record);
161  } else {
162  match.Reset(record);
163  }
164  }
165  }
166  }
167  }
168 
170  SRecord& rec = *it->second;
171  /// merge mergeable ranges
172  NON_CONST_ITERATE (SRecord::TLoc, loc_iter, rec.loc) {
173  ITERATE (set<TSeqRange>, src_iter, loc_iter->merge_ranges) {
174  TSeqRange range(*src_iter);
175  set<TSeqRange>::iterator dst_iter =
176  loc_iter->ranges.begin();
177  for ( ; dst_iter != loc_iter->ranges.end(); ) {
178  TSeqRange r(range);
179  r += *dst_iter;
180  if (r.GetLength() <=
181  range.GetLength() + dst_iter->GetLength()) {
182  range += *dst_iter;
183  _TRACE("merging overlapping ranges: "
184  << range.GetFrom() << " - "
185  << range.GetTo() << " <-> "
186  << dst_iter->GetFrom() << " - "
187  << dst_iter->GetTo());
188  loc_iter->ranges.erase(dst_iter++);
189  break;
190  } else {
191  ++dst_iter;
192  }
193  }
194  loc_iter->ranges.insert(range);
195  }
196  }
197 
198  if (rec.key == "exon") {
199  rec.key = "mRNA";
200  }
201  x_ParseAndPlace(rec);
202  }
203 
204  ///
205  /// remap gene refs
206  /// we have built a set of gene-id -> gene-ref pairs
207  ///
208  if (m_TSE && m_GeneRefs.size()) {
210  if ( !iter->second->IsSetLocus() &&
211  !iter->second->IsSetLocus_tag()) {
212  iter->second->SetLocus(iter->first);
213  } else if ( !iter->second->IsSetLocus() ||
214  iter->second->GetLocus() != iter->first) {
215  iter->second->SetSyn().push_back(iter->first);
216  }
217  }
218 
219  CTypeIterator<CSeq_feat> feat_iter(*m_TSE);
220  for ( ; feat_iter; ++feat_iter) {
221  const CGene_ref* ref = NULL;
222  if (feat_iter->GetData().IsGene()) {
223  ref = &feat_iter->GetData().GetGene();
224  } else {
225  ref = feat_iter->GetGeneXref();
226  }
227  if (ref && ref->IsSetLocus()) {
229  m_GeneRefs.find(ref->GetLocus());
230  if (iter != m_GeneRefs.end()) {
231  const_cast<CGene_ref*>(ref)->Assign(*iter->second);
232  }
233  }
234  }
235  }
236 
237  CRef<CSeq_entry> tse(m_TSE); // need to save before resetting.
238  x_Reset();
239 
240  // promote transcript_id and protein_id to products
241  if (flags & fSetProducts) {
242  CTypeIterator<CSeq_feat> feat_iter(*tse);
243  for ( ; feat_iter; ++feat_iter) {
244  CSeq_feat& feat = *feat_iter;
245 
246  string qual_name;
247  switch (feat.GetData().GetSubtype()) {
249  qual_name = "protein_id";
250  break;
251 
253  qual_name = "transcript_id";
254  break;
255 
256  default:
257  continue;
258  break;
259  }
260 
261  string id_str = feat.GetNamedQual(qual_name);
262  if ( !id_str.empty() ) {
263  CRef<CSeq_id> id = x_ResolveSeqName(id_str);
264  feat.SetProduct().SetWhole(*id);
265  }
266  }
267  }
268 
269  if (flags & fCreateGeneFeats) {
270  CTypeIterator<CSeq_annot> annot_iter(*tse);
271  for ( ; annot_iter; ++annot_iter) {
272  CSeq_annot& annot = *annot_iter;
273  if (annot.GetData().Which() != CSeq_annot::TData::e_Ftable) {
274  continue;
275  }
276 
277  // we work within the scope of one annotation
278  CSeq_annot::TData::TFtable::iterator feat_iter =
279  annot.SetData().SetFtable().begin();
280  CSeq_annot::TData::TFtable::iterator feat_end =
281  annot.SetData().SetFtable().end();
282 
283  /// we plan to create a series of gene features, one for each gene
284  /// identified above
285  /// genes are identified via a 'gene_id' marker
286  typedef map<string, CRef<CSeq_feat> > TGeneMap;
287  TGeneMap genes;
288  for (bool has_genes = false;
289  feat_iter != feat_end && !has_genes; ++feat_iter) {
290  CSeq_feat& feat = **feat_iter;
291 
292  switch (feat.GetData().GetSubtype()) {
294  /// we already have genes, so don't add any more
295  has_genes = true;
296  genes.clear();
297  break;
298 
301  /// for mRNA and CDS features, create a gene
302  /// this is only done if the gene_id parameter was set
303  /// in parsing, we promote gene_id to a gene xref
304  if ( !feat.GetGeneXref() ) {
305  continue;
306  }
307  {{
308  string gene_id;
309  feat.GetGeneXref()->GetLabel(&gene_id);
310  _ASSERT( !gene_id.empty() );
311  TSeqRange range = feat.GetLocation().GetTotalRange();
312 
313  ENa_strand strand = feat.GetLocation().GetStrand();
314  const CSeq_id* id = feat.GetLocation().GetId();
315  if ( !id ) {
316  x_Error("No consistent ID found; gene feature skipped");
317  continue;
318  }
319 
320  TGeneMap::iterator iter = genes.find(gene_id);
321  if (iter == genes.end()) {
322  /// new gene feature
323  CRef<CSeq_feat> gene(new CSeq_feat());
324  gene->SetData().SetGene().Assign(*feat.GetGeneXref());
325 
326  gene->SetLocation().SetInt().SetFrom(range.GetFrom());
327  gene->SetLocation().SetInt().SetTo (range.GetTo());
328  gene->SetLocation().SetId(*id);
329  gene->SetLocation().SetInt().SetStrand(strand);
330  genes[gene_id] = gene;
331  } else {
332  /// we agglomerate the old location
333  CRef<CSeq_feat> gene = iter->second;
334 
335  TSeqRange r2 = gene->GetLocation().GetTotalRange();
336  range += r2;
337  gene->SetLocation().SetInt().SetFrom(range.GetFrom());
338  gene->SetLocation().SetInt().SetTo (range.GetTo());
339  gene->SetLocation().InvalidateTotalRangeCache();
340  }
341  }}
342  break;
343 
344  default:
345  break;
346  }
347  }
348 
349  ITERATE (TGeneMap, iter, genes) {
350  annot.SetData().SetFtable().push_back(iter->second);
351  }
352  }
353  }
354 
355  return tse;
356 }
357 
358 
359 void CGFFReader::x_Warn(const string& message, unsigned int line)
360 {
361  if (line) {
362  ERR_POST_X(2, Warning << message << " [GFF input, line " << line << ']');
363  } else {
364  ERR_POST_X(3, Warning << message << " [GFF input]");
365  }
366 }
367 
368 
369 void CGFFReader::x_Error(const string& message, unsigned int line)
370 {
371  if (line) {
372  ERR_POST_X(1, Error << message << " [GFF input, line " << line << ']');
373  } else {
374  ERR_POST_X(1, Error << message << " [GFF input]");
375  }
376 }
377 
378 
379 void CGFFReader::x_Info(const string& message, unsigned int line)
380 {
381  if (line) {
382  ERR_POST_X(1, Info << message << " [GFF input, line " << line << ']');
383  } else {
384  ERR_POST_X(1, Info << message << " [GFF input]");
385  }
386 }
387 
388 
390 {
391  m_TSE.Reset(new CSeq_entry);
393  m_SeqCache.clear();
395  m_GeneRefs.clear();
396  m_DefMol.erase();
397  m_LineNumber = 0;
398  m_Version = 2;
399 }
400 
401 
403 {
404  if ( line.empty() || line[0] != '#' || line[1] != '#' ) {
405  return false;
406  }
407  TStrVec v;
408  NStr::Split(line, "# \t", v, NStr::fSplit_Tokenize);
409  if (v.empty()) {
410  return true;
411  }
412  if (v[0] == "date" && v.size() > 1) {
413  x_ParseDateComment(v[1]);
414  } else if (v[0] == "Type" && v.size() > 1) {
415  x_ParseTypeComment(v[1], v.size() > 2 ? v[2] : TStr());
416  } else if (v[0] == "gff-version" && v.size() > 1) {
418  } else if (v[0] == "FASTA") {
420  }
421  // etc.
422  return true;
423 }
424 
425 
427 {
428  try {
429  CRef<CSeqdesc> desc(new CSeqdesc);
430  desc->SetUpdate_date().SetToTime(CTime(date, "Y-M-D"),
432  m_TSE->SetSet().SetDescr().Set().push_back(desc);
433  } catch (exception& e) {
434  x_Error(string("Bad ISO date: ") + e.what(), x_GetLineNumber());
435  }
436 }
437 
438 
439 void CGFFReader::x_ParseTypeComment(const TStr& moltype, const TStr& seqname)
440 {
441  if (seqname.empty()) {
442  m_DefMol = moltype;
443  } else {
444  // automatically adds to m_TSE if new
445  x_ResolveID(*x_ResolveSeqName(seqname), moltype);
446  }
447 }
448 
449 
451 {
453  CRef<CSeq_entry> seqs = reader.ReadSet();
454  for (CTypeIterator<CBioseq> it(*seqs); it; ++it) {
455  if (it->GetId().empty()) { // can this happen?
456  CRef<CSeq_entry> parent(new CSeq_entry);
457  parent->SetSeq(*it);
458  m_TSE->SetSet().SetSeq_set().push_back(parent);
459  continue;
460  }
461  CRef<CBioseq> our_bs = x_ResolveID(*it->GetId().front(), kEmptyStr);
462  // keep our annotations, but replace everything else.
463  // (XXX - should also keep mol)
464  our_bs->SetId() = it->GetId();
465  if (it->IsSetDescr()) {
466  our_bs->SetDescr(it->SetDescr());
467  }
468  our_bs->SetInst(it->SetInst());
469  }
470 }
471 
472 
475 {
476  TStrVec v;
477  bool misdelimited = false;
478 
479  NStr::Split(line, "\t", v);
480  if (v.size() < 8) {
481  v.clear();
482  NStr::Split(line, " \t", v, NStr::fSplit_Tokenize);
483  if (v.size() < 8) {
484  x_Error("Skipping line due to insufficient fields",
485  x_GetLineNumber());
486  return null;
487  } else if (m_Version < 3) {
488  x_Info("(Recovered) Bad delimiters (should use tabs)", x_GetLineNumber());
489  misdelimited = true;
490  }
491  } else {
492  // XXX - warn about extra fields (if any), but only if they're
493  // not comments
494  // v.resize(9);
495  }
496 
497  CRef<SRecord> record(x_NewRecord());
498  string accession;
499  TSeqPos from = 0, to = numeric_limits<TSeqPos>::max();
500  ENa_strand strand = eNa_strand_unknown;
501  s_URLDecode(v[0], accession);
502  record->source = v[1];
503  record->key = v[2];
504 
505  try {
506  from = NStr::StringToUInt(v[3]) - 1;
507  } catch (std::exception& e) {
508  x_Error(string("Bad FROM position: ") + e.what(), x_GetLineNumber());
509  }
510 
511  try {
512  to = NStr::StringToUInt(v[4]) - 1;
513  } catch (std::exception& e) {
514  x_Error(string("Bad TO position: ") + e.what(), x_GetLineNumber());
515  }
516 
517  record->score = v[5];
518 
519  if (v[6] == "+") {
520  strand = eNa_strand_plus;
521  } else if (v[6] == "-") {
522  strand = eNa_strand_minus;
523  } else if ( !(v[6] == ".") ) {
524  x_Warn("Bad strand " + string(v[6]) + " (should be [+-.])",
525  x_GetLineNumber());
526  }
527 
528  if (v[7] == "0" || v[7] == "1" || v[7] == "2") {
529  record->frame = v[7][0] - '0';
530  } else if (v[7] == ".") {
531  record->frame = -1;
532  } else {
533  x_Warn("Bad frame " + string(v[7]) + " (should be [012.])",
534  x_GetLineNumber());
535  record->frame = -1;
536  }
537 
538  {{
539  SRecord::SSubLoc subloc;
540  subloc.accession = accession;
541  subloc.strand = strand;
542  subloc.ranges.insert(TSeqRange(from, to));
543 
544  record->loc.push_back(subloc);
545  }}
546 
547  SIZE_TYPE i = 8;
548  if (m_Version >= 3) {
549  x_ParseV3Attributes(*record, v, i);
550  } else {
551  x_ParseV2Attributes(*record, v, i);
552  }
553 
554  if ( !misdelimited && (i > 9 || (i == 9 && v.size() > 9
555  && !NStr::StartsWith(v[9], "#") ))) {
556  x_Warn("Extra non-comment fields", x_GetLineNumber());
557  }
558 
559  if (record->FindAttribute("Target") != record->attrs.end()) {
560  record->type = SRecord::eAlign;
561  } else {
562  record->type = SRecord::eFeat;
563  }
564 
565  // extracting additional gff3 attributes
566  if (m_Version == 3) {
567  SRecord::TAttrs::const_iterator id_it = record->FindAttribute("ID");
568  if (id_it != record->attrs.end()) {
569  record->id = (*id_it)[1];
570  }
571 
572  SRecord::TAttrs::const_iterator parent_it = record->FindAttribute("Parent");
573  if (parent_it != record->attrs.end()) {
574  record->parent = (*parent_it)[1];
575  }
576 
577  SRecord::TAttrs::const_iterator name_it = record->FindAttribute("Name");
578  if (name_it != record->attrs.end()) {
579  record->name = (*name_it)[1];
580  }
581  }
582 
583  record->line_no = m_LineNumber;
584  record->id = x_FeatureID(*record);
585  return record;
586 }
587 
588 
590 {
592  (record.key, *x_ResolveLoc(record.loc),
594  if (record.frame >= 0 && feat->GetData().IsCdregion()) {
595  feat->SetData().SetCdregion().SetFrame
596  (static_cast<CCdregion::EFrame>(record.frame + 1));
597  }
598  if ( m_Version == 3 ) {
599  ITERATE (SRecord::TAttrs, it, record.attrs) {
600  string tag = it->front();
601  if (tag == "ID") {
602  feat->SetId( *s_StringToFeatId( (*it)[1] ) );
603  }
604  if (tag == "Parent") {
605  CRef<CSeqFeatXref> xref( new CSeqFeatXref );
606  xref->SetId( *s_StringToFeatId( (*it)[1] ) );
607  feat->SetXref().push_back( xref );
608  }
609  }
610  }
611 
612  if ( record.source != "." ) {
614  source->SetQual( "source" );
615  source->SetVal( record.source );
616  feat->SetQual().push_back( source );
617  }
618 
619  string gene_id;
620  string gene;
621  string locus_tag;
622  ITERATE (SRecord::TAttrs, it, record.attrs) {
623  string tag = it->front();
624  string value;
625  switch (it->size()) {
626  case 1:
627  break;
628  case 2:
629  value = (*it)[1];
630  break;
631  default:
632  x_Warn("Ignoring extra fields in value of " + tag, record.line_no);
633  value = (*it)[1];
634  break;
635  }
636  if (x_GetFlags() & fGBQuals) {
637  if (tag == "transcript_id") {
638  //continue;
639  } else if (tag == "gene_id") {
640  gene_id = value;
641  continue;
642  } else if (tag == "gene") {
643  gene = value;
644  continue;
645  } else if (tag == "locus_tag") {
646  locus_tag = value;
647  continue;
648  } else if (tag == "exon_number") {
649  tag = "number";
650  } else if (NStr::StartsWith(tag, "insd_")) {
651  tag.erase(0, 5);
652  }
653 
655  (feat, kEmptyStr, tag, value, CFeature_table_reader::fKeepBadKey);
656  } else { // don't attempt to parse, just treat as imported
657  CRef<CGb_qual> qual(new CGb_qual);
658  qual->SetQual(tag);
659  qual->SetVal(value);
660  feat->SetQual().push_back(qual);
661  }
662  }
663 
664  if ( !gene_id.empty() ) {
665  SIZE_TYPE colon = gene_id.find(':');
666  if (colon != NPOS) {
667  gene_id.erase(0, colon + 1);
668  }
669 
670  TGeneRefs::value_type val(gene_id, CRef<CGene_ref>());
671  TGeneRefs::iterator iter = m_GeneRefs.insert(val).first;
672  if ( !iter->second ) {
673  iter->second.Reset(new CGene_ref);
674  }
675  if ( !gene.empty() ) {
676  if (iter->second->IsSetLocus() &&
677  iter->second->GetLocus() != gene) {
678  LOG_POST_X(4, Warning << "CGFFReader::x_ParseFeatRecord(): "
679  << "inconsistent gene name: "
680  << gene << " != " << iter->second->GetLocus()
681  << ", ignoring second");
682  } else if ( !iter->second->IsSetLocus() ) {
683  iter->second->SetLocus(gene);
684  }
685  }
686  if ( !locus_tag.empty() ) {
687  if (iter->second->IsSetLocus_tag() &&
688  iter->second->GetLocus_tag() != locus_tag) {
689  LOG_POST_X(5, Warning << "CGFFReader::x_ParseFeatRecord(): "
690  << "inconsistent locus tag: "
691  << locus_tag << " != " << iter->second->GetLocus_tag()
692  << ", ignoring second");
693  } else if ( !iter->second->IsSetLocus_tag() ) {
694  iter->second->SetLocus_tag(locus_tag);
695  }
696  }
697 
698  // translate
700  (feat, kEmptyStr, "gene_id", gene_id,
702  if (x_GetFlags() & fGBQuals) {
704  (feat, kEmptyStr, "gene", gene_id,
706  }
707  }
708 
709  return feat;
710 }
711 
712 
714 {
715  CRef<CSeq_align> align(new CSeq_align);
717  align->SetDim(2);
718  SRecord::TAttrs::const_iterator tgit = record.FindAttribute("Target");
719  vector<string> target;
720  if (tgit != record.attrs.end()) {
721  NStr::Split((*tgit)[1], " +-", target, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
722  }
723  if (target.size() != 3) {
724  x_Warn("Bad Target attribute", record.line_no);
725  return align;
726  }
727  CRef<CSeq_id> tgid = x_ResolveSeqName(target[0]);
728  TSeqPos tgstart = NStr::StringToUInt(target[1]) - 1;
729  TSeqPos tgstop = NStr::StringToUInt(target[2]) - 1;
730  TSeqPos tglen = tgstop - tgstart + 1;
731 
732  CRef<CSeq_loc> refloc = x_ResolveLoc(record.loc);
733  CRef<CSeq_id> refid(&refloc->SetInt().SetId());
734  TSeqPos reflen = 0;
735  for (CSeq_loc_CI it(*refloc); it; ++it) {
736  reflen += it.GetRange().GetLength();
737  }
738 
739  CRef<CSeq_loc> tgloc(new CSeq_loc);
740  tgloc->SetInt().SetId(*tgid);
741  tgloc->SetInt().SetFrom(tgstart);
742  tgloc->SetInt().SetTo(tgstop);
743 
744  SRecord::TAttrs::const_iterator gap_it = record.FindAttribute("Gap");
745  if (gap_it == record.attrs.end()) {
746  // single ungapped alignment
747  if (reflen == tglen && refloc->IsInt()) {
748  CDense_seg& ds = align->SetSegs().SetDenseg();
749  ds.SetNumseg(1);
750  ds.SetIds().push_back(refid);
751  ds.SetIds().push_back(tgid);
752  ds.SetStarts().push_back(refloc->GetInt().GetFrom());
753  ds.SetStarts().push_back(tgstart);
754  ds.SetLens().push_back(reflen);
755  if (refloc->GetInt().IsSetStrand()) {
756  ds.SetStrands().push_back(refloc->GetInt().GetStrand());
757  ds.SetStrands().push_back(eNa_strand_plus);
758  }
759  } else {
760  if (reflen != tglen && reflen != 3 * tglen) {
761  x_Warn("Reference and target locations have an irregular"
762  " ratio.", record.line_no);
763  }
765  ss->SetLoc().push_back(refloc);
766  ss->SetLoc().push_back(tgloc);
767  align->SetSegs().SetStd().push_back(ss);
768  }
769  } else {
770  SCigarAlignment cigar
771  ((*gap_it)[1], SCigarAlignment::eOpFirstIfAmbiguous);
772  align = cigar(refloc->GetInt(), tgloc->GetInt());
773  }
774 
775  try {
776  CRef<CScore> score(new CScore);
777  score->SetValue().SetReal(NStr::StringToDouble(record.score));
778  align->SetScore().push_back(score);
779  } catch (...) {
780  }
781 
782  return align;
783 }
784 
785 
787 {
788  CRef<CSeq_loc> seqloc(new CSeq_loc);
789  ITERATE (SRecord::TLoc, it, loc) {
790  CRef<CSeq_id> id = x_ResolveSeqName(it->accession);
791  ITERATE (set<TSeqRange>, range, it->ranges) {
792  CRef<CSeq_loc> segment(new CSeq_loc);
793  if (range->GetLength() == 1) {
794  CSeq_point& pnt = segment->SetPnt();
795  pnt.SetId (*id);
796  pnt.SetPoint(range->GetFrom());
797  if (it->strand != eNa_strand_unknown) {
798  pnt.SetStrand(it->strand);
799  }
800  } else {
801  CSeq_interval& si = segment->SetInt();
802  si.SetId (*id);
803  si.SetFrom(range->GetFrom());
804  si.SetTo (range->GetTo());
805  if (it->strand != eNa_strand_unknown) {
806  si.SetStrand(it->strand);
807  }
808  }
809  if (IsReverse(it->strand)) {
810  seqloc->SetMix().Set().push_front(segment);
811  } else {
812  seqloc->SetMix().Set().push_back(segment);
813  }
814  }
815  }
816 
817  if (seqloc->GetMix().Get().size() == 1) {
818  return seqloc->SetMix().Set().front();
819  } else {
820  return seqloc;
821  }
822 }
823 
824 
826  SIZE_TYPE& i)
827 {
828  string attr_last_value;
829  vector<string> attr_values;
830  char quote_char = 0;
831 
832  for (; i < v.size(); ++i) {
833  string s = string(v[i]) + ' ';
834  SIZE_TYPE pos = 0;
835  while (pos < s.size()) {
836  SIZE_TYPE pos2;
837  if (quote_char) { // must be inside a value
838  pos2 = s.find_first_of(" \'\"\\", pos);
839  _ASSERT(pos2 != NPOS); // due to trailing space
840  if (s[pos2] == quote_char) {
841  if (attr_values.empty()) {
842  x_Warn("quoted attribute tag " + attr_last_value,
843  x_GetLineNumber());
844  }
845  quote_char = 0;
846  attr_last_value += s.substr(pos, pos2 - pos);
847  try {
848  attr_values.push_back(NStr::ParseEscapes
849  (attr_last_value));
850  } catch (CStringException& e) {
851  attr_values.push_back(attr_last_value);
852  x_Warn(e.what() + (" in value of " + attr_values[0]),
853  x_GetLineNumber());
854  }
855  attr_last_value.erase();
856  } else if (s[pos2] == '\\') {
857  _VERIFY(++pos2 != s.size());
858  attr_last_value += s.substr(pos, pos2 + 1 - pos);
859  } else {
860  attr_last_value += s.substr(pos, pos2 + 1 - pos);
861  }
862  } else {
863  pos2 = s.find_first_of(" #;\"", pos); // also look for \'?
864  _ASSERT(pos2 != NPOS); // due to trailing space
865  if (pos != pos2) {
866  // grab and place the preceding token
867  attr_last_value += s.substr(pos, pos2 - pos);
868  attr_values.push_back(attr_last_value);
869  attr_last_value.erase();
870  }
871 
872  switch (s[pos2]) {
873  case ' ':
874  if (pos2 == s.size() - 1) {
875  x_AddAttribute(record, attr_values);
876  attr_values.clear();
877  }
878  break;
879 
880  case '#':
881  return;
882 
883  case ';':
884  if (attr_values.empty()) {
885  x_Warn("null attribute", x_GetLineNumber());
886  } else {
887  x_AddAttribute(record, attr_values);
888  attr_values.clear();
889  }
890  break;
891 
892  // NB: we don't currently search for single quotes.
893  case '\"':
894  case '\'':
895  quote_char = s[pos2];
896  break;
897 
898  default:
899  _TROUBLE;
900  }
901  }
902  pos = pos2 + 1;
903  }
904  }
905 
906  if ( !attr_values.empty() ) {
907  x_Warn("unterminated attribute " + attr_values[0], x_GetLineNumber());
908  x_AddAttribute(record, attr_values);
909  }
910 }
911 
912 bool CGFFReader::x_SplitKeyValuePair( const string& pair, string& key, string& value )
913 {
914  if ( NStr::SplitInTwo( pair, "=", key, value ) ) {
915  return true;
916  }
917  if ( NStr::SplitInTwo( pair, " ", key, value ) ) {
918  x_Info("(recovered) missdelimited attribute/value pair: " + key, x_GetLineNumber());
919  return true;
920  }
921  x_Warn("attribute without value: " + key, x_GetLineNumber());
922  return false;
923 }
924 
925 
927  SIZE_TYPE& i)
928 {
929  vector<string> v2, attr;
930  NStr::Split(v[i], ";", v2, NStr::fSplit_Tokenize);
931  ITERATE (vector<string>, it, v2) {
932  attr.clear();
933  string key, values;
934  if (x_SplitKeyValuePair( *it, key, values )) {
935  vector<string> vals;
936  attr.resize(2);
937  s_URLDecode(key, attr[0]);
938  NStr::Split(values, ",", vals);
939  ITERATE (vector<string>, it2, vals) {
940  string value( *it2 );
941  if ( NStr::MatchesMask(value, "\"*\"") ) {
942  //
943  // Note: The GFF3 spec is ambiguous on whether quoting is
944  // required for free text values.
945  //
946  value = value.substr(1, value.length()-2);
947  }
948  s_URLDecode(value, attr[1]);
949  x_AddAttribute(record, attr);
950  }
951  } else {
952  x_Warn("attribute without value: " + key, x_GetLineNumber());
953  attr.resize(1);
954  s_URLDecode(*it, attr[0]);
955  x_AddAttribute(record, attr);
956  continue;
957  }
958  }
959 }
960 
961 
962 void CGFFReader::x_AddAttribute(SRecord& record, vector<string>& attr)
963 {
964  if (attr.size() == 0) {
965  return;
966  }
967 
968  if (x_GetFlags() & fGBQuals) {
969  if (attr[0] == "gbkey" && attr.size() == 2) {
970  record.key = attr[1];
971  return;
972  }
973  }
974  record.attrs.insert(attr);
975 }
976 
977 
978 string CGFFReader::x_FeatureID(const SRecord& record)
979 {
980  if (record.type != SRecord::eFeat || x_GetFlags() & fNoGTF) {
981  return kEmptyStr;
982  }
983 
984  // has been retrieved in initial interval parsing
985  if (m_Version == 3) {
986  if (!record.id.empty()) {
987  return record.id;
988  }
989  else if (!record.parent.empty()) {
990  return record.source + record.key + record.parent;
991  }
992  else {
993  return "";
994  }
995  }
996 
997  SRecord::TAttrs::const_iterator gene_it = record.FindAttribute("gene_id");
998  SRecord::TAttrs::const_iterator transcript_it
999  = record.FindAttribute("transcript_id");
1000 
1001  // concatenate our IDs from above, if found
1002  string id;
1003  if (gene_it != record.attrs.end()) {
1004  id += (*gene_it)[1];
1005  }
1006 
1007  if (transcript_it != record.attrs.end()) {
1008  if ( !id.empty() ) {
1009  id += ' ';
1010  }
1011  id += (*transcript_it)[1];
1012  }
1013 
1014  // look for db xrefs
1016  = record.FindAttribute("db_xref");
1017  for ( ; dbxref_it != record.attrs.end() &&
1018  dbxref_it->front() == "db_xref"; ++dbxref_it) {
1019  if ( !id.empty() ) {
1020  id += ' ';
1021  }
1022  id += (*dbxref_it)[1];
1023  }
1024 
1025  if ( id.empty() ) {
1026  return id;
1027  }
1028 
1029  if (record.key == "start_codon" || record.key == "stop_codon") {
1030  //id += " " + record.key;
1031  id += "CDS";
1032  } else if (record.key == "CDS"
1033  || NStr::FindNoCase(record.key, "rna") != NPOS) {
1034  //id += " " + record.key;
1035  id += record.key;
1036  } else if (record.key == "exon") {
1037  // normally separate intervals, but may want to merge.
1038  if (x_GetFlags() & fMergeExons) {
1039  id += record.key;
1040  } else {
1042  = record.FindAttribute("exon_number");
1043  if (it == record.attrs.end()) {
1044  return kEmptyStr;
1045  } else {
1046  id += record.key + ' ' + (*it)[1];
1047  }
1048  }
1049  } else if (x_GetFlags() & fMergeOnyCdsMrna) {
1050  return kEmptyStr;
1051  }
1052  return id;
1053 }
1054 
1055 
1057 {
1058  // XXX - perform sanity checks and warn on mismatch
1059 
1060  bool merge_overlaps = false;
1061  if (dest.key == "CDS" &&
1062  (src.key == "start_codon" || src.key == "stop_codon")) {
1063  // start_codon and stop_codon features should be merged into
1064  // existing CDS locations
1065  merge_overlaps = true;
1066  }
1067 
1068  if ((dest.key == "start_codon" || dest.key == "stop_codon") &&
1069  src.key == "CDS") {
1070  // start_codon and stop_codon features should be merged into
1071  // existing CDS locations
1072  merge_overlaps = true;
1073  dest.key = "CDS";
1074  }
1075 
1076  // adjust the frame as needed
1077  int best_frame = dest.frame;
1078 
1079  ITERATE (SRecord::TLoc, slit, src.loc) {
1080  bool merged = false;
1081  NON_CONST_ITERATE (SRecord::TLoc, dlit, dest.loc) {
1082  if (slit->accession != dlit->accession) {
1083  if (dest.loc.size() == 1) {
1084  x_Warn("Multi-accession feature", src.line_no);
1085  }
1086  continue;
1087  } else if (slit->strand != dlit->strand) {
1088  if (dest.loc.size() == 1) {
1089  x_Warn("Multi-orientation feature", src.line_no);
1090  }
1091  continue;
1092  } else {
1093  if (slit->strand == eNa_strand_plus) {
1094  if (slit->ranges.begin()->GetFrom() <
1095  dlit->ranges.begin()->GetFrom()) {
1096  best_frame = src.frame;
1097  }
1098  } else {
1099  if (slit->ranges.begin()->GetTo() >
1100  dlit->ranges.begin()->GetTo()) {
1101  best_frame = src.frame;
1102  }
1103  }
1104  if (merge_overlaps) {
1105  ITERATE (set<TSeqRange>, set_iter, slit->ranges) {
1106  dlit->merge_ranges.insert(*set_iter);
1107  }
1108  } else {
1109  ITERATE (set<TSeqRange>, set_iter, slit->ranges) {
1110  dlit->ranges.insert(*set_iter);
1111  }
1112  }
1113  merged = true;
1114  break;
1115  }
1116  }
1117  if ( !merged ) {
1118  dest.loc.push_back(*slit);
1119  }
1120  }
1121 
1122  dest.frame = best_frame;
1123  if (src.key != dest.key) {
1124  if (dest.key == "CDS" && NStr::EndsWith(src.key, "_codon")
1125  && !(x_GetFlags() & fNoGTF) ) {
1126  // ok
1127  } else if (src.key == "CDS" && NStr::EndsWith(dest.key, "_codon")
1128  && !(x_GetFlags() & fNoGTF) ) {
1129  dest.key = "CDS";
1130  } else {
1131  x_Warn("Merging features with different keys: " + dest.key
1132  + " != " + src.key, src.line_no);
1133  }
1134  }
1135 
1136  x_MergeAttributes(dest, src);
1137 }
1138 
1139 
1141 {
1142  SRecord::TAttrs::iterator dait = dest.attrs.begin();
1143  SRecord::TAttrs::iterator dait_end = dest.attrs.end();
1144  SRecord::TAttrs::iterator dait_tag = dait_end;
1145  ITERATE (SRecord::TAttrs, sait, src.attrs) {
1146  const string& tag = sait->front();
1147  while (dait != dait_end && dait->front() < tag) {
1148  ++dait;
1149  }
1150 
1151  if (dait_tag == dait_end || dait_tag->front() != tag) {
1152  dait_tag = dait;
1153  }
1154  if (dait != dait_end && dait->front() == tag) {
1155  while (dait != dait_end && *dait < *sait) {
1156  ++dait;
1157  }
1158  }
1159  if (dait != dait_end && *dait == *sait) {
1160  continue; // identical
1161  } else if ( !(x_GetFlags() & fNoGTF) && tag == "exon_number") {
1162  if (dait_tag != dait_end) {
1163  while (dait != dait_end && dait->front() == tag) {
1164  ++dait;
1165  }
1166  dest.attrs.erase(dait_tag, dait);
1167  dait_tag = dait_end;
1168  }
1169  } else {
1170  dest.attrs.insert(dait, *sait);
1171  }
1172  }
1173 }
1174 
1175 
1177 {
1178  CRef<CBioseq> seq;
1179  if ( !feat.IsSetProduct() ) {
1180  for (CTypeConstIterator<CSeq_id> it(feat.GetLocation()); it; ++it) {
1181  CRef<CBioseq> seq2 = x_ResolveID(*it, kEmptyStr);
1182  if ( !seq ) {
1183  seq.Reset(seq2);
1184  } else if ( seq2.NotEmpty() && seq != seq2) {
1185  seq.Reset();
1186  BREAK(it);
1187  }
1188  }
1189  }
1190 
1191  CBioseq::TAnnot& annots
1192  = seq ? seq->SetAnnot() : m_TSE->SetSet().SetAnnot();
1193  NON_CONST_ITERATE (CBioseq::TAnnot, it, annots) {
1194  if ((*it)->GetData().IsFtable()) {
1195  (*it)->SetData().SetFtable().push_back(CRef<CSeq_feat>(&feat));
1196  return;
1197  }
1198  }
1199  CRef<CSeq_annot> annot(new CSeq_annot);
1200  annot->SetData().SetFtable().push_back(CRef<CSeq_feat>(&feat));
1201  annots.push_back(annot);
1202 }
1203 
1204 
1206 {
1207  CRef<CBioseq> seq;
1208  try {
1209  seq = x_ResolveID(align.GetSeq_id(0), kEmptyStr);
1210  } catch (...) {
1211  }
1212  CBioseq::TAnnot& annots
1213  = seq ? seq->SetAnnot() : m_TSE->SetSet().SetAnnot();
1214  NON_CONST_ITERATE (CBioseq::TAnnot, it, annots) {
1215  if ((*it)->GetData().IsAlign()) {
1216  (*it)->SetData().SetAlign().push_back(CRef<CSeq_align>(&align));
1217  return;
1218  }
1219  }
1220  CRef<CSeq_annot> annot(new CSeq_annot);
1221  annot->SetData().SetAlign().push_back(CRef<CSeq_align>(&align));
1222  annots.push_back(annot);
1223 }
1224 
1225 
1227 {
1228  switch (record.type) {
1229  case SRecord::eFeat:
1230  x_PlaceFeature(*x_ParseFeatRecord(record), record);
1231  break;
1232  case SRecord::eAlign:
1233  x_PlaceAlignment(*x_ParseAlignRecord(record), record);
1234  break;
1235  default:
1236  x_Warn("Unknown record type " + NStr::IntToString(record.type),
1237  record.line_no);
1238  }
1239 }
1240 
1241 
1243 {
1244  CRef<CSeq_id>& id = m_SeqNameCache[name];
1245  if (id.NotEmpty()
1246  && (id->Which() == CSeq_id::e_not_set
1247  || static_cast<int>(id->Which()) >= CSeq_id::e_MaxChoice)) {
1248  x_Warn("x_ResolveSeqName: invalid cache entry for " + name);
1249  id.Reset();
1250  }
1251  if ( !id ) {
1252  id.Reset(x_ResolveNewSeqName(name));
1253  }
1254  if ( !id || id->Which() == CSeq_id::e_not_set
1255  || static_cast<int>(id->Which()) >= CSeq_id::e_MaxChoice) {
1256  x_Warn("x_ResolveNewSeqName returned null or invalid ID for " + name);
1257  id.Reset(new CSeq_id(CSeq_id::e_Local, name, name));
1258  }
1259  return id;
1260 }
1261 
1262 
1264 {
1265  if (m_Flags & fAllIdsAsLocal) {
1266  if (NStr::StartsWith(name, "lcl|")) {
1267  return CRef<CSeq_id>(new CSeq_id(name));
1268  } else {
1269  return CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local, name));
1270  }
1271  }
1272 
1273  if (m_Flags & fNumericIdsAsLocal) {
1274  if (name.find_first_not_of("0123456789") == string::npos) {
1275  return CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local, name));
1276  }
1277  }
1278  try {
1279  CRef<CSeq_id> pId(new CSeq_id(name));
1280  if (!pId || (pId->IsGi() && pId->GetGi() < GI_CONST(500)) ) {
1281  pId = new CSeq_id(CSeq_id::e_Local, name);
1282  }
1283  return pId;
1284  }
1285  catch (CSeqIdException&) {
1286  return CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local, name));
1287  }
1288 }
1289 
1290 
1292 {
1294  if ( !seq ) {
1295  seq.Reset(x_ResolveNewID(id, mol));
1296  // Derived versions of x_ResolveNewID may legimately return null
1297  // results....
1298  if (seq) {
1299  x_PlaceSeq(*seq);
1300  ITERATE (CBioseq::TId, it, seq->GetId()) {
1301  m_SeqCache.insert(make_pair(CConstRef<CSeq_id>(*it), seq));
1302  }
1303  }
1304  }
1305  return seq;
1306 }
1307 
1308 
1309 CRef<CBioseq> CGFFReader::x_ResolveNewID(const CSeq_id& id, const string& mol0)
1310 {
1311  CRef<CBioseq> seq(new CBioseq);
1312  CRef<CSeq_id> id_copy(new CSeq_id);
1313 
1314  id_copy->Assign(id);
1315  seq->SetId().push_back(id_copy);
1316  seq->SetInst().SetRepr(CSeq_inst::eRepr_virtual);
1317 
1318  const string& mol = mol0.empty() ? m_DefMol : mol0;
1319  if (mol.empty() || mol == "dna") {
1320  seq->SetInst().SetMol(CSeq_inst::eMol_dna);
1321  } else if (mol == "rna") {
1322  seq->SetInst().SetMol(CSeq_inst::eMol_rna);
1323  } else if (mol == "protein") {
1324  seq->SetInst().SetMol(CSeq_inst::eMol_aa);
1325  } else {
1326  x_Warn("unrecognized sequence type " + mol + "; assuming DNA");
1327  seq->SetInst().SetMol(CSeq_inst::eMol_dna);
1328  }
1329 
1330  return seq;
1331 }
1332 
1334 {
1335  CTypeIterator<CSeq_feat> feat_iter(*tse);
1336  for ( ; feat_iter; ++feat_iter) {
1337  CSeq_feat& feat = *feat_iter;
1338 
1339  string qual_name;
1340  switch (feat.GetData().GetSubtype()) {
1342  qual_name = "protein_id";
1343  break;
1344 
1346  qual_name = "transcript_id";
1347  break;
1348 
1349  default:
1350  continue;
1351  break;
1352  }
1353 
1354  string id_str = feat.GetNamedQual(qual_name);
1355  if ( !id_str.empty() ) {
1356  CRef<CSeq_id> id = x_ResolveSeqName(id_str);
1357  feat.SetProduct().SetWhole(*id);
1358  }
1359  }
1360 }
1361 
1363 {
1364  CTypeIterator<CSeq_annot> annot_iter(*tse);
1365  for ( ; annot_iter; ++annot_iter) {
1366  CSeq_annot& annot = *annot_iter;
1367  if (annot.GetData().Which() != CSeq_annot::TData::e_Ftable) {
1368  continue;
1369  }
1370 
1371  // we work within the scope of one annotation
1372  CSeq_annot::TData::TFtable::iterator feat_iter =
1373  annot.SetData().SetFtable().begin();
1374  CSeq_annot::TData::TFtable::iterator feat_end =
1375  annot.SetData().SetFtable().end();
1376 
1377  /// we plan to create a series of gene features, one for each gene
1378  /// identified above
1379  /// genes are identified via a 'gene_id' marker
1380  typedef map<string, CRef<CSeq_feat> > TGeneMap;
1381  TGeneMap genes;
1382  for (bool has_genes = false;
1383  feat_iter != feat_end && !has_genes; ++feat_iter) {
1384  CSeq_feat& feat = **feat_iter;
1385 
1386  switch (feat.GetData().GetSubtype()) {
1388  /// we already have genes, so don't add any more
1389  has_genes = true;
1390  genes.clear();
1391  break;
1392 
1395  /// for mRNA and CDS features, create a gene
1396  /// this is only done if the gene_id parameter was set
1397  /// in parsing, we promote gene_id to a gene xref
1398  if ( !feat.GetGeneXref() ) {
1399  continue;
1400  }
1401  {{
1402  string gene_id;
1403  feat.GetGeneXref()->GetLabel(&gene_id);
1404  _ASSERT( !gene_id.empty() );
1405  TSeqRange range = feat.GetLocation().GetTotalRange();
1406 
1407  ENa_strand strand = feat.GetLocation().GetStrand();
1408  const CSeq_id* id = feat.GetLocation().GetId();
1409  if ( !id ) {
1410  x_Error("No consistent ID found; gene feature skipped");
1411  continue;
1412  }
1413 
1414  TGeneMap::iterator iter = genes.find(gene_id);
1415  if (iter == genes.end()) {
1416  /// new gene feature
1417  CRef<CSeq_feat> gene(new CSeq_feat());
1418  gene->SetData().SetGene().Assign(*feat.GetGeneXref());
1419 
1420  gene->SetLocation().SetInt().SetFrom(range.GetFrom());
1421  gene->SetLocation().SetInt().SetTo (range.GetTo());
1422  gene->SetLocation().SetId(*id);
1423  gene->SetLocation().SetInt().SetStrand(strand);
1424  genes[gene_id] = gene;
1425  } else {
1426  /// we agglomerate the old location
1427  CRef<CSeq_feat> gene = iter->second;
1428 
1429  TSeqRange r2 = gene->GetLocation().GetTotalRange();
1430  range += r2;
1431  gene->SetLocation().SetInt().SetFrom(range.GetFrom());
1432  gene->SetLocation().SetInt().SetTo (range.GetTo());
1433  gene->SetLocation().InvalidateTotalRangeCache();
1434  }
1435  }}
1436  break;
1437 
1438  default:
1439  break;
1440  }
1441  }
1442 
1443  ITERATE (TGeneMap, iter, genes) {
1444  annot.SetData().SetFtable().push_back(iter->second);
1445  }
1446  }
1447 }
1448 
1450 {
1451  if ( !tse || gene_refs.empty() ) {
1452  return;
1453  }
1454  NON_CONST_ITERATE (TGeneRefs, iter, gene_refs) {
1455  if ( !iter->second->IsSetLocus() &&
1456  !iter->second->IsSetLocus_tag()) {
1457  iter->second->SetLocus(iter->first);
1458  } else if ( !iter->second->IsSetLocus() ||
1459  iter->second->GetLocus() != iter->first) {
1460  iter->second->SetSyn().push_back(iter->first);
1461  }
1462  }
1463 
1464  CTypeIterator<CSeq_feat> feat_iter(*tse);
1465  for ( ; feat_iter; ++feat_iter) {
1466  const CGene_ref* ref = NULL;
1467  if (feat_iter->GetData().IsGene()) {
1468  ref = &feat_iter->GetData().GetGene();
1469  } else {
1470  ref = feat_iter->GetGeneXref();
1471  }
1472  if (ref && ref->IsSetLocus()) {
1474  gene_refs.find(ref->GetLocus());
1475  if (iter != gene_refs.end()) {
1476  const_cast<CGene_ref*>(ref)->Assign(*iter->second);
1477  }
1478  }
1479  }
1480 }
1481 
1483 {
1484  bool found = false;
1485  for (CTypeConstIterator<CBioseq> it(*m_TSE); it; ++it) {
1486  if (&*it == &seq) {
1487  found = true;
1488  BREAK(it);
1489  }
1490  }
1491  if ( !found ) {
1492  CRef<CSeq_entry> se(new CSeq_entry);
1493  se->SetSeq(seq);
1494  m_TSE->SetSet().SetSeq_set().push_back(se);
1495  }
1496 }
1497 
1498 
1500 CGFFReader::SRecord::FindAttribute(const string& att_name, size_t min_values)
1501 const
1502 {
1504  = attrs.lower_bound(vector<string>(1, att_name));
1505  while (it != attrs.end() && it->front() == att_name
1506  && it->size() <= min_values) {
1507  ++it;
1508  }
1509  return (it == attrs.end() || it->front() == att_name) ? it : attrs.end();
1510 }
1511 
1512 
1513 bool
1515 {
1516  // line starts with keyword "browser" or "track"
1517  return (NStr::StartsWith(line, "browser") || NStr::StartsWith(line, "track") );
1518 }
1519 
1520 
void GetLabel(string *label) const
Definition: Gene_ref.cpp:57
virtual void x_Error(const string &message, unsigned int line=0)
Definition: gff_reader.cpp:369
virtual void x_ParseDateComment(const TStr &date)
Definition: gff_reader.cpp:426
TScore & SetScore(void)
Assign a value to Score data member.
Definition: Seq_align_.hpp:902
static void AddFeatQual(CRef< CSeq_feat > sfp, const string &feat_name, const string &qual, const string &val, const TFlags flags=0, ILineErrorListener *pMessageListener=0, int line=0, const string &seq_id=std::string())
Definition: readfeat.cpp:3589
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:802
size_type size() const
Definition: map.hpp:148
void SetDescr(TDescr &value)
Assign a value to Descr data member.
void SetTo(TTo value)
Assign a value to To data member.
void SetFrom(TFrom value)
Assign a value to From data member.
TFlags m_Flags
Definition: gff_reader.hpp:250
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:866
void SetQual(const TQual &value)
Assign a value to Qual data member.
Definition: Gb_qual_.hpp:219
bool IsGi(void) const
Check if variant Gi is selected.
Definition: Seq_id_.hpp:883
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:62
TStrands & SetStrands(void)
Assign a value to Strands data member.
Definition: Dense_seg_.hpp:586
all identifiers are local IDs
Definition: gff_reader.hpp:95
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:73
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:374
const struct ncbi::grid::netcache::search::fields::KEY key
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:855
void SetDim(TDim value)
Assign a value to Dim data member.
Definition: Seq_align_.hpp:865
#define _TROUBLE
void clear()
Definition: map.hpp:169
Template class for iteration on objects of class C.
Definition: iterator.hpp:672
std::ofstream out("events_result.xml")
main entry point for tests
User-defined methods of the data storage class.
virtual char PeekChar(void) const =0
Returns the first character of the next string without consuming it.
const Tdata & Get(void) const
Get the member data.
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:762
position_type GetLength(void) const
Definition: range.hpp:158
TStrand GetStrand(void) const
Get the Strand member data.
unsigned int m_LineNumber
Definition: gff_reader.hpp:249
static string & s_URLDecode(const CTempString &s, string &out)
Definition: gff_reader.cpp:83
const string & GetNamedQual(const CTempString &qual_name) const
Return a named qualifier.
Definition: Seq_feat.cpp:417
static bool MatchesMask(CTempString str, CTempString mask, ECase use_case=eCase)
Match "str" against the "mask".
Definition: ncbistr.cpp:235
void SetId(TId &value)
Assign a value to Id data member.
Definition: Seq_point_.cpp:61
TLoc loc
from accession, start, stop, strand
Definition: gff_reader.hpp:137
CStringException –.
Definition: ncbistr.hpp:4287
User-defined methods of the data storage class.
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1035
move protein_id and transcript_id to products for mRNA and CDS features
Definition: gff_reader.hpp:82
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
all identifiers are local IDs
Definition: gff_reader.hpp:92
Abstract base class for lightweight line-by-line reading.
Definition: line_reader.hpp:53
mapping pieces together
Definition: Seq_align_.hpp:103
size_type find_first_of(const CTempString match, size_type pos=0) const
Find the first occurrence of any character in the matching string within the current string...
Definition: tempstr.hpp:569
virtual void x_Info(const string &message, unsigned int line=0)
Definition: gff_reader.cpp:379
virtual CRef< CBioseq > x_ResolveID(const CSeq_id &id, const TStr &mol)
Falls back to x_ResolveNewID on cache misses.
string
Definition: cgiapp.hpp:514
virtual void x_MergeRecords(SRecord &dest, const SRecord &src)
Definition: Score.hpp:56
TFlags x_GetFlags(void) const
Definition: gff_reader.hpp:179
virtual void x_CreateGeneFeatures(CRef< CSeq_entry > &)
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2888
bool IsSetLocus(void) const
Official gene symbol Check if a value has been assigned to Locus data member.
Definition: Gene_ref_.hpp:488
virtual CRef< CSeq_loc > x_ResolveLoc(const SRecord::TLoc &loc)
Definition: gff_reader.cpp:786
bool IsCdregion(void) const
Check if variant Cdregion is selected.
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:898
const NCBI_NS_NCBI::CEnumeratedTypeValues *ENUM_METHOD_NAME() ENa_strand(void)
Access to ENa_strand's attributes (values, names) as defined in spec.
void SetStrand(TStrand value)
Assign a value to Strand data member.
#define NULL
Definition: ncbistd.hpp:225
bool IsSetStrand(void) const
Check if a value has been assigned to Strand data member.
CTempString TStr
Definition: gff_reader.hpp:164
const_iterator end() const
Definition: set.hpp:136
#define kEmptyStr
Definition: ncbistr.hpp:121
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
virtual void x_MergeAttributes(SRecord &dest, const SRecord &src)
Merge adjacent delimiters.
Definition: ncbistr.hpp:2434
== e_Named_annot_track+1
Definition: Seq_id_.hpp:118
ILineReader * m_LineReader
Definition: gff_reader.hpp:251
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_annot_.hpp:585
virtual void x_Warn(const string &message, unsigned int line=0)
Definition: gff_reader.cpp:359
TUpdate_date & SetUpdate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:500
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:795
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
#define _VERIFY(expr)
Definition: ncbidbg.hpp:159
#define NPOS
Definition: ncbistr.hpp:131
int i
const TMix & GetMix(void) const
Get the variant data.
Definition: Seq_loc_.cpp:282
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1107
virtual CRef< CSeq_align > x_ParseAlignRecord(const SRecord &record)
Definition: gff_reader.cpp:713
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
Definition: ncbidiag.hpp:544
CRef< CSeq_entry > ReadSet(int max_seqs=kMax_Int, ILineErrorListener *pMessageListener=0)
Read multiple sequences (by default, as many as are available.)
Definition: fasta.cpp:482
virtual void x_Reset(void)
Reset all state, since we're between streams.
Definition: gff_reader.cpp:389
virtual CRef< SRecord > x_NewRecord(void)
Definition: gff_reader.hpp:189
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2444
#define GI_CONST(gi)
Definition: ncbimisc.hpp:1020
virtual void x_PlaceSeq(CBioseq &seq)
Defines: CTimeFormat - storage class for time format.
CFeat_id –.
Definition: Feat_id.hpp:65
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:4808
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
bool IsReverse(ENa_strand s)
Definition: Na_strand.hpp:75
virtual void x_ParseAndPlace(const SRecord &record)
void SetToTime(const CTime &time, EPrecision prec=ePrecision_second)
Definition: Date.cpp:57
TXref & SetXref(void)
Assign a value to Xref data member.
Definition: Seq_feat_.hpp:1297
void SetId(TId &value)
Assign a value to Id data member.
Definition: Seq_feat_.cpp:73
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:101
virtual void x_PlaceAlignment(CSeq_align &align, const SRecord &record)
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
Definition: ncbistr.cpp:543
= 0x04 (yields misc_feature /standard_name="...")
Definition: readfeat.hpp:67
= 0x02 (As much as possible, try to use bad keys as if they were acceptable)
Definition: readfeat.hpp:66
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
const_iterator end() const
Definition: map.hpp:152
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
static const char si[8][64]
Definition: des.c:146
CSeqFeatXref –.
Definition: SeqFeatXref.hpp:65
const_iterator begin() const
Definition: set.hpp:135
TValue & SetValue(void)
Assign a value to Value data member.
Definition: Score_.hpp:474
const char * tag
const_iterator find(const key_type &key) const
Definition: map.hpp:153
create gene features for mRNAs and CDSs if none exist already
Definition: gff_reader.hpp:86
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:440
restrict merging to just CDS and mRNA features
Definition: gff_reader.hpp:79
Code to handle Concise Idiosyncratic Gapped Alignment Report notation.
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3455
unsigned int line_no
Definition: gff_reader.hpp:143
TFrom GetFrom(void) const
Get the From member data.
void SetPnt(TPnt &v)
Definition: Seq_loc.hpp:970
static double StringToDouble(const CTempStringEx str, TStringToNumFlags flags=0)
Convert string to double.
Definition: ncbistr.cpp:1288
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_align_.hpp:818
const_iterator lower_bound(const key_type &key) const
Definition: set.hpp:138
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
void SetVal(const TVal &value)
Assign a value to Val data member.
Definition: Gb_qual_.hpp:259
Operators to edit gaps in sequences.
CRef< CSeq_entry > m_TSE
Definition: gff_reader.hpp:243
CRef< CSeq_entry > Read(CNcbiIstream &in, TFlags flags=fDefaults)
Definition: gff_reader.cpp:112
User-defined methods of the data storage class.
parent_type::iterator iterator
Definition: set.hpp:80
unsigned int x_GetLineNumber(void)
Definition: gff_reader.hpp:180
TStarts & SetStarts(void)
Assign a value to Starts data member.
Definition: Dense_seg_.hpp:536
#define LOG_POST_X(err_subcode, message)
Definition: ncbidiag.hpp:547
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5141
virtual void x_ParseV3Attributes(SRecord &record, const TStrVec &v, SIZE_TYPE &i)
Definition: gff_reader.cpp:926
static int match(register const unsigned char *eptr, register const uschar *ecode, const unsigned char *mstart, int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, int flags, unsigned int rdepth)
Definition: pcre_exec.c:431
bool empty() const
Definition: map.hpp:149
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
void SetId(TId &value)
Assign a value to Id data member.
const CharType(& source)[N]
Definition: pointer.h:1107
TLoc & SetLoc(void)
Assign a value to Loc data member.
Definition: Std_seg_.hpp:363
void SetStrand(TStrand value)
Assign a value to Strand data member.
Definition: Seq_point_.hpp:359
const TGene & GetGene(void) const
Get the variant data.
TLens & SetLens(void)
Assign a value to Lens data member.
Definition: Dense_seg_.hpp:561
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
Definition: range.hpp:419
virtual string x_FeatureID(const SRecord &record)
Returning the empty string indicates that record constitutes an entire feature.
Definition: gff_reader.cpp:978
virtual bool x_IsLineUcscMetaInformation(const TStr &)
virtual void x_ParseV2Attributes(SRecord &record, const TStrVec &v, SIZE_TYPE &i)
Definition: gff_reader.cpp:825
bool IsSetProduct(void) const
product of process Check if a value has been assigned to Product data member.
Definition: Seq_feat_.hpp:1074
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
virtual void x_SetProducts(CRef< CSeq_entry > &)
parent_type::const_iterator const_iterator
Definition: set.hpp:79
T max(T x_, T y_)
.hpp User-defined methods of the data storage class.
Definition: Gb_qual.hpp:60
virtual CRef< CSeq_id > x_ResolveNewSeqName(const string &name)
virtual bool x_ParseStructuredComment(const TStr &line)
Definition: gff_reader.cpp:402
position_type GetTo(void) const
Definition: range.hpp:142
CTime –.
Definition: ncbitime.hpp:290
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
virtual CRef< CSeq_id > x_ResolveSeqName(const string &name)
Falls back to x_ResolveNewSeqName on cache misses.
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
void erase(iterator pos)
Definition: set.hpp:151
map< string, string > ss
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
Definition: tempstr.hpp:807
TIds & SetIds(void)
Assign a value to Ids data member.
Definition: Dense_seg_.hpp:511
#define BREAK(it)
Definition: ncbistl.hpp:173
const CVect2< U > & v2
Definition: globals.hpp:440
TStr & SetStr(void)
Select the variant.
Definition: Object_id_.hpp:304
Useful/utility classes and methods.
position_type GetFrom(void) const
Definition: range.hpp:134
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1041
void SetPoint(TPoint value)
Assign a value to Point data member.
Definition: Seq_point_.hpp:312
virtual void x_RemapGeneRefs(CRef< CSeq_entry > &, TGeneRefs &)
Base class for reading FASTA sequences.
Definition: fasta.hpp:78
CRef –.
Definition: ncbiobj.hpp:616
const TLocus & GetLocus(void) const
Get the Locus member data.
Definition: Gene_ref_.hpp:500
merge exons with the same transcript_id
Definition: gff_reader.hpp:76
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
void SetInt(TInt &v)
Definition: Seq_loc.hpp:968
Definition: Seq_entry.hpp:55
TSeqNameCache m_SeqNameCache
Definition: gff_reader.hpp:244
virtual CRef< SRecord > x_ParseFeatureInterval(const TStr &line)
Definition: gff_reader.cpp:474
TGi GetGi(void) const
Get the variant data.
Definition: Seq_id_.hpp:889
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1029
static uch flags
string m_DefMol
Definition: gff_reader.hpp:248
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
TSeqCache m_SeqCache
Definition: gff_reader.hpp:245
static CRef< CSeq_feat > CreateSeqFeat(const string &feat, CSeq_loc &location, const TFlags flags=0, ILineErrorListener *pMessageListener=0, unsigned int line=0, std::string *seq_id=0, ITableFilter *filter=0)
Definition: readfeat.cpp:3573
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:926
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1185
TDelayedRecords m_DelayedRecords
Definition: gff_reader.hpp:246
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
TGeneRefs m_GeneRefs
Definition: gff_reader.hpp:247
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:130
static string ParseEscapes(const CTempString str, EEscSeqRange mode=eEscSeqRange_Standard, char user_char= '?')
Parse C-style escape sequences in the specified string.
Definition: ncbistr.cpp:4643
CSeqIdException –.
Definition: Seq_id.hpp:743
virtual void x_PlaceFeature(CSeq_feat &feat, const SRecord &record)
virtual CRef< CBioseq > x_ResolveNewID(const CSeq_id &id, const string &mol)
The base version just constructs a shell so as not to depend on the object manager, but derived versions may consult it.
const CGene_ref * GetGeneXref(void) const
See related function in util/feature.hpp.
Definition: Seq_feat.cpp:169
TLocal & SetLocal(void)
Select the variant.
Definition: Feat_id_.cpp:140
#define _ASSERT
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:278
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
No variant selected.
Definition: Seq_id_.hpp:94
namespace ncbi::objects::
Definition: Seq_feat.hpp:56
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:367
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:70
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5157
bool IsGene(void) const
Check if variant Gene is selected.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3362
User-defined methods of the data storage class.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
TAttrs::const_iterator FindAttribute(const string &att_name, size_t min_values=1) const
void SetMix(TMix &v)
Definition: Seq_loc.hpp:972
#define const
Definition: zconf.h:217
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
virtual bool x_SplitKeyValuePair(const string &, string &, string &)
Definition: gff_reader.cpp:912
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:721
static CRef< CFeat_id > s_StringToFeatId(const string &str)
Definition: gff_reader.cpp:74
virtual void x_ParseTypeComment(const TStr &moltype, const TStr &seqname)
Definition: gff_reader.cpp:439
Simple implementation of ILineReader for i(o)streams.
void SetId(TId &value)
Assign a value to Id data member.
TQual & SetQual(void)
Assign a value to Qual data member.
Definition: Seq_feat_.hpp:1143
TReal & SetReal(void)
Select the variant.
Definition: Score_.hpp:391
Reader for GFF (including GTF) files.
std::istream & in(std::istream &in_, double &x_)
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_annot_.hpp:865
don't honor/recognize GTF conventions
Definition: gff_reader.hpp:73
virtual void x_AddAttribute(SRecord &record, vector< string > &attr)
Definition: gff_reader.cpp:962
virtual CRef< CSeq_feat > x_ParseFeatRecord(const SRecord &record)
Definition: gff_reader.cpp:589
numeric identifiers are local IDs
Definition: gff_reader.hpp:89
static const char * str(char *buf, int n)
Definition: stats.c:84
Definition: set.hpp:44
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:141
void SetProduct(TProduct &value)
Assign a value to Product data member.
Definition: Seq_feat_.cpp:110
set< TSeqRange > ranges
the set of ranges that make up this location this allows us to separately assign frame even if the ra...
Definition: gff_reader.hpp:120
vector< SSubLoc > TLoc
Definition: gff_reader.hpp:130
#define _TRACE(message)
Definition: ncbidbg.hpp:120
vector< TStr > TStrVec
Definition: gff_reader.hpp:165
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:98
void SetNumseg(TNumseg value)
Assign a value to Numseg data member.
Definition: Dense_seg_.hpp:474
virtual void x_ReadFastaSequences(ILineReader &in)
Definition: gff_reader.cpp:450
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:768
ESubtype GetSubtype(void) const
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:922
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:531
Modified on Sun May 27 14:51:14 2018 by modify_doxy.py rev. 546573