00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034 #include <ncbi_pch.hpp>
00035 #include <objtools/readers/gff_reader.hpp>
00036
00037 #include <corelib/ncbistr_util.hpp>
00038 #include <corelib/ncbitime.hpp>
00039 #include <corelib/ncbiutil.hpp>
00040 #include <corelib/stream_utils.hpp>
00041 #include <serial/iterator.hpp>
00042
00043 #include <objects/general/Date.hpp>
00044 #include <objects/general/Object_id.hpp>
00045 #include <objects/seq/Seq_annot.hpp>
00046 #include <objects/seq/Seq_descr.hpp>
00047 #include <objects/seq/Seq_inst.hpp>
00048 #include <objects/seq/Seqdesc.hpp>
00049 #include <objects/seqalign/Dense_seg.hpp>
00050 #include <objects/seqalign/Score.hpp>
00051 #include <objects/seqalign/Std_seg.hpp>
00052 #include <objects/seqfeat/Feat_id.hpp>
00053 #include <objects/seqfeat/Cdregion.hpp>
00054 #include <objects/seqfeat/SeqFeatXref.hpp>
00055 #include <objects/seqfeat/Gb_qual.hpp>
00056 #include <objects/seqloc/Seq_interval.hpp>
00057 #include <objects/seqloc/Seq_point.hpp>
00058 #include <objects/seqset/Bioseq_set.hpp>
00059
00060 #include <objtools/readers/cigar.hpp>
00061 #include <objtools/readers/fasta.hpp>
00062 #include <objtools/readers/readfeat.hpp>
00063 #include <objtools/error_codes.hpp>
00064
00065 #include <algorithm>
00066 #include <ctype.h>
00067
00068
00069 #define NCBI_USE_ERRCODE_X Objtools_Rd_GFF
00070
00071 BEGIN_NCBI_SCOPE
00072 BEGIN_SCOPE(objects)
00073
00074 static CRef<CFeat_id>
00075 s_StringToFeatId( const string& str )
00076 {
00077 CRef<CObject_id> objid( new CObject_id );
00078 objid->SetStr( str );
00079 CRef<CFeat_id> featid( new CFeat_id );
00080 featid->SetLocal( *objid );
00081 return featid;
00082 }
00083
00084 static string& s_URLDecode(const CTempString& s, string& out) {
00085 SIZE_TYPE pos = 0;
00086 out.erase();
00087 out.reserve(s.size());
00088 while (pos < s.size()) {
00089 SIZE_TYPE pos2 = s.find_first_of("%" , pos);
00090 out += s.substr(pos, pos2 - pos);
00091 if (pos2 == NPOS) {
00092 break;
00093 } else if (s[pos2] == '+') {
00094 out += ' ';
00095 pos = pos2 + 1;
00096 } else if (s[pos2] == '%') {
00097 try {
00098 out += (char)NStr::StringToInt(s.substr(pos2 + 1, 2), 0, 16);
00099 pos = pos2 + 3;
00100 } catch (CStringException&) {
00101
00102 out += '%';
00103 pos = pos2 + 1;
00104 }
00105 } else {
00106 _TROUBLE;
00107 }
00108 }
00109 return out;
00110 }
00111
00112
00113 CRef<CSeq_entry> CGFFReader::Read(CNcbiIstream& in, TFlags flags)
00114 {
00115 CStreamLineReader lr(in);
00116 return Read(lr, flags);
00117 }
00118
00119 CRef<CSeq_entry> CGFFReader::Read(ILineReader& in, TFlags flags)
00120 {
00121 x_Reset();
00122 m_Flags = flags;
00123 m_LineReader = ∈
00124
00125 if (m_Flags & fSetVersion3) {
00126 m_Version = 3;
00127 }
00128
00129 TStr line;
00130 while ( !in.AtEOF() ) {
00131 ++m_LineNumber;
00132 char c = in.PeekChar();
00133 if (c == '#') {
00134 line = *++in;
00135 if (line.size() > 2 && line[1] == '#') {
00136 x_ParseStructuredComment(line);
00137
00138 }
00139 } else if (c == '>') {
00140
00141 x_ReadFastaSequences(in);
00142 } else {
00143 line = *++in;
00144 if ( x_IsLineUcscMetaInformation(line) ) {
00145
00146 continue;
00147 }
00148 if ( line.empty() ) {
00149
00150 continue;
00151 }
00152 CRef<SRecord> record = x_ParseFeatureInterval(line);
00153 if (record) {
00154
00155 if (record->id.empty()) {
00156 x_ParseAndPlace(*record);
00157 } else {
00158 CRef<SRecord>& match = m_DelayedRecords[ record->id ];
00159
00160 if (match) {
00161 x_MergeRecords(*match, *record);
00162 } else {
00163 match.Reset(record);
00164 }
00165 }
00166 }
00167 }
00168 }
00169
00170 NON_CONST_ITERATE (TDelayedRecords, it, m_DelayedRecords) {
00171 SRecord& rec = *it->second;
00172
00173 NON_CONST_ITERATE (SRecord::TLoc, loc_iter, rec.loc) {
00174 ITERATE (set<TSeqRange>, src_iter, loc_iter->merge_ranges) {
00175 TSeqRange range(*src_iter);
00176 set<TSeqRange>::iterator dst_iter =
00177 loc_iter->ranges.begin();
00178 for ( ; dst_iter != loc_iter->ranges.end(); ) {
00179 TSeqRange r(range);
00180 r += *dst_iter;
00181 if (r.GetLength() <=
00182 range.GetLength() + dst_iter->GetLength()) {
00183 range += *dst_iter;
00184 _TRACE("merging overlapping ranges: "
00185 << range.GetFrom() << " - "
00186 << range.GetTo() << " <-> "
00187 << dst_iter->GetFrom() << " - "
00188 << dst_iter->GetTo());
00189 loc_iter->ranges.erase(dst_iter++);
00190 break;
00191 } else {
00192 ++dst_iter;
00193 }
00194 }
00195 loc_iter->ranges.insert(range);
00196 }
00197 }
00198
00199 if (rec.key == "exon") {
00200 rec.key = "mRNA";
00201 }
00202 x_ParseAndPlace(rec);
00203 }
00204
00205
00206
00207
00208
00209 if (m_TSE && m_GeneRefs.size()) {
00210 NON_CONST_ITERATE (TGeneRefs, iter, m_GeneRefs) {
00211 if ( !iter->second->IsSetLocus() &&
00212 !iter->second->IsSetLocus_tag()) {
00213 iter->second->SetLocus(iter->first);
00214 } else if ( !iter->second->IsSetLocus() ||
00215 iter->second->GetLocus() != iter->first) {
00216 iter->second->SetSyn().push_back(iter->first);
00217 }
00218 }
00219
00220 CTypeIterator<CSeq_feat> feat_iter(*m_TSE);
00221 for ( ; feat_iter; ++feat_iter) {
00222 const CGene_ref* ref = NULL;
00223 if (feat_iter->GetData().IsGene()) {
00224 ref = &feat_iter->GetData().GetGene();
00225 } else {
00226 ref = feat_iter->GetGeneXref();
00227 }
00228 if (ref && ref->IsSetLocus()) {
00229 TGeneRefs::const_iterator iter =
00230 m_GeneRefs.find(ref->GetLocus());
00231 if (iter != m_GeneRefs.end()) {
00232 const_cast<CGene_ref*>(ref)->Assign(*iter->second);
00233 }
00234 }
00235 }
00236 }
00237
00238 CRef<CSeq_entry> tse(m_TSE);
00239 x_Reset();
00240
00241
00242 if (flags & fSetProducts) {
00243 CTypeIterator<CSeq_feat> feat_iter(*tse);
00244 for ( ; feat_iter; ++feat_iter) {
00245 CSeq_feat& feat = *feat_iter;
00246
00247 string qual_name;
00248 switch (feat.GetData().GetSubtype()) {
00249 case CSeqFeatData::eSubtype_cdregion:
00250 qual_name = "protein_id";
00251 break;
00252
00253 case CSeqFeatData::eSubtype_mRNA:
00254 qual_name = "transcript_id";
00255 break;
00256
00257 default:
00258 continue;
00259 break;
00260 }
00261
00262 string id_str = feat.GetNamedQual(qual_name);
00263 if ( !id_str.empty() ) {
00264 CRef<CSeq_id> id = x_ResolveSeqName(id_str);
00265 feat.SetProduct().SetWhole(*id);
00266 }
00267 }
00268 }
00269
00270 if (flags & fCreateGeneFeats) {
00271 CTypeIterator<CSeq_annot> annot_iter(*tse);
00272 for ( ; annot_iter; ++annot_iter) {
00273 CSeq_annot& annot = *annot_iter;
00274 if (annot.GetData().Which() != CSeq_annot::TData::e_Ftable) {
00275 continue;
00276 }
00277
00278
00279 CSeq_annot::TData::TFtable::iterator feat_iter =
00280 annot.SetData().SetFtable().begin();
00281 CSeq_annot::TData::TFtable::iterator feat_end =
00282 annot.SetData().SetFtable().end();
00283
00284
00285
00286
00287 typedef map<string, CRef<CSeq_feat> > TGeneMap;
00288 TGeneMap genes;
00289 for (bool has_genes = false;
00290 feat_iter != feat_end && !has_genes; ++feat_iter) {
00291 CSeq_feat& feat = **feat_iter;
00292
00293 switch (feat.GetData().GetSubtype()) {
00294 case CSeqFeatData::eSubtype_gene:
00295
00296 has_genes = true;
00297 genes.clear();
00298 break;
00299
00300 case CSeqFeatData::eSubtype_mRNA:
00301 case CSeqFeatData::eSubtype_cdregion:
00302
00303
00304
00305 if ( !feat.GetGeneXref() ) {
00306 continue;
00307 }
00308 {{
00309 string gene_id;
00310 feat.GetGeneXref()->GetLabel(&gene_id);
00311 _ASSERT( !gene_id.empty() );
00312 TSeqRange range = feat.GetLocation().GetTotalRange();
00313
00314 ENa_strand strand = feat.GetLocation().GetStrand();
00315 const CSeq_id* id = feat.GetLocation().GetId();
00316 if ( !id ) {
00317 x_Error("No consistent ID found; gene feature skipped");
00318 continue;
00319 }
00320
00321 TGeneMap::iterator iter = genes.find(gene_id);
00322 if (iter == genes.end()) {
00323
00324 CRef<CSeq_feat> gene(new CSeq_feat());
00325 gene->SetData().SetGene().Assign(*feat.GetGeneXref());
00326
00327 gene->SetLocation().SetInt().SetFrom(range.GetFrom());
00328 gene->SetLocation().SetInt().SetTo (range.GetTo());
00329 gene->SetLocation().SetId(*id);
00330 gene->SetLocation().SetInt().SetStrand(strand);
00331 genes[gene_id] = gene;
00332 } else {
00333
00334 CRef<CSeq_feat> gene = iter->second;
00335
00336 TSeqRange r2 = gene->GetLocation().GetTotalRange();
00337 range += r2;
00338 gene->SetLocation().SetInt().SetFrom(range.GetFrom());
00339 gene->SetLocation().SetInt().SetTo (range.GetTo());
00340 gene->SetLocation().InvalidateTotalRangeCache();
00341 }
00342 }}
00343 break;
00344
00345 default:
00346 break;
00347 }
00348 }
00349
00350 ITERATE (TGeneMap, iter, genes) {
00351 annot.SetData().SetFtable().push_back(iter->second);
00352 }
00353 }
00354 }
00355
00356 return tse;
00357 }
00358
00359
00360 void CGFFReader::x_Warn(const string& message, unsigned int line)
00361 {
00362 if (line) {
00363 ERR_POST_X(2, Warning << message << " [GFF input, line " << line << ']');
00364 } else {
00365 ERR_POST_X(3, Warning << message << " [GFF input]");
00366 }
00367 }
00368
00369
00370 void CGFFReader::x_Error(const string& message, unsigned int line)
00371 {
00372 if (line) {
00373 ERR_POST_X(1, Error << message << " [GFF input, line " << line << ']');
00374 } else {
00375 ERR_POST_X(1, Error << message << " [GFF input]");
00376 }
00377 }
00378
00379
00380 void CGFFReader::x_Info(const string& message, unsigned int line)
00381 {
00382 if (line) {
00383 ERR_POST_X(1, Info << message << " [GFF input, line " << line << ']');
00384 } else {
00385 ERR_POST_X(1, Info << message << " [GFF input]");
00386 }
00387 }
00388
00389
00390 void CGFFReader::x_Reset(void)
00391 {
00392 m_TSE.Reset(new CSeq_entry);
00393 m_SeqNameCache.clear();
00394 m_SeqCache.clear();
00395 m_DelayedRecords.clear();
00396 m_GeneRefs.clear();
00397 m_DefMol.erase();
00398 m_LineNumber = 0;
00399 m_Version = 2;
00400 }
00401
00402
00403 bool CGFFReader::x_ParseStructuredComment(const TStr& line)
00404 {
00405 if ( line.empty() || line[0] != '#' || line[1] != '#' ) {
00406 return false;
00407 }
00408 TStrVec v;
00409
00410 typedef CStrTokenize<TStr, TStrVec> TTokenizer;
00411 TTokenizer::TPosContainer pos_container;
00412 TTokenizer::Do(line, "# \t", v, TTokenizer::eMergeDelims, pos_container);
00413 if (v.empty()) {
00414 return true;
00415 }
00416 if (v[0] == "date" && v.size() > 1) {
00417 x_ParseDateComment(v[1]);
00418 } else if (v[0] == "Type" && v.size() > 1) {
00419 x_ParseTypeComment(v[1], v.size() > 2 ? v[2] : TStr());
00420 } else if (v[0] == "gff-version" && v.size() > 1) {
00421 m_Version = NStr::StringToInt(v[1]);
00422 } else if (v[0] == "FASTA") {
00423 x_ReadFastaSequences(*m_LineReader);
00424 }
00425
00426 return true;
00427 }
00428
00429
00430 void CGFFReader::x_ParseDateComment(const TStr& date)
00431 {
00432 try {
00433 CRef<CSeqdesc> desc(new CSeqdesc);
00434 desc->SetUpdate_date().SetToTime(CTime(date, "Y-M-D"),
00435 CDate::ePrecision_day);
00436 m_TSE->SetSet().SetDescr().Set().push_back(desc);
00437 } catch (exception& e) {
00438 x_Error(string("Bad ISO date: ") + e.what(), x_GetLineNumber());
00439 }
00440 }
00441
00442
00443 void CGFFReader::x_ParseTypeComment(const TStr& moltype, const TStr& seqname)
00444 {
00445 if (seqname.empty()) {
00446 m_DefMol = moltype;
00447 } else {
00448
00449 x_ResolveID(*x_ResolveSeqName(seqname), moltype);
00450 }
00451 }
00452
00453
00454 void CGFFReader::x_ReadFastaSequences(ILineReader& in)
00455 {
00456 CFastaReader reader(in, fReadFasta_AssumeNuc);
00457 CRef<CSeq_entry> seqs = reader.ReadSet();
00458 for (CTypeIterator<CBioseq> it(*seqs); it; ++it) {
00459 if (it->GetId().empty()) {
00460 CRef<CSeq_entry> parent(new CSeq_entry);
00461 parent->SetSeq(*it);
00462 m_TSE->SetSet().SetSeq_set().push_back(parent);
00463 continue;
00464 }
00465 CRef<CBioseq> our_bs = x_ResolveID(*it->GetId().front(), kEmptyStr);
00466
00467
00468 our_bs->SetId() = it->GetId();
00469 if (it->IsSetDescr()) {
00470 our_bs->SetDescr(it->SetDescr());
00471 }
00472 our_bs->SetInst(it->SetInst());
00473 }
00474 }
00475
00476
00477 CRef<CGFFReader::SRecord>
00478 CGFFReader::x_ParseFeatureInterval(const TStr& line)
00479 {
00480 typedef CStrTokenize<TStr, TStrVec> TTokenizer;
00481 TTokenizer::TPosContainer pos_container;
00482 TStrVec v;
00483 bool misdelimited = false;
00484
00485 TTokenizer::Do(line, "\t", v, TTokenizer::eNoMergeDelims, pos_container);
00486 if (v.size() < 8) {
00487 v.clear();
00488 TTokenizer::Do(line, " \t", v, TTokenizer::eMergeDelims, pos_container);
00489 if (v.size() < 8) {
00490 x_Error("Skipping line due to insufficient fields",
00491 x_GetLineNumber());
00492 return null;
00493 } else if (m_Version < 3) {
00494 x_Info("(Recovered) Bad delimiters (should use tabs)", x_GetLineNumber());
00495 misdelimited = true;
00496 }
00497 } else {
00498
00499
00500
00501 }
00502
00503 CRef<SRecord> record(x_NewRecord());
00504 string accession;
00505 TSeqPos from = 0, to = numeric_limits<TSeqPos>::max();
00506 ENa_strand strand = eNa_strand_unknown;
00507 s_URLDecode(v[0], accession);
00508 record->source = v[1];
00509 record->key = v[2];
00510
00511 try {
00512 from = NStr::StringToUInt(v[3]) - 1;
00513 } catch (std::exception& e) {
00514 x_Error(string("Bad FROM position: ") + e.what(), x_GetLineNumber());
00515 }
00516
00517 try {
00518 to = NStr::StringToUInt(v[4]) - 1;
00519 } catch (std::exception& e) {
00520 x_Error(string("Bad TO position: ") + e.what(), x_GetLineNumber());
00521 }
00522
00523 record->score = v[5];
00524
00525 if (v[6] == "+") {
00526 strand = eNa_strand_plus;
00527 } else if (v[6] == "-") {
00528 strand = eNa_strand_minus;
00529 } else if ( !(v[6] == ".") ) {
00530 x_Warn("Bad strand " + string(v[6]) + " (should be [+-.])",
00531 x_GetLineNumber());
00532 }
00533
00534 if (v[7] == "0" || v[7] == "1" || v[7] == "2") {
00535 record->frame = v[7][0] - '0';
00536 } else if (v[7] == ".") {
00537 record->frame = -1;
00538 } else {
00539 x_Warn("Bad frame " + string(v[7]) + " (should be [012.])",
00540 x_GetLineNumber());
00541 record->frame = -1;
00542 }
00543
00544 {{
00545 SRecord::SSubLoc subloc;
00546 subloc.accession = accession;
00547 subloc.strand = strand;
00548 subloc.ranges.insert(TSeqRange(from, to));
00549
00550 record->loc.push_back(subloc);
00551 }}
00552
00553 SIZE_TYPE i = 8;
00554 if (m_Version >= 3) {
00555 x_ParseV3Attributes(*record, v, i);
00556 } else {
00557 x_ParseV2Attributes(*record, v, i);
00558 }
00559
00560 if ( !misdelimited && (i > 9 || (i == 9 && v.size() > 9
00561 && !NStr::StartsWith(v[9], "#") ))) {
00562 x_Warn("Extra non-comment fields", x_GetLineNumber());
00563 }
00564
00565 if (record->FindAttribute("Target") != record->attrs.end()) {
00566 record->type = SRecord::eAlign;
00567 } else {
00568 record->type = SRecord::eFeat;
00569 }
00570
00571
00572 if (m_Version == 3) {
00573 SRecord::TAttrs::const_iterator id_it = record->FindAttribute("ID");
00574 if (id_it != record->attrs.end()) {
00575 record->id = (*id_it)[1];
00576 }
00577
00578 SRecord::TAttrs::const_iterator parent_it = record->FindAttribute("Parent");
00579 if (parent_it != record->attrs.end()) {
00580 record->parent = (*parent_it)[1];
00581 }
00582
00583 SRecord::TAttrs::const_iterator name_it = record->FindAttribute("Name");
00584 if (name_it != record->attrs.end()) {
00585 record->name = (*name_it)[1];
00586 }
00587 }
00588
00589 record->line_no = m_LineNumber;
00590 record->id = x_FeatureID(*record);
00591 return record;
00592 }
00593
00594
00595 CRef<CSeq_feat> CGFFReader::x_ParseFeatRecord(const SRecord& record)
00596 {
00597 CRef<CSeq_feat> feat(CFeature_table_reader::CreateSeqFeat
00598 (record.key, *x_ResolveLoc(record.loc),
00599 CFeature_table_reader::fTranslateBadKey));
00600 if (record.frame >= 0 && feat->GetData().IsCdregion()) {
00601 feat->SetData().SetCdregion().SetFrame
00602 (static_cast<CCdregion::EFrame>(record.frame + 1));
00603 }
00604 if ( m_Version == 3 ) {
00605 ITERATE (SRecord::TAttrs, it, record.attrs) {
00606 string tag = it->front();
00607 if (tag == "ID") {
00608 feat->SetId( *s_StringToFeatId( (*it)[1] ) );
00609 }
00610 if (tag == "Parent") {
00611 CRef<CSeqFeatXref> xref( new CSeqFeatXref );
00612 xref->SetId( *s_StringToFeatId( (*it)[1] ) );
00613 feat->SetXref().push_back( xref );
00614 }
00615 }
00616 }
00617
00618 string gene_id;
00619 string gene;
00620 string locus_tag;
00621 ITERATE (SRecord::TAttrs, it, record.attrs) {
00622 string tag = it->front();
00623 string value;
00624 switch (it->size()) {
00625 case 1:
00626 break;
00627 case 2:
00628 value = (*it)[1];
00629 break;
00630 default:
00631 x_Warn("Ignoring extra fields in value of " + tag, record.line_no);
00632 value = (*it)[1];
00633 break;
00634 }
00635 if (x_GetFlags() & fGBQuals) {
00636 if (tag == "transcript_id") {
00637
00638 } else if (tag == "gene_id") {
00639 gene_id = value;
00640 continue;
00641 } else if (tag == "gene") {
00642 gene = value;
00643 continue;
00644 } else if (tag == "locus_tag") {
00645 locus_tag = value;
00646 continue;
00647 } else if (tag == "exon_number") {
00648 tag = "number";
00649 } else if (NStr::StartsWith(tag, "insd_")) {
00650 tag.erase(0, 5);
00651 }
00652
00653 CFeature_table_reader::AddFeatQual
00654 (feat, tag, value, CFeature_table_reader::fKeepBadKey);
00655 } else {
00656 CRef<CGb_qual> qual(new CGb_qual);
00657 qual->SetQual(tag);
00658 qual->SetVal(value);
00659 feat->SetQual().push_back(qual);
00660 }
00661 }
00662
00663 if ( !gene_id.empty() ) {
00664 SIZE_TYPE colon = gene_id.find(':');
00665 if (colon != NPOS) {
00666 gene_id.erase(0, colon + 1);
00667 }
00668
00669 TGeneRefs::value_type val(gene_id, CRef<CGene_ref>());
00670 TGeneRefs::iterator iter = m_GeneRefs.insert(val).first;
00671 if ( !iter->second ) {
00672 iter->second.Reset(new CGene_ref);
00673 }
00674 if ( !gene.empty() ) {
00675 if (iter->second->IsSetLocus() &&
00676 iter->second->GetLocus() != gene) {
00677 LOG_POST_X(4, Warning << "CGFFReader::x_ParseFeatRecord(): "
00678 << "inconsistent gene name: "
00679 << gene << " != " << iter->second->GetLocus()
00680 << ", ignoring second");
00681 } else if ( !iter->second->IsSetLocus() ) {
00682 iter->second->SetLocus(gene);
00683 }
00684 }
00685 if ( !locus_tag.empty() ) {
00686 if (iter->second->IsSetLocus_tag() &&
00687 iter->second->GetLocus_tag() != locus_tag) {
00688 LOG_POST_X(5, Warning << "CGFFReader::x_ParseFeatRecord(): "
00689 << "inconsistent locus tag: "
00690 << locus_tag << " != " << iter->second->GetLocus_tag()
00691 << ", ignoring second");
00692 } else if ( !iter->second->IsSetLocus_tag() ) {
00693 iter->second->SetLocus_tag(locus_tag);
00694 }
00695 }
00696
00697
00698 CFeature_table_reader::AddFeatQual
00699 (feat, "gene_id", gene_id,
00700 CFeature_table_reader::fKeepBadKey);
00701 if (x_GetFlags() & fGBQuals) {
00702 CFeature_table_reader::AddFeatQual
00703 (feat, "gene", gene_id,
00704 CFeature_table_reader::fKeepBadKey);
00705 }
00706 }
00707
00708 return feat;
00709 }
00710
00711
00712 CRef<CSeq_align> CGFFReader::x_ParseAlignRecord(const SRecord& record)
00713 {
00714 CRef<CSeq_align> align(new CSeq_align);
00715 align->SetType(CSeq_align::eType_partial);
00716 align->SetDim(2);
00717 SRecord::TAttrs::const_iterator tgit = record.FindAttribute("Target");
00718 vector<string> target;
00719 if (tgit != record.attrs.end()) {
00720 NStr::Tokenize((*tgit)[1], " +-", target, NStr::eMergeDelims);
00721 }
00722 if (target.size() != 3) {
00723 x_Warn("Bad Target attribute", record.line_no);
00724 return align;
00725 }
00726 CRef<CSeq_id> tgid = x_ResolveSeqName(target[0]);
00727 TSeqPos tgstart = NStr::StringToUInt(target[1]) - 1;
00728 TSeqPos tgstop = NStr::StringToUInt(target[2]) - 1;
00729 TSeqPos tglen = tgstop - tgstart + 1;
00730
00731 CRef<CSeq_loc> refloc = x_ResolveLoc(record.loc);
00732 CRef<CSeq_id> refid(&refloc->SetInt().SetId());
00733 TSeqPos reflen = 0;
00734 for (CSeq_loc_CI it(*refloc); it; ++it) {
00735 reflen += it.GetRange().GetLength();
00736 }
00737
00738 CRef<CSeq_loc> tgloc(new CSeq_loc);
00739 tgloc->SetInt().SetId(*tgid);
00740 tgloc->SetInt().SetFrom(tgstart);
00741 tgloc->SetInt().SetTo(tgstop);
00742
00743 SRecord::TAttrs::const_iterator gap_it = record.FindAttribute("Gap");
00744 if (gap_it == record.attrs.end()) {
00745
00746 if (reflen == tglen && refloc->IsInt()) {
00747 CDense_seg& ds = align->SetSegs().SetDenseg();
00748 ds.SetNumseg(1);
00749 ds.SetIds().push_back(refid);
00750 ds.SetIds().push_back(tgid);
00751 ds.SetStarts().push_back(refloc->GetInt().GetFrom());
00752 ds.SetStarts().push_back(tgstart);
00753 ds.SetLens().push_back(reflen);
00754 if (refloc->GetInt().IsSetStrand()) {
00755 ds.SetStrands().push_back(refloc->GetInt().GetStrand());
00756 ds.SetStrands().push_back(eNa_strand_plus);
00757 }
00758 } else {
00759 if (reflen != tglen && reflen != 3 * tglen) {
00760 x_Warn("Reference and target locations have an irregular"
00761 " ratio.", record.line_no);
00762 }
00763 CRef<CStd_seg> ss(new CStd_seg);
00764 ss->SetLoc().push_back(refloc);
00765 ss->SetLoc().push_back(tgloc);
00766 align->SetSegs().SetStd().push_back(ss);
00767 }
00768 } else {
00769 SCigarAlignment cigar
00770 ((*gap_it)[1], SCigarAlignment::eOpFirstIfAmbiguous);
00771 align = cigar(refloc->GetInt(), tgloc->GetInt());
00772 }
00773
00774 try {
00775 CRef<CScore> score(new CScore);
00776 score->SetValue().SetReal(NStr::StringToDouble(record.score));
00777 align->SetScore().push_back(score);
00778 } catch (...) {
00779 }
00780
00781 return align;
00782 }
00783
00784
00785 CRef<CSeq_loc> CGFFReader::x_ResolveLoc(const SRecord::TLoc& loc)
00786 {
00787 CRef<CSeq_loc> seqloc(new CSeq_loc);
00788 ITERATE (SRecord::TLoc, it, loc) {
00789 CRef<CSeq_id> id = x_ResolveSeqName(it->accession);
00790 ITERATE (set<TSeqRange>, range, it->ranges) {
00791 CRef<CSeq_loc> segment(new CSeq_loc);
00792 if (range->GetLength() == 1) {
00793 CSeq_point& pnt = segment->SetPnt();
00794 pnt.SetId (*id);
00795 pnt.SetPoint(range->GetFrom());
00796 if (it->strand != eNa_strand_unknown) {
00797 pnt.SetStrand(it->strand);
00798 }
00799 } else {
00800 CSeq_interval& si = segment->SetInt();
00801 si.SetId (*id);
00802 si.SetFrom(range->GetFrom());
00803 si.SetTo (range->GetTo());
00804 if (it->strand != eNa_strand_unknown) {
00805 si.SetStrand(it->strand);
00806 }
00807 }
00808 if (IsReverse(it->strand)) {
00809 seqloc->SetMix().Set().push_front(segment);
00810 } else {
00811 seqloc->SetMix().Set().push_back(segment);
00812 }
00813 }
00814 }
00815
00816 if (seqloc->GetMix().Get().size() == 1) {
00817 return seqloc->SetMix().Set().front();
00818 } else {
00819 return seqloc;
00820 }
00821 }
00822
00823
00824 void CGFFReader::x_ParseV2Attributes(SRecord& record, const TStrVec& v,
00825 SIZE_TYPE& i)
00826 {
00827 string attr_last_value;
00828 vector<string> attr_values;
00829 char quote_char = 0;
00830
00831 for (; i < v.size(); ++i) {
00832 string s = string(v[i]) + ' ';
00833 SIZE_TYPE pos = 0;
00834 while (pos < s.size()) {
00835 SIZE_TYPE pos2;
00836 if (quote_char) {
00837 pos2 = s.find_first_of(" \'\"\\", pos);
00838 _ASSERT(pos2 != NPOS);
00839 if (s[pos2] == quote_char) {
00840 if (attr_values.empty()) {
00841 x_Warn("quoted attribute tag " + attr_last_value,
00842 x_GetLineNumber());
00843 }
00844 quote_char = 0;
00845 attr_last_value += s.substr(pos, pos2 - pos);
00846 try {
00847 attr_values.push_back(NStr::ParseEscapes
00848 (attr_last_value));
00849 } catch (CStringException& e) {
00850 attr_values.push_back(attr_last_value);
00851 x_Warn(e.what() + (" in value of " + attr_values[0]),
00852 x_GetLineNumber());
00853 }
00854 attr_last_value.erase();
00855 } else if (s[pos2] == '\\') {
00856 _VERIFY(++pos2 != s.size());
00857 attr_last_value += s.substr(pos, pos2 + 1 - pos);
00858 } else {
00859 attr_last_value += s.substr(pos, pos2 + 1 - pos);
00860 }
00861 } else {
00862 pos2 = s.find_first_of(" #;\"", pos);
00863 _ASSERT(pos2 != NPOS);
00864 if (pos != pos2) {
00865
00866 attr_last_value += s.substr(pos, pos2 - pos);
00867 attr_values.push_back(attr_last_value);
00868 attr_last_value.erase();
00869 }
00870
00871 switch (s[pos2]) {
00872 case ' ':
00873 if (pos2 == s.size() - 1) {
00874 x_AddAttribute(record, attr_values);
00875 attr_values.clear();
00876 }
00877 break;
00878
00879 case '#':
00880 return;
00881
00882 case ';':
00883 if (attr_values.empty()) {
00884 x_Warn("null attribute", x_GetLineNumber());
00885 } else {
00886 x_AddAttribute(record, attr_values);
00887 attr_values.clear();
00888 }
00889 break;
00890
00891
00892 case '\"':
00893 case '\'':
00894 quote_char = s[pos2];
00895 break;
00896
00897 default:
00898 _TROUBLE;
00899 }
00900 }
00901 pos = pos2 + 1;
00902 }
00903 }
00904
00905 if ( !attr_values.empty() ) {
00906 x_Warn("unterminated attribute " + attr_values[0], x_GetLineNumber());
00907 x_AddAttribute(record, attr_values);
00908 }
00909 }
00910
00911 bool CGFFReader::x_SplitKeyValuePair( const string& pair, string& key, string& value )
00912 {
00913 if ( NStr::SplitInTwo( pair, "=", key, value ) ) {
00914 return true;
00915 }
00916 if ( NStr::SplitInTwo( pair, " ", key, value ) ) {
00917 x_Info("(recovered) missdelimited attribute/value pair: " + key, x_GetLineNumber());
00918 return true;
00919 }
00920 x_Warn("attribute without value: " + key, x_GetLineNumber());
00921 return false;
00922 }
00923
00924
00925 void CGFFReader::x_ParseV3Attributes(SRecord& record, const TStrVec& v,
00926 SIZE_TYPE& i)
00927 {
00928 vector<string> v2, attr;
00929 NStr::Tokenize(v[i], ";", v2, NStr::eMergeDelims);
00930 ITERATE (vector<string>, it, v2) {
00931 attr.clear();
00932 string key, values;
00933 if (x_SplitKeyValuePair( *it, key, values )) {
00934 vector<string> vals;
00935 attr.resize(2);
00936 s_URLDecode(key, attr[0]);
00937 NStr::Tokenize(values, ",", vals);
00938 ITERATE (vector<string>, it2, vals) {
00939 string value( *it2 );
00940 if ( NStr::MatchesMask(value, "\"*\"") ) {
00941
00942
00943
00944
00945 value = value.substr(1, value.length()-2);
00946 }
00947 s_URLDecode(value, attr[1]);
00948 x_AddAttribute(record, attr);
00949 }
00950 } else {
00951 x_Warn("attribute without value: " + key, x_GetLineNumber());
00952 attr.resize(1);
00953 s_URLDecode(*it, attr[0]);
00954 x_AddAttribute(record, attr);
00955 continue;
00956 }
00957 }
00958 }
00959
00960
00961 void CGFFReader::x_AddAttribute(SRecord& record, vector<string>& attr)
00962 {
00963 if (attr.size() == 0) {
00964 return;
00965 }
00966
00967 if (x_GetFlags() & fGBQuals) {
00968 if (attr[0] == "gbkey" && attr.size() == 2) {
00969 record.key = attr[1];
00970 return;
00971 }
00972 }
00973 record.attrs.insert(attr);
00974 }
00975
00976
00977 string CGFFReader::x_FeatureID(const SRecord& record)
00978 {
00979 if (record.type != SRecord::eFeat || x_GetFlags() & fNoGTF) {
00980 return kEmptyStr;
00981 }
00982
00983
00984 if (m_Version == 3) {
00985 if (!record.id.empty()) {
00986 return record.id;
00987 }
00988 else {
00989 return record.source + record.key + record.parent;
00990 }
00991 }
00992
00993 SRecord::TAttrs::const_iterator gene_it = record.FindAttribute("gene_id");
00994 SRecord::TAttrs::const_iterator transcript_it
00995 = record.FindAttribute("transcript_id");
00996
00997
00998 string id;
00999 if (gene_it != record.attrs.end()) {
01000 id += (*gene_it)[1];
01001 }
01002
01003 if (transcript_it != record.attrs.end()) {
01004 if ( !id.empty() ) {
01005 id += ' ';
01006 }
01007 id += (*transcript_it)[1];
01008 }
01009
01010
01011 SRecord::TAttrs::const_iterator dbxref_it
01012 = record.FindAttribute("db_xref");
01013 for ( ; dbxref_it != record.attrs.end() &&
01014 dbxref_it->front() == "db_xref"; ++dbxref_it) {
01015 if ( !id.empty() ) {
01016 id += ' ';
01017 }
01018 id += (*dbxref_it)[1];
01019 }
01020
01021 if ( id.empty() ) {
01022 return id;
01023 }
01024
01025 if (record.key == "start_codon" || record.key == "stop_codon") {
01026
01027 id += "CDS";
01028 } else if (record.key == "CDS"
01029 || NStr::FindNoCase(record.key, "rna") != NPOS) {
01030
01031 id += record.key;
01032 } else if (record.key == "exon") {
01033
01034 if (x_GetFlags() & fMergeExons) {
01035 id += record.key;
01036 } else {
01037 SRecord::TAttrs::const_iterator it
01038 = record.FindAttribute("exon_number");
01039 if (it == record.attrs.end()) {
01040 return kEmptyStr;
01041 } else {
01042 id += record.key + ' ' + (*it)[1];
01043 }
01044 }
01045 } else if (x_GetFlags() & fMergeOnyCdsMrna) {
01046 return kEmptyStr;
01047 }
01048 return id;
01049 }
01050
01051
01052 void CGFFReader::x_MergeRecords(SRecord& dest, const SRecord& src)
01053 {
01054
01055
01056 bool merge_overlaps = false;
01057 if (dest.key == "CDS" &&
01058 (src.key == "start_codon" || src.key == "stop_codon")) {
01059
01060
01061 merge_overlaps = true;
01062 }
01063
01064 if ((dest.key == "start_codon" || dest.key == "stop_codon") &&
01065 src.key == "CDS") {
01066
01067
01068 merge_overlaps = true;
01069 dest.key = "CDS";
01070 }
01071
01072
01073 int best_frame = dest.frame;
01074
01075 ITERATE (SRecord::TLoc, slit, src.loc) {
01076 bool merged = false;
01077 NON_CONST_ITERATE (SRecord::TLoc, dlit, dest.loc) {
01078 if (slit->accession != dlit->accession) {
01079 if (dest.loc.size() == 1) {
01080 x_Warn("Multi-accession feature", src.line_no);
01081 }
01082 continue;
01083 } else if (slit->strand != dlit->strand) {
01084 if (dest.loc.size() == 1) {
01085 x_Warn("Multi-orientation feature", src.line_no);
01086 }
01087 continue;
01088 } else {
01089 if (slit->strand == eNa_strand_plus) {
01090 if (slit->ranges.begin()->GetFrom() <
01091 dlit->ranges.begin()->GetFrom()) {
01092 best_frame = src.frame;
01093 }
01094 } else {
01095 if (slit->ranges.begin()->GetTo() >
01096 dlit->ranges.begin()->GetTo()) {
01097 best_frame = src.frame;
01098 }
01099 }
01100 if (merge_overlaps) {
01101 ITERATE (set<TSeqRange>, set_iter, slit->ranges) {
01102 dlit->merge_ranges.insert(*set_iter);
01103 }
01104 } else {
01105 ITERATE (set<TSeqRange>, set_iter, slit->ranges) {
01106 dlit->ranges.insert(*set_iter);
01107 }
01108 }
01109 merged = true;
01110 break;
01111 }
01112 }
01113 if ( !merged ) {
01114 dest.loc.push_back(*slit);
01115 }
01116 }
01117
01118 dest.frame = best_frame;
01119 if (src.key != dest.key) {
01120 if (dest.key == "CDS" && NStr::EndsWith(src.key, "_codon")
01121 && !(x_GetFlags() & fNoGTF) ) {
01122
01123 } else if (src.key == "CDS" && NStr::EndsWith(dest.key, "_codon")
01124 && !(x_GetFlags() & fNoGTF) ) {
01125 dest.key = "CDS";
01126 } else {
01127 x_Warn("Merging features with different keys: " + dest.key
01128 + " != " + src.key, src.line_no);
01129 }
01130 }
01131
01132 x_MergeAttributes(dest, src);
01133 }
01134
01135
01136 void CGFFReader::x_MergeAttributes(SRecord& dest, const SRecord& src)
01137 {
01138 SRecord::TAttrs::iterator dait = dest.attrs.begin();
01139 SRecord::TAttrs::iterator dait_end = dest.attrs.end();
01140 SRecord::TAttrs::iterator dait_tag = dait_end;
01141 ITERATE (SRecord::TAttrs, sait, src.attrs) {
01142 const string& tag = sait->front();
01143 while (dait != dait_end && dait->front() < tag) {
01144 ++dait;
01145 }
01146
01147 if (dait_tag == dait_end || dait_tag->front() != tag) {
01148 dait_tag = dait;
01149 }
01150 if (dait != dait_end && dait->front() == tag) {
01151 while (dait != dait_end && *dait < *sait) {
01152 ++dait;
01153 }
01154 }
01155 if (dait != dait_end && *dait == *sait) {
01156 continue;
01157 } else if ( !(x_GetFlags() & fNoGTF) && tag == "exon_number") {
01158 if (dait_tag != dait_end) {
01159 while (dait != dait_end && dait->front() == tag) {
01160 ++dait;
01161 }
01162 dest.attrs.erase(dait_tag, dait);
01163 dait_tag = dait_end;
01164 }
01165 } else {
01166 dest.attrs.insert(dait, *sait);
01167 }
01168 }
01169 }
01170
01171
01172 void CGFFReader::x_PlaceFeature(CSeq_feat& feat, const SRecord&)
01173 {
01174 CRef<CBioseq> seq;
01175 if ( !feat.IsSetProduct() ) {
01176 for (CTypeConstIterator<CSeq_id> it(feat.GetLocation()); it; ++it) {
01177 CRef<CBioseq> seq2 = x_ResolveID(*it, kEmptyStr);
01178 if ( !seq ) {
01179 seq.Reset(seq2);
01180 } else if ( seq2.NotEmpty() && seq != seq2) {
01181 seq.Reset();
01182 BREAK(it);
01183 }
01184 }
01185 }
01186
01187 CBioseq::TAnnot& annots
01188 = seq ? seq->SetAnnot() : m_TSE->SetSet().SetAnnot();
01189 NON_CONST_ITERATE (CBioseq::TAnnot, it, annots) {
01190 if ((*it)->GetData().IsFtable()) {
01191 (*it)->SetData().SetFtable().push_back(CRef<CSeq_feat>(&feat));
01192 return;
01193 }
01194 }
01195 CRef<CSeq_annot> annot(new CSeq_annot);
01196 annot->SetData().SetFtable().push_back(CRef<CSeq_feat>(&feat));
01197 annots.push_back(annot);
01198 }
01199
01200
01201 void CGFFReader::x_PlaceAlignment(CSeq_align& align, const SRecord& record)
01202 {
01203 CRef<CBioseq> seq;
01204 try {
01205 seq = x_ResolveID(align.GetSeq_id(0), kEmptyStr);
01206 } catch (...) {
01207 }
01208 CBioseq::TAnnot& annots
01209 = seq ? seq->SetAnnot() : m_TSE->SetSet().SetAnnot();
01210 NON_CONST_ITERATE (CBioseq::TAnnot, it, annots) {
01211 if ((*it)->GetData().IsAlign()) {
01212 (*it)->SetData().SetAlign().push_back(CRef<CSeq_align>(&align));
01213 return;
01214 }
01215 }
01216 CRef<CSeq_annot> annot(new CSeq_annot);
01217 annot->SetData().SetAlign().push_back(CRef<CSeq_align>(&align));
01218 annots.push_back(annot);
01219 }
01220
01221
01222 void CGFFReader::x_ParseAndPlace(const SRecord& record)
01223 {
01224 switch (record.type) {
01225 case SRecord::eFeat:
01226 x_PlaceFeature(*x_ParseFeatRecord(record), record);
01227 break;
01228 case SRecord::eAlign:
01229 x_PlaceAlignment(*x_ParseAlignRecord(record), record);
01230 break;
01231 default:
01232 x_Warn("Unknown record type " + NStr::IntToString(record.type),
01233 record.line_no);
01234 }
01235 }
01236
01237
01238 CRef<CSeq_id> CGFFReader::x_ResolveSeqName(const string& name)
01239 {
01240 CRef<CSeq_id>& id = m_SeqNameCache[name];
01241 if (id.NotEmpty()
01242 && (id->Which() == CSeq_id::e_not_set
01243 || static_cast<int>(id->Which()) >= CSeq_id::e_MaxChoice)) {
01244 x_Warn("x_ResolveSeqName: invalid cache entry for " + name);
01245 id.Reset();
01246 }
01247 if ( !id ) {
01248 id.Reset(x_ResolveNewSeqName(name));
01249 }
01250 if ( !id || id->Which() == CSeq_id::e_not_set
01251 || static_cast<int>(id->Which()) >= CSeq_id::e_MaxChoice) {
01252 x_Warn("x_ResolveNewSeqName returned null or invalid ID for " + name);
01253 id.Reset(new CSeq_id(CSeq_id::e_Local, name, name));
01254 }
01255 return id;
01256 }
01257
01258
01259 CRef<CSeq_id> CGFFReader::x_ResolveNewSeqName(const string& name)
01260 {
01261 if (m_Flags & fAllIdsAsLocal) {
01262 if (NStr::StartsWith(name, "lcl|")) {
01263 return CRef<CSeq_id>(new CSeq_id(name));
01264 } else {
01265 return CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local, name));
01266 }
01267 }
01268
01269 if (m_Flags & fNumericIdsAsLocal) {
01270 if (name.find_first_not_of("0123456789") == string::npos) {
01271 return CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local, name));
01272 }
01273 }
01274 try {
01275 return CRef<CSeq_id>(new CSeq_id(name));
01276 }
01277 catch (CSeqIdException&) {
01278 return CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local, name));
01279 }
01280 }
01281
01282
01283 CRef<CBioseq> CGFFReader::x_ResolveID(const CSeq_id& id, const TStr& mol)
01284 {
01285 CRef<CBioseq>& seq = m_SeqCache[CConstRef<CSeq_id>(&id)];
01286 if ( !seq ) {
01287 seq.Reset(x_ResolveNewID(id, mol));
01288
01289
01290 if (seq) {
01291 x_PlaceSeq(*seq);
01292 ITERATE (CBioseq::TId, it, seq->GetId()) {
01293 m_SeqCache.insert(make_pair(CConstRef<CSeq_id>(*it), seq));
01294 }
01295 }
01296 }
01297 return seq;
01298 }
01299
01300
01301 CRef<CBioseq> CGFFReader::x_ResolveNewID(const CSeq_id& id, const string& mol0)
01302 {
01303 CRef<CBioseq> seq(new CBioseq);
01304 CRef<CSeq_id> id_copy(new CSeq_id);
01305
01306 id_copy->Assign(id);
01307 seq->SetId().push_back(id_copy);
01308 seq->SetInst().SetRepr(CSeq_inst::eRepr_virtual);
01309
01310 const string& mol = mol0.empty() ? m_DefMol : mol0;
01311 if (mol.empty() || mol == "dna") {
01312 seq->SetInst().SetMol(CSeq_inst::eMol_dna);
01313 } else if (mol == "rna") {
01314 seq->SetInst().SetMol(CSeq_inst::eMol_rna);
01315 } else if (mol == "protein") {
01316 seq->SetInst().SetMol(CSeq_inst::eMol_aa);
01317 } else {
01318 x_Warn("unrecognized sequence type " + mol + "; assuming DNA");
01319 seq->SetInst().SetMol(CSeq_inst::eMol_dna);
01320 }
01321
01322 return seq;
01323 }
01324
01325 void CGFFReader::x_SetProducts( CRef<CSeq_entry>& tse )
01326 {
01327 CTypeIterator<CSeq_feat> feat_iter(*tse);
01328 for ( ; feat_iter; ++feat_iter) {
01329 CSeq_feat& feat = *feat_iter;
01330
01331 string qual_name;
01332 switch (feat.GetData().GetSubtype()) {
01333 case CSeqFeatData::eSubtype_cdregion:
01334 qual_name = "protein_id";
01335 break;
01336
01337 case CSeqFeatData::eSubtype_mRNA:
01338 qual_name = "transcript_id";
01339 break;
01340
01341 default:
01342 continue;
01343 break;
01344 }
01345
01346 string id_str = feat.GetNamedQual(qual_name);
01347 if ( !id_str.empty() ) {
01348 CRef<CSeq_id> id = x_ResolveSeqName(id_str);
01349 feat.SetProduct().SetWhole(*id);
01350 }
01351 }
01352 }
01353
01354 void CGFFReader::x_CreateGeneFeatures( CRef<CSeq_entry>& tse )
01355 {
01356 CTypeIterator<CSeq_annot> annot_iter(*tse);
01357 for ( ; annot_iter; ++annot_iter) {
01358 CSeq_annot& annot = *annot_iter;
01359 if (annot.GetData().Which() != CSeq_annot::TData::e_Ftable) {
01360 continue;
01361 }
01362
01363
01364 CSeq_annot::TData::TFtable::iterator feat_iter =
01365 annot.SetData().SetFtable().begin();
01366 CSeq_annot::TData::TFtable::iterator feat_end =
01367 annot.SetData().SetFtable().end();
01368
01369
01370
01371
01372 typedef map<string, CRef<CSeq_feat> > TGeneMap;
01373 TGeneMap genes;
01374 for (bool has_genes = false;
01375 feat_iter != feat_end && !has_genes; ++feat_iter) {
01376 CSeq_feat& feat = **feat_iter;
01377
01378 switch (feat.GetData().GetSubtype()) {
01379 case CSeqFeatData::eSubtype_gene:
01380
01381 has_genes = true;
01382 genes.clear();
01383 break;
01384
01385 case CSeqFeatData::eSubtype_mRNA:
01386 case CSeqFeatData::eSubtype_cdregion:
01387
01388
01389
01390 if ( !feat.GetGeneXref() ) {
01391 continue;
01392 }
01393 {{
01394 string gene_id;
01395 feat.GetGeneXref()->GetLabel(&gene_id);
01396 _ASSERT( !gene_id.empty() );
01397 TSeqRange range = feat.GetLocation().GetTotalRange();
01398
01399 ENa_strand strand = feat.GetLocation().GetStrand();
01400 const CSeq_id* id = feat.GetLocation().GetId();
01401 if ( !id ) {
01402 x_Error("No consistent ID found; gene feature skipped");
01403 continue;
01404 }
01405
01406 TGeneMap::iterator iter = genes.find(gene_id);
01407 if (iter == genes.end()) {
01408
01409 CRef<CSeq_feat> gene(new CSeq_feat());
01410 gene->SetData().SetGene().Assign(*feat.GetGeneXref());
01411
01412 gene->SetLocation().SetInt().SetFrom(range.GetFrom());
01413 gene->SetLocation().SetInt().SetTo (range.GetTo());
01414 gene->SetLocation().SetId(*id);
01415 gene->SetLocation().SetInt().SetStrand(strand);
01416 genes[gene_id] = gene;
01417 } else {
01418
01419 CRef<CSeq_feat> gene = iter->second;
01420
01421 TSeqRange r2 = gene->GetLocation().GetTotalRange();
01422 range += r2;
01423 gene->SetLocation().SetInt().SetFrom(range.GetFrom());
01424 gene->SetLocation().SetInt().SetTo (range.GetTo());
01425 gene->SetLocation().InvalidateTotalRangeCache();
01426 }
01427 }}
01428 break;
01429
01430 default:
01431 break;
01432 }
01433 }
01434
01435 ITERATE (TGeneMap, iter, genes) {
01436 annot.SetData().SetFtable().push_back(iter->second);
01437 }
01438 }
01439 }
01440
01441 void CGFFReader::x_RemapGeneRefs( CRef<CSeq_entry>& tse, TGeneRefs& gene_refs )
01442 {
01443 if ( !tse || gene_refs.empty() ) {
01444 return;
01445 }
01446 NON_CONST_ITERATE (TGeneRefs, iter, gene_refs) {
01447 if ( !iter->second->IsSetLocus() &&
01448 !iter->second->IsSetLocus_tag()) {
01449 iter->second->SetLocus(iter->first);
01450 } else if ( !iter->second->IsSetLocus() ||
01451 iter->second->GetLocus() != iter->first) {
01452 iter->second->SetSyn().push_back(iter->first);
01453 }
01454 }
01455
01456 CTypeIterator<CSeq_feat> feat_iter(*tse);
01457 for ( ; feat_iter; ++feat_iter) {
01458 const CGene_ref* ref = NULL;
01459 if (feat_iter->GetData().IsGene()) {
01460 ref = &feat_iter->GetData().GetGene();
01461 } else {
01462 ref = feat_iter->GetGeneXref();
01463 }
01464 if (ref && ref->IsSetLocus()) {
01465 TGeneRefs::const_iterator iter =
01466 gene_refs.find(ref->GetLocus());
01467 if (iter != gene_refs.end()) {
01468 const_cast<CGene_ref*>(ref)->Assign(*iter->second);
01469 }
01470 }
01471 }
01472 }
01473
01474 void CGFFReader::x_PlaceSeq(CBioseq& seq)
01475 {
01476 bool found = false;
01477 for (CTypeConstIterator<CBioseq> it(*m_TSE); it; ++it) {
01478 if (&*it == &seq) {
01479 found = true;
01480 BREAK(it);
01481 }
01482 }
01483 if ( !found ) {
01484 CRef<CSeq_entry> se(new CSeq_entry);
01485 se->SetSeq(seq);
01486 m_TSE->SetSet().SetSeq_set().push_back(se);
01487 }
01488 }
01489
01490
01491 CGFFReader::SRecord::TAttrs::const_iterator
01492 CGFFReader::SRecord::FindAttribute(const string& att_name, size_t min_values)
01493 const
01494 {
01495 SRecord::TAttrs::const_iterator it
01496 = attrs.lower_bound(vector<string>(1, att_name));
01497 while (it != attrs.end() && it->front() == att_name
01498 && it->size() <= min_values) {
01499 ++it;
01500 }
01501 return (it == attrs.end() || it->front() == att_name) ? it : attrs.end();
01502 }
01503
01504
01505 bool
01506 CGFFReader::x_IsLineUcscMetaInformation(const TStr& line)
01507 {
01508
01509 return (NStr::StartsWith(line, "browser ") || NStr::StartsWith(line, "track ") );
01510 }
01511
01512
01513 END_SCOPE(objects)
01514 END_NCBI_SCOPE
01515
01516