src/objtools/readers/gff_reader.cpp

Go to the documentation of this file.
00001 /*  $Id: gff_reader.cpp 173623 2009-10-20 01:42:01Z dicuccio $
00002 * ===========================================================================
00003 *
00004 *                            PUBLIC DOMAIN NOTICE
00005 *               National Center for Biotechnology Information
00006 *
00007 *  This software/database is a "United States Government Work" under the
00008 *  terms of the United States Copyright Act.  It was written as part of
00009 *  the author's official duties as a United States Government employee and
00010 *  thus cannot be copyrighted.  This software/database is freely available
00011 *  to the public for use. The National Library of Medicine and the U.S.
00012 *  Government have not placed any restriction on its use or reproduction.
00013 *
00014 *  Although all reasonable efforts have been taken to ensure the accuracy
00015 *  and reliability of the software and data, the NLM and the U.S.
00016 *  Government do not and cannot warrant the performance or results that
00017 *  may be obtained by using this software or data. The NLM and the U.S.
00018 *  Government disclaim all warranties, express or implied, including
00019 *  warranties of performance, merchantability or fitness for any particular
00020 *  purpose.
00021 *
00022 *  Please cite the author in any work or product based on this material.
00023 *
00024 * ===========================================================================
00025 *
00026 * Authors:  Aaron Ucko, Wratko Hlavina
00027 *
00028 * File Description:
00029 *   Reader for GFF (including GTF) files.
00030 *
00031 * ===========================================================================
00032 */
00033 
00034 #include <ncbi_pch.hpp>
00035 #include <objtools/readers/gff_reader.hpp>
00036 
00037 #include <corelib/ncbistr_util.hpp>
00038 #include <corelib/ncbitime.hpp>
00039 #include <corelib/ncbiutil.hpp>
00040 #include <corelib/stream_utils.hpp>
00041 #include <serial/iterator.hpp>
00042 
00043 #include <objects/general/Date.hpp>
00044 #include <objects/general/Object_id.hpp>
00045 #include <objects/seq/Seq_annot.hpp>
00046 #include <objects/seq/Seq_descr.hpp>
00047 #include <objects/seq/Seq_inst.hpp>
00048 #include <objects/seq/Seqdesc.hpp>
00049 #include <objects/seqalign/Dense_seg.hpp>
00050 #include <objects/seqalign/Score.hpp>
00051 #include <objects/seqalign/Std_seg.hpp>
00052 #include <objects/seqfeat/Feat_id.hpp>
00053 #include <objects/seqfeat/Cdregion.hpp>
00054 #include <objects/seqfeat/SeqFeatXref.hpp>
00055 #include <objects/seqfeat/Gb_qual.hpp>
00056 #include <objects/seqloc/Seq_interval.hpp>
00057 #include <objects/seqloc/Seq_point.hpp>
00058 #include <objects/seqset/Bioseq_set.hpp>
00059 
00060 #include <objtools/readers/cigar.hpp>
00061 #include <objtools/readers/fasta.hpp>
00062 #include <objtools/readers/readfeat.hpp>
00063 #include <objtools/error_codes.hpp>
00064 
00065 #include <algorithm>
00066 #include <ctype.h>
00067 
00068 
00069 #define NCBI_USE_ERRCODE_X   Objtools_Rd_GFF
00070 
00071 BEGIN_NCBI_SCOPE
00072 BEGIN_SCOPE(objects)
00073 
00074 static CRef<CFeat_id>
00075 s_StringToFeatId( const string& str )
00076 {
00077     CRef<CObject_id> objid( new CObject_id );
00078     objid->SetStr( str );
00079     CRef<CFeat_id> featid( new CFeat_id );
00080     featid->SetLocal( *objid );
00081     return featid;
00082 }    
00083 
00084 static string& s_URLDecode(const CTempString& s, string& out) {
00085     SIZE_TYPE pos = 0;
00086     out.erase();
00087     out.reserve(s.size());
00088     while (pos < s.size()) {
00089         SIZE_TYPE pos2 = s.find_first_of("%" /* "+" */, pos);
00090         out += s.substr(pos, pos2 - pos);
00091         if (pos2 == NPOS) {
00092             break;
00093         } else if (s[pos2] == '+') { // disabled -- often used literally
00094             out += ' ';
00095             pos = pos2 + 1;
00096         } else if (s[pos2] == '%') {
00097             try {
00098                 out += (char)NStr::StringToInt(s.substr(pos2 + 1, 2), 0, 16);
00099                 pos = pos2 + 3;
00100             } catch (CStringException&) {
00101                 // some sources neglect to encode % (!)
00102                 out += '%';
00103                 pos = pos2 + 1;
00104             }
00105         } else {
00106             _TROUBLE;
00107         }
00108     }
00109     return out;
00110 }
00111 
00112 
00113 CRef<CSeq_entry> CGFFReader::Read(CNcbiIstream& in, TFlags flags)
00114 {
00115     CStreamLineReader lr(in);
00116     return Read(lr, flags);
00117 }
00118 
00119 CRef<CSeq_entry> CGFFReader::Read(ILineReader& in, TFlags flags)
00120 {
00121     x_Reset();
00122     m_Flags  = flags;
00123     m_LineReader = &in;
00124 
00125     if (m_Flags & fSetVersion3) {
00126         m_Version = 3;
00127     }
00128 
00129     TStr line;
00130     while ( !in.AtEOF() ) {
00131         ++m_LineNumber;
00132         char c = in.PeekChar();
00133         if (c == '#') {
00134             line = *++in;
00135             if (line.size() > 2  &&  line[1] == '#') {
00136                 x_ParseStructuredComment(line);
00137                 // ignore regular comments
00138             }
00139         } else if (c == '>') {
00140             // implicit ##FASTA
00141             x_ReadFastaSequences(in);
00142         } else {
00143             line = *++in;            
00144             if ( x_IsLineUcscMetaInformation(line) ) {
00145                 // UCSC browser or track line. For now, we ignore those.
00146                 continue;
00147             }
00148             if ( line.empty() ) {
00149                 // too commonly used for file formatting to even warn about
00150                 continue;
00151             }
00152             CRef<SRecord> record = x_ParseFeatureInterval(line);
00153             if (record) {
00154                 
00155                 if (record->id.empty()) {
00156                     x_ParseAndPlace(*record);
00157                 } else {
00158                     CRef<SRecord>& match = m_DelayedRecords[ record->id ];
00159                     // _TRACE(id << " -> " << match.GetPointer());
00160                     if (match) {
00161                         x_MergeRecords(*match, *record);
00162                     } else {
00163                         match.Reset(record);
00164                     }
00165                 }
00166             }
00167         }
00168     }
00169 
00170     NON_CONST_ITERATE (TDelayedRecords, it, m_DelayedRecords) {
00171         SRecord& rec = *it->second;
00172         /// merge mergeable ranges
00173         NON_CONST_ITERATE (SRecord::TLoc, loc_iter, rec.loc) {
00174             ITERATE (set<TSeqRange>, src_iter, loc_iter->merge_ranges) {
00175                 TSeqRange range(*src_iter);
00176                 set<TSeqRange>::iterator dst_iter =
00177                     loc_iter->ranges.begin();
00178                 for ( ;  dst_iter != loc_iter->ranges.end();  ) {
00179                     TSeqRange r(range);
00180                     r += *dst_iter;
00181                     if (r.GetLength() <=
00182                         range.GetLength() + dst_iter->GetLength()) {
00183                         range += *dst_iter;
00184                         _TRACE("merging overlapping ranges: "
00185                                << range.GetFrom() << " - "
00186                                << range.GetTo() << " <-> "
00187                                << dst_iter->GetFrom() << " - "
00188                                << dst_iter->GetTo());
00189                         loc_iter->ranges.erase(dst_iter++);
00190                         break;
00191                     } else {
00192                         ++dst_iter;
00193                     }
00194                 }
00195                 loc_iter->ranges.insert(range);
00196             }
00197         }
00198 
00199         if (rec.key == "exon") {
00200             rec.key = "mRNA";
00201         }
00202         x_ParseAndPlace(rec);
00203     }
00204 
00205     ///
00206     /// remap gene refs
00207     /// we have built a set of gene-id -> gene-ref pairs
00208     ///
00209     if (m_TSE  &&  m_GeneRefs.size()) {
00210         NON_CONST_ITERATE (TGeneRefs, iter, m_GeneRefs) {
00211             if ( !iter->second->IsSetLocus()  &&
00212                  !iter->second->IsSetLocus_tag()) {
00213                 iter->second->SetLocus(iter->first);
00214             } else if ( !iter->second->IsSetLocus()  ||
00215                         iter->second->GetLocus() != iter->first) {
00216                 iter->second->SetSyn().push_back(iter->first);
00217             }
00218         }
00219 
00220         CTypeIterator<CSeq_feat> feat_iter(*m_TSE);
00221         for ( ;  feat_iter;  ++feat_iter) {
00222             const CGene_ref* ref = NULL;
00223             if (feat_iter->GetData().IsGene()) {
00224                 ref = &feat_iter->GetData().GetGene();
00225             } else {
00226                 ref = feat_iter->GetGeneXref();
00227             }
00228             if (ref  &&  ref->IsSetLocus()) {
00229                 TGeneRefs::const_iterator iter =
00230                     m_GeneRefs.find(ref->GetLocus());
00231                 if (iter != m_GeneRefs.end()) {
00232                     const_cast<CGene_ref*>(ref)->Assign(*iter->second);
00233                 }
00234             }
00235         }
00236     }
00237 
00238     CRef<CSeq_entry> tse(m_TSE); // need to save before resetting.
00239     x_Reset();
00240 
00241     // promote transcript_id and protein_id to products
00242     if (flags & fSetProducts) {
00243         CTypeIterator<CSeq_feat> feat_iter(*tse);
00244         for ( ;  feat_iter;  ++feat_iter) {
00245             CSeq_feat& feat = *feat_iter;
00246 
00247             string qual_name;
00248             switch (feat.GetData().GetSubtype()) {
00249             case CSeqFeatData::eSubtype_cdregion:
00250                 qual_name = "protein_id";
00251                 break;
00252 
00253             case CSeqFeatData::eSubtype_mRNA:
00254                 qual_name = "transcript_id";
00255                 break;
00256 
00257             default:
00258                 continue;
00259                 break;
00260             }
00261 
00262             string id_str = feat.GetNamedQual(qual_name);
00263             if ( !id_str.empty() ) {
00264                 CRef<CSeq_id> id = x_ResolveSeqName(id_str);
00265                 feat.SetProduct().SetWhole(*id);
00266             }
00267         }
00268     }
00269 
00270     if (flags & fCreateGeneFeats) {
00271         CTypeIterator<CSeq_annot> annot_iter(*tse);
00272         for ( ;  annot_iter;  ++annot_iter) {
00273             CSeq_annot& annot = *annot_iter;
00274             if (annot.GetData().Which() != CSeq_annot::TData::e_Ftable) {
00275                 continue;
00276             }
00277 
00278             // we work within the scope of one annotation
00279             CSeq_annot::TData::TFtable::iterator feat_iter = 
00280                 annot.SetData().SetFtable().begin();
00281             CSeq_annot::TData::TFtable::iterator feat_end = 
00282                 annot.SetData().SetFtable().end();
00283 
00284             /// we plan to create a series of gene features, one for each gene
00285             /// identified above
00286             /// genes are identified via a 'gene_id' marker
00287             typedef map<string, CRef<CSeq_feat> > TGeneMap;
00288             TGeneMap genes;
00289             for (bool has_genes = false;
00290                  feat_iter != feat_end  &&  !has_genes;  ++feat_iter) {
00291                 CSeq_feat& feat = **feat_iter;
00292 
00293                 switch (feat.GetData().GetSubtype()) {
00294                 case CSeqFeatData::eSubtype_gene:
00295                     /// we already have genes, so don't add any more
00296                     has_genes = true;
00297                     genes.clear();
00298                     break;
00299 
00300                 case CSeqFeatData::eSubtype_mRNA:
00301                 case CSeqFeatData::eSubtype_cdregion:
00302                     /// for mRNA and CDS features, create a gene
00303                     /// this is only done if the gene_id parameter was set
00304                     /// in parsing, we promote gene_id to a gene xref
00305                     if ( !feat.GetGeneXref() ) {
00306                         continue;
00307                     }
00308                     {{
00309                         string gene_id;
00310                         feat.GetGeneXref()->GetLabel(&gene_id);
00311                         _ASSERT( !gene_id.empty() );
00312                         TSeqRange range = feat.GetLocation().GetTotalRange();
00313 
00314                         ENa_strand strand = feat.GetLocation().GetStrand();
00315                         const CSeq_id* id = feat.GetLocation().GetId();
00316                         if ( !id ) {
00317                             x_Error("No consistent ID found; gene feature skipped");
00318                             continue;
00319                         }
00320 
00321                         TGeneMap::iterator iter = genes.find(gene_id);
00322                         if (iter == genes.end()) {
00323                             /// new gene feature
00324                             CRef<CSeq_feat> gene(new CSeq_feat());
00325                             gene->SetData().SetGene().Assign(*feat.GetGeneXref());
00326 
00327                             gene->SetLocation().SetInt().SetFrom(range.GetFrom());
00328                             gene->SetLocation().SetInt().SetTo  (range.GetTo());
00329                             gene->SetLocation().SetId(*id);
00330                             gene->SetLocation().SetInt().SetStrand(strand);
00331                             genes[gene_id] = gene;
00332                         } else {
00333                             /// we agglomerate the old location
00334                             CRef<CSeq_feat> gene = iter->second;
00335 
00336                             TSeqRange r2 = gene->GetLocation().GetTotalRange();
00337                             range += r2;
00338                             gene->SetLocation().SetInt().SetFrom(range.GetFrom());
00339                             gene->SetLocation().SetInt().SetTo  (range.GetTo());
00340                             gene->SetLocation().InvalidateTotalRangeCache();
00341                         }
00342                     }}
00343                     break;
00344 
00345                 default:
00346                     break;
00347                 }
00348             }
00349 
00350             ITERATE (TGeneMap, iter, genes) {
00351                 annot.SetData().SetFtable().push_back(iter->second);
00352             }
00353         }
00354     }
00355 
00356     return tse;
00357 }
00358 
00359 
00360 void CGFFReader::x_Warn(const string& message, unsigned int line)
00361 {
00362     if (line) {
00363         ERR_POST_X(2, Warning << message << " [GFF input, line " << line << ']');
00364     } else {
00365         ERR_POST_X(3, Warning << message << " [GFF input]");
00366     }
00367 }
00368 
00369 
00370 void CGFFReader::x_Error(const string& message, unsigned int line)
00371 {
00372     if (line) {
00373         ERR_POST_X(1, Error << message << " [GFF input, line " << line << ']');
00374     } else {
00375         ERR_POST_X(1, Error << message << " [GFF input]");
00376     }
00377 }
00378 
00379 
00380 void CGFFReader::x_Info(const string& message, unsigned int line)
00381 {
00382     if (line) {
00383         ERR_POST_X(1, Info << message << " [GFF input, line " << line << ']');
00384     } else {
00385         ERR_POST_X(1, Info << message << " [GFF input]");
00386     }
00387 }
00388 
00389 
00390 void CGFFReader::x_Reset(void)
00391 {
00392     m_TSE.Reset(new CSeq_entry);
00393     m_SeqNameCache.clear();
00394     m_SeqCache.clear();
00395     m_DelayedRecords.clear();
00396     m_GeneRefs.clear();
00397     m_DefMol.erase();
00398     m_LineNumber = 0;
00399     m_Version = 2;
00400 }
00401 
00402 
00403 bool CGFFReader::x_ParseStructuredComment(const TStr& line)
00404 {
00405     if ( line.empty() || line[0] != '#' || line[1] != '#' ) {
00406         return false;
00407     }
00408     TStrVec v;
00409     // NStr::Tokenize(line, "# \t", v, NStr::eMergeDelims);
00410     typedef CStrTokenize<TStr, TStrVec> TTokenizer;
00411     TTokenizer::TPosContainer pos_container;
00412     TTokenizer::Do(line, "# \t", v, TTokenizer::eMergeDelims, pos_container);
00413     if (v.empty()) {
00414         return true;
00415     }
00416     if (v[0] == "date"  &&  v.size() > 1) {
00417         x_ParseDateComment(v[1]);
00418     } else if (v[0] == "Type"  &&  v.size() > 1) {
00419         x_ParseTypeComment(v[1], v.size() > 2 ? v[2] : TStr());
00420     } else if (v[0] == "gff-version"  &&  v.size() > 1) {
00421         m_Version = NStr::StringToInt(v[1]);
00422     } else if (v[0] == "FASTA") {
00423         x_ReadFastaSequences(*m_LineReader);
00424     }
00425     // etc.
00426     return true;
00427 }
00428 
00429 
00430 void CGFFReader::x_ParseDateComment(const TStr& date)
00431 {
00432     try {
00433         CRef<CSeqdesc> desc(new CSeqdesc);
00434         desc->SetUpdate_date().SetToTime(CTime(date, "Y-M-D"),
00435                                          CDate::ePrecision_day);
00436         m_TSE->SetSet().SetDescr().Set().push_back(desc);
00437     } catch (exception& e) {
00438         x_Error(string("Bad ISO date: ") + e.what(), x_GetLineNumber());
00439     }
00440 }
00441 
00442 
00443 void CGFFReader::x_ParseTypeComment(const TStr& moltype, const TStr& seqname)
00444 {
00445     if (seqname.empty()) {
00446         m_DefMol = moltype;
00447     } else {
00448         // automatically adds to m_TSE if new
00449         x_ResolveID(*x_ResolveSeqName(seqname), moltype);
00450     }
00451 }
00452 
00453 
00454 void CGFFReader::x_ReadFastaSequences(ILineReader& in)
00455 {
00456     CFastaReader reader(in, fReadFasta_AssumeNuc);
00457     CRef<CSeq_entry> seqs = reader.ReadSet();
00458     for (CTypeIterator<CBioseq> it(*seqs);  it;  ++it) {
00459         if (it->GetId().empty()) { // can this happen?
00460             CRef<CSeq_entry> parent(new CSeq_entry);
00461             parent->SetSeq(*it);
00462             m_TSE->SetSet().SetSeq_set().push_back(parent);
00463             continue;
00464         }
00465         CRef<CBioseq> our_bs = x_ResolveID(*it->GetId().front(), kEmptyStr);
00466         // keep our annotations, but replace everything else.
00467         // (XXX - should also keep mol)
00468         our_bs->SetId() = it->GetId();
00469         if (it->IsSetDescr()) {
00470             our_bs->SetDescr(it->SetDescr());
00471         }
00472         our_bs->SetInst(it->SetInst());
00473     }
00474 }
00475 
00476 
00477 CRef<CGFFReader::SRecord>
00478 CGFFReader::x_ParseFeatureInterval(const TStr& line)
00479 {
00480     typedef CStrTokenize<TStr, TStrVec> TTokenizer;
00481     TTokenizer::TPosContainer           pos_container;
00482     TStrVec                             v;
00483     bool                                misdelimited = false;
00484 
00485     TTokenizer::Do(line, "\t", v, TTokenizer::eNoMergeDelims, pos_container);
00486     if (v.size() < 8) {
00487         v.clear();
00488         TTokenizer::Do(line, " \t", v, TTokenizer::eMergeDelims, pos_container);
00489         if (v.size() < 8) {
00490             x_Error("Skipping line due to insufficient fields",
00491                    x_GetLineNumber());
00492             return null;
00493         } else if (m_Version < 3) {
00494             x_Info("(Recovered) Bad delimiters (should use tabs)", x_GetLineNumber());
00495             misdelimited = true;
00496         }
00497     } else {
00498         // XXX - warn about extra fields (if any), but only if they're
00499         // not comments
00500         // v.resize(9);
00501     }
00502 
00503     CRef<SRecord> record(x_NewRecord());
00504     string        accession;
00505     TSeqPos       from = 0, to = numeric_limits<TSeqPos>::max();
00506     ENa_strand    strand = eNa_strand_unknown;
00507     s_URLDecode(v[0], accession);
00508     record->source = v[1];
00509     record->key = v[2];
00510 
00511     try {
00512         from = NStr::StringToUInt(v[3]) - 1;
00513     } catch (std::exception& e) {
00514         x_Error(string("Bad FROM position: ") + e.what(), x_GetLineNumber());
00515     }
00516 
00517     try {
00518         to = NStr::StringToUInt(v[4]) - 1;
00519     } catch (std::exception& e) {
00520         x_Error(string("Bad TO position: ") + e.what(), x_GetLineNumber());
00521     }
00522 
00523     record->score = v[5];
00524 
00525     if (v[6] == "+") {
00526         strand = eNa_strand_plus;
00527     } else if (v[6] == "-") {
00528         strand = eNa_strand_minus;
00529     } else if ( !(v[6] == ".") ) {
00530         x_Warn("Bad strand " + string(v[6]) + " (should be [+-.])",
00531                x_GetLineNumber());
00532     }
00533 
00534     if (v[7] == "0"  ||  v[7] == "1"  ||  v[7] == "2") {
00535         record->frame = v[7][0] - '0';
00536     } else if (v[7] == ".") {
00537         record->frame = -1;
00538     } else {
00539         x_Warn("Bad frame " + string(v[7]) + " (should be [012.])",
00540                x_GetLineNumber());
00541         record->frame = -1;
00542     }
00543 
00544     {{
00545         SRecord::SSubLoc subloc;
00546         subloc.accession = accession;
00547         subloc.strand    = strand;
00548         subloc.ranges.insert(TSeqRange(from, to));
00549 
00550         record->loc.push_back(subloc);
00551     }}
00552 
00553     SIZE_TYPE i = 8;
00554     if (m_Version >= 3) {
00555         x_ParseV3Attributes(*record, v, i);
00556     } else {
00557         x_ParseV2Attributes(*record, v, i);
00558     }
00559 
00560     if ( !misdelimited  &&  (i > 9  ||  (i == 9  &&  v.size() > 9
00561                                          &&  !NStr::StartsWith(v[9], "#") ))) {
00562         x_Warn("Extra non-comment fields", x_GetLineNumber());
00563     }
00564 
00565     if (record->FindAttribute("Target") != record->attrs.end()) {
00566         record->type = SRecord::eAlign;
00567     } else {
00568         record->type = SRecord::eFeat;
00569     } 
00570 
00571     // extracting additional gff3 attributes
00572     if (m_Version == 3) {
00573         SRecord::TAttrs::const_iterator id_it = record->FindAttribute("ID");
00574         if (id_it != record->attrs.end()) {
00575             record->id = (*id_it)[1];
00576         }
00577     
00578         SRecord::TAttrs::const_iterator parent_it = record->FindAttribute("Parent");
00579         if (parent_it != record->attrs.end()) {
00580             record->parent = (*parent_it)[1];
00581         }
00582 
00583         SRecord::TAttrs::const_iterator name_it = record->FindAttribute("Name");
00584         if (name_it != record->attrs.end()) {
00585             record->name = (*name_it)[1];
00586         }        
00587     }
00588 
00589     record->line_no = m_LineNumber;
00590     record->id = x_FeatureID(*record);
00591     return record;
00592 }
00593 
00594 
00595 CRef<CSeq_feat> CGFFReader::x_ParseFeatRecord(const SRecord& record)
00596 {
00597     CRef<CSeq_feat> feat(CFeature_table_reader::CreateSeqFeat
00598                          (record.key, *x_ResolveLoc(record.loc),
00599                           CFeature_table_reader::fTranslateBadKey));
00600     if (record.frame >= 0  &&  feat->GetData().IsCdregion()) {
00601         feat->SetData().SetCdregion().SetFrame
00602             (static_cast<CCdregion::EFrame>(record.frame + 1));
00603     }
00604     if ( m_Version == 3 ) {
00605         ITERATE (SRecord::TAttrs, it, record.attrs) {
00606             string tag = it->front();
00607             if (tag == "ID") {
00608                 feat->SetId( *s_StringToFeatId( (*it)[1] ) );
00609             }
00610             if (tag == "Parent") {
00611                 CRef<CSeqFeatXref> xref( new CSeqFeatXref );
00612                 xref->SetId( *s_StringToFeatId( (*it)[1] ) );
00613                 feat->SetXref().push_back( xref );
00614             }
00615         }
00616     }
00617 
00618     string gene_id;
00619     string gene;
00620     string locus_tag;
00621     ITERATE (SRecord::TAttrs, it, record.attrs) {
00622         string tag = it->front();
00623         string value;
00624         switch (it->size()) {
00625         case 1:
00626             break;
00627         case 2:
00628             value = (*it)[1];
00629             break;
00630         default:
00631             x_Warn("Ignoring extra fields in value of " + tag, record.line_no);
00632             value = (*it)[1];
00633             break;
00634         }
00635         if (x_GetFlags() & fGBQuals) {
00636             if (tag == "transcript_id") {
00637                 //continue;
00638             } else if (tag == "gene_id") {
00639                 gene_id = value;
00640                 continue;
00641             } else if (tag == "gene") {
00642                 gene = value;
00643                 continue;
00644             } else if (tag == "locus_tag") {
00645                 locus_tag = value;
00646                 continue;
00647             } else if (tag == "exon_number") {
00648                 tag = "number";
00649             } else if (NStr::StartsWith(tag, "insd_")) {
00650                 tag.erase(0, 5);
00651             }
00652 
00653             CFeature_table_reader::AddFeatQual
00654                 (feat, tag, value, CFeature_table_reader::fKeepBadKey);
00655         } else { // don't attempt to parse, just treat as imported
00656             CRef<CGb_qual> qual(new CGb_qual);
00657             qual->SetQual(tag);
00658             qual->SetVal(value);
00659             feat->SetQual().push_back(qual);
00660         }
00661     }
00662 
00663     if ( !gene_id.empty() ) {
00664         SIZE_TYPE colon = gene_id.find(':');
00665         if (colon != NPOS) {
00666             gene_id.erase(0, colon + 1);
00667         }
00668 
00669         TGeneRefs::value_type val(gene_id, CRef<CGene_ref>());
00670         TGeneRefs::iterator iter = m_GeneRefs.insert(val).first;
00671         if ( !iter->second ) {
00672             iter->second.Reset(new CGene_ref);
00673         }
00674         if ( !gene.empty() ) {
00675             if (iter->second->IsSetLocus()  &&
00676                 iter->second->GetLocus() != gene) {
00677                 LOG_POST_X(4, Warning << "CGFFReader::x_ParseFeatRecord(): "
00678                            << "inconsistent gene name: "
00679                            << gene << " != " << iter->second->GetLocus()
00680                            << ", ignoring second");
00681             } else if ( !iter->second->IsSetLocus() ) {
00682                 iter->second->SetLocus(gene);
00683             }
00684         }
00685         if ( !locus_tag.empty() ) {
00686             if (iter->second->IsSetLocus_tag()  &&
00687                 iter->second->GetLocus_tag() != locus_tag) {
00688                 LOG_POST_X(5, Warning << "CGFFReader::x_ParseFeatRecord(): "
00689                            << "inconsistent locus tag: "
00690                            << locus_tag << " != " << iter->second->GetLocus_tag()
00691                            << ", ignoring second");
00692             } else if ( !iter->second->IsSetLocus_tag() ) {
00693                 iter->second->SetLocus_tag(locus_tag);
00694             }
00695         }
00696 
00697         // translate
00698         CFeature_table_reader::AddFeatQual
00699             (feat, "gene_id", gene_id,
00700              CFeature_table_reader::fKeepBadKey);
00701         if (x_GetFlags() & fGBQuals) {
00702             CFeature_table_reader::AddFeatQual
00703                 (feat, "gene", gene_id,
00704                  CFeature_table_reader::fKeepBadKey);
00705         }
00706     }
00707 
00708     return feat;
00709 }
00710 
00711 
00712 CRef<CSeq_align> CGFFReader::x_ParseAlignRecord(const SRecord& record)
00713 {
00714     CRef<CSeq_align> align(new CSeq_align);
00715     align->SetType(CSeq_align::eType_partial);
00716     align->SetDim(2);
00717     SRecord::TAttrs::const_iterator tgit = record.FindAttribute("Target");
00718     vector<string> target;
00719     if (tgit != record.attrs.end()) {
00720         NStr::Tokenize((*tgit)[1], " +-", target, NStr::eMergeDelims);
00721     }
00722     if (target.size() != 3) {
00723         x_Warn("Bad Target attribute", record.line_no);
00724         return align;
00725     }
00726     CRef<CSeq_id> tgid    = x_ResolveSeqName(target[0]);
00727     TSeqPos       tgstart = NStr::StringToUInt(target[1]) - 1;
00728     TSeqPos       tgstop  = NStr::StringToUInt(target[2]) - 1;
00729     TSeqPos       tglen   = tgstop - tgstart + 1;
00730 
00731     CRef<CSeq_loc> refloc = x_ResolveLoc(record.loc);
00732     CRef<CSeq_id>  refid(&refloc->SetInt().SetId());
00733     TSeqPos        reflen = 0;
00734     for (CSeq_loc_CI it(*refloc);  it;  ++it) {
00735         reflen += it.GetRange().GetLength();
00736     }
00737 
00738     CRef<CSeq_loc> tgloc(new CSeq_loc);
00739     tgloc->SetInt().SetId(*tgid);
00740     tgloc->SetInt().SetFrom(tgstart);
00741     tgloc->SetInt().SetTo(tgstop);
00742 
00743     SRecord::TAttrs::const_iterator gap_it = record.FindAttribute("Gap");
00744     if (gap_it == record.attrs.end()) {
00745         // single ungapped alignment
00746         if (reflen == tglen  &&  refloc->IsInt()) {
00747             CDense_seg& ds = align->SetSegs().SetDenseg();
00748             ds.SetNumseg(1);
00749             ds.SetIds().push_back(refid);
00750             ds.SetIds().push_back(tgid);
00751             ds.SetStarts().push_back(refloc->GetInt().GetFrom());
00752             ds.SetStarts().push_back(tgstart);
00753             ds.SetLens().push_back(reflen);
00754             if (refloc->GetInt().IsSetStrand()) {
00755                 ds.SetStrands().push_back(refloc->GetInt().GetStrand());
00756                 ds.SetStrands().push_back(eNa_strand_plus);
00757             }
00758         } else {
00759             if (reflen != tglen  &&  reflen != 3 * tglen) {
00760                 x_Warn("Reference and target locations have an irregular"
00761                        " ratio.", record.line_no);
00762             }
00763             CRef<CStd_seg> ss(new CStd_seg);
00764             ss->SetLoc().push_back(refloc);
00765             ss->SetLoc().push_back(tgloc);
00766             align->SetSegs().SetStd().push_back(ss);
00767         }
00768     } else {
00769         SCigarAlignment cigar
00770             ((*gap_it)[1], SCigarAlignment::eOpFirstIfAmbiguous);
00771         align = cigar(refloc->GetInt(), tgloc->GetInt());
00772     }
00773 
00774     try {
00775         CRef<CScore> score(new CScore);
00776         score->SetValue().SetReal(NStr::StringToDouble(record.score));
00777         align->SetScore().push_back(score);
00778     } catch (...) {
00779     }
00780 
00781     return align;
00782 }
00783 
00784 
00785 CRef<CSeq_loc> CGFFReader::x_ResolveLoc(const SRecord::TLoc& loc)
00786 {
00787     CRef<CSeq_loc> seqloc(new CSeq_loc);
00788     ITERATE (SRecord::TLoc, it, loc) {
00789         CRef<CSeq_id> id = x_ResolveSeqName(it->accession);
00790         ITERATE (set<TSeqRange>, range, it->ranges) {
00791             CRef<CSeq_loc> segment(new CSeq_loc);
00792             if (range->GetLength() == 1) {
00793                 CSeq_point& pnt = segment->SetPnt();
00794                 pnt.SetId   (*id);
00795                 pnt.SetPoint(range->GetFrom());
00796                 if (it->strand != eNa_strand_unknown) {
00797                     pnt.SetStrand(it->strand);
00798                 }
00799             } else {
00800                 CSeq_interval& si = segment->SetInt();
00801                 si.SetId  (*id);
00802                 si.SetFrom(range->GetFrom());
00803                 si.SetTo  (range->GetTo());
00804                 if (it->strand != eNa_strand_unknown) {
00805                     si.SetStrand(it->strand);
00806                 }
00807             }
00808             if (IsReverse(it->strand)) {
00809                 seqloc->SetMix().Set().push_front(segment);
00810             } else {
00811                 seqloc->SetMix().Set().push_back(segment);
00812             }
00813         }
00814     }
00815 
00816     if (seqloc->GetMix().Get().size() == 1) {
00817         return seqloc->SetMix().Set().front();
00818     } else {
00819         return seqloc;
00820     }
00821 }
00822 
00823 
00824 void CGFFReader::x_ParseV2Attributes(SRecord& record, const TStrVec& v,
00825                                      SIZE_TYPE& i)
00826 {
00827     string         attr_last_value;
00828     vector<string> attr_values;
00829     char           quote_char = 0;
00830 
00831     for (;  i < v.size();  ++i) {
00832         string s = string(v[i]) + ' ';
00833         SIZE_TYPE pos = 0;
00834         while (pos < s.size()) {
00835             SIZE_TYPE pos2;
00836             if (quote_char) { // must be inside a value
00837                 pos2 = s.find_first_of(" \'\"\\", pos);
00838                 _ASSERT(pos2 != NPOS); // due to trailing space
00839                 if (s[pos2] == quote_char) {
00840                     if (attr_values.empty()) {
00841                         x_Warn("quoted attribute tag " + attr_last_value,
00842                                x_GetLineNumber());
00843                     }
00844                     quote_char = 0;
00845                     attr_last_value += s.substr(pos, pos2 - pos);
00846                     try {
00847                         attr_values.push_back(NStr::ParseEscapes
00848                                               (attr_last_value));
00849                     } catch (CStringException& e) {
00850                         attr_values.push_back(attr_last_value);
00851                         x_Warn(e.what() + (" in value of " + attr_values[0]),
00852                                x_GetLineNumber());
00853                     }
00854                     attr_last_value.erase();
00855                 } else if (s[pos2] == '\\') {
00856                     _VERIFY(++pos2 != s.size());
00857                     attr_last_value += s.substr(pos, pos2 + 1 - pos);
00858                 } else {
00859                     attr_last_value += s.substr(pos, pos2 + 1 - pos);
00860                 }
00861             } else {
00862                 pos2 = s.find_first_of(" #;\"", pos); // also look for \'?
00863                 _ASSERT(pos2 != NPOS); // due to trailing space
00864                 if (pos != pos2) {
00865                     // grab and place the preceding token
00866                     attr_last_value += s.substr(pos, pos2 - pos);
00867                     attr_values.push_back(attr_last_value);
00868                     attr_last_value.erase();
00869                 }
00870 
00871                 switch (s[pos2]) {
00872                 case ' ':
00873                     if (pos2 == s.size() - 1) {
00874                         x_AddAttribute(record, attr_values);
00875                         attr_values.clear();
00876                     }
00877                     break;
00878 
00879                 case '#':
00880                     return;
00881 
00882                 case ';':
00883                     if (attr_values.empty()) {
00884                         x_Warn("null attribute", x_GetLineNumber());
00885                     } else {
00886                         x_AddAttribute(record, attr_values);
00887                         attr_values.clear();
00888                     }
00889                     break;
00890 
00891                 // NB: we don't currently search for single quotes.
00892                 case '\"':
00893                 case '\'':
00894                     quote_char = s[pos2];
00895                     break;
00896 
00897                 default:
00898                     _TROUBLE;
00899                 }
00900             }
00901             pos = pos2 + 1;
00902         }
00903     }
00904 
00905     if ( !attr_values.empty() ) {
00906         x_Warn("unterminated attribute " + attr_values[0], x_GetLineNumber());
00907         x_AddAttribute(record, attr_values);
00908     }
00909 }
00910 
00911 bool CGFFReader::x_SplitKeyValuePair( const string& pair, string& key, string& value )
00912 {
00913     if ( NStr::SplitInTwo( pair, "=", key, value ) ) {
00914         return true;
00915     }
00916     if ( NStr::SplitInTwo( pair, " ", key, value ) ) {
00917         x_Info("(recovered) missdelimited attribute/value pair: " + key, x_GetLineNumber());
00918         return true;
00919     }
00920     x_Warn("attribute without value: " + key, x_GetLineNumber());
00921     return false;
00922 }
00923             
00924 
00925 void CGFFReader::x_ParseV3Attributes(SRecord& record, const TStrVec& v,
00926                                      SIZE_TYPE& i)
00927 {
00928     vector<string> v2, attr;
00929     NStr::Tokenize(v[i], ";", v2, NStr::eMergeDelims);
00930     ITERATE (vector<string>, it, v2) {
00931         attr.clear();
00932         string key, values;
00933         if (x_SplitKeyValuePair( *it, key, values )) {
00934             vector<string> vals;
00935             attr.resize(2);
00936             s_URLDecode(key, attr[0]);
00937             NStr::Tokenize(values, ",", vals);
00938             ITERATE (vector<string>, it2, vals) {
00939                 string value( *it2 );
00940                 if ( NStr::MatchesMask(value, "\"*\"") ) {
00941                     //
00942                     //  Note: The GFF3 spec is ambiguous on whether quoting is
00943                     //  required for free text values.
00944                     //
00945                     value = value.substr(1, value.length()-2);
00946                 }
00947                 s_URLDecode(value, attr[1]);
00948                 x_AddAttribute(record, attr);
00949             }
00950         } else {
00951             x_Warn("attribute without value: " + key, x_GetLineNumber());
00952             attr.resize(1);
00953             s_URLDecode(*it, attr[0]);
00954             x_AddAttribute(record, attr);
00955             continue;
00956         }
00957     }
00958 }
00959 
00960 
00961 void CGFFReader::x_AddAttribute(SRecord& record, vector<string>& attr)
00962 {
00963     if (attr.size() == 0) {
00964         return;
00965     }
00966 
00967     if (x_GetFlags() & fGBQuals) {
00968         if (attr[0] == "gbkey"  &&  attr.size() == 2) {
00969             record.key = attr[1];
00970             return;
00971         }
00972     }
00973     record.attrs.insert(attr);
00974 }
00975 
00976 
00977 string CGFFReader::x_FeatureID(const SRecord& record)
00978 {
00979     if (record.type != SRecord::eFeat  ||  x_GetFlags() & fNoGTF) {
00980         return kEmptyStr;
00981     }
00982 
00983     // has been retrieved in initial interval parsing
00984     if (m_Version == 3) {
00985         if (!record.id.empty()) {
00986             return  record.id;   
00987         }        
00988         else { // mergeable record
00989             return record.source + record.key + record.parent;
00990         }
00991     }    
00992     
00993     SRecord::TAttrs::const_iterator gene_it = record.FindAttribute("gene_id");
00994     SRecord::TAttrs::const_iterator transcript_it
00995         = record.FindAttribute("transcript_id");
00996 
00997     // concatenate our IDs from above, if found
00998     string id;
00999     if (gene_it != record.attrs.end()) {
01000         id += (*gene_it)[1];
01001     }
01002 
01003     if (transcript_it != record.attrs.end()) {
01004         if ( !id.empty() ) {
01005             id += ' ';
01006         }
01007         id += (*transcript_it)[1];
01008     }
01009 
01010     // look for db xrefs
01011     SRecord::TAttrs::const_iterator dbxref_it
01012         = record.FindAttribute("db_xref");
01013     for ( ; dbxref_it != record.attrs.end()  &&
01014             dbxref_it->front() == "db_xref";  ++dbxref_it) {
01015         if ( !id.empty() ) {
01016             id += ' ';
01017         }
01018         id += (*dbxref_it)[1];
01019     }
01020 
01021     if ( id.empty() ) {
01022         return id;
01023     }
01024 
01025     if (record.key == "start_codon" ||  record.key == "stop_codon") {
01026         //id += " " + record.key;
01027         id += "CDS";
01028     } else if (record.key == "CDS"
01029                ||  NStr::FindNoCase(record.key, "rna") != NPOS) {
01030         //id += " " + record.key;
01031         id += record.key;
01032     } else if (record.key == "exon") {
01033         // normally separate intervals, but may want to merge.
01034         if (x_GetFlags() & fMergeExons) {
01035             id += record.key;
01036         } else {
01037             SRecord::TAttrs::const_iterator it
01038                 = record.FindAttribute("exon_number");
01039             if (it == record.attrs.end()) {
01040                 return kEmptyStr;
01041             } else {
01042                 id += record.key + ' ' + (*it)[1];
01043             }
01044         }
01045     } else if (x_GetFlags() & fMergeOnyCdsMrna) {
01046         return kEmptyStr;
01047     }
01048     return id;
01049 }
01050 
01051 
01052 void CGFFReader::x_MergeRecords(SRecord& dest, const SRecord& src)
01053 {
01054     // XXX - perform sanity checks and warn on mismatch
01055 
01056     bool merge_overlaps = false;
01057     if (dest.key == "CDS"  &&
01058         (src.key == "start_codon"  ||  src.key == "stop_codon")) {
01059         // start_codon and stop_codon features should be merged into
01060         // existing CDS locations
01061         merge_overlaps = true;
01062     }
01063 
01064     if ((dest.key == "start_codon"  ||  dest.key == "stop_codon") &&
01065         src.key == "CDS") {
01066         // start_codon and stop_codon features should be merged into
01067         // existing CDS locations
01068         merge_overlaps = true;
01069         dest.key = "CDS";
01070     }
01071 
01072     // adjust the frame as needed
01073     int best_frame = dest.frame;
01074 
01075     ITERATE (SRecord::TLoc, slit, src.loc) {
01076         bool merged = false;
01077         NON_CONST_ITERATE (SRecord::TLoc, dlit, dest.loc) {
01078             if (slit->accession != dlit->accession) {
01079                 if (dest.loc.size() == 1) {
01080                     x_Warn("Multi-accession feature", src.line_no);
01081                 }
01082                 continue;
01083             } else if (slit->strand != dlit->strand) {
01084                 if (dest.loc.size() == 1) {
01085                     x_Warn("Multi-orientation feature", src.line_no);
01086                 }
01087                 continue;
01088             } else {
01089                 if (slit->strand == eNa_strand_plus) {
01090                     if (slit->ranges.begin()->GetFrom() <
01091                         dlit->ranges.begin()->GetFrom()) {
01092                         best_frame = src.frame;
01093                     }
01094                 } else {
01095                     if (slit->ranges.begin()->GetTo() >
01096                         dlit->ranges.begin()->GetTo()) {
01097                         best_frame = src.frame;
01098                     }
01099                 }
01100                 if (merge_overlaps) {
01101                     ITERATE (set<TSeqRange>, set_iter, slit->ranges) {
01102                         dlit->merge_ranges.insert(*set_iter);
01103                     }
01104                 } else {
01105                     ITERATE (set<TSeqRange>, set_iter, slit->ranges) {
01106                         dlit->ranges.insert(*set_iter);
01107                     }
01108                 }
01109                 merged = true;
01110                 break;
01111             }
01112         }
01113         if ( !merged ) {
01114             dest.loc.push_back(*slit);
01115         }
01116     }
01117 
01118     dest.frame = best_frame;
01119     if (src.key != dest.key) {
01120         if (dest.key == "CDS"  &&  NStr::EndsWith(src.key, "_codon")
01121             &&  !(x_GetFlags() & fNoGTF) ) {
01122             // ok
01123         } else if (src.key == "CDS" &&  NStr::EndsWith(dest.key, "_codon")
01124             &&  !(x_GetFlags() & fNoGTF) ) {
01125             dest.key = "CDS";
01126         } else {
01127             x_Warn("Merging features with different keys: " + dest.key
01128                    + " != " + src.key, src.line_no);
01129         }
01130     }
01131 
01132     x_MergeAttributes(dest, src);
01133 }
01134 
01135 
01136 void CGFFReader::x_MergeAttributes(SRecord& dest, const SRecord& src)
01137 {
01138     SRecord::TAttrs::iterator dait     = dest.attrs.begin();
01139     SRecord::TAttrs::iterator dait_end = dest.attrs.end();
01140     SRecord::TAttrs::iterator dait_tag = dait_end;
01141     ITERATE (SRecord::TAttrs, sait, src.attrs) {
01142         const string& tag = sait->front();
01143         while (dait != dait_end  &&  dait->front() < tag) {
01144             ++dait;
01145         }
01146 
01147         if (dait_tag == dait_end  ||  dait_tag->front() != tag) {
01148             dait_tag = dait;
01149         }
01150         if (dait != dait_end  &&  dait->front() == tag) {
01151             while (dait != dait_end  &&  *dait < *sait) {
01152                 ++dait;
01153             }
01154         }
01155         if (dait != dait_end  &&  *dait == *sait) {
01156             continue; // identical
01157         } else if ( !(x_GetFlags() & fNoGTF)  &&  tag == "exon_number") {
01158             if (dait_tag != dait_end) {
01159                 while (dait != dait_end  &&  dait->front() == tag) {
01160                     ++dait;
01161                 }
01162                 dest.attrs.erase(dait_tag, dait);
01163                 dait_tag = dait_end;
01164             }
01165         } else {
01166             dest.attrs.insert(dait, *sait);
01167         }
01168     }
01169 }
01170 
01171 
01172 void CGFFReader::x_PlaceFeature(CSeq_feat& feat, const SRecord&)
01173 {
01174     CRef<CBioseq> seq;
01175     if ( !feat.IsSetProduct() ) {
01176         for (CTypeConstIterator<CSeq_id> it(feat.GetLocation());  it;  ++it) {
01177             CRef<CBioseq> seq2 = x_ResolveID(*it, kEmptyStr);
01178             if ( !seq ) {
01179                 seq.Reset(seq2);
01180             } else if ( seq2.NotEmpty()  &&  seq != seq2) {
01181                 seq.Reset();
01182                 BREAK(it);
01183             }
01184         }
01185     }
01186 
01187     CBioseq::TAnnot& annots
01188         = seq ? seq->SetAnnot() : m_TSE->SetSet().SetAnnot();
01189     NON_CONST_ITERATE (CBioseq::TAnnot, it, annots) {
01190         if ((*it)->GetData().IsFtable()) {
01191             (*it)->SetData().SetFtable().push_back(CRef<CSeq_feat>(&feat));
01192             return;
01193         }
01194     }
01195     CRef<CSeq_annot> annot(new CSeq_annot);
01196     annot->SetData().SetFtable().push_back(CRef<CSeq_feat>(&feat));
01197     annots.push_back(annot);
01198 }
01199 
01200 
01201 void CGFFReader::x_PlaceAlignment(CSeq_align& align, const SRecord& record)
01202 {
01203     CRef<CBioseq> seq;
01204     try {
01205         seq = x_ResolveID(align.GetSeq_id(0), kEmptyStr);
01206     } catch (...) {
01207     }
01208     CBioseq::TAnnot& annots
01209         = seq ? seq->SetAnnot() : m_TSE->SetSet().SetAnnot();
01210     NON_CONST_ITERATE (CBioseq::TAnnot, it, annots) {
01211         if ((*it)->GetData().IsAlign()) {
01212             (*it)->SetData().SetAlign().push_back(CRef<CSeq_align>(&align));
01213             return;
01214         }
01215     }
01216     CRef<CSeq_annot> annot(new CSeq_annot);
01217     annot->SetData().SetAlign().push_back(CRef<CSeq_align>(&align));
01218     annots.push_back(annot);
01219 }
01220 
01221 
01222 void CGFFReader::x_ParseAndPlace(const SRecord& record)
01223 {
01224     switch (record.type) {
01225     case SRecord::eFeat:
01226         x_PlaceFeature(*x_ParseFeatRecord(record), record);
01227         break;
01228     case SRecord::eAlign:
01229         x_PlaceAlignment(*x_ParseAlignRecord(record), record);
01230         break;
01231     default:
01232         x_Warn("Unknown record type " + NStr::IntToString(record.type),
01233                record.line_no);
01234     }
01235 }
01236 
01237 
01238 CRef<CSeq_id> CGFFReader::x_ResolveSeqName(const string& name)
01239 {
01240     CRef<CSeq_id>& id = m_SeqNameCache[name];
01241     if (id.NotEmpty()
01242         &&  (id->Which() == CSeq_id::e_not_set
01243              ||  static_cast<int>(id->Which()) >= CSeq_id::e_MaxChoice)) {
01244         x_Warn("x_ResolveSeqName: invalid cache entry for " + name);
01245         id.Reset();
01246     }
01247     if ( !id ) {
01248         id.Reset(x_ResolveNewSeqName(name));
01249     }
01250     if ( !id ||  id->Which() == CSeq_id::e_not_set
01251         ||  static_cast<int>(id->Which()) >= CSeq_id::e_MaxChoice) {
01252         x_Warn("x_ResolveNewSeqName returned null or invalid ID for " + name);
01253         id.Reset(new CSeq_id(CSeq_id::e_Local, name, name));
01254     }
01255     return id;
01256 }
01257 
01258 
01259 CRef<CSeq_id> CGFFReader::x_ResolveNewSeqName(const string& name)
01260 {
01261     if (m_Flags & fAllIdsAsLocal) {
01262         if (NStr::StartsWith(name, "lcl|")) {
01263             return CRef<CSeq_id>(new CSeq_id(name));
01264         } else {
01265             return CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local, name));
01266         }
01267     }
01268 
01269     if (m_Flags & fNumericIdsAsLocal) {
01270         if (name.find_first_not_of("0123456789") == string::npos) {
01271             return CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local, name));
01272         }
01273     }
01274     try {
01275         return CRef<CSeq_id>(new CSeq_id(name));
01276     }
01277     catch (CSeqIdException&) {
01278         return CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local, name));
01279     }
01280 }
01281 
01282 
01283 CRef<CBioseq> CGFFReader::x_ResolveID(const CSeq_id& id, const TStr& mol)
01284 {
01285     CRef<CBioseq>& seq = m_SeqCache[CConstRef<CSeq_id>(&id)];
01286     if ( !seq ) {
01287         seq.Reset(x_ResolveNewID(id, mol));
01288         // Derived versions of x_ResolveNewID may legimately return null
01289         // results....
01290         if (seq) {
01291             x_PlaceSeq(*seq);
01292             ITERATE (CBioseq::TId, it, seq->GetId()) {
01293                 m_SeqCache.insert(make_pair(CConstRef<CSeq_id>(*it), seq));
01294             }
01295         }
01296     }
01297     return seq;
01298 }
01299 
01300 
01301 CRef<CBioseq> CGFFReader::x_ResolveNewID(const CSeq_id& id, const string& mol0)
01302 {
01303     CRef<CBioseq> seq(new CBioseq);
01304     CRef<CSeq_id> id_copy(new CSeq_id);
01305 
01306     id_copy->Assign(id);
01307     seq->SetId().push_back(id_copy);
01308     seq->SetInst().SetRepr(CSeq_inst::eRepr_virtual);
01309 
01310     const string& mol = mol0.empty() ? m_DefMol : mol0;
01311     if (mol.empty()  ||  mol == "dna") {
01312         seq->SetInst().SetMol(CSeq_inst::eMol_dna);
01313     } else if (mol == "rna")  {
01314         seq->SetInst().SetMol(CSeq_inst::eMol_rna);
01315     } else if (mol == "protein")  {
01316         seq->SetInst().SetMol(CSeq_inst::eMol_aa);
01317     } else {
01318         x_Warn("unrecognized sequence type " + mol + "; assuming DNA");
01319         seq->SetInst().SetMol(CSeq_inst::eMol_dna);
01320     }
01321 
01322     return seq;
01323 }
01324 
01325 void CGFFReader::x_SetProducts( CRef<CSeq_entry>& tse )
01326 {
01327     CTypeIterator<CSeq_feat> feat_iter(*tse);
01328     for ( ;  feat_iter;  ++feat_iter) {
01329         CSeq_feat& feat = *feat_iter;
01330 
01331         string qual_name;
01332         switch (feat.GetData().GetSubtype()) {
01333         case CSeqFeatData::eSubtype_cdregion:
01334             qual_name = "protein_id";
01335             break;
01336 
01337         case CSeqFeatData::eSubtype_mRNA:
01338             qual_name = "transcript_id";
01339             break;
01340 
01341         default:
01342             continue;
01343             break;
01344         }
01345 
01346         string id_str = feat.GetNamedQual(qual_name);
01347         if ( !id_str.empty() ) {
01348             CRef<CSeq_id> id = x_ResolveSeqName(id_str);
01349             feat.SetProduct().SetWhole(*id);
01350         }
01351     }
01352 }
01353 
01354 void CGFFReader::x_CreateGeneFeatures( CRef<CSeq_entry>& tse )
01355 {
01356     CTypeIterator<CSeq_annot> annot_iter(*tse);
01357     for ( ;  annot_iter;  ++annot_iter) {
01358         CSeq_annot& annot = *annot_iter;
01359         if (annot.GetData().Which() != CSeq_annot::TData::e_Ftable) {
01360             continue;
01361         }
01362 
01363         // we work within the scope of one annotation
01364         CSeq_annot::TData::TFtable::iterator feat_iter = 
01365             annot.SetData().SetFtable().begin();
01366         CSeq_annot::TData::TFtable::iterator feat_end = 
01367             annot.SetData().SetFtable().end();
01368 
01369         /// we plan to create a series of gene features, one for each gene
01370         /// identified above
01371         /// genes are identified via a 'gene_id' marker
01372         typedef map<string, CRef<CSeq_feat> > TGeneMap;
01373         TGeneMap genes;
01374         for (bool has_genes = false;
01375              feat_iter != feat_end  &&  !has_genes;  ++feat_iter) {
01376             CSeq_feat& feat = **feat_iter;
01377 
01378             switch (feat.GetData().GetSubtype()) {
01379             case CSeqFeatData::eSubtype_gene:
01380                 /// we already have genes, so don't add any more
01381                 has_genes = true;
01382                 genes.clear();
01383                 break;
01384 
01385             case CSeqFeatData::eSubtype_mRNA:
01386             case CSeqFeatData::eSubtype_cdregion:
01387                 /// for mRNA and CDS features, create a gene
01388                 /// this is only done if the gene_id parameter was set
01389                 /// in parsing, we promote gene_id to a gene xref
01390                 if ( !feat.GetGeneXref() ) {
01391                     continue;
01392                 }
01393                 {{
01394                     string gene_id;
01395                     feat.GetGeneXref()->GetLabel(&gene_id);
01396                     _ASSERT( !gene_id.empty() );
01397                     TSeqRange range = feat.GetLocation().GetTotalRange();
01398 
01399                     ENa_strand strand = feat.GetLocation().GetStrand();
01400                     const CSeq_id* id = feat.GetLocation().GetId();
01401                     if ( !id ) {
01402                         x_Error("No consistent ID found; gene feature skipped");
01403                         continue;
01404                     }
01405 
01406                     TGeneMap::iterator iter = genes.find(gene_id);
01407                     if (iter == genes.end()) {
01408                         /// new gene feature
01409                         CRef<CSeq_feat> gene(new CSeq_feat());
01410                         gene->SetData().SetGene().Assign(*feat.GetGeneXref());
01411 
01412                         gene->SetLocation().SetInt().SetFrom(range.GetFrom());
01413                         gene->SetLocation().SetInt().SetTo  (range.GetTo());
01414                         gene->SetLocation().SetId(*id);
01415                         gene->SetLocation().SetInt().SetStrand(strand);
01416                         genes[gene_id] = gene;
01417                     } else {
01418                         /// we agglomerate the old location
01419                         CRef<CSeq_feat> gene = iter->second;
01420 
01421                         TSeqRange r2 = gene->GetLocation().GetTotalRange();
01422                         range += r2;
01423                         gene->SetLocation().SetInt().SetFrom(range.GetFrom());
01424                         gene->SetLocation().SetInt().SetTo  (range.GetTo());
01425                         gene->SetLocation().InvalidateTotalRangeCache();
01426                     }
01427                 }}
01428                 break;
01429 
01430             default:
01431                 break;
01432             }
01433         }
01434 
01435         ITERATE (TGeneMap, iter, genes) {
01436             annot.SetData().SetFtable().push_back(iter->second);
01437         }
01438     }
01439 }
01440 
01441 void CGFFReader::x_RemapGeneRefs( CRef<CSeq_entry>& tse, TGeneRefs& gene_refs )
01442 {
01443     if ( !tse  ||  gene_refs.empty() ) {
01444         return;
01445     }
01446     NON_CONST_ITERATE (TGeneRefs, iter, gene_refs) {
01447         if ( !iter->second->IsSetLocus()  &&
01448              !iter->second->IsSetLocus_tag()) {
01449             iter->second->SetLocus(iter->first);
01450         } else if ( !iter->second->IsSetLocus()  ||
01451                     iter->second->GetLocus() != iter->first) {
01452             iter->second->SetSyn().push_back(iter->first);
01453         }
01454     }
01455 
01456     CTypeIterator<CSeq_feat> feat_iter(*tse);
01457     for ( ;  feat_iter;  ++feat_iter) {
01458         const CGene_ref* ref = NULL;
01459         if (feat_iter->GetData().IsGene()) {
01460             ref = &feat_iter->GetData().GetGene();
01461         } else {
01462             ref = feat_iter->GetGeneXref();
01463         }
01464         if (ref  &&  ref->IsSetLocus()) {
01465             TGeneRefs::const_iterator iter =
01466                 gene_refs.find(ref->GetLocus());
01467             if (iter != gene_refs.end()) {
01468                 const_cast<CGene_ref*>(ref)->Assign(*iter->second);
01469             }
01470         }
01471     }
01472 }
01473 
01474 void CGFFReader::x_PlaceSeq(CBioseq& seq)
01475 {
01476     bool found = false;
01477     for (CTypeConstIterator<CBioseq> it(*m_TSE);  it;  ++it) {
01478         if (&*it == &seq) {
01479             found = true;
01480             BREAK(it);
01481         }
01482     }
01483     if ( !found ) {
01484         CRef<CSeq_entry> se(new CSeq_entry);
01485         se->SetSeq(seq);
01486         m_TSE->SetSet().SetSeq_set().push_back(se);
01487     }
01488 }
01489 
01490 
01491 CGFFReader::SRecord::TAttrs::const_iterator
01492 CGFFReader::SRecord::FindAttribute(const string& att_name, size_t min_values)
01493 const
01494 {
01495     SRecord::TAttrs::const_iterator it
01496         = attrs.lower_bound(vector<string>(1, att_name));
01497     while (it != attrs.end()  &&  it->front() == att_name
01498            &&  it->size() <= min_values) {
01499         ++it;
01500     }
01501     return (it == attrs.end() || it->front() == att_name) ? it : attrs.end();
01502 }
01503 
01504 
01505 bool
01506 CGFFReader::x_IsLineUcscMetaInformation(const TStr& line)
01507 {
01508     // line starts with keyword "browser" or "track"
01509     return (NStr::StartsWith(line, "browser ") || NStr::StartsWith(line, "track ") );
01510 }
01511     
01512 
01513 END_SCOPE(objects)
01514 END_NCBI_SCOPE
01515 
01516 

Generated on Sun Dec 6 22:42:29 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Mon Dec 07 16:21:13 2009 by modify_doxy.py rev. 173732