NCBI C++ ToolKit
hgvs_parser2.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

00001 /*  $Id: hgvs_parser2.cpp 66445 2015-03-03 19:17:55Z astashya $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE
00005  *               National Center for Biotechnology Information
00006  *
00007  *  This software/database is a "United States Government Work" under the
00008  *  terms of the United States Copyright Act.  It was written as part of
00009  *  the author's official duties as a United States Government employee and
00010  *  thus cannot be copyrighted.  This software/database is freely available
00011  *  to the public for use. The National Library of Medicine and the U.S.
00012  *  Government have not placed any restriction on its use or reproduction.
00013  *
00014  *  Although all reasonable efforts have been taken to ensure the accuracy
00015  *  and reliability of the software and data, the NLM and the U.S.
00016  *  Government do not and cannot warrant the performance or results that
00017  *  may be obtained by using this software or data. The NLM and the U.S.
00018  *  Government disclaim all warranties, express or implied, including
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.
00021  *
00022  *  Please cite the author in any work or product based on this material.
00023  *
00024  * ===========================================================================
00025  *
00026  * File Description:
00027  *   Sample library
00028  *
00029  */
00030 
00031 #include <ncbi_pch.hpp>
00032 
00033 
00034 #include <serial/iterator.hpp>
00035 
00036 #include <objects/seqalign/Seq_align.hpp>
00037 #include <objects/seqalign/Spliced_seg.hpp>
00038 #include <objects/seqalign/Spliced_exon.hpp>
00039 #include <objects/seqalign/Product_pos.hpp>
00040 #include <objects/seqalign/Prot_pos.hpp>
00041 
00042 #include <objects/variation/Variation.hpp>
00043 #include <objects/variation/VariantPlacement.hpp>
00044 #include <objects/variation/VariationMethod.hpp>
00045 #include <objects/variation/VariationException.hpp>
00046 
00047 #include <objects/seqfeat/Seq_feat.hpp>
00048 #include <objects/seqfeat/Variation_inst.hpp>
00049 #include <objects/seqfeat/Delta_item.hpp>
00050 #include <objects/seqfeat/Ext_loc.hpp>
00051 #include <objects/seqfeat/BioSource.hpp>
00052 #include <objects/seqfeat/SeqFeatXref.hpp>
00053 
00054 #include <objects/seq/seqport_util.hpp>
00055 #include <objects/seq/Seq_literal.hpp>
00056 #include <objects/seq/Seq_data.hpp>
00057 #include <objects/seq/Numbering.hpp>
00058 #include <objects/seq/Num_ref.hpp>
00059 #include <objects/seq/Annot_descr.hpp>
00060 #include <objects/seq/Annotdesc.hpp>
00061 #include <objects/seq/Seq_descr.hpp>
00062 
00063 #include <objects/general/Object_id.hpp>
00064 #include <objects/general/User_object.hpp>
00065 #include <objects/general/Dbtag.hpp>
00066 
00067 #include <objects/seqloc/Seq_point.hpp>
00068 #include <objects/seqloc/Seq_loc_equiv.hpp>
00069 
00070 #include <objmgr/util/sequence.hpp>
00071 #include <objmgr/seq_vector.hpp>
00072 #include <objmgr/align_ci.hpp>
00073 #include <objmgr/feat_ci.hpp>
00074 #include <objmgr/seq_loc_mapper.hpp>
00075 
00076 #include <misc/hgvs/hgvs_parser2.hpp>
00077 #include <misc/hgvs/variation_util2.hpp>
00078 
00079 
00080 BEGIN_NCBI_SCOPE
00081 
00082 namespace variation {
00083 
00084 #define HGVS_THROW(err_code, message) NCBI_THROW(CHgvsParser::CHgvsParserException, err_code, message)
00085 
00086 #define HGVS_ASSERT_RULE(i, rule_id) \
00087     if((i->value.id()) != (SGrammar::rule_id))             \
00088     {HGVS_THROW(eGrammatic, "Unexpected rule " + CHgvsParser::SGrammar::s_GetRuleName(i->value.id()) ); }
00089 
00090 
00091 CSafeStatic<CHgvsParser::SGrammar> CHgvsParser::s_grammar;
00092 
00093 const char* CHgvsParser::SGrammar::s_rule_names[CHgvsParser::SGrammar::eNodeIds_SIZE] = 
00094 {
00095     "NONE",
00096     "root",
00097     "list_delimiter",
00098     "list1a",
00099     "list2a",
00100     "list3a",
00101     "list1b",
00102     "list2b",
00103     "list3b",
00104     "expr1",
00105     "expr2",
00106     "expr3",
00107     "translocation",
00108     "header",
00109     "seq_id",
00110     "mut_list",
00111     "mut_ref",
00112     "mol",
00113     "int_fuzz",
00114     "abs_pos",
00115     "general_pos",
00116     "fuzzy_pos",
00117     "pos_spec",
00118     "location",
00119     "nuc_range",
00120     "prot_range",
00121     "mut_inst",
00122     "raw_seq",
00123     "raw_seq_or_len",
00124     "aminoacid1",
00125     "aminoacid2",
00126     "aminoacid3",
00127     "nuc_subst",
00128     "deletion",
00129     "insertion",
00130     "delins",
00131     "duplication",
00132     "nuc_inv",
00133     "ssr",
00134     "conversion",
00135     "seq_loc",
00136     "seq_ref",
00137     "prot_pos",
00138     "prot_fs",
00139     "prot_missense",
00140     "prot_ext",
00141     "no_change"
00142 };
00143 
00144 
00145 CVariantPlacement& SetFirstPlacement(CVariation& v)
00146 {
00147     if(v.SetPlacements().size() == 0) {
00148         CRef<CVariantPlacement> p(new CVariantPlacement);
00149         v.SetPlacements().push_back(p);
00150     }
00151     return *v.SetPlacements().front();
00152 }
00153 
00154 void SetComputational(CVariation& variation)
00155 {
00156     CVariationMethod& m = variation.SetMethod();
00157     m.SetMethod();
00158 
00159     if(find(m.GetMethod().begin(),
00160             m.GetMethod().end(),
00161             CVariationMethod::eMethod_E_computational) == m.GetMethod().end())
00162     {
00163         m.SetMethod().push_back(CVariationMethod::eMethod_E_computational);
00164     }
00165 }
00166 
00167 
00168 bool SeqsMatch(const string& query, const char* text)
00169 {
00170     static const char* iupac_bases = ".TGKCYSBAWRDMHVN"; //position of the iupac literal = 4-bit mask for A|C|G|T
00171     for(size_t i = 0; i < query.size(); i++) {
00172         size_t a = CTempString(iupac_bases).find(query[i]);
00173         size_t b = CTempString(iupac_bases).find(text[i]);
00174         if(!(a & b)) {
00175             return false;
00176         }
00177     }
00178     return true;
00179 }
00180 
00181 
00182 CRef<CSeq_loc> FindSSRLoc(const CSeq_loc& loc, const string& seq, CScope& scope)
00183 {
00184     //Extend the loc 10kb up and down; Find all occurences of seq in the resulting
00185     //interval, create locs for individual repeat units; then merge them, and keep the interval that
00186     //overlaps the original.
00187 
00188     const TSeqPos ext_interval = 10000;
00189 
00190     CRef<CSeq_loc> loc1 = sequence::Seq_loc_Merge(loc, CSeq_loc::fMerge_SingleRange, NULL);
00191     CBioseq_Handle bsh = scope.GetBioseqHandle(sequence::GetId(loc, NULL));
00192     TSeqPos seq_len = bsh.GetInst_Length();
00193     loc1->SetInt().SetFrom() -= min(ext_interval, loc1->GetInt().GetFrom());
00194     loc1->SetInt().SetTo() += min(ext_interval, seq_len - 1 - loc1->GetInt().GetTo());
00195 
00196     CSeqVector v(*loc1, scope, CBioseq_Handle::eCoding_Iupac);
00197     string str1;
00198     v.GetSeqData(v.begin(), v.end(), str1);
00199 
00200     CRef<CSeq_loc> container(new CSeq_loc(CSeq_loc::e_Mix));
00201 
00202     for(size_t i = 0; i < str1.size() - seq.size(); i++) {
00203         if(SeqsMatch(seq, &str1[i])) {
00204             CRef<CSeq_loc> repeat_unit_loc(new CSeq_loc);
00205             repeat_unit_loc->Assign(*loc1);
00206 
00207             if(sequence::GetStrand(loc, NULL) == eNa_strand_minus) {
00208                 repeat_unit_loc->SetInt().SetTo() -= i;
00209                 repeat_unit_loc->SetInt().SetFrom(repeat_unit_loc->GetInt().GetTo() - (seq.size() - 1));
00210             } else {
00211                 repeat_unit_loc->SetInt().SetFrom() += i;
00212                 repeat_unit_loc->SetInt().SetTo(repeat_unit_loc->GetInt().GetFrom() + (seq.size() - 1));
00213             }
00214             container->SetMix().Set().push_back(repeat_unit_loc);
00215         }
00216     }
00217 
00218     CRef<CSeq_loc> merged_repeats = sequence::Seq_loc_Merge(*container, CSeq_loc::fSortAndMerge_All, NULL);
00219     merged_repeats->ChangeToMix();
00220     CRef<CSeq_loc> result(new CSeq_loc(CSeq_loc::e_Null));
00221     result->Assign(loc);
00222 
00223     for(CSeq_loc_CI ci(*merged_repeats); ci; ++ci) {
00224         const CSeq_loc& loc2 = ci.GetEmbeddingSeq_loc();
00225         if(sequence::Compare(loc, loc2, NULL, sequence::fCompareOverlapping) != sequence::eNoOverlap) {
00226             result->Add(loc2);
00227         }
00228     }
00229 
00230     return sequence::Seq_loc_Merge(*result, CSeq_loc::fSortAndMerge_All, NULL);
00231 }
00232 
00233 
00234 
00235 
00236 void CHgvsParser::s_SetStartOffset(CVariantPlacement& p, const CHgvsParser::SFuzzyInt& fint)
00237 {
00238     p.ResetStart_offset();
00239     p.ResetStart_offset_fuzz();
00240     if(fint.value || fint.fuzz) {
00241         p.SetStart_offset(fint.value);
00242     }
00243 
00244     if(fint.fuzz) {
00245         p.SetStart_offset_fuzz().Assign(*fint.fuzz);
00246     }
00247 
00248 #if 0
00249     if(!fint.value 
00250        && fint.fuzz 
00251        && fint.fuzz->IsLim() 
00252        && (   fint.fuzz->GetLim() == CInt_fuzz::eLim_lt
00253            || fint.fuzz->GetLim() == CInt_fuzz::eLim_tl))
00254     {
00255         // VAR-832
00256         // interpret c.x-? as c.x-(?_1), not as c.x+(?_0)
00257         p.SetStart_offset(-1);
00258     }
00259 #endif
00260 
00261 }
00262 
00263 void CHgvsParser::s_SetStopOffset(CVariantPlacement& p, const CHgvsParser::SFuzzyInt& fint)
00264 {
00265     p.ResetStop_offset();
00266     p.ResetStop_offset_fuzz();
00267     if(fint.value || fint.fuzz) {
00268         p.SetStop_offset(fint.value);
00269     }
00270 
00271     if(fint.fuzz) {
00272         p.SetStop_offset_fuzz().Assign(*fint.fuzz);
00273     }
00274 
00275 #if 0
00276     if(!fint.value 
00277        && fint.fuzz 
00278        && fint.fuzz->IsLim() 
00279        && (   fint.fuzz->GetLim() == CInt_fuzz::eLim_gt 
00280            || fint.fuzz->GetLim() == CInt_fuzz::eLim_tr))
00281     {
00282         // VAR-832
00283         // interpret c.x+? as c.x+(1_?), not as c.x+(0_?)
00284         p.SetStop_offset(1);
00285     }
00286 #endif
00287 }
00288 
00289 
00290 
00291 //if a variation has an asserted sequence, stored in placement.seq, repackage it as a set having
00292 //the original variation and a synthetic one representing the asserted sequence. The placement.seq
00293 //is cleared, as it is a placeholder for the actual reference sequence.
00294 void RepackageAssertedSequence(CVariation& vr)
00295 {
00296     if(vr.IsSetPlacements() && SetFirstPlacement(vr).IsSetSeq()) {
00297         CRef<CVariation> container(new CVariation);
00298         container->SetPlacements() = vr.SetPlacements();
00299 
00300         CRef<CVariation> orig(new CVariation);
00301         orig->Assign(vr);
00302         orig->ResetPlacements(); //location will be set on the package, as it is the same for both members
00303 
00304         container->SetData().SetSet().SetType(CVariation::TData::TSet::eData_set_type_package);
00305         container->SetData().SetSet().SetVariations().push_back(orig);
00306 
00307         CRef<CVariation> asserted_vr(new CVariation);
00308         asserted_vr->SetData().SetInstance().SetObservation(CVariation_inst::eObservation_asserted);
00309         asserted_vr->SetData().SetInstance().SetType(CVariation_inst::eType_identity);
00310 
00311         CRef<CDelta_item> delta(new CDelta_item);
00312         delta->SetSeq().SetLiteral().Assign(SetFirstPlacement(vr).GetSeq());
00313         asserted_vr->SetData().SetInstance().SetDelta().push_back(delta);
00314 
00315         SetFirstPlacement(*container).ResetSeq();
00316         container->SetData().SetSet().SetVariations().push_back(asserted_vr);
00317 
00318         vr.Assign(*container);
00319 
00320     } else if(vr.GetData().IsSet()) {
00321         NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, it, vr.SetData().SetSet().SetVariations()) {
00322             RepackageAssertedSequence(**it);
00323         }
00324     }
00325 }
00326 
00327 
00328 //HGVS distinguished between c. n. and r. molecules, while in 
00329 //VariantPlacement we have moltype cdna (which maps onto c.) and rna (which maps onto n.)
00330 //
00331 //If we have an HGVS expression like NM_123456.7:r. the VariantPlacement will have moltype rna
00332 //that would map-back onto NM_123456.7:n., which is what we don't want, as "n.' reserved
00333 //for non-coding RNAs, so we'll convert rna moltype to cdna based on accession here.
00334 //Note that this is a post-processing step, as in the midst of parsing we want to treat
00335 //NM_123456.7:r. as non-coding rna, since these have absolute coordinates rather than cds-relative.
00336 void AdjustMoltype(CVariation& vr, CScope& scope)
00337 {
00338     CVariationUtil util(scope);
00339 
00340     for(CTypeIterator<CVariantPlacement> it(Begin(vr)); it; ++it) {
00341         CVariantPlacement& p = *it;
00342 
00343         if(p.IsSetMol() 
00344            && p.GetMol() == CVariantPlacement::eMol_rna
00345            && p.GetLoc().GetId())
00346         {
00347             p.SetMol(util.GetMolType(*p.GetLoc().GetId()));
00348         }
00349     }
00350 }
00351 
00352 
00353 CHgvsParser::CContext::CContext(const CContext& other)
00354   : m_hgvs(other.m_hgvs)
00355 {
00356     this->m_bsh = other.m_bsh;
00357     this->m_cds = other.m_cds;
00358     this->m_scope = other.m_scope;
00359     this->m_seq_id_resolvers = other.m_seq_id_resolvers;
00360     this->m_placement.Reset();
00361     if(other.m_placement) {
00362         /*
00363          * Note: need to make a copy of the placement, such that if preceding subvariation
00364          * is location-specific and the other one isn't, first's placement is not
00365          * given to the other one, e.g. "NM_004004.2:c.[35_36dup;40del]+[=]" - if we
00366          * made a shallow copy, then the location context for "[=]" would be erroneously
00367          * inherited from the last sibling: "NM_004004.2:c.40=" instead of whole "NM_004004.2:c.="
00368          */
00369          this->m_placement.Reset(new CVariantPlacement);
00370          this->m_placement->Assign(*other.m_placement);
00371     }
00372 }
00373 
00374 const CSeq_feat& CHgvsParser::CContext::GetCDS() const
00375 {
00376     if(m_cds.IsNull()) {
00377         HGVS_THROW(eContext, "No CDS feature in context");
00378     }
00379     return *m_cds;
00380 }
00381 
00382 const CSeq_id& CHgvsParser::CContext::GetId() const
00383 {
00384     return sequence::GetId(GetPlacement().GetLoc(), NULL);
00385 }
00386 
00387 
00388 void CHgvsParser::CContext::SetId(const CSeq_id& id, CVariantPlacement::TMol mol)
00389 {
00390     Clear();
00391 
00392     SetPlacement().SetMol(mol);
00393     SetPlacement().SetLoc().SetWhole().Assign(id);
00394 
00395     m_bsh = m_scope->GetBioseqHandle(id);
00396 
00397     if(!m_bsh) {
00398         HGVS_THROW(eContext, "Cannnot get bioseq for seq-id " + id.AsFastaString());
00399     }
00400 
00401     if(mol == CVariantPlacement::eMol_cdna) {
00402         SAnnotSelector sel;
00403         sel.SetResolveTSE();
00404         for(CFeat_CI ci(m_bsh, sel); ci; ++ci) {
00405             const CMappedFeat& mf = *ci;
00406             if(mf.GetData().IsCdregion()) {
00407                 if(m_cds.IsNull()) {
00408                     m_cds.Reset(new CSeq_feat());
00409                     m_cds->Assign(mf.GetMappedFeature());
00410                 } else {
00411                     HGVS_THROW(eContext, "Multiple CDS features on the sequence");
00412                 }
00413             }
00414         }
00415         if(m_cds.IsNull()) {
00416             HGVS_THROW(eContext, "Could not find CDS feat");
00417         }
00418     }
00419 }
00420 
00421 
00422 const string CHgvsParser::SGrammar::s_GetRuleName(parser_id id)
00423 {
00424     if(id.to_long() >= CHgvsParser::SGrammar::eNodeIds_SIZE) {
00425         HGVS_THROW(eLogic, "Rule name not hardcoded");
00426     } else {
00427         return s_rule_names[id.to_long()];
00428     }
00429 }
00430 
00431 
00432 CHgvsParser::SFuzzyInt CHgvsParser::x_int_fuzz(TIterator const& i, const CContext& context)
00433 {
00434     HGVS_ASSERT_RULE(i, eID_int_fuzz);
00435     TIterator it = i->children.begin();
00436 
00437     CHgvsParser::SFuzzyInt fint;
00438     fint.fuzz.Reset(new CInt_fuzz);
00439 
00440     if(i->children.size() == 1) { //e.g. '5' or '?'
00441         string s(it->value.begin(), it->value.end());
00442         if(s == "?") {
00443             fint.SetPureFuzz();
00444         } else {
00445             fint.value = NStr::StringToInt(s);
00446             fint.fuzz.Reset();
00447         }
00448     } else if(i->children.size() == 3) { //e.g. '(5)' or '(?)'
00449         ++it;
00450         string s(it->value.begin(), it->value.end());
00451         if(s == "?") {
00452             fint.SetPureFuzz();
00453         } else {
00454             fint.value = NStr::StringToInt(s);
00455             fint.fuzz->SetLim(CInt_fuzz::eLim_unk);
00456         }
00457     } else if(i->children.size() == 5) { //e.g. '(5_7)' or '(?_10)'
00458         ++it;
00459         string s1(it->value.begin(), it->value.end());
00460         ++it;
00461         ++it;
00462         string s2(it->value.begin(), it->value.end());
00463 
00464         if(s1 == "?" && s2 == "?") {
00465             fint.SetPureFuzz();
00466         } else if(s1 != "?" && s2 != "?") {
00467             fint.value = NStr::StringToInt(s1);
00468             fint.fuzz->SetRange().SetMin(NStr::StringToInt(s1));
00469             fint.fuzz->SetRange().SetMax(NStr::StringToInt(s2));
00470         } else if(s2 == "?") {
00471             fint.value = NStr::StringToInt(s1);
00472             fint.fuzz->SetLim(CInt_fuzz::eLim_gt);
00473         } else if(s1 == "?") {
00474             fint.value = NStr::StringToInt(s2);
00475             fint.fuzz->SetLim(CInt_fuzz::eLim_lt);
00476         } else {
00477             HGVS_THROW(eLogic, "Unreachable code");
00478         }
00479     }
00480 
00481     return fint;
00482 }
00483 
00484 
00485 
00486 /* In HGVS:
00487  * the nucleotide 3' of the translation stop codon is *1, the next *2, etc.
00488  * # there is no nucleotide 0
00489  * # nucleotide 1 is the A of the ATG-translation initiation codon
00490  * # the nucleotide 5' of the ATG-translation initiation codon is -1, the previous -2, etc.
00491  *
00492  * I.e. need to adjust if dealing with positive coordinates, except for *-relative ones.
00493  */
00494 template<typename T>
00495 T AdjustHgvsCoord(T val, TSeqPos offset, bool adjust)
00496 {
00497     val += offset;
00498     if(adjust && val > (T)offset) { 
00499         // note: val may be unsigned, so check for (val+offset > offset)
00500         // instead of (val > 0)
00501         val -= 1;
00502     }
00503     return val;
00504 }
00505 
00506 CRef<CSeq_point> CHgvsParser::x_abs_pos(TIterator const& i, const CContext& context)
00507 {
00508     HGVS_ASSERT_RULE(i, eID_abs_pos);
00509     TIterator it = i->children.begin();
00510 
00511     TSeqPos offset(0);
00512     bool adjust = true; //see AdjustHgvsCoord comments above
00513     if(i->children.size() == 2) {
00514         adjust = false;
00515         string s(it->value.begin(), it->value.end());
00516         if(s != "*") {
00517             HGVS_THROW(eGrammatic, "Expected literal '*'");
00518         }
00519         if(context.GetPlacement().GetMol() != CVariantPlacement::eMol_cdna) {
00520             HGVS_THROW(eContext, "Expected 'c.' context for stop-codon-relative coordinate");
00521         }
00522 
00523         offset = context.GetCDS().GetLocation().GetStop(eExtreme_Biological);
00524         ++it;
00525     } else {
00526         if (context.GetPlacement().GetMol() == CVariantPlacement::eMol_cdna) {
00527             //Note: in RNA coordinates (r.) the coordinates are absolute, like in genomic sequences,
00528             //  "The RNA sequence type uses only GenBank mRNA records. The value 1 is assigned to the first
00529             //  base in the record and from there all bases are counted normally."
00530             //so the cds-start offset applies only to "c." coordinates
00531             offset = context.GetCDS().GetLocation().GetStart(eExtreme_Biological);
00532         }
00533     }
00534 
00535     CRef<CSeq_point> pnt(new CSeq_point);
00536     {{
00537         SFuzzyInt int_fuzz = x_int_fuzz(it, context);
00538 
00539         pnt->SetId().Assign(context.GetId());
00540         pnt->SetStrand(eNa_strand_plus);
00541         if(int_fuzz.IsPureFuzz()) {
00542             pnt->SetPoint(kInvalidSeqPos);
00543         } else {
00544             pnt->SetPoint(AdjustHgvsCoord(int_fuzz.value, offset, adjust));
00545         }
00546 
00547         if(!int_fuzz.fuzz.IsNull()) {
00548             pnt->SetFuzz(*int_fuzz.fuzz);
00549             if(pnt->GetFuzz().IsRange()) {
00550                 CInt_fuzz_Base::TRange& r = pnt->SetFuzz().SetRange();
00551                 r.SetMin(AdjustHgvsCoord(r.GetMin(), offset, adjust));
00552                 r.SetMax(AdjustHgvsCoord(r.GetMax(), offset, adjust));
00553             }
00554         }
00555     }}
00556 
00557     return pnt;
00558 }
00559 
00560 bool IsPureFuzzPoint(const CSeq_point& p)
00561 {
00562     return p.GetPoint() == kInvalidSeqPos 
00563         && p.IsSetFuzz() 
00564         && p.GetFuzz().IsLim() 
00565         && p.GetFuzz().GetLim() == CInt_fuzz::eLim_other;
00566 }
00567 
00568 
00569 /*
00570  * general_pos is either simple abs-pos that is passed down to x_abs_pos,
00571  * or an intronic location that is specified by a mapping point in the
00572  * local coordinates and the -upstream / +downstream offset after remapping.
00573  *
00574  * The mapping point can either be an abs-pos in local coordinates, or
00575  * specified as offset in intron-specific coordinate system where IVS# specifies
00576  * the intron number
00577  */
00578 CHgvsParser::SOffsetPoint CHgvsParser::x_general_pos(TIterator const& i, const CContext& context)
00579 {
00580     HGVS_ASSERT_RULE(i, eID_general_pos);
00581 
00582     SOffsetPoint ofpnt;
00583 
00584     if(i->children.size() == 1) {
00585         //local coordinates
00586         ofpnt.pnt = x_abs_pos(i->children.begin(), context);
00587     } else {
00588         //(str_p("IVS") >> int_p | abs_pos) >> sign_p >> int_fuzz
00589 
00590         TIterator it = i->children.end() - 1;
00591         ofpnt.offset = x_int_fuzz(it, context);
00592         --it;
00593 
00594         //adjust for sign; convert +? or -? offsets to 
00595         //0> or 0<
00596         string s_sign(it->value.begin(), it->value.end());
00597         int sign1 = s_sign == "-" ? -1 : 1;
00598         ofpnt.offset.value *= sign1;
00599         if(ofpnt.offset.fuzz && ofpnt.offset.fuzz->IsRange()) {
00600             ofpnt.offset.fuzz->SetRange().SetMin() *= sign1;
00601             ofpnt.offset.fuzz->SetRange().SetMax() *= sign1;
00602         } else if(ofpnt.offset.IsPureFuzz()) {
00603             ofpnt.offset.fuzz->SetLim(sign1 < 0 ? CInt_fuzz::eLim_lt : CInt_fuzz::eLim_gt);
00604         }
00605 
00606         --it;
00607         if(it->value.id() == SGrammar::eID_abs_pos) {
00608             //base-loc is an abs-pos
00609             ofpnt.pnt = x_abs_pos(i->children.begin(), context);
00610         } else {
00611             //base-loc is IVS-relative.
00612             ofpnt.pnt.Reset(new CSeq_point);
00613             ofpnt.pnt->SetId().Assign(context.GetId());
00614             ofpnt.pnt->SetStrand(eNa_strand_plus);
00615 
00616             TIterator it = i->children.begin();
00617             string s_ivs(it->value.begin(), it->value.end());
00618             ++it;
00619             string s_ivs_num(it->value.begin(), it->value.end());
00620             int ivs_num = NStr::StringToInt(s_ivs_num);
00621 
00622             //If IVS3+50, the mapping point is the last base of third exon
00623             //if IVS3-50, the mapping point is the first base of the fourth exon
00624             size_t target_exon_num = sign1 < 0 ? ivs_num + 1 : ivs_num;
00625 
00626             SAnnotSelector sel;
00627             sel.IncludeFeatSubtype(CSeqFeatData::eSubtype_exon);
00628             CBioseq_Handle bsh = context.GetScope().GetBioseqHandle(context.GetId());
00629             size_t exon_num = 1;
00630             //Note: IVS is cDNA-centric, so we'll have to use ordinals of the exons instead of /number qual
00631             for(CFeat_CI ci(bsh, sel); ci; ++ci) {
00632                 const CMappedFeat& mf = *ci;
00633                 if(exon_num == target_exon_num) {
00634                     ofpnt.pnt->SetPoint(sign1 > 0 ? mf.GetLocation().GetStop(eExtreme_Biological)
00635                                                   : mf.GetLocation().GetStart(eExtreme_Biological));
00636                     break;
00637                 }
00638                 exon_num++;
00639             }
00640         }
00641     }
00642 
00643     //We could be dealing with improper HGVS expression (that we need to support anyway)
00644     //where the coordinate extends beyond the range of sequence
00645     //e.g. NM_000518:c.-78A>G, where codon-start is at 51. In this case the resulting
00646     //coordinate will be negative; we'll convert it to offset-format (27 bases upstream of pos 0)
00647 
00648     if(!IsPureFuzzPoint(*ofpnt.pnt) 
00649        && (   context.GetPlacement().GetMol() == CVariantPlacement::eMol_cdna 
00650            || context.GetPlacement().GetMol() == CVariantPlacement::eMol_rna))
00651     {
00652         if(static_cast<TSignedSeqPos>(ofpnt.pnt->GetPoint()) < 0) {
00653             ofpnt.offset.value += static_cast<TSignedSeqPos>(ofpnt.pnt->GetPoint());
00654             ofpnt.pnt->SetPoint(0);
00655         } else {
00656             CVariationUtil vu(context.GetScope());
00657             TSeqPos transcribed_len = vu.GetEffectiveTranscriptLength(context.GetBioseqHandle());
00658             // Note: positions past the last exon are interpreted as specifying near-gene/intronic
00659             // target rather than polyA. JIRA:SNP-7341
00660 
00661             if(ofpnt.pnt->GetPoint() >= transcribed_len) {
00662                 TSeqPos anchor_pos = transcribed_len - 1;
00663                 TSeqPos overrun = ofpnt.pnt->GetPoint() - anchor_pos;
00664                 ofpnt.offset.value += overrun;
00665                 ofpnt.pnt->SetPoint(anchor_pos);
00666             }
00667         }
00668     }
00669 
00670     return ofpnt;
00671 }
00672 
00673 
00674 CHgvsParser::SOffsetPoint CHgvsParser::x_fuzzy_pos(TIterator const& i, const CContext& context)
00675 {
00676     HGVS_ASSERT_RULE(i, eID_fuzzy_pos);
00677 
00678     SOffsetPoint pnt1 = x_general_pos(i->children.begin(), context);
00679     SOffsetPoint pnt2 = x_general_pos(i->children.begin() + 1, context);
00680 
00681     //Verify that on the same seq-id.
00682     if(!pnt1.pnt->GetId().Equals(pnt2.pnt->GetId())) {
00683         HGVS_THROW(eSemantic, "Points in a fuzzy pos are on different sequences");
00684     }
00685     if(pnt1.pnt->GetStrand() != pnt2.pnt->GetStrand()) {
00686         HGVS_THROW(eSemantic, "Range-loc start/stop are on different strands.");
00687     }
00688 
00689     if(IsPureFuzzPoint(*pnt1.pnt) || IsPureFuzzPoint(*pnt2.pnt)) {
00690         if(IsPureFuzzPoint(*pnt2.pnt)) {
00691             pnt1.pnt->SetFuzz().SetLim(CInt_fuzz::eLim_tr);
00692             return pnt1;
00693         } else {
00694             pnt2.pnt->SetFuzz().SetLim(CInt_fuzz::eLim_tl);
00695             return pnt2;
00696         }
00697     }
00698 
00699     if((pnt1.offset.value != 0 || pnt2.offset.value != 0) && !pnt1.pnt->Equals(*pnt2.pnt)) {
00700         HGVS_THROW(eSemantic, "Base-points in an intronic fuzzy position must be equal");
00701     }
00702 
00703     SOffsetPoint pnt = pnt1;
00704     if(pnt1.offset.value != pnt2.offset.value) {
00705         pnt.offset.fuzz.Reset(new CInt_fuzz);
00706         pnt.offset.fuzz->SetRange().SetMin(pnt1.offset.value);
00707         pnt.offset.fuzz->SetRange().SetMax(pnt2.offset.value);
00708     }
00709 
00710     return pnt;
00711 
00712 #if 0
00713     todo: reconcile
00714     //If Both are Empty - the result is empty, otherwise reconciliate
00715     if(pnt1.pnt->GetPoint() == kInvalidSeqPos && pnt2.pnt->GetPoint() == kInvalidSeqPos) {
00716         pnt.pnt = pnt1.pnt;
00717         pnt.offset = pnt1.offset;
00718     } else {
00719         pnt.pnt.Reset(new CSeq_point);
00720         pnt.pnt.Assign(*pnt1.pnt);
00721 
00722         TSeqPos min_pos = min(pnt1.pnt->GetPoint(), pnt2.pnt->GetPoint());
00723         TSeqPos max_pos = max(pnt1.pnt->GetPoint(), pnt2.pnt->GetPoint());
00724 
00725         if(!pnt1->IsSetFuzz() && !pnt2->IsSetFuzz()) {
00726             //Both are non-fuzzy - create the min-max fuzz.
00727             //(10+50_10+60)
00728             pnt->SetFuzz().SetRange().SetMin(min_pos);
00729             pnt->SetFuzz().SetRange().SetMax(max_pos);
00730 
00731         } else if(pnt1->IsSetFuzz() && pnt2->IsSetFuzz()) {
00732             //Both are fuzzy - reconcile the fuzz.
00733 
00734             if(pnt1->GetFuzz().GetLim() == CInt_fuzz::eLim_tr
00735             && pnt2->GetFuzz().GetLim() == CInt_fuzz::eLim_tl)
00736             {
00737                 //fuzz points inwards - create min-max fuzz
00738                 //(10+?_11-?)
00739                 pnt->SetFuzz().SetRange().SetMin(min_pos);
00740                 pnt->SetFuzz().SetRange().SetMax(max_pos);
00741 
00742             } else if (pnt1->GetFuzz().GetLim() == CInt_fuzz::eLim_tl
00743                     && pnt2->GetFuzz().GetLim() == CInt_fuzz::eLim_tr)
00744             {
00745                 //fuzz points outwards - set fuzz to unk
00746                 //(10-?_10+?)
00747                 //(?_10+?)
00748                 //(10-?_?)
00749                 pnt->SetFuzz().SetLim(CInt_fuzz::eLim_unk);
00750 
00751             }  else if (pnt1->GetFuzz().GetLim() == CInt_fuzz::eLim_tl
00752                      && pnt2->GetFuzz().GetLim() == CInt_fuzz::eLim_tl)
00753             {
00754                 //fuzz is to the left - use 5'-most
00755                 //(?_10-?)
00756                 //(10-?_11-?)
00757                 pnt->SetPoint(pnt->GetStrand() == eNa_strand_minus ? max_pos : min_pos);
00758 
00759             }  else if (pnt1->GetFuzz().GetLim() == CInt_fuzz::eLim_tr
00760                      && pnt2->GetFuzz().GetLim() == CInt_fuzz::eLim_tr)
00761             {
00762                 //fuzz is to the right - use 3'-most
00763                 //(10+?_?)
00764                 //(10+?_11+?)
00765                 pnt->SetPoint(pnt->GetStrand() == eNa_strand_minus ? min_pos : max_pos);
00766 
00767             } else {
00768                 pnt->SetFuzz().SetLim(CInt_fuzz::eLim_unk);
00769             }
00770         } else {
00771             // One of the two is non-fuzzy:
00772             // use it to specify position, and the fuzz of the other to specify the fuzz
00773             // e.g.  (10+5_10+?)  -> loc1=100005; loc2=100000tr  -> 100005tr
00774 
00775             pnt->Assign(pnt1->IsSetFuzz() ? *pnt2 : *pnt1);
00776             pnt->SetFuzz().Assign(pnt1->IsSetFuzz() ? pnt1->GetFuzz()
00777                                                     : pnt2->GetFuzz());
00778 
00779         }
00780     }
00781 #endif
00782 
00783 
00784 
00785 }
00786 
00787 CSeq_id_Handle GetUniquePrimaryTranscriptId(CBioseq_Handle& bsh)
00788 {
00789     set<CSeq_id_Handle> annotated_transcript_feats;
00790     set<CSeq_id_Handle> annotated_transcript_aligns;
00791 
00792     SAnnotSelector sel;
00793     sel.SetResolveTSE();
00794     sel.IncludeFeatType(CSeqFeatData::e_Rna);
00795     for(CFeat_CI ci(bsh, sel); ci; ++ci) {
00796         const CMappedFeat& mf = *ci;
00797         if(mf.IsSetProduct() && mf.GetProduct().GetId()) {
00798             annotated_transcript_feats.insert(
00799                     CSeq_id_Handle::GetHandle(*mf.GetProduct().GetId()));
00800         }
00801     }
00802 
00803     for(CAlign_CI ci(bsh, sel); ci; ++ci) {
00804         const CSeq_align& aln = *ci;
00805         if(aln.GetSegs().IsSpliced()) {
00806             annotated_transcript_aligns.insert(
00807                     CSeq_id_Handle::GetHandle(aln.GetSeq_id(0)));
00808         }
00809     }
00810 
00811     vector<CSeq_id_Handle> v;
00812     set_intersection(annotated_transcript_feats.begin(),
00813                      annotated_transcript_feats.end(),
00814                      annotated_transcript_aligns.begin(),
00815                      annotated_transcript_aligns.end(),
00816                      back_inserter(v));
00817 
00818     return v.size() != 1 ? CSeq_id_Handle() 
00819         : sequence::GetId(v.front(), 
00820                           bsh.GetScope(), 
00821                           sequence::eGetId_ForceAcc);
00822 }
00823 
00824 bool IsLRG(CBioseq_Handle& bsh) 
00825 {
00826     ITERATE(CBioseq_Handle::TId, it, bsh.GetId()) {
00827         const CSeq_id& id = *it->GetSeqId();
00828         if(   id.IsGeneral()
00829            && id.GetGeneral().GetDb() == "LRG") 
00830         {
00831             return true;
00832         }
00833     }
00834     return false;
00835 }
00836 
00837 
00838 CHgvsParser::CContext CHgvsParser::x_header(TIterator const& i, const CContext& context)
00839 {
00840     HGVS_ASSERT_RULE(i, eID_header);
00841 
00842     CContext ctx(context);
00843 
00844     TIterator it = i->children.rbegin()->children.begin();
00845     string mol(it->value.begin(), it->value.end());
00846     CVariantPlacement::TMol mol_type =
00847                        mol == "c" ? CVariantPlacement::eMol_cdna
00848                      : mol == "g" ? CVariantPlacement::eMol_genomic
00849                      : mol == "r" ? CVariantPlacement::eMol_rna
00850                      : mol == "n" ? CVariantPlacement::eMol_rna
00851                      : mol == "p" ? CVariantPlacement::eMol_protein
00852                      : mol == "m" ? CVariantPlacement::eMol_mitochondrion
00853                      : mol == "mt" ? CVariantPlacement::eMol_mitochondrion
00854                      : CVariantPlacement::eMol_unknown;
00855 
00856     it  = (i->children.rbegin() + 1)->children.begin();
00857     string id_str(it->value.begin(), it->value.end());
00858 
00859     CSeq_id_Handle idh = context.ResolevSeqId(id_str);
00860     CBioseq_Handle bsh = context.GetScope().GetBioseqHandle(idh);
00861     if(!bsh) {
00862         HGVS_THROW(eSemantic, "Could not resolve seq-id-str='" + id_str + "'; idh=" + idh.AsString());
00863     }
00864         
00865 
00866     if(bsh.IsNucleotide() 
00867        && mol_type == CVariantPlacement::eMol_protein 
00868        && NStr::Find(id_str, "CCDS") == 0) 
00869     {
00870         //If we have something like CCDS2.1:p., the CCDS2.1 will resolve
00871         //to an NM, but we need to resolve it to corresponding NP.
00872         //
00873         //We could do this for all seq-ids, as long as there's unique CDS,
00874         //but as per SNP-4536 the seq-id and moltype correspondence must be enforced,
00875         //so we do it for CCDS only
00876 
00877         SAnnotSelector sel;
00878         sel.SetResolveTSE();
00879         sel.IncludeFeatType(CSeqFeatData::e_Cdregion);
00880         bool already_found = false;
00881         for(CFeat_CI ci(bsh, sel); ci; ++ci) {
00882             const CMappedFeat& mf = *ci;
00883             if(mf.IsSetProduct() && mf.GetProduct().GetId()) {
00884                 if(already_found) {
00885                     HGVS_THROW(eSemantic, "Can't resolve to prot - multiple CDSes on " + idh.AsString());
00886                 } else {
00887                     idh = sequence::GetId(
00888                             *mf.GetProduct().GetId(), 
00889                             context.GetScope(), 
00890                             sequence::eGetId_ForceAcc);
00891                     already_found = true;
00892                 }
00893             }
00894         }
00895         if(!already_found) {
00896             HGVS_THROW(eSemantic, "Can't resolve to prot - can't find CDS on " + idh.AsString());
00897         }
00898     } else if(   (   mol_type == CVariantPlacement::eMol_cdna 
00899                   || mol_type == CVariantPlacement::eMol_rna) 
00900               && idh.IdentifyAccession() == CSeq_id::eAcc_refseq_genomic)  //e.g. NG_009822.1:c.1437+1G>A
00901     {
00902         //VAR-861
00903         if(!IsLRG(bsh)) {
00904             HGVS_THROW(eSemantic, "Specifying c. expression in NG coordinates is only supported for LRG subset where NM/NG associations are stable");
00905         }
00906         
00907         CSeq_id_Handle idh2 = GetUniquePrimaryTranscriptId(bsh);
00908         if(!idh2) {
00909             HGVS_THROW(eSemantic, "Can't resolve to a unique transcript on NG: " + idh.AsString());
00910         } else {
00911             idh = idh2;
00912         }
00913     }
00914 
00915     ctx.SetId(*idh.GetSeqId(), mol_type);
00916 
00917     if(i->children.size() == 3) {
00918         it  = (i->children.rbegin() + 2)->children.begin();
00919         string tag_str(it->value.begin(), it->value.end());
00920         //record tag in context, if it is necessary in the future
00921     }
00922 
00923     return ctx;
00924 }
00925 
00926 
00927 CHgvsParser::SOffsetPoint CHgvsParser::x_pos_spec(TIterator const& i, const CContext& context)
00928 {
00929     HGVS_ASSERT_RULE(i, eID_pos_spec);
00930 
00931     SOffsetPoint pnt;
00932     TIterator it = i->children.begin();
00933     if(it->value.id() == SGrammar::eID_general_pos) {
00934         pnt = x_general_pos(it, context);
00935     } else if(it->value.id() == SGrammar::eID_fuzzy_pos) {
00936         pnt = x_fuzzy_pos(it, context);
00937     } else {
00938         bool flip_strand = false;
00939         if(i->children.size() == 3) {
00940             //first child is 'o' - opposite
00941             flip_strand = true;
00942             ++it;
00943         }
00944 
00945         CContext local_ctx = x_header(it, context);
00946         ++it;
00947         pnt = x_pos_spec(it, local_ctx);
00948 
00949         if(flip_strand) {
00950             pnt.pnt->FlipStrand();
00951         }
00952     }
00953 
00954     return pnt;
00955 }
00956 
00957 
00958 CHgvsParser::SOffsetPoint CHgvsParser::x_prot_pos(TIterator const& i, const CContext& context)
00959 {
00960     HGVS_ASSERT_RULE(i, eID_prot_pos);
00961     TIterator it = i->children.begin();
00962 
00963     CRef<CSeq_literal> prot_literal = x_raw_seq(it, context);
00964 
00965     if(context.GetPlacement().GetMol() != CVariantPlacement::eMol_protein) {
00966         HGVS_THROW(eSemantic, "Expected protein context");
00967     }
00968 
00969     if(prot_literal->GetLength() != 1) {
00970         HGVS_THROW(eSemantic, "Expected single aa literal in prot-pos");
00971     }
00972 
00973     ++it;
00974     SOffsetPoint pnt = x_pos_spec(it, context);
00975 
00976     pnt.asserted_sequence = prot_literal->GetSeq_data().GetNcbieaa();
00977 
00978     return pnt;
00979 }
00980 
00981 
00982 CRef<CVariantPlacement> CHgvsParser::x_range(TIterator const& i, const CContext& context)
00983 {
00984     SOffsetPoint pnt1, pnt2;
00985 
00986     CRef<CVariantPlacement> p(new CVariantPlacement);
00987     p->Assign(context.GetPlacement());
00988 
00989     if(i->value.id() == SGrammar::eID_prot_range) {
00990         pnt1 = x_prot_pos(i->children.begin(), context);
00991         pnt2 = x_prot_pos(i->children.begin() + 1, context);
00992     } else if(i->value.id() == SGrammar::eID_nuc_range) {
00993         pnt1 = x_pos_spec(i->children.begin(), context);
00994         pnt2 = x_pos_spec(i->children.begin() + 1, context);
00995     } else {
00996         HGVS_ASSERT_RULE(i, eID_NONE);
00997     }
00998 
00999     if(!pnt1.pnt->GetId().Equals(pnt2.pnt->GetId())) {
01000         HGVS_THROW(eSemantic, "Range-loc start/stop are on different seq-ids.");
01001     }
01002     if(pnt1.pnt->GetStrand() != pnt2.pnt->GetStrand()) {
01003         HGVS_THROW(eSemantic, "Range-loc start/stop are on different strands.");
01004     }
01005 
01006     p->SetLoc().SetInt().SetId(pnt1.pnt->SetId());
01007     p->SetLoc().SetInt().SetFrom(pnt1.pnt->GetPoint());
01008     p->SetLoc().SetInt().SetTo(pnt2.pnt->GetPoint());
01009     p->SetLoc().SetInt().SetStrand(pnt1.pnt->GetStrand());
01010     if(pnt1.pnt->IsSetFuzz()) {
01011         p->SetLoc().SetInt().SetFuzz_from(pnt1.pnt->SetFuzz());
01012     }
01013 
01014     if(pnt2.pnt->IsSetFuzz()) {
01015         p->SetLoc().SetInt().SetFuzz_to(pnt2.pnt->SetFuzz());
01016     }
01017 
01018     s_SetStartOffset(*p, pnt1.offset);
01019     s_SetStopOffset(*p, pnt2.offset);
01020 
01021     if(pnt1.asserted_sequence != "" || pnt2.asserted_sequence != "") {
01022         //for proteins, the asserted sequence is specified as part of location, rather than variation
01023         p->SetSeq().SetLength(CVariationUtil::s_GetLength(*p, NULL));
01024         string& seq_str = (context.GetPlacement().GetMol() == CVariantPlacement::eMol_protein)
01025                 ? p->SetSeq().SetSeq_data().SetNcbieaa().Set()
01026                 : p->SetSeq().SetSeq_data().SetIupacna().Set();
01027         seq_str = pnt1.asserted_sequence + ".." + pnt2.asserted_sequence;
01028     }
01029 
01030     return p;
01031 }
01032 
01033 CRef<CVariantPlacement> CHgvsParser::x_location(TIterator const& i, const CContext& context)
01034 {
01035     HGVS_ASSERT_RULE(i, eID_location);
01036 
01037     CRef<CVariantPlacement> placement(new CVariantPlacement);
01038     placement->Assign(context.GetPlacement());
01039 
01040     TIterator it = i->children.begin();
01041     CRef<CSeq_loc> loc(new CSeq_loc);
01042     if(it->value.id() == SGrammar::eID_prot_pos || it->value.id() == SGrammar::eID_pos_spec) {
01043         SOffsetPoint pnt = it->value.id() == SGrammar::eID_prot_pos
01044                 ? x_prot_pos(it, context)
01045                 : x_pos_spec(it, context);
01046         placement->SetLoc().SetPnt(*pnt.pnt);
01047         s_SetStartOffset(*placement, pnt.offset);
01048         if(pnt.asserted_sequence != "") {
01049             placement->SetSeq().SetLength(CVariationUtil::s_GetLength(*placement, NULL));
01050             string& seq_str = (context.GetPlacement().GetMol() == CVariantPlacement::eMol_protein)
01051                     ? placement->SetSeq().SetSeq_data().SetNcbieaa().Set()
01052                     : placement->SetSeq().SetSeq_data().SetIupacna().Set();
01053             seq_str = pnt.asserted_sequence;
01054         }
01055         
01056         //todo point with pos=0 and fuzz=unk -> unknown pos ->set loc to empty
01057 
01058     } else if(it->value.id() == SGrammar::eID_nuc_range || it->value.id() == SGrammar::eID_prot_range) {
01059         placement = x_range(it, context);
01060     } else {
01061         HGVS_ASSERT_RULE(it, eID_NONE);
01062     }
01063 
01064     if(placement->GetLoc().IsPnt() && placement->GetLoc().GetPnt().GetPoint() == kInvalidSeqPos) {
01065         placement->SetLoc().SetEmpty().Assign(context.GetId());
01066     }
01067 
01068     CVariationUtil util(context.GetScope());
01069     if(CVariationUtil::eFail == util.CheckExonBoundary(*placement)) {
01070         CRef<CVariationException> exception(new CVariationException);
01071         exception->SetCode(CVariationException::eCode_hgvs_exon_boundary);
01072         exception->SetMessage("HGVS exon-boundary position not represented in the transcript annotation");
01073         placement->SetExceptions().push_back(exception);    
01074     }
01075     util.CheckPlacement(*placement);
01076 
01077     return placement;
01078 }
01079 
01080 
01081 CRef<CSeq_loc> CHgvsParser::x_seq_loc(TIterator const& i, const CContext& context)
01082 {
01083     HGVS_ASSERT_RULE(i, eID_seq_loc);
01084     TIterator it = i->children.begin();
01085 
01086     bool flip_strand = false;
01087     if(i->children.size() == 3) {
01088         //first child is 'o' - opposite
01089         flip_strand = true;
01090         ++it;
01091     }
01092 
01093     CContext local_context = x_header(it, context);
01094     ++it;
01095     CRef<CVariantPlacement> p = x_location(it, local_context);
01096 
01097     if(flip_strand) {
01098         p->SetLoc().FlipStrand();
01099     }
01100 
01101     if(p->IsSetStop_offset() || p->IsSetStart_offset()) {
01102         HGVS_THROW(eSemantic, "Intronic seq-locs are not supported in this context");
01103     }
01104 
01105     CRef<CSeq_loc> loc(new CSeq_loc);
01106     loc->Assign(p->GetLoc());
01107     return loc;
01108 }
01109 
01110 CRef<CSeq_literal> CHgvsParser::x_raw_seq_or_len(TIterator const& i, const CContext& context)
01111 {
01112     HGVS_ASSERT_RULE(i, eID_raw_seq_or_len);
01113 
01114     CRef<CSeq_literal> literal;
01115     TIterator it = i->children.begin();
01116 
01117     if(it == i->children.end()) {
01118         HGVS_THROW(eLogic, "Unexpected parse-tree state when parsing " + context.GetHgvs());
01119     }
01120 
01121     if(it->value.id() == SGrammar::eID_raw_seq) {
01122         literal = x_raw_seq(it, context);
01123     } else if(it->value.id() == SGrammar::eID_int_fuzz) {
01124         SFuzzyInt int_fuzz = x_int_fuzz(it, context);
01125         literal.Reset(new CSeq_literal);
01126         literal->SetLength(int_fuzz.value);
01127         if(int_fuzz.fuzz.IsNull()) {
01128             ;//no-fuzz;
01129         } else if(int_fuzz.IsPureFuzz()) {
01130             //unknown length (no value) - will represent as length=0 with gt fuzz
01131             literal->SetFuzz().SetLim(CInt_fuzz::eLim_gt);
01132         } else {
01133             literal->SetFuzz(*int_fuzz.fuzz);
01134         }
01135     } else {
01136         HGVS_ASSERT_RULE(it, eID_NONE);
01137     }
01138     return literal;
01139 }
01140 
01141 CHgvsParser::TDelta CHgvsParser::x_seq_ref(TIterator const& i, const CContext& context)
01142 {
01143     HGVS_ASSERT_RULE(i, eID_seq_ref);
01144     CHgvsParser::TDelta delta(new TDelta::TObjectType);
01145     TIterator it = i->children.begin();
01146 
01147     if(it->value.id() == SGrammar::eID_seq_loc) {
01148         CRef<CSeq_loc> loc = x_seq_loc(it, context);
01149         delta->SetSeq().SetLoc(*loc);
01150     } else if(it->value.id() == SGrammar::eID_nuc_range || it->value.id() == SGrammar::eID_prot_range) {
01151         CRef<CVariantPlacement> p = x_range(it, context);
01152         if(p->IsSetStart_offset() || p->IsSetStop_offset()) {
01153             HGVS_THROW(eSemantic, "Intronic loc is not supported in this context");
01154         }
01155         delta->SetSeq().SetLoc().Assign(p->GetLoc());
01156     } else if(it->value.id() == SGrammar::eID_raw_seq_or_len) {
01157         CRef<CSeq_literal> literal = x_raw_seq_or_len(it, context);
01158         delta->SetSeq().SetLiteral(*literal);
01159     } else {
01160         HGVS_ASSERT_RULE(it, eID_NONE);
01161     }
01162 
01163     return delta;
01164 }
01165 
01166 
01167 bool CHgvsParser::s_hgvsaa2ncbieaa(const string& hgvsaa, string& out)
01168 {
01169     //try to interpret sequence that was matched by either of aminoacid1, aminoacid, or aminoacid3
01170     string tmp_out(""); //so that the caller may pass same variable for in and out
01171     bool ret = s_hgvsaa2ncbieaa(hgvsaa, true, tmp_out);
01172     if(!ret) {
01173         ret = s_hgvsaa2ncbieaa(hgvsaa, false, tmp_out);
01174     }
01175     if(!ret) {
01176         ret = s_hgvs_iupacaa2ncbieaa(hgvsaa, tmp_out);
01177     }
01178 
01179     out = tmp_out;
01180     return ret;
01181 }
01182 
01183 bool CHgvsParser::s_hgvs_iupacaa2ncbieaa(const string& hgvsaa, string& out)
01184 {
01185     out = hgvsaa;
01186 
01187     //"X" used to mean "Ter" in HGVS; now it means "unknown aminoacid"
01188     //Still, we'll interpret it as Ter, simply beacuse it is more likely
01189     //that the submitter is using legacy representation.
01190     NStr::ReplaceInPlace(out, "X", "*");
01191     NStr::ReplaceInPlace(out, "?", "X");
01192     return true;
01193 }
01194 
01195 bool CHgvsParser::s_hgvsaa2ncbieaa(const string& hgvsaa, bool uplow, string& out)
01196 {
01197     string in = hgvsaa;
01198     out = "";
01199     while(in != "") {
01200         bool found = false;
01201         for(size_t i_ncbistdaa = 0; i_ncbistdaa < 28; i_ncbistdaa++) {
01202             string iupac3 = CSeqportUtil::GetIupacaa3(i_ncbistdaa);
01203             if(NStr::StartsWith(in, uplow ? iupac3 : NStr::ToUpper(iupac3))) {
01204                 size_t i_ncbieaa = CSeqportUtil::GetMapToIndex(CSeq_data::e_Ncbistdaa,
01205                                                                CSeq_data::e_Ncbieaa,
01206                                                                i_ncbistdaa);
01207                 out += CSeqportUtil::GetCode(CSeq_data::e_Ncbieaa, i_ncbieaa);
01208                 found = true;
01209                 break;
01210             }
01211         }
01212         if(found) {
01213             in = in.substr(3);
01214         } else if(NStr::StartsWith(in, "*")) { out.push_back('*'); in = in.substr(1);
01215         } else if(NStr::StartsWith(in, "X")) { out.push_back('*'); in = in.substr(1);
01216         } else if(NStr::StartsWith(in, "?")) { out.push_back('X'); in = in.substr(1);
01217         //} else if(NStr::StartsWith(in, "STOP", NStr::eNocase)) { out.push_back('X'); in = in.substr(4); //VAR-283
01218         } else {
01219             out = hgvsaa;
01220             return false;
01221         }
01222     }
01223     return true;
01224 }
01225 
01226 
01227 
01228 CRef<CSeq_literal> CHgvsParser::x_raw_seq(TIterator const& i, const CContext& context)
01229 {
01230     HGVS_ASSERT_RULE(i, eID_raw_seq);
01231     TIterator it = i->children.begin();
01232 
01233     string seq_str(it->value.begin(), it->value.end());
01234 
01235     CRef<CSeq_literal>literal(new CSeq_literal);
01236     if(context.GetPlacement().GetMol() == CVariantPlacement::eMol_protein) {
01237         s_hgvsaa2ncbieaa(seq_str, seq_str);
01238         literal->SetSeq_data().SetNcbieaa().Set(seq_str);
01239     } else {
01240         seq_str = NStr::ToUpper(seq_str);
01241         NStr::ReplaceInPlace(seq_str, "U", "T");
01242         literal->SetSeq_data().SetIupacna().Set(seq_str);
01243     }
01244 
01245     literal->SetLength(seq_str.size());
01246 
01247     vector<TSeqPos> bad;
01248     CSeqportUtil::Validate(literal->GetSeq_data(), &bad);
01249 
01250     if(bad.size() > 0) {
01251         HGVS_THROW(eSemantic, "Invalid sequence at pos " +  NStr::IntToString(bad[0]) + " in " + seq_str);
01252     }
01253 
01254     return literal;
01255 }
01256 
01257 
01258 int GetDeltaLength(const CDelta_item& delta, int loc_len)
01259 {
01260     int len = !delta.IsSetSeq()          ? 0 
01261             : delta.GetSeq().IsLiteral() ? delta.GetSeq().GetLiteral().GetLength()
01262             : delta.GetSeq().IsLoc()     ? sequence::GetLength(delta.GetSeq().GetLoc(), NULL)
01263             : delta.GetSeq().IsThis()    ? loc_len
01264             : 0;
01265    if(delta.IsSetMultiplier()) {
01266         len *= delta.GetMultiplier();
01267    }
01268    return len; 
01269 }
01270 
01271 CVariation_inst::EType GetDelInsSubtype(int del_len, int ins_len)
01272 {
01273     return del_len > 0 && ins_len == 0        ? CVariation_inst::eType_del
01274          : del_len == 0 && ins_len > 0        ? CVariation_inst::eType_ins
01275          : del_len == ins_len && del_len != 1 ? CVariation_inst::eType_mnp
01276          : del_len == ins_len && del_len == 1 ? CVariation_inst::eType_snv
01277          :                                      CVariation_inst::eType_delins;
01278 }
01279 
01280 CRef<CVariation> CHgvsParser::x_delins(TIterator const& i, const CContext& context)
01281 {
01282     HGVS_ASSERT_RULE(i, eID_delins);
01283     TIterator it = i->children.begin();
01284     CRef<CVariation> del_vr = x_deletion(it, context);
01285     ++it;
01286     CRef<CVariation> ins_vr = x_insertion(it, context, false);
01287         //note: don't verify location, as it must be len=2 for pure-insertion only
01288 
01289     //The resulting delins variation has deletion's placement (with asserted seq, if any),
01290     //and insertion's inst, except action type is "replace" (default) rather than "ins-before",
01291     //so we reset action
01292 
01293     int placement_len = CVariationUtil::s_GetLength(SetFirstPlacement(*del_vr), NULL);
01294     int del_len = GetDeltaLength(*del_vr->GetData().GetInstance().GetDelta().front(), placement_len);
01295     int ins_len = GetDeltaLength(*ins_vr->GetData().GetInstance().GetDelta().front(), placement_len);
01296     del_vr->SetData().SetInstance().SetType(GetDelInsSubtype(del_len, ins_len));
01297 
01298     del_vr->SetData().SetInstance().SetDelta() = ins_vr->SetData().SetInstance().SetDelta();
01299     del_vr->SetData().SetInstance().SetDelta().front()->ResetAction();
01300 
01301     if(ins_len == 1 && del_len == 1) {
01302         CRef<CVariationException> ex(new CVariationException);
01303         ex->SetCode(CVariationException::eCode_hgvs_parsing);
01304         ex->SetMessage("delins used for single-nt substitution");
01305         SetFirstPlacement(*del_vr).SetExceptions().push_back(ex);
01306     }
01307 
01308     return del_vr;
01309 }
01310 
01311 CRef<CVariation> CHgvsParser::x_deletion(TIterator const& i, const CContext& context)
01312 {
01313     HGVS_ASSERT_RULE(i, eID_deletion);
01314     TIterator it = i->children.begin();
01315     CRef<CVariation> vr(new CVariation);
01316     CVariation_inst& var_inst = vr->SetData().SetInstance();
01317 
01318     var_inst.SetType(CVariation_inst::eType_del);
01319     CVariantPlacement& p = SetFirstPlacement(*vr);
01320     p.Assign(context.GetPlacement());
01321 
01322     CRef<CDelta_item> di(new CDelta_item);
01323     di->SetAction(CDelta_item::eAction_del_at);
01324     di->SetSeq().SetThis();
01325     var_inst.SetDelta().push_back(di);
01326 
01327     ++it;
01328 
01329     if(it != i->children.end() && it->value.id() == SGrammar::eID_raw_seq_or_len) {
01330         CRef<CSeq_literal> literal = x_raw_seq_or_len(it, context);
01331         ++it;
01332         SetFirstPlacement(*vr).SetSeq(*literal);
01333 
01334         if(literal->GetLength() != CVariationUtil::s_GetLength(p, NULL)) {
01335             CRef<CVariationException> ex(new CVariationException);
01336             ex->SetCode(CVariationException::eCode_hgvs_parsing);
01337             ex->SetMessage("Sequence length is inconsistent with location length");
01338             p.SetExceptions().push_back(ex);
01339         }
01340     }
01341 
01342     var_inst.SetDelta();
01343     return vr;
01344 }
01345 
01346 
01347 CRef<CVariation> CHgvsParser::x_insertion(TIterator const& i, const CContext& context, bool check_loc)
01348 {
01349     HGVS_ASSERT_RULE(i, eID_insertion);
01350     TIterator it = i->children.begin();
01351     ++it; //skip ins
01352     CRef<CVariation> vr(new CVariation);
01353     CVariation_inst& var_inst = vr->SetData().SetInstance();
01354 
01355     var_inst.SetType(CVariation_inst::eType_ins);
01356 
01357     SetFirstPlacement(*vr).Assign(context.GetPlacement());
01358 
01359     if(check_loc && CVariationUtil::s_GetLength(*vr->GetPlacements().front(), NULL) != 2) {
01360         HGVS_THROW(eSemantic, "Location must be a dinucleotide");
01361     }
01362 
01363     TDelta delta_ins = x_seq_ref(it, context);
01364 
01365     //todo:
01366     //alternative representation: if delta is literal, might use action=morph and prefix/suffix the insertion with the flanking nucleotides.
01367     delta_ins->SetAction(CDelta_item::eAction_ins_before);
01368 
01369     var_inst.SetDelta().push_back(delta_ins);
01370 
01371     return vr;
01372 }
01373 
01374 
01375 CRef<CVariation> CHgvsParser::x_duplication(TIterator const& i, const CContext& context)
01376 {
01377     HGVS_ASSERT_RULE(i, eID_duplication);
01378     TIterator it = i->children.begin();
01379     CRef<CVariation> vr(new CVariation);
01380     CVariation_inst& var_inst = vr->SetData().SetInstance();
01381     var_inst.SetType(CVariation_inst::eType_ins); //replace seq @ location with this*2
01382 
01383     SetFirstPlacement(*vr).Assign(context.GetPlacement());
01384 
01385     TDelta delta(new TDelta::TObjectType);
01386     delta->SetSeq().SetThis(); //delta->SetSeq().SetLoc(vr->SetLocation());
01387     delta->SetMultiplier(2);
01388     var_inst.SetDelta().push_back(delta);
01389 
01390     ++it; //skip dup
01391 
01392     //the next node is either expected length or expected sequence
01393     if(it != i->children.end() && it->value.id() == SGrammar::eID_seq_ref) {
01394         TDelta dup_seq = x_seq_ref(it, context);
01395         if(dup_seq->GetSeq().IsLiteral()) {
01396             SetFirstPlacement(*vr).SetSeq(dup_seq->SetSeq().SetLiteral());
01397 
01398             if(CVariationUtil::s_GetLength(*vr->GetPlacements().front(), NULL) != dup_seq->GetSeq().GetLiteral().GetLength()) {
01399                 HGVS_THROW(eSemantic, "Location length and asserted sequence length differ");
01400             }
01401         }
01402     }
01403 
01404     return vr;
01405 }
01406 
01407 
01408 CRef<CVariation> CHgvsParser::x_no_change(TIterator const& i, const CContext& context)
01409 {
01410     HGVS_ASSERT_RULE(i, eID_no_change);
01411     TIterator it = i->children.begin();
01412     CRef<CVariation> vr(new CVariation);
01413     CVariation_inst& var_inst = vr->SetData().SetInstance();
01414 
01415     CVariantPlacement& p = SetFirstPlacement(*vr);
01416     p.Assign(context.GetPlacement());
01417 
01418     //VAR-574: The no-change variation is interpreted as X>X
01419 
01420     if(it->value.id() == SGrammar::eID_raw_seq) {
01421         CRef<CSeq_literal> seq_from = x_raw_seq(it, context);
01422         p.SetSeq(*seq_from);
01423         ++it;
01424     } else if(p.GetLoc().IsWhole()) {
01425         // will fall-back 'identity' inst-type instead of whole-seq mnp
01426     } else {
01427         CVariationUtil util(context.GetScope());
01428         util.AttachSeq(p);
01429         p.ResetExceptions();
01430             // if could not attach seq (e.g. too-large or intronic context)
01431             // will fall-back on 'identity' inst-type below, so ignoring
01432             // the exceptions.
01433     }
01434 
01435     var_inst.SetType(
01436             p.GetMol() == CVariantPlacement::eMol_protein ? CVariation_inst::eType_prot_silent
01437           : !p.IsSetSeq()                                 ? CVariation_inst::eType_identity
01438           : p.GetSeq().GetLength() == 1                   ? CVariation_inst::eType_snv 
01439           :                                                 CVariation_inst::eType_mnp);
01440 
01441     TDelta delta(new TDelta::TObjectType);
01442     if(p.IsSetSeq()) {
01443         delta->SetSeq().SetLiteral().Assign(p.GetSeq());
01444     } else {
01445         delta->SetSeq().SetThis();
01446     }
01447     var_inst.SetDelta().push_back(delta);
01448 
01449     return vr;
01450 }
01451 
01452 
01453 CRef<CVariation> CHgvsParser::x_nuc_subst(TIterator const& i, const CContext& context)
01454 {
01455     HGVS_ASSERT_RULE(i, eID_nuc_subst);
01456     TIterator it = i->children.begin();
01457     CRef<CVariation> vr(new CVariation);
01458     CVariation_inst& var_inst = vr->SetData().SetInstance();
01459 
01460     SetFirstPlacement(*vr).Assign(context.GetPlacement());
01461 
01462     if(it->value.id() == SGrammar::eID_raw_seq) {
01463         CRef<CSeq_literal> seq_from = x_raw_seq(it, context);
01464         SetFirstPlacement(*vr).SetSeq(*seq_from);
01465         ++it;
01466     }
01467 
01468     ++it;//skip ">"
01469 
01470     CRef<CSeq_literal> seq_to = x_raw_seq(it, context);
01471     TDelta delta(new TDelta::TObjectType);
01472     delta->SetSeq().SetLiteral(*seq_to);
01473     var_inst.SetDelta().push_back(delta);
01474 
01475     var_inst.SetType(
01476             GetDelInsSubtype(
01477                 CVariationUtil::s_GetLength(SetFirstPlacement(*vr), NULL), 
01478                 seq_to->GetLength()));
01479 
01480     return vr;
01481 }
01482 
01483 
01484 CRef<CVariation> CHgvsParser::x_nuc_inv(TIterator const& i, const CContext& context)
01485 {
01486     HGVS_ASSERT_RULE(i, eID_nuc_inv);
01487 
01488     TIterator it = i->children.begin();
01489     CRef<CVariation> vr(new CVariation);
01490     CVariation_inst& var_inst = vr->SetData().SetInstance();
01491     var_inst.SetType(CVariation_inst::eType_inv);
01492 
01493     SetFirstPlacement(*vr).Assign(context.GetPlacement());
01494 
01495 #if 0
01496     TDelta delta(new TDelta::TObjectType);
01497     delta->SetSeq().SetLoc().Assign(*loc);
01498     delta->SetSeq().SetLoc().FlipStrand();
01499     var_inst.SetDelta().push_back(delta);
01500 #else
01501     //don't put anything in the delta, as the inversion sequence is placement-specific, not variation-specific
01502     var_inst.SetDelta(); 
01503 #endif
01504 
01505     ++it;
01506 
01507      //capture asserted seq
01508      if(it != i->children.end() && it->value.id() == SGrammar::eID_seq_ref) {
01509          TDelta dup_seq = x_seq_ref(it, context);
01510          if(dup_seq->GetSeq().IsLiteral()) {
01511              SetFirstPlacement(*vr).SetSeq(dup_seq->SetSeq().SetLiteral());
01512          }
01513      }
01514 
01515     return vr;
01516 }
01517 
01518 
01519 CRef<CVariation> CHgvsParser::x_ssr(TIterator const& i, const CContext& context)
01520 {
01521     HGVS_ASSERT_RULE(i, eID_ssr);
01522     TIterator it = i->children.begin();
01523     CRef<CVariation> vr(new CVariation);
01524     vr->SetData().SetInstance().SetType(CVariation_inst::eType_microsatellite);
01525 
01526 
01527     CRef<CSeq_literal> literal;
01528     if(it->value.id() == SGrammar::eID_raw_seq) {
01529         literal = x_raw_seq(it, context);
01530         ++it;
01531     }
01532 
01533     CVariantPlacement& p = SetFirstPlacement(*vr);
01534     p.Assign(context.GetPlacement());
01535 
01536     // The location may either specify a repeat unit, 
01537     // or point to the first base of a repeat unit.
01538     // We normalize it so it is alwas the repeat unit.
01539 #if 1
01540     if(   !literal.IsNull() 
01541        && literal->IsSetSeq_data() 
01542        && literal->GetSeq_data().IsIupacna()
01543        && !p.IsSetStart_offset()
01544        && !p.IsSetStop_offset())
01545     {
01546         CRef<CSeq_loc> ssr_loc = FindSSRLoc(
01547                 p.GetLoc(), 
01548                 literal->GetSeq_data().GetIupacna(), 
01549                 context.GetScope());
01550         p.SetLoc().Assign(*ssr_loc);
01551     } else if(p.IsSetStart_offset() && !p.IsSetStop_offset()) {
01552         p.SetStop_offset(p.GetStart_offset() 
01553                        + (literal.IsNull() ? 0 : literal->GetLength() - 1));
01554     }
01555 #else
01556     if(SetFirstPlacement(*vr).GetLoc().IsPnt() && !literal.IsNull()) {
01557         ExtendDownstream(SetFirstPlacement(*vr), literal->GetLength() - 1);
01558     }
01559 #endif
01560 
01561 
01562     if(it->value.id() == SGrammar::eID_ssr) { // list('['>>int_p>>']', '+') with '[',']','+' nodes discarded;
01563         //Note: see ssr grammar in the header for reasons why we have to match all alleles here
01564         //rather than match them separately as mut_insts
01565 
01566         vr->SetData().SetSet().SetType(CVariation::TData::TSet::eData_set_type_genotype);
01567         for(; it != i->children.end(); ++it) {
01568             string s1(it->value.begin(), it->value.end());
01569             CRef<CVariation> vr2(new CVariation);
01570             vr2->SetData().SetInstance().SetType(CVariation_inst::eType_microsatellite);
01571 
01572             TDelta delta(new TDelta::TObjectType);
01573             if(!literal.IsNull()) {
01574                 delta->SetSeq().SetLiteral().Assign(*literal);
01575             } else {
01576                 delta->SetSeq().SetThis();
01577             }
01578             delta->SetMultiplier(NStr::StringToInt(s1));
01579 
01580             vr2->SetData().SetInstance().SetDelta().push_back(delta);
01581             vr->SetData().SetSet().SetVariations().push_back(vr2);
01582         }
01583         vr = x_unwrap_iff_singleton(*vr);
01584     } else {
01585         TDelta delta(new TDelta::TObjectType);
01586         if(!literal.IsNull()) {
01587             delta->SetSeq().SetLiteral().Assign(*literal);
01588         } else {
01589             delta->SetSeq().SetThis();
01590         }
01591 
01592         SFuzzyInt int_fuzz = x_int_fuzz(it, context);
01593         delta->SetMultiplier(int_fuzz.value);
01594         if(int_fuzz.fuzz.IsNull()) {
01595             ;
01596         } else {
01597             delta->SetMultiplier_fuzz(*int_fuzz.fuzz);
01598         }
01599         vr->SetData().SetInstance().SetDelta().push_back(delta);
01600     }
01601 
01602     return vr;
01603 }
01604 
01605 
01606 CRef<CVariation> CHgvsParser::x_translocation(TIterator const& i, const CContext& context)
01607 {
01608     HGVS_ASSERT_RULE(i, eID_translocation);
01609     TIterator it = i->children.end() - 1; //note: seq-loc follows iscn expression, i.e. last child
01610     CRef<CVariation> vr(new CVariation);
01611     CVariation_inst& var_inst = vr->SetData().SetInstance();
01612     var_inst.SetType(CVariation_inst::eType_translocation);
01613 
01614     CRef<CSeq_loc> loc = x_seq_loc(it, context);
01615     SetFirstPlacement(*vr).SetLoc().Assign(*loc);
01616     CVariationUtil util(context.GetScope());
01617     SetFirstPlacement(*vr).SetMol(util.GetMolType(sequence::GetId(*loc, NULL)));
01618 
01619     it = i->children.begin();
01620     string iscn_expr(it->value.begin(), it->value.end());
01621     vr->SetSynonyms().push_back("ISCN:" + iscn_expr);
01622     var_inst.SetDelta(); //no delta contents
01623 
01624     return vr;
01625 }
01626 
01627 
01628 CRef<CVariation> CHgvsParser::x_conversion(TIterator const& i, const CContext& context)
01629 {
01630     HGVS_ASSERT_RULE(i, eID_conversion);
01631     TIterator it = i->children.begin();
01632     CRef<CVariation> vr(new CVariation);
01633     CVariation_inst& var_inst = vr->SetData().SetInstance();
01634     var_inst.SetType(CVariation_inst::eType_transposon);
01635 
01636     SetFirstPlacement(*vr).Assign(context.GetPlacement());
01637 
01638     ++it;
01639     CRef<CSeq_loc> loc_other = x_seq_loc(it, context);
01640 
01641     TDelta delta(new TDelta::TObjectType);
01642     delta->SetSeq().SetLoc().Assign(*loc_other);
01643     var_inst.SetDelta().push_back(delta);
01644 
01645     return vr;
01646 }
01647 
01648 
01649 CRef<CVariation> CHgvsParser::x_prot_fs(TIterator const& i, const CContext& context)
01650 {
01651     HGVS_ASSERT_RULE(i, eID_prot_fs);
01652     TIterator it = i->children.begin();
01653     CRef<CVariation> vr(new CVariation);
01654 
01655     if(context.GetPlacement().GetMol() != CVariantPlacement::eMol_protein) {
01656         HGVS_THROW(eContext, "Frameshift can only be specified in protein context");
01657     }
01658 
01659     vr->SetData().SetNote("Frameshift");
01660     vr->SetFrameshift();
01661 
01662     SetFirstPlacement(*vr).Assign(context.GetPlacement());
01663 
01664     ++it; //skip 'fs'
01665     if(it != i->children.end()) {
01666         //fsX# description: the remaining tokens are 'X' and integer
01667         ++it; //skip 'X'
01668         if(it != i->children.end()) {
01669             string s(it->value.begin(), it->value.end());
01670             int x_length = NStr::StringToInt(s);
01671             vr->SetFrameshift().SetX_length(x_length);
01672         }
01673     }
01674 
01675     return vr;
01676 }
01677 
01678 
01679 CRef<CVariation> CHgvsParser::x_prot_ext(TIterator const& i, const CContext& context)
01680 {
01681     HGVS_ASSERT_RULE(i, eID_prot_ext);
01682     TIterator it = i->children.begin();
01683 
01684     if(context.GetPlacement().GetMol() != CVariantPlacement::eMol_protein) {
01685         HGVS_THROW(eContext, "Expected protein context");
01686     }
01687 
01688     CRef<CVariation> vr(new CVariation);
01689     CVariation_inst& var_inst = vr->SetData().SetInstance();
01690     var_inst.SetType(CVariation_inst::eType_prot_other);
01691     string ext_type_str(it->value.begin(), it->value.end());
01692     ++it;
01693     string ext_len_str(it->value.begin(), it->value.end());
01694     int ext_len = NStr::StringToInt(ext_len_str);
01695 
01696     SetFirstPlacement(*vr).Assign(context.GetPlacement());
01697     SetFirstPlacement(*vr).SetLoc().SetPnt().SetId().Assign(context.GetId());
01698     SetFirstPlacement(*vr).SetLoc().SetPnt().SetStrand(eNa_strand_plus);
01699 
01700     TDelta delta(new TDelta::TObjectType);
01701     delta->SetSeq().SetLiteral().SetLength(abs(ext_len) + 1);
01702         //extension of Met or X by N bases = replacing first or last AA with (N+1) AAs
01703 
01704     if(ext_type_str == "extMet") {
01705         if(ext_len > 0) {
01706             HGVS_THROW(eSemantic, "extMet must be followed by a negative integer");
01707         }
01708         SetFirstPlacement(*vr).SetLoc().SetPnt().SetPoint(0);
01709         //extension precedes first AA
01710         var_inst.SetDelta().push_back(delta);
01711     } else if(ext_type_str == "extX" || ext_type_str == "ext*") {
01712         if(ext_len < 0) {
01713             HGVS_THROW(eSemantic, "exX must be followed by a non-negative integer");
01714         }
01715 
01716         SetFirstPlacement(*vr).SetLoc().SetPnt().SetPoint(context.GetBioseqHandle().GetInst_Length() - 1);
01717         //extension follows last AA
01718         var_inst.SetDelta().push_back(delta);
01719     } else {
01720         HGVS_THROW(eGrammatic, "Unexpected ext_type: " + ext_type_str);
01721     }
01722 
01723     return vr;
01724 }
01725 
01726 
01727 CRef<CVariation> CHgvsParser::x_prot_missense(TIterator const& i, const CContext& context)
01728 {
01729     HGVS_ASSERT_RULE(i, eID_prot_missense);
01730     TIterator it = i->children.begin();
01731 
01732     CRef<CSeq_literal> prot_literal = x_raw_seq(it, context);
01733 
01734     if(context.GetPlacement().GetMol() != CVariantPlacement::eMol_protein) {
01735         HGVS_THROW(eContext, "Expected protein context");
01736     }
01737 
01738     CRef<CVariation> vr(new CVariation);
01739     CVariation_inst& var_inst = vr->SetData().SetInstance();
01740     var_inst.SetType(prot_literal->GetLength() == 1 ? 
01741             CVariation_inst::eType_prot_missense 
01742           : CVariation_inst::eType_prot_other);
01743 
01744     SetFirstPlacement(*vr).Assign(context.GetPlacement());
01745 
01746     TDelta delta(new TDelta::TObjectType);
01747     delta->SetSeq().SetLiteral(*prot_literal);
01748     var_inst.SetDelta().push_back(delta);
01749 
01750     return vr;
01751 }
01752 
01753 
01754 CRef<CVariation>  CHgvsParser::x_string_content(TIterator const& i, const CContext& context)
01755 {
01756     CRef<CVariation> vr(new CVariation);
01757     string s(i->value.begin(), i->value.end());
01758     s = s.substr(1); //truncate the leading pipe
01759     SetFirstPlacement(*vr).Assign(context.GetPlacement());
01760     vr->SetData().SetNote(s);
01761     return vr;
01762 }
01763 
01764 
01765 CRef<CVariation> CHgvsParser::x_mut_inst(TIterator const& i, const CContext& context)
01766 {
01767     HGVS_ASSERT_RULE(i, eID_mut_inst);
01768 
01769     TIterator it = i->children.begin();
01770 
01771     CRef<CVariation> vr(new CVariation);
01772     if(it->value.id() == SGrammar::eID_mut_inst) {
01773         string s(it->value.begin(), it->value.end());
01774         if(s == "?") {
01775             vr->SetData().SetUnknown();
01776             SetFirstPlacement(*vr).Assign(context.GetPlacement());
01777         } else {
01778             vr = x_string_content(it, context);
01779         }
01780     } else {
01781         vr =
01782             it->value.id() == SGrammar::eID_no_change     ? x_no_change(it, context)
01783           : it->value.id() == SGrammar::eID_delins        ? x_delins(it, context)
01784           : it->value.id() == SGrammar::eID_deletion      ? x_deletion(it, context)
01785           : it->value.id() == SGrammar::eID_insertion     ? x_insertion(it, context, true)
01786           : it->value.id() == SGrammar::eID_duplication   ? x_duplication(it, context)
01787           : it->value.id() == SGrammar::eID_nuc_subst     ? x_nuc_subst(it, context)
01788           : it->value.id() == SGrammar::eID_nuc_inv       ? x_nuc_inv(it, context)
01789           : it->value.id() == SGrammar::eID_ssr           ? x_ssr(it, context)
01790           : it->value.id() == SGrammar::eID_conversion    ? x_conversion(it, context)
01791           : it->value.id() == SGrammar::eID_prot_ext      ? x_prot_ext(it, context)
01792           : it->value.id() == SGrammar::eID_prot_fs       ? x_prot_fs(it, context)
01793           : it->value.id() == SGrammar::eID_prot_missense ? x_prot_missense(it, context)
01794           : it->value.id() == SGrammar::eID_translocation ? x_translocation(it, context)
01795           : CRef<CVariation>(NULL);
01796 
01797         if(vr.IsNull()) {
01798             HGVS_ASSERT_RULE(it, eID_NONE);
01799         }
01800     }
01801 
01802     return vr;
01803 }
01804 
01805 CRef<CVariation> CHgvsParser::x_expr1(TIterator const& i, const CContext& context)
01806 {
01807     HGVS_ASSERT_RULE(i, eID_expr1);
01808     TIterator it = i->children.begin();
01809     CRef<CVariation> vr;
01810 
01811     string s(it->value.begin(), it->value.end());
01812     if(it->value.id() == i->value.id() && s == "(") {
01813         ++it;
01814         vr = x_expr1(it, context);
01815         SetComputational(*vr);
01816     } else if(it->value.id() == SGrammar::eID_list1a) {
01817         vr = x_list(it, context);
01818     } else if(it->value.id() == SGrammar::eID_header) {
01819         CContext local_ctx = x_header(it, context);
01820         ++it;
01821         vr = x_expr2(it, local_ctx);
01822     } else if(it->value.id() == SGrammar::eID_translocation) {
01823         vr = x_translocation(it, context);
01824     } else {
01825         HGVS_ASSERT_RULE(it, eID_NONE);
01826     }
01827 
01828     return vr;
01829 }
01830 
01831 CRef<CVariation> CHgvsParser::x_expr2(TIterator const& i, const CContext& context)
01832 {
01833     HGVS_ASSERT_RULE(i, eID_expr2);
01834     TIterator it = i->children.begin();
01835     CRef<CVariation> vr;
01836 
01837     string s(it->value.begin(), it->value.end());
01838     if(it->value.id() == i->value.id() && s == "(") {
01839         ++it;
01840         vr = x_expr2(it, context);
01841         SetComputational(*vr);
01842     } else if(it->value.id() == SGrammar::eID_list2a) {
01843         vr = x_list(it, context);
01844     } else if(it->value.id() == SGrammar::eID_location) {
01845         CContext local_context(context);
01846         CRef<CVariantPlacement> placement = x_location(it, local_context);
01847         local_context.SetPlacement().Assign(*placement);
01848         ++it;
01849         vr = x_expr3(it, local_context);
01850     } else if(it->value.id() == SGrammar::eID_prot_ext) {
01851         vr = x_prot_ext(it, context);
01852     } else if(it->value.id() == SGrammar::eID_no_change) {
01853         vr = x_no_change(it, context);
01854     } else if(it->value.id() == i->value.id()) {
01855         vr.Reset(new CVariation);
01856         SetFirstPlacement(*vr).Assign(context.GetPlacement());
01857 
01858         if(s == "?") {
01859             vr->SetData().SetUnknown();
01860             //SetFirstPlacement(*vr).SetLoc().SetEmpty().Assign(context.GetId());
01861         } else if(s == "0?" || s == "0") {
01862             //loss of product: represent as deletion of the whole product sequence.
01863             SetFirstPlacement(*vr).SetLoc().SetWhole().Assign(context.GetId());
01864             CVariation_inst& var_inst = vr->SetData().SetInstance();
01865             var_inst.SetType(CVariation_inst::eType_del);
01866             CRef<CDelta_item> di(new CDelta_item);
01867             di->SetAction(CDelta_item::eAction_del_at);
01868             di->SetSeq().SetThis();
01869             var_inst.SetDelta().push_back(di);
01870 
01871             if(s == "0?") {
01872                 SetComputational(*vr);
01873             }
01874         } else {
01875             HGVS_THROW(eGrammatic, "Unexpected expr terminal: " + s);
01876         }
01877     } else {
01878         HGVS_ASSERT_RULE(it, eID_NONE);
01879     }
01880 
01881     return vr;
01882 }
01883 
01884 
01885 CRef<CVariation> CHgvsParser::x_expr3(TIterator const& i, const CContext& context)
01886 {
01887     HGVS_ASSERT_RULE(i, eID_expr3);
01888     TIterator it = i->children.begin();
01889     CRef<CVariation> vr;
01890 
01891     string s(it->value.begin(), it->value.end());
01892     if(it->value.id() == i->value.id() && s == "(") {
01893         ++it;
01894         vr = x_expr3(it, context);
01895         SetComputational(*vr);
01896     } else if(it->value.id() == SGrammar::eID_list3a) {
01897         vr = x_list(it, context);
01898     } else if(it->value.id() == SGrammar::eID_mut_inst) {
01899         vr.Reset(new CVariation);
01900         vr->SetData().SetSet().SetType(CVariation::TData::TSet::eData_set_type_compound);
01901         for(; it != i->children.end(); ++it) {
01902             CRef<CVariation> inst_ref = x_mut_inst(it, context);
01903 
01904             if(inst_ref->GetData().IsNote()
01905               && inst_ref->GetData().GetNote() == "Frameshift"
01906               && vr->SetData().SetSet().SetVariations().size() > 0)
01907             {
01908                 //if inst_ref is a frameshift subexpression, we need to attach it as attribute of the
01909                 //previous variation in a compound inst-list, since frameshift is not a subtype of
01910                 //Variation.data, and thus not represented as a separate subvariation.
01911 
01912                 vr->SetData().SetSet().SetVariations().back()->SetFrameshift().Assign(inst_ref->GetFrameshift());
01913             } else {
01914                 vr->SetData().SetSet().SetVariations().push_back(inst_ref);
01915             }
01916         }
01917         vr = x_unwrap_iff_singleton(*vr);
01918     } else {
01919         HGVS_ASSERT_RULE(it, eID_NONE);
01920     }
01921 
01922     return vr;
01923 }
01924 
01925 
01926 CVariation::TData::TSet::EData_set_type CHgvsParser::x_list_delimiter(TIterator const& i, const CContext& context)
01927 {
01928     HGVS_ASSERT_RULE(i, eID_list_delimiter);
01929     TIterator it = i->children.begin();
01930     string s(it->value.begin(), it->value.end());
01931 
01932     return s == "//" ? CVariation::TData::TSet::eData_set_type_chimeric
01933          : s == "/"  ? CVariation::TData::TSet::eData_set_type_mosaic
01934          : s == "+"  ? CVariation::TData::TSet::eData_set_type_genotype
01935          : s == ","  ? CVariation::TData::TSet::eData_set_type_products
01936          : s == ";"  ? CVariation::TData::TSet::eData_set_type_haplotype //note that within context of list#a, ";" delimits genotype
01937          : s == "(+)" || s == "(;)" ? CVariation::TData::TSet::eData_set_type_individual
01938          : CVariation::TData::TSet::eData_set_type_unknown;
01939 }
01940 
01941 
01942 CRef<CVariation> CHgvsParser::x_list(TIterator const& i, const CContext& context)
01943 {
01944     if(!SGrammar::s_is_list(i->value.id())) {
01945         HGVS_ASSERT_RULE(i, eID_NONE);
01946     }
01947 
01948     CRef<CVariation> vr(new CVariation);
01949     TVariationSet& varset = vr->SetData().SetSet();
01950     varset.SetType(CVariation::TData::TSet::eData_set_type_unknown);
01951 
01952 
01953     for(TIterator it = i->children.begin(); it != i->children.end(); ++it) {
01954         //will process two elements from the children list: delimiter and following expression.
01955         //The first one does not have the delimiter. The delimiter determines the set-type.
01956         if(it != i->children.begin()) {
01957             if(SGrammar::s_is_list_a(i->value.id())) {
01958                 /*
01959                  * list#a is delimited by either ";"(HGVS-2.0) or "+"(HGVS_1.0);
01960                  * Both represent alleles within genotype.
01961                  * Note: the delimiter rule in the context is chset_p<>("+;"), i.e.
01962                  * a terminal, not a rule like list_delimiter; so calling
01963                  * x_list_delimiter(...) parser here would throw
01964                  */
01965                 varset.SetType(CVariation::TData::TSet::eData_set_type_genotype);
01966             } else {
01967                 CVariation::TData::TSet::EData_set_type set_type = x_list_delimiter(it, context);
01968                 if(varset.GetType() == CVariation::TData::TSet::eData_set_type_unknown) {
01969                     varset.SetType(set_type);
01970                 } else if(set_type != varset.GetType()) {
01971                     HGVS_THROW(eSemantic, "Non-unique delimiters within a list");
01972                 }
01973             }
01974             ++it;
01975         } 
01976 
01977         CRef<CVariation> vr;
01978         if(it->value.id() == SGrammar::eID_expr1) {
01979             vr = x_expr1(it, context);
01980         } else if(it->value.id() == SGrammar::eID_expr2) {
01981             vr = x_expr2(it, context);
01982         } else if(it->value.id() == SGrammar::eID_expr3) {
01983             vr = x_expr3(it, context);
01984         } else if(SGrammar::s_is_list(it->value.id())) {
01985             vr = x_list(it, context);
01986         } else {
01987             HGVS_ASSERT_RULE(it, eID_NONE);
01988         }
01989 
01990         varset.SetVariations().push_back(vr);
01991     }
01992 
01993     vr = x_unwrap_iff_singleton(*vr);
01994     return vr;
01995 }
01996 
01997 
01998 CRef<CVariation> CHgvsParser::x_root(TIterator const& i, const CContext& context)
01999 {
02000     HGVS_ASSERT_RULE(i, eID_root);
02001 
02002     CRef<CVariation> vr = x_list(i, context);
02003 
02004     RepackageAssertedSequence(*vr);
02005     AdjustMoltype(*vr, context.GetScope());
02006     CVariationUtil::s_FactorOutPlacements(*vr);
02007     
02008     vr->Index();
02009     return vr;
02010 }
02011 
02012 CRef<CVariation>  CHgvsParser::x_unwrap_iff_singleton(CVariation& v)
02013 {
02014     if(v.GetData().IsSet() && v.GetData().GetSet().GetVariations().size() == 1) {
02015         CRef<CVariation> first = v.SetData().SetSet().SetVariations().front();
02016         if(!first->IsSetPlacements() && v.IsSetPlacements()) {
02017             first->SetPlacements() = v.SetPlacements();
02018         }
02019         return first;
02020     } else {
02021         return CRef<CVariation>(&v);
02022     }
02023 }
02024 
02025 
02026 void CHgvsParser::sx_AppendMoltypeExceptions(CVariation& v, CScope& scope)
02027 {
02028     CVariationUtil util(scope);
02029     for(CTypeIterator<CVariantPlacement> it(Begin(v)); it; ++it) {
02030         CVariantPlacement& p = *it;
02031         CVariantPlacement::TMol mol = util.GetMolType(sequence::GetId(p.GetLoc(), NULL));
02032         if(p.GetMol() != mol) {
02033             CRef<CVariantPlacement> p2(new CVariantPlacement);
02034             p2->Assign(p);
02035             p2->SetMol(mol);
02036 
02037             string asserted_header = CHgvsParser::s_SeqIdToHgvsStr(p, &scope);
02038             string expected_header = CHgvsParser::s_SeqIdToHgvsStr(*p2, &scope);
02039 
02040             CRef<CVariationException> ex(new CVariationException);
02041             ex->SetCode(CVariationException::eCode_inconsistent_asserted_moltype);
02042             ex->SetMessage("Inconsistent mol-type. asserted:'" + asserted_header + "'; expected:'" + expected_header + "'");
02043             p.SetExceptions().push_back(ex);
02044         }
02045     }
02046 }
02047 
02048 CRef<CVariation> CHgvsParser::AsVariation(const string& hgvs, TOpFlags flags)
02049 {
02050     string hgvs2 = NStr::TruncateSpaces(hgvs); 
02051     tree_parse_info<> info = pt_parse(hgvs2.c_str(), *s_grammar, +space_p);
02052 
02053     if(!info.full) {
02054 #if 0
02055         CNcbiOstrstream ostr;
02056         tree_to_xml(ostr, info.trees, hgvs2.c_str() , CHgvsParser::SGrammar::s_GetRuleNames());
02057         string tree_str = CNcbiOstrstreamToString(ostr);
02058 #endif
02059         HGVS_THROW(eGrammatic, "Syntax error at pos " + NStr::SizetToString(info.length + 1) + " in " + hgvs2 + "");
02060     }
02061 
02062     CContext context(m_scope, m_seq_id_resolvers, hgvs);
02063     CRef<CVariation> vr = x_root(info.trees.begin(), context);
02064     vr->SetName(hgvs2);
02065     sx_AppendMoltypeExceptions(*vr, context.GetScope());
02066 
02067     CVariationUtil util(context.GetScope());
02068     util.CheckAmbiguitiesInLiterals(*vr);
02069 
02070     return vr;
02071 }
02072 
02073 
02074 
02075 void CHgvsParser::AttachHgvs(CVariation& v)
02076 {
02077     v.Index();
02078 
02079     //compute and attach placement-specific HGVS expressions
02080     for(CTypeIterator<CVariation> it(Begin(v)); it; ++it) {
02081         CVariation& v2 = *it;
02082         if(!v2.IsSetPlacements()) {
02083             continue;
02084         }
02085         NON_CONST_ITERATE(CVariation::TPlacements, it2, v2.SetPlacements()) {
02086             CVariantPlacement& p2 = **it2;
02087 
02088             if(!p2.GetLoc().GetId()) {
02089                 continue;
02090             }
02091 
02092             if(p2.GetMol() != CVariantPlacement::eMol_protein && v2.GetConsequenceParent()) {
02093                 //if this variation is in consequnece, only compute HGVS for protein variations
02094                 //(as otherwise it will throw - can't have HGVS expression for protein with nuc placement)
02095                 continue;
02096             }
02097 
02098             //compute hgvs-expression specific to the placement and the variation to which it is attached
02099             try {
02100                 string hgvs_expression = AsHgvsExpression(v2, CConstRef<CSeq_id>(p2.GetLoc().GetId()));
02101                 p2.SetHgvs_name(hgvs_expression);
02102             } catch (CException& e ) {
02103                 CNcbiOstrstream ostr;
02104                 ostr << MSerial_AsnText << p2;
02105                 string s = CNcbiOstrstreamToString(ostr);
02106                 NCBI_REPORT_EXCEPTION("Can't compute HGVS expression for " + s, e);
02107             }
02108         }
02109     }
02110 
02111     //If the root variation does not have placements (e.g. a container for placement-specific subvariations)
02112     //then compute the hgvs expression for the root placement and attach it to variation itself as a synonym.
02113     if(!v.IsSetPlacements()) {
02114         string root_output_hgvs = AsHgvsExpression(v);
02115         v.SetSynonyms().push_back(root_output_hgvs);
02116     }
02117 }
02118 
02119 
02120 
02121 };
02122 
02123 END_NCBI_SCOPE
02124 
Modified on Wed Mar 04 13:49:47 2015 by modify_doxy.py rev. 426318