NCBI C++ ToolKit
gnomon_objmgr.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gnomon_objmgr.cpp 68791 2015-09-02 14:23:40Z souvorov $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Mike DiCuccio
27  *
28  * File Description: gnomon library parts requiring object manager support
29  * to allow apps not using these to be linked w/o object manager libs
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <algo/gnomon/gnomon.hpp>
35 #include <algo/gnomon/chainer.hpp>
38 #include "hmm.hpp"
39 #include "hmm_inlines.hpp"
40 
48 #include <objmgr/util/sequence.hpp>
52 
53 #include <stdio.h>
54 
55 #include <functional>
56 
58 BEGIN_SCOPE(gnomon)
60 
61 void debug()
62 {
63 }
64 
65 Int8 GetModelId(const CSeq_align& seq_align)
66 {
67  Int8 id = 0;
68  if(seq_align.CanGetId())
69  seq_align.GetId().back()->GetIdType(id); // 0 if no integer id
70 
71  return id;
72  // return CIdHandler::GetId(*seq_align.GetId().back());
73 }
74 
75 CAlignModel::CAlignModel(const CSeq_align& seq_align) :
76  CGeneModel(seq_align.GetSegs().GetSpliced().GetGenomic_strand()==eNa_strand_minus?eMinus:ePlus, GetModelId(seq_align), emRNA)
77 {
78 #ifdef _DEBUG
79  debug();
80 #endif
81 
82  const CSpliced_seg& sps = seq_align.GetSegs().GetSpliced();
83 
84  bool is_protein = false;
86  SetType(eProt);
87  is_protein = true;
88  }
89 
90  bool is_product_reversed = false;
91  if(sps.CanGetProduct_strand() && sps.GetProduct_strand()==eNa_strand_minus) {
92  Status() |= CGeneModel::eReversed;
93  is_product_reversed = true;
94  }
95 
96  SetTargetId(sps.GetProduct_id());
97 
98  int product_len = sps.CanGetProduct_length()?sps.GetProduct_length():0;
99  if (is_protein)
100  product_len *=3;
101  int prod_prev = -1;
102  bool prev_3_prime_splice = false;
103 
104  int target_len = product_len;
105 
106  vector<TSignedSeqRange> transcript_exons;
107  TInDels indels;
108 
109  if(!sps.CanGetGenomic_id())
110  NCBI_THROW(CGnomonException, eGenericError, "CSpliced_seg must have genomic id");
111 
112  const CSeq_id& genomic_id = sps.GetGenomic_id();
113 
114  string mismatches;
115  string mismstatus;
116 
117  if(seq_align.CanGetExt()) {
118  int count = 0;
119  ITERATE(CSeq_align::TExt, i, seq_align.GetExt()) {
120  if((*i)->IsSetType() && (*i)->GetType().IsStr()) {
121  string type = (*i)->GetType().GetStr();
122  if(type == "RNASeq-Counts") {
123  ITERATE(CUser_object::TData, j, (*i)->GetData()) {
124  if(*j && (*j)->CanGetLabel() && (*j)->GetLabel().IsStr()) {
125  string label = (*j)->GetLabel().GetStr();
126  if(NStr::EndsWith(label, "alignments") || label.find(' ') == string::npos)
127  count += (*j)->GetData().GetInt();
128  }
129  }
130  } else if(type == "MismatchedBases") {
131  mismatches = (*i)->GetData().front()->GetData().GetStr();
132  } else if(type == "MismatchedBasesStatus") {
133  mismstatus = (*i)->GetData().front()->GetData().GetStr();
134  }
135  }
136  }
137  if(count > 0)
138  SetWeight(count);
139  if(Strand() == eMinus) {
140  reverse(mismatches.begin(),mismatches.end());
141  reverse(mismstatus.begin(),mismstatus.end());
142  }
143  }
144 
145 #ifdef _DEBUG
146  bool ggap_model= false;
147 #endif
148 
149  ITERATE(CSpliced_seg::TExons, e_it, sps.GetExons()) {
150  const CSpliced_exon& exon = **e_it;
151  int prod_cur_start = exon.GetProduct_start().AsSeqPos();
152  int prod_cur_end = exon.GetProduct_end().AsSeqPos();
153  if (is_product_reversed) {
154  int tmp = prod_cur_start;
155  prod_cur_start = product_len - prod_cur_end -1;
156  prod_cur_end = product_len - tmp -1;
157  }
158  int nuc_cur_start = exon.GetGenomic_start();
159  int nuc_cur_end = exon.GetGenomic_end();
160 
161  // bool cur_5_prime_splice = exon.CanGetAcceptor_before_exon() && exon.GetAcceptor_before_exon().CanGetBases() && exon.GetAcceptor_before_exon().GetBases().size()==2;
162  bool cur_5_prime_splice = exon.CanGetAcceptor_before_exon() && exon.GetAcceptor_before_exon().CanGetBases();
163 
164  if (prod_prev+1 != prod_cur_start || !prev_3_prime_splice || !cur_5_prime_splice) {
165  AddHole();
166  if(!mismatches.empty()) // also will take care about first exon
167  mismatches = mismatches.substr(prod_cur_start - prod_prev -1);
168  }
169 
170  // prev_3_prime_splice = exon.CanGetDonor_after_exon() && exon.GetDonor_after_exon().CanGetBases() && exon.GetDonor_after_exon().GetBases().size()==2;
171  prev_3_prime_splice = exon.CanGetDonor_after_exon() && exon.GetDonor_after_exon().CanGetBases();
172 
173  string fs, ss;
175  fs = exon.GetAcceptor_before_exon().GetBases();
176  }
178  ss = exon.GetDonor_after_exon().GetBases();
179  }
180  if(Strand() == eMinus) {
181  swap(fs,ss);
182  }
183 
184  double eident = 0;
185  if(exon.CanGetScores() && exon.GetScores().CanGet()) {
186  CScore_set::Tdata scores = exon.GetScores().Get();
187  ITERATE(CScore_set::Tdata, it, scores) {
188  if((*it)->CanGetId() && (*it)->GetId().IsStr()) {
189  if((*it)->GetId().GetStr() == "idty") {
190  eident = (*it)->GetValue().GetReal();
191  break;
192  }
193  }
194  }
195  }
196 
197  if(!exon.CanGetGenomic_id() || !(exon.GetGenomic_id() < genomic_id || genomic_id < exon.GetGenomic_id())) { // normal exon
198  AddNormalExon(TSignedSeqRange(nuc_cur_start,nuc_cur_end), fs, ss, eident, Strand() == eMinus);
199  } else { // genomic gap
200  if(!exon.CanGetGenomic_strand())
201  NCBI_THROW(CGnomonException, eGenericError, "CSpliced_exon for gap filling must have genomic strand");
204  scope.AddDefaults();
205 
206  string transcript = GetDNASequence(fill_id,scope);
207  string fill_seq = transcript.substr(nuc_cur_start, nuc_cur_end-nuc_cur_start+1);
208 
209  CInDelInfo::SSource fill_src;
210  fill_src.m_acc = CIdHandler::ToString(*fill_id);
211  fill_src.m_strand = ePlus;
212  fill_src.m_range = TSignedSeqRange(nuc_cur_start, nuc_cur_end);
213 
214  if(exon.GetGenomic_strand() == eNa_strand_minus) {
215  fill_src.m_strand = eMinus;
216  ReverseComplement(fill_seq.begin(),fill_seq.end());
217  }
218 
219  AddGgapExon(eident, fill_seq, fill_src, Strand() == eMinus);
220 #ifdef _DEBUG
221  ggap_model = true;
222 #endif
223  }
224  transcript_exons.push_back(TSignedSeqRange(exon.GetProduct_start().AsSeqPos(), exon.GetProduct_end().AsSeqPos()));
225 
226  _ASSERT(transcript_exons.back().NotEmpty());
227 
228  int pos = 0;
229  int prod_pos = prod_cur_start;
230 
231  ITERATE(CSpliced_exon::TParts, p_it, exon.GetParts()) {
232  const CSpliced_exon_chunk& chunk = **p_it;
234  if(!mismstatus.empty() && (chunk.IsProduct_ins() || chunk.IsGenomic_ins() || chunk.IsMismatch())) {
235  if(mismstatus[0] == 'n')
236  indelstatus = CInDelInfo::eGenomeNotCorrect;
237  else if(mismstatus[0] == 'c')
238  indelstatus = CInDelInfo::eGenomeCorrect;
239  mismstatus = mismstatus.substr(1);
240  }
241  if (chunk.IsProduct_ins()) {
242  string v = kEmptyStr;
243  int product_ins = chunk.GetProduct_ins();
244  if(!mismatches.empty()) {
245  v = mismatches.substr(0,product_ins);
246  mismatches = mismatches.substr(product_ins);
247  }
248  if(Strand() == eMinus)
249  reverse(v.begin(),v.end());
250  CInDelInfo fs(Strand()==ePlus?nuc_cur_start+pos:nuc_cur_end-pos+1, product_ins, CInDelInfo::eDel, v);
251  if (Strand() == ePlus)
252  indels.push_back(fs);
253  else
254  indels.insert(indels.begin(), fs);
255  prod_pos += chunk.GetProduct_ins();
256  } else if (chunk.IsGenomic_ins()) {
257  const int genomic_ins = chunk.GetGenomic_ins();
258  if (Strand()==ePlus)
259  indels.push_back(CInDelInfo(nuc_cur_start+pos, genomic_ins, CInDelInfo::eIns));
260  else
261  indels.insert(indels.begin(), CInDelInfo(nuc_cur_end-pos-genomic_ins+1, genomic_ins, CInDelInfo::eIns));
262  pos += genomic_ins;
263  } else if (chunk.IsMatch()) {
264  pos += chunk.GetMatch();
265  prod_pos += chunk.GetMatch();
266  } else if (chunk.IsMismatch()) {
267  int mismatch_len = chunk.GetMismatch();
268  string v(mismatch_len,'N');
269  if(!mismatches.empty()) {
270  _ASSERT(mismatch_len <= (int)mismatches.length());
271  v = mismatches.substr(0,mismatch_len);
272  mismatches = mismatches.substr(mismatch_len);
273  }
274  if(Strand() == ePlus) {
275  indels.push_back(CInDelInfo(nuc_cur_start+pos, mismatch_len, CInDelInfo::eMism, v));
276  } else {
277  reverse(v.begin(),v.end());
278  indels.insert(indels.begin(), CInDelInfo(nuc_cur_end-pos-mismatch_len+1, mismatch_len, CInDelInfo::eMism, v));
279  }
280  pos += mismatch_len;
281  prod_pos += mismatch_len;
282  } else { // if (chunk.IsDiag())
283  pos += chunk.GetDiag();
284  prod_pos += chunk.GetDiag();
285  }
286  if(indelstatus != CInDelInfo::eUnknown) {
287  if(Strand() == ePlus)
288  indels.back().SetStatus(indelstatus);
289  else
290  indels.front().SetStatus(indelstatus);
291  }
292  }
293 
294  prod_prev = prod_cur_end;
295  }
296 
297  _ASSERT(mismatches.empty() || (product_len - prod_prev - 1 == (int)mismatches.length()));
298 
299  sort(transcript_exons.begin(),transcript_exons.end());
300  bool minusstrand = Strand() == eMinus;
301  EStrand orientation = (is_product_reversed == minusstrand) ? ePlus : eMinus;
302  if(orientation == eMinus)
303  reverse(transcript_exons.begin(),transcript_exons.end());
304 
305  if (is_protein) {
306  _ASSERT(orientation == Strand());
307  if (sps.CanGetModifiers()) {
309  if ((*m)->IsStop_codon_found()) {
310  target_len += 3;
311  if (Strand() == ePlus) {
312  ExtendRight( 3 );
313  _ASSERT((transcript_exons.back().GetTo()+1)%3 == 0);
314  transcript_exons.back().SetTo(transcript_exons.back().GetTo()+3);
315  } else {
316  ExtendLeft( 3 );
317  _ASSERT((transcript_exons.front().GetTo()+1)%3 == 0);
318  transcript_exons.front().SetTo(transcript_exons.front().GetTo()+3);
319  }
320  }
321  }
322  }
323  }
324 
325  //convert tandem indels into indel+mism
326  bool keepdoing = true;
327  while(keepdoing) {
328  keepdoing = false;
329  NON_CONST_ITERATE(TInDels, indl, indels) {
330  TInDels::iterator indl_next = indl;
331  if(++indl_next == indels.end())
332  break;
333 
334  if(indl->InDelEnd() == indl_next->Loc()) { // tandem
335  string new_seq = indl->GetInDelV()+indl_next->GetInDelV();
336  int new_seq_len = new_seq.size();
337  if(indl->GetType() == indl_next->GetType()) { // combine same
338  *indl = CInDelInfo(indl->Loc(), indl->Len()+indl_next->Len(), indl->GetType(), new_seq);
339  indels.erase(indl_next);
340  keepdoing = true;
341  } else if(!indl->IsMismatch() && !indl_next->IsMismatch()) { // tandem indels
342  if(indl->Len() == indl_next->Len()) {
343  *indl = CInDelInfo(indl->Loc(), indl->Len(), CInDelInfo::eMism, new_seq);
344  indels.erase(indl_next);
345  } else if(indl->Len() < indl_next->Len()) {
346  *indl = CInDelInfo(indl->Loc(), indl->Len(), CInDelInfo::eMism, new_seq.substr(0,indl->Len()));
347  *indl_next = CInDelInfo(indl->InDelEnd(), indl_next->Len()-indl->Len(), indl_next->GetType(), new_seq.substr(indl->Len()));
348  } else { // indl_next->Len() < indl->Len()
349  *indl = CInDelInfo(indl->Loc(), indl->Len()-indl_next->Len(), indl->GetType(), new_seq.substr(0,new_seq_len-indl_next->Len()));
350  *indl_next = CInDelInfo(indl->InDelEnd(), indl_next->Len(), CInDelInfo::eMism, new_seq.substr(new_seq_len-indl_next->Len()));
351  }
352  keepdoing = true;
353  }
354  }
355  }
356  }
357 
358  m_alignmap = CAlignMap(Exons(), transcript_exons, indels, orientation, target_len );
359  FrameShifts() = indels;
360 
361  TSignedSeqRange newlimits = m_alignmap.ShrinkToRealPoints(Limits(),is_protein);
362  if(newlimits != Limits()) {
363  Clip(newlimits,CAlignModel::eRemoveExons);
364  }
365 
366  for (CGeneModel::TExons::const_iterator piece_begin = Exons().begin(); piece_begin != Exons().end(); ++piece_begin) {
367  _ASSERT( !piece_begin->m_fsplice );
368 
369  if(piece_begin->Limits().Empty()) { // ggap
370  _ASSERT(piece_begin->m_ssplice);
371  ++piece_begin;
372  _ASSERT(piece_begin->Limits().NotEmpty());
373  }
374 
375  CGeneModel::TExons::const_iterator piece_end;
376  for (piece_end = piece_begin; piece_end != Exons().end() && piece_end->m_ssplice; ++piece_end) ;
377  _ASSERT( piece_end != Exons().end() );
378 
379  CGeneModel::TExons::const_iterator piece_end_g = piece_end;
380  if(piece_end_g->Limits().Empty()) { // ggap
381  _ASSERT(piece_end_g->m_fsplice);
382  --piece_end_g;
383  _ASSERT(piece_end_g->Limits().NotEmpty());
384  }
385 
386  TSignedSeqRange piece_range(piece_begin->GetFrom(),piece_end_g->GetTo());
387 
388  piece_range = m_alignmap.ShrinkToRealPoints(piece_range, is_protein); // finds first projectable interval (on codon boundaries for proteins)
389 
390  /*
391  TSignedSeqRange pr;
392  while(pr != piece_range) {
393  pr = piece_range;
394  ITERATE(TInDels, i, indels) { // here we check that no indels touch our interval from outside
395  if((i->IsDeletion() && i->Loc() == pr.GetFrom()) || ((i->IsInsertion() || i->IsMismatch()) && i->Loc()+i->Len() == pr.GetFrom()))
396  pr.SetFrom(pr.GetFrom()+1);
397  else if(i->Loc() == pr.GetTo()+1)
398  pr.SetTo(pr.GetTo()-1);
399  }
400  if(pr != piece_range)
401  piece_range = m_alignmap.ShrinkToRealPoints(pr, is_protein);
402  }
403  */
404 
405  _ASSERT(piece_range.NotEmpty());
406  _ASSERT(piece_range.IntersectingWith(piece_begin->Limits()) && piece_range.IntersectingWith(piece_end_g->Limits()));
407 
408  if(piece_range.GetFrom() != piece_begin->GetFrom() || piece_range.GetTo() != piece_end_g->GetTo()) {
409  _ASSERT(!ggap_model);
410  Clip(piece_range, CGeneModel::eDontRemoveExons);
411  }
412 
413  piece_begin = piece_end;
414  }
415 
416  if (is_protein) {
417  TSignedSeqRange reading_frame = m_alignmap.MapRangeOrigToEdited(Limits(), true);
418  TSignedSeqRange start, stop;
419  if (sps.CanGetModifiers()) {
421  if ((*m)->IsStart_codon_found()) {
422  start = TSignedSeqRange(reading_frame.GetFrom(),reading_frame.GetFrom()+2);
423  reading_frame.SetFrom(start.GetTo()+1);
424  } else if ((*m)->IsStop_codon_found()) {
425  stop = TSignedSeqRange(reading_frame.GetTo()-2,reading_frame.GetTo());
426  reading_frame.SetTo(stop.GetFrom()-1);
427  }
428  }
429  }
430 
431  CCDSInfo cds_info_t;
432  cds_info_t.SetReadingFrame(reading_frame, true);
433  if (start.NotEmpty()) {
434  cds_info_t.SetStart(start, false);
435  }
436  if (stop.NotEmpty()) {
437  cds_info_t.SetStop(stop, false);
438  }
439  CCDSInfo cds_info_g = cds_info_t.MapFromEditedToOrig(m_alignmap);
440  if(cds_info_g.ReadingFrame().NotEmpty()) // successful projection
441  SetCdsInfo(cds_info_g);
442  else
443  SetCdsInfo(cds_info_t);
444  }
445 
446  if (sps.IsSetPoly_a()) {
447  Status() |= CGeneModel::ePolyA;
448  }
449 
450  if(seq_align.CanGetScore()) {
451  const CSeq_align::TScore& score = seq_align.GetScore();
452  ITERATE(CSeq_align::TScore, it, score) {
453  if((*it)->CanGetId() && (*it)->GetId().IsStr()) {
454  string scr = (*it)->GetId().GetStr();
455  if((scr == "N of matches") || (scr == "num_ident") || (scr == "matches")) {
456  double ident = (*it)->GetValue().GetInt();
457  ident /= seq_align.GetAlignLength();
458  SetIdent(ident);
459  } else if(scr == "rank" && (*it)->GetValue().GetInt() == 1) {
460  Status() |= CGeneModel::eBestPlacement;
461  } else if(scr == "ambiguous_orientation") {
463  } else if(scr == "count") {
464  _ASSERT(Weight() == 1 || Weight() == (*it)->GetValue().GetInt());
465  SetWeight((*it)->GetValue().GetInt());
466  }
467  }
468  }
469  }
470 }
471 
472 
473 
474 string CGeneModel::GetCdsDnaSequence (const CResidueVec& contig_sequence) const
475 {
476  if(ReadingFrame().Empty())
477  return kEmptyStr;
478 
480  CCDSInfo cds_info = GetCdsInfo();
481  if(cds_info.IsMappedToGenome())
482  cds_info = cds_info.MapFromOrigToEdited(amap);
483  int cds_len = cds_info.Cds().GetLength();
484  int cds_start = cds_info.Cds().GetFrom()-TranscriptLimits().GetFrom();
485 
486  CResidueVec mrna;
487  amap.EditedSequence(contig_sequence,mrna);
488 
489  string cds_seq(cds_len,'A');
490  copy(mrna.begin()+cds_start, mrna.begin()+cds_start+cds_len, cds_seq.begin());
491 
492  if(Status() & eReversed)
493  ReverseComplement(cds_seq.begin(),cds_seq.end());
494 
495  return cds_seq;
496 }
497 
498 string CGeneModel::GetProtein (const CResidueVec& contig_sequence) const
499 {
500  string cds_seq = GetCdsDnaSequence(contig_sequence);
501  string prot_seq;
502 
503  objects::CSeqTranslator::Translate(cds_seq, prot_seq, objects::CSeqTranslator::fIs5PrimePartial);
504 
505  if(PStop()) {
506  CCDSInfo cds_info = GetCdsInfo();
507  if(cds_info.IsMappedToGenome()) {
508  CAlignMap amap = GetAlignMap();
509  cds_info = cds_info.MapFromOrigToEdited(amap);
510  }
511  ITERATE(CCDSInfo::TPStops, stp, cds_info.PStops()) {
512  if(stp->m_status == CCDSInfo::eSelenocysteine)
513  prot_seq[(stp->GetFrom()- cds_info.Cds().GetFrom())/3] = 'U';
514  }
515  }
516 
517  return prot_seq;
518 }
519 
520 string CGeneModel::GetProtein (const CResidueVec& contig_sequence, const CGenetic_code* gencode) const
521 {
522  string cds_seq = GetCdsDnaSequence(contig_sequence);
523  string prot_seq;
524 
525  objects::CSeqTranslator::Translate(cds_seq, prot_seq, objects::CSeqTranslator::fDefault, gencode );
526  if (prot_seq[0] == '-') {
527  string first_triplet = cds_seq.substr(0, 3);
528  string first_aa;
529  objects::CSeqTranslator::Translate(first_triplet, first_aa, objects::CSeqTranslator::fIs5PrimePartial, gencode );
530  prot_seq = first_aa+prot_seq.substr(1);
531  }
532 
533  if(PStop()) {
534  CCDSInfo cds_info = GetCdsInfo();
535  if(cds_info.IsMappedToGenome()) {
536  CAlignMap amap = GetAlignMap();
537  cds_info = cds_info.MapFromOrigToEdited(amap);
538  }
539  ITERATE(CCDSInfo::TPStops, stp, cds_info.PStops()) {
540  if(stp->m_status == CCDSInfo::eSelenocysteine)
541  prot_seq[(stp->GetFrom()- cds_info.Cds().GetFrom())/3] = 'U';
542  }
543  }
544 
545  return prot_seq;
546 }
547 
548 //
549 // helper function - convert a vector<TSignedSeqRange> into a compact CSeq_loc
550 //
551 namespace {
552 
553 CRef<CSeq_loc> s_ExonDataToLoc(const vector<TSignedSeqRange>& vec,
554  ENa_strand strand, const CSeq_id& id)
555 {
556  CRef<CSeq_loc> loc(new CSeq_loc());
557 
559  ITERATE (vector<TSignedSeqRange>, iter, vec) {
561  ival->SetFrom(iter->GetFrom());
562  ival->SetTo(iter->GetTo());
563  ival->SetStrand(strand);
564  ival->SetId().Assign(id);
565 
566  data.push_back(ival);
567  }
568 
569  if (data.size() == 1) {
570  loc->SetInt(*data.front());
571  } else {
572  loc->SetPacked_int().Set().swap(data);
573  }
574 
575  return loc;
576 }
577 
578 } //end unnamed namespace
579 
581 {
582  TGeneModelList genes = GetGenes();
583 
585 
586  annot->SetNameDesc("Gnomon gene scan output");
587 
588  CSeq_annot::C_Data::TFtable& ftable = annot->SetData().SetFtable();
589 
590  unsigned int counter = 0;
591  string locus_tag_base("GNOMON_");
592  ITERATE (TGeneModelList, it, genes) {
593  const CGeneModel& igene = *it;
594  int strand = igene.Strand();
595  TSignedSeqRange cds_limits = igene.RealCdsLimits();
596 
597  vector<TSignedSeqRange> mrna_vec;
598  copy(igene.Exons().begin(), igene.Exons().end(), back_inserter(mrna_vec));
599  vector<TSignedSeqRange> cds_vec;
600 
601  for (size_t j = 0; j < mrna_vec.size(); ++j) {
602  TSignedSeqRange intersect(mrna_vec[j] & cds_limits);
603  if (!intersect.Empty()) {
604  cds_vec.push_back(intersect);
605  }
606  }
607 
608  // stop-codon removal from cds
609  if (igene.HasStop()) {
610  if (strand == ePlus) {
611  _ASSERT(cds_vec.back().GetLength()>=3);
612  cds_vec.back().SetTo(cds_vec.back().GetTo() - 3);
613  } else {
614  _ASSERT(cds_vec.front().GetLength()>=3);
615  cds_vec.front().SetFrom(cds_vec.front().GetFrom() + 3);
616  }
617  }
618 
619  //
620  // create our mRNA
621  CRef<CSeq_feat> feat_mrna;
622  if (mrna_vec.size()) {
623  feat_mrna = new CSeq_feat();
624  feat_mrna->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
625  feat_mrna->SetLocation
626  (*s_ExonDataToLoc(mrna_vec,
627  (strand == ePlus ? eNa_strand_plus : eNa_strand_minus), id));
628  } else {
629  continue;
630  }
631 
632  //
633  // create the accompanying CDS
634  CRef<CSeq_feat> feat_cds;
635  if (!cds_vec.empty()) {
636  feat_cds = new CSeq_feat();
637  feat_cds->SetData().SetCdregion().SetFrame(CCdregion::TFrame(1+(strand == ePlus?(igene.ReadingFrame().GetFrom()-cds_limits.GetFrom())%3:(cds_limits.GetTo()-igene.ReadingFrame().GetTo())%3)));
638 
639  feat_cds->SetLocation
640  (*s_ExonDataToLoc(cds_vec,
641  (strand == ePlus ? eNa_strand_plus : eNa_strand_minus), id));
642  }
643 
644  //
645  // create a dummy gene feature as well
646  CRef<CSeq_feat> feat_gene(new CSeq_feat());
647 
648  char buf[32];
649  sprintf(buf, "%04u", ++counter);
650  string name(locus_tag_base);
651  name += buf;
652  feat_gene->SetData().SetGene().SetLocus_tag(name);
653  feat_gene->SetLocation().SetInt()
654  .SetFrom(feat_mrna->GetLocation().GetTotalRange().GetFrom());
655  feat_gene->SetLocation().SetInt()
656  .SetTo(feat_mrna->GetLocation().GetTotalRange().GetTo());
657  feat_gene->SetLocation().SetInt().SetStrand
658  (strand == ePlus ? eNa_strand_plus : eNa_strand_minus);
659 
660  feat_gene->SetLocation().SetId
661  (sequence::GetId(feat_mrna->GetLocation(), 0));
662 
663  ftable.push_back(feat_gene);
664  ftable.push_back(feat_mrna);
665  if (feat_cds) {
666  ftable.push_back(feat_cds);
667  }
668  }
669  return annot;
670 }
671 
672 
673 //
674 //
675 // This function deduces the frame from 5p coordinate of the CDS which should be on the 5p codon boundary
676 //
677 //
678 double CCodingPropensity::GetScore(CConstRef<CHMMParameters> hmm_params, const CSeq_loc& cds, CScope& scope, int *const gccontent, double *const startscore)
679 {
680  *gccontent = 0;
681  const CSeq_id* seq_id = cds.GetId();
682  if (seq_id == NULL)
683  NCBI_THROW(CGnomonException, eGenericError, "CCodingPropensity: CDS has multiple ids or no id at all");
684 
685  // Need to know GC content in order to load correct models.
686  // Compute on the whole transcript, not just CDS.
687  CBioseq_Handle xcript_hand = scope.GetBioseqHandle(*seq_id);
688  CSeqVector xcript_vec = xcript_hand.GetSeqVector();
689  xcript_vec.SetIupacCoding();
690  unsigned int gc_count = 0;
691  CSeqVector_CI xcript_iter(xcript_vec);
692  for( ; xcript_iter; ++xcript_iter) {
693  if (*xcript_iter == 'G' || *xcript_iter == 'C') {
694  ++gc_count;
695  }
696  }
697  *gccontent = static_cast<unsigned int>(100.0 * gc_count / xcript_vec.size() + 0.5);
698 
699  const CMC3_CodingRegion<5>& cdr = dynamic_cast<const CMC3_CodingRegion<5>&>(hmm_params->GetParameter(CMC3_CodingRegion<5>::class_id(), *gccontent));
700  const CMC_NonCodingRegion<5>& ncdr = dynamic_cast<const CMC_NonCodingRegion<5>&>(hmm_params->GetParameter(CMC_NonCodingRegion<5>::class_id(), *gccontent));
701 
702  // Represent coding sequence as enum Nucleotides
703  CSeqVector vec(cds, scope);
704  vec.SetIupacCoding();
705  CEResidueVec seq;
706  seq.reserve(vec.size());
707  CSeqVector_CI iter(vec);
708  for( ; iter; ++iter) {
709  seq.push_back(fromACGT(*iter));
710  }
711 
712  // Sum coding and non-coding scores across coding sequence.
713  // Don't include stop codon!
714  double score = 0;
715  for (unsigned int i = 5; i < seq.size() - 3; ++i)
716  score += cdr.Score(seq, i, i % 3) - ncdr.Score(seq, i);
717 
718  if(startscore) {
719  //
720  // start evalustion needs 17 bases total (11 before ATG and 3 after)
721  // if there is not enough sequence it will be substituted by Ns which will degrade the score
722  //
723 
724  const CWMM_Start& start = dynamic_cast<const CWMM_Start&>(hmm_params->GetParameter(CWMM_Start::class_id(), *gccontent));
725 
726  int totallen = xcript_vec.size();
727  int left, right;
728  int extraNs5p = 0;
729  int extrabases = start.Left()+2; // (11) extrabases left of ATG needed for start (6) and noncoding (5) evaluation
730  if(cds.GetStrand() == eNa_strand_plus) {
731  int startposition = cds.GetTotalRange().GetFrom();
732  if(startposition < extrabases) {
733  left = 0;
734  extraNs5p = extrabases-startposition;
735  } else {
736  left = startposition-extrabases;
737  }
738  right = min(startposition+2+start.Right(),totallen-1);
739  } else {
740  int startposition = cds.GetTotalRange().GetTo();
741  if(startposition+extrabases >= totallen) {
742  right = totallen-1;
743  extraNs5p = startposition+extrabases-totallen+1;
744  } else {
745  right = startposition+extrabases;
746  }
747  left = max(0,startposition-2-start.Right());
748  }
749 
750 
751  CRef<CSeq_loc> startarea(new CSeq_loc());
752  CSeq_interval& d = startarea->SetInt();
753  d.SetStrand(cds.GetStrand());
754  CRef<CSeq_id> id(new CSeq_id());
755  id->Assign(*seq_id);
756  d.SetId(*id);
757  d.SetFrom(left);
758  d.SetTo(right);
759 
760  CSeqVector sttvec(*startarea, scope);
761  sttvec.SetIupacCoding();
762  CEResidueVec sttseq;
763  sttseq.resize(extraNs5p, enN); // fill with Ns if necessery
764  for(unsigned int i = 0; i < sttvec.size(); ++i) {
765  sttseq.push_back(fromACGT(sttvec[i]));
766  }
767  sttseq.resize(5+start.Left()+start.Right(), enN); // add Ns if necessery
768 
769  *startscore = start.Score(sttseq, extrabases+2);
770  if(*startscore != BadScore()) {
771  for(unsigned int i = 5; i < sttseq.size(); ++i) {
772  *startscore -= ncdr.Score(sttseq, i);
773  }
774  }
775  }
776 
777  return score;
778 }
779 
780 END_SCOPE(gnomon)
static char tmp[2048]
Definition: utf8.c:62
CBioseq_Handle –.
CSpliced_exon_chunk –.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:908
const TProduct_start & GetProduct_start(void) const
Get the Product_start member data.
vector< CInDelInfo > TInDels
const TExons & Exons() const
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:900
bool CanGetProduct_strand(void) const
Check if it is safe to call GetProduct_strand method.
char * buf
const TProduct_id & GetProduct_id(void) const
Get the Product_id member data.
TSignedSeqRange TranscriptLimits() const
void SetTo(TTo value)
Assign a value to To data member.
TProduct_strand GetProduct_strand(void) const
Get the Product_strand member data.
list< CRef< CSpliced_exon > > TExons
void SetFrom(TFrom value)
Assign a value to From data member.
list< CRef< CUser_object > > TExt
Definition: Seq_align_.hpp:402
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:864
TSignedSeqRange ReadingFrame() const
const Tdata & Get(void) const
Get the member data.
Definition: Score_set_.hpp:166
const TScore & GetScore(void) const
Get the Score member data.
Definition: Seq_align_.hpp:883
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:73
Int8 GetModelId(const CSeq_align &seq_align)
string GetDNASequence(CConstRef< objects::CSeq_id > id, CScope &scope)
Definition: id_handler.cpp:130
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1230
User-defined methods of the data storage class.
TSignedSeqRange ReadingFrame() const
const TParts & GetParts(void) const
Get the Parts member data.
position_type GetLength(void) const
Definition: range.hpp:158
CCDSInfo MapFromOrigToEdited(const CAlignMap &amap) const
TSeqPos AsSeqPos() const
Definition: Product_pos.cpp:56
string GetProtein(const CResidueVec &contig_sequence) const
TGenomic_ins GetGenomic_ins(void) const
Get the variant data.
double Score(const CEResidueVec &seq, int i, int codonshift) const
CCDSInfo MapFromEditedToOrig(const CAlignMap &amap) const
bool IsProduct_ins(void) const
Check if variant Product_ins is selected.
const TDonor_after_exon & GetDonor_after_exon(void) const
Get the Donor_after_exon member data.
CRef< objects::CSeq_annot > GetAnnot(const objects::CSeq_id &id)
list< CRef< CSeq_interval > > Tdata
bool CanGetScores(void) const
Check if it is safe to call GetScores method.
const TExons & GetExons(void) const
Get the Exons member data.
const TGenomic_id & GetGenomic_id(void) const
Get the Genomic_id member data.
EStrand
list< CRef< CSpliced_exon_chunk > > TParts
bool CanGetModifiers(void) const
Check if it is safe to call GetModifiers method.
list< CRef< CScore > > Tdata
Definition: Score_set_.hpp:90
signed NCBI_INT8_TYPE Int8
Signed 8 byte sized integer.
Definition: ncbitype.h:143
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:895
const NCBI_NS_NCBI::CEnumeratedTypeValues *ENUM_METHOD_NAME() ENa_strand(void)
Access to ENa_strand's attributes (values, names) as defined in spec.
TThisType & SetTo(position_type to)
Definition: range.hpp:180
TSignedSeqRange RealCdsLimits() const
bool CanGetProduct_type(void) const
Check if it is safe to call GetProduct_type method.
void SetStrand(TStrand value)
Assign a value to Strand data member.
#define NULL
Definition: ncbistd.hpp:225
TSeqPos size(void) const
Definition: seq_vector.hpp:291
bool CanGetGenomic_id(void) const
Check if it is safe to call GetGenomic_id method.
#define kEmptyStr
Definition: ncbistr.hpp:120
bool Empty(const CNcbiOstrstream &src)
Definition: fileutil.cpp:518
TThisType & SetFrom(position_type from)
Definition: range.hpp:170
bool IsMatch(void) const
Check if variant Match is selected.
bool CanGetScore(void) const
Check if it is safe to call GetScore method.
Definition: Seq_align_.hpp:877
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetStop(TSignedSeqRange r, bool confirmed=false)
const TId & GetId(void) const
Get the Id member data.
Definition: Seq_align_.hpp:963
vector< CRef< CUser_field > > TData
const TGenomic_id & GetGenomic_id(void) const
Get the Genomic_id member data.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:893
bool CanGetId(void) const
Check if it is safe to call GetId method.
Definition: Seq_align_.hpp:957
int i
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1082
bool CanGetDonor_after_exon(void) const
Check if it is safe to call GetDonor_after_exon method.
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:612
const TProduct_end & GetProduct_end(void) const
Get the Product_end member data.
const CInputModel & GetParameter(const string &type, int cgcontent) const
Definition: hmm.cpp:758
TSignedSeqRange Cds() const
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:101
TGenomic_end GetGenomic_end(void) const
Get the Genomic_end member data.
bool HasStop() const
const CCDSInfo & GetCdsInfo() const
bool CanGetGenomic_strand(void) const
Check if it is safe to call GetGenomic_strand method.
const TAcceptor_before_exon & GetAcceptor_before_exon(void) const
Get the Acceptor_before_exon member data.
string GetCdsDnaSequence(const CResidueVec &contig_sequence) const
TDiag GetDiag(void) const
Get the variant data.
void SetStart(TSignedSeqRange r, bool confirmed=false)
bool IsMismatch(void) const
Check if variant Mismatch is selected.
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_align_.hpp:988
int Right() const
Definition: hmm.hpp:114
list< CRef< CSpliced_seg_modifier > > TModifiers
bool Empty(void) const
Definition: range.hpp:148
CSeqVector –.
Definition: seq_vector.hpp:64
void debug()
TSeqPos GetAlignLength(bool include_gaps=true) const
Get the length of this alignment.
Definition: Seq_align.cpp:1915
double BadScore()
static string class_id()
Definition: hmm.hpp:150
string Translate(const string &nuc_str, bool is_mito)
TProduct_length GetProduct_length(void) const
Get the Product_length member data.
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
void SetId(TId &value)
Assign a value to Id data member.
TMismatch GetMismatch(void) const
Get the variant data.
void EditedSequence(const Vec &original_sequence, Vec &edited_sequence, bool includeholes=false) const
Definition: gnomon_seq.cpp:620
EResidue fromACGT(TResidue c)
Definition: gnomon_seq.hpp:61
USING_SCOPE(ncbi::objects)
const TSpliced & GetSpliced(void) const
Get the variant data.
Definition: Seq_align_.cpp:219
bool CanGetAcceptor_before_exon(void) const
Check if it is safe to call GetAcceptor_before_exon method.
bool CanGetProduct_length(void) const
Check if it is safe to call GetProduct_length method.
The Object manager core.
EStrand Strand() const
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
bool NotEmpty(void) const
Definition: range.hpp:152
T max(T x_, T y_)
bool CanGetBases(void) const
Check if it is safe to call GetBases method.
position_type GetTo(void) const
Definition: range.hpp:142
TMatch GetMatch(void) const
Get the variant data.
virtual CAlignMap GetAlignMap() const
Magic spell ;-) needed for some weird compilers... very empiric.
map< string, string > ss
unsigned int & Status()
T min(T x_, T y_)
const TModifiers & GetModifiers(void) const
Get the Modifiers member data.
CRange< TSignedSeqPos > TSignedSeqRange
Definition: range.hpp:420
position_type GetFrom(void) const
Definition: range.hpp:134
int Left() const
Definition: hmm.hpp:113
const TScores & GetScores(void) const
Get the Scores member data.
CScope –.
Definition: scope.hpp:90
void SetInt(TInt &v)
Definition: Seq_loc.hpp:965
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found...
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void SetReadingFrame(TSignedSeqRange r, bool protein=false)
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:923
static string ToString(const CSeq_id &id)
Definition: id_handler.cpp:68
bool IsGenomic_ins(void) const
Check if variant Genomic_ins is selected.
const TBases & GetBases(void) const
Get the Bases member data.
bool IsMappedToGenome() const
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
TGenomic_strand GetGenomic_strand(void) const
Get the Genomic_strand member data.
list< CGeneModel > GetGenes() const
vector< CRef< CScore > > TScore
Definition: Seq_align_.hpp:398
vector< SPStop > TPStops
bool PStop(bool includeall=true) const
void SetIupacCoding(void)
Set coding to either Iupacaa or Iupacna depending on molecule type.
#define _ASSERT
TGenomic_start GetGenomic_start(void) const
Get the Genomic_start member data.
namespace ncbi::objects::
Definition: Seq_feat.hpp:55
list< CGeneModel > TGeneModelList
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:70
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5530
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string...
Definition: ncbiexpt.hpp:546
bool CanGetExt(void) const
Check if it is safe to call GetExt method.
Definition: Seq_align_.hpp:982
double Score(const CEResidueVec &seq, int i) const
Definition: hmm.cpp:444
static double GetScore(CConstRef< CHMMParameters > hmm_params, const objects::CSeq_loc &cds, objects::CScope &scope, int *const gccontent, double *const startscore=0)
const TPStops & PStops() const
bool IsSetPoly_a(void) const
start of poly(A) tail on the transcript For sense transcripts: aligned product positions < poly-a <= ...
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:98
TProduct_type GetProduct_type(void) const
Get the Product_type member data.
vector< TResidue > CResidueVec
bool CanGetGenomic_id(void) const
Check if it is safe to call GetGenomic_id method.
void ReverseComplement(const BidirectionalIterator &first, const BidirectionalIterator &last)
TSignedSeqRange m_range
bool CanGet(void) const
Check if it is safe to call Get method.
Definition: Score_set_.hpp:160
unsigned int
Definition: types.hpp:1153
TProduct_ins GetProduct_ins(void) const
Get the variant data.
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
Modified on Sat Jun 25 14:44:22 2016 by modify_doxy.py rev. 426318