NCBI C++ ToolKit
aligncollapser.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: aligncollapser.cpp 71138 2016-02-11 20:02:35Z souvorov $
2  ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Alexandre Souvorov
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
36 #include <corelib/ncbiargs.hpp>
37 #include <objmgr/bioseq_handle.hpp>
38 #include <objmgr/scope.hpp>
39 #include <objmgr/seq_vector.hpp>
40 #include <objmgr/util/sequence.hpp>
41 #include "gnomon_seq.hpp"
42 
43 
45 BEGIN_SCOPE(gnomon)
47 
48 string GetTargetAcc(int shift, const deque<char>& id_pool) {
49  string target;
50  for(int i = shift; id_pool[i] != 0; ++i)
51  target.push_back(id_pool[i]);
52 
53  return target;
54 }
55 
56 CAlignModel CAlignCommon::GetAlignment(const SAlignIndividual& ali, const deque<char>& target_id_pool) const {
57 
59  if(isPolyA())
61  if(isCap())
62  a.Status() |= CGeneModel::eCap;
63  if(isUnknown())
65  a.SetID(ali.m_align_id >= 0 ? ali.m_align_id : -ali.m_align_id);
66  a.SetWeight(ali.m_weight);
67  if(ali.m_align_id < 0)
69 
70  if(m_introns.empty()) {
71  a.AddExon(ali.m_range);
72  } else {
73  string fs;
74  string ss;
75  if(!m_introns.front().m_sig.empty()) {
76  if(a.Strand() == ePlus)
77  ss = m_introns.front().m_sig.substr(0,2);
78  else
79  ss = m_introns.front().m_sig.substr(2,2);
80  }
81  a.AddExon(TSignedSeqRange(ali.m_range.GetFrom(), m_introns.front().m_range.GetFrom()), fs, ss);
82  for(int i = 0; i < (int)m_introns.size()-1; ++i) {
83  if(!m_introns[i].m_sig.empty() && !m_introns[i+1].m_sig.empty()) {
84  if(a.Strand() == ePlus) {
85  fs = m_introns[i].m_sig.substr(2,2);
86  ss = m_introns[i+1].m_sig.substr(0,2);
87  } else {
88  fs = m_introns[i].m_sig.substr(0,2);
89  ss = m_introns[i+1].m_sig.substr(2,2);
90  }
91  }
92  a.AddExon(TSignedSeqRange(m_introns[i].m_range.GetTo(), m_introns[i+1].m_range.GetFrom()), fs, ss);
93  }
94  if(!m_introns.back().m_sig.empty()) {
95  if(a.Strand() == ePlus)
96  fs = m_introns.back().m_sig.substr(2,2);
97  else
98  fs = m_introns.back().m_sig.substr(0,2);
99  }
100  ss = "";
101  a.AddExon(TSignedSeqRange(m_introns.back().m_range.GetTo(), ali.m_range.GetTo()), fs, ss);
102  }
103 
104  CAlignMap amap(a.Exons(), a.FrameShifts(), a.Strand());
105  CAlignModel align(a, amap);
106 
107  CRef<CSeq_id> target_id(CIdHandler::ToSeq_id(GetTargetAcc(ali.m_target_id, target_id_pool)));
108  align.SetTargetId(*target_id);
109 
110  return align;
111 };
112 
114  LeftAndLongFirstOrder(const deque<char>& idp) : id_pool(idp) {}
115  const deque<char>& id_pool;
116 
117  bool operator() (const SAlignIndividual& a, const SAlignIndividual& b) { // left and long first
118  if(a.m_range == b.m_range)
119  return GetTargetAcc(a.m_target_id,id_pool) < GetTargetAcc(b.m_target_id,id_pool);
120  else if(a.m_range.GetFrom() != b.m_range.GetFrom())
121  return a.m_range.GetFrom() < b.m_range.GetFrom();
122  else
123  return a.m_range.GetTo() > b.m_range.GetTo();
124  }
125 };
126 
127 bool OriginalOrder(const SAlignIndividual& a, const SAlignIndividual& b) { // the order in which alignmnets were added
128  return a.m_target_id < b.m_target_id;
129 }
130 
131 
132 
134 
135  m_flags = 0;
136  if(align.Type()&CGeneModel::eSR)
137  m_flags |= esr;
138  if(align.Type()&CGeneModel::eEST)
139  m_flags |= eest;
140  if(align.Status()&CGeneModel::ePolyA)
141  m_flags |= epolya;
142  if(align.Status()&CGeneModel::eCap)
143  m_flags |= ecap;
144 
147  m_flags |= eplus;
148  } else if(align.Strand() == ePlus){
149  m_flags |= eplus;
150  } else {
151  m_flags |= eminus;
152  }
153 
154  const CGeneModel::TExons& e = align.Exons();
155  for(int i = 1; i < (int)e.size(); ++i) {
156  if(e[i-1].m_ssplice && e[i].m_fsplice) {
157  string sig;
158  if(align.Strand() == ePlus)
159  sig = e[i-1].m_ssplice_sig+e[i].m_fsplice_sig;
160  else
161  sig = e[i].m_fsplice_sig+e[i-1].m_ssplice_sig;
162  SIntron intron(e[i-1].GetTo(), e[i].GetFrom(), align.Strand(), (align.Status()&CGeneModel::eUnknownOrientation) == 0, sig);
163  m_introns.push_back(intron);
164  }
165  }
166 }
167 
169  SAlignExtended(SAlignIndividual& ali, const set<int>& left_exon_ends, const set<int>& right_exon_ends) : m_ali(&ali), m_initial_right_end(ali.m_range.GetTo()) {
170 
172  right_exon_ends.lower_bound(m_ali->m_range.GetTo()); // leftmost compatible rexon
174  if(ri != right_exon_ends.end())
175  m_rlimb = *ri; // position of leftmost compatible rexon
176  m_rlima = -1;
177  if(ri != right_exon_ends.begin())
178  m_rlima = *(--ri); // position of the rightmost incompatible rexon
180  left_exon_ends.upper_bound(m_ali->m_range.GetFrom()); // leftmost not compatible lexon
182  if(li != left_exon_ends.end())
183  m_llimb = *li; // position of the leftmost not compatible lexon
184  }
185 
188  int m_rlimb;
189  int m_rlima;
190  int m_llimb;
191 };
192 
194  arg_desc->SetCurrentGroup("Collapsing and filtering");
195 
196  arg_desc->AddFlag("filtersr","Filter SR");
197  arg_desc->AddFlag("filterest","Filter EST");
198  arg_desc->AddFlag("filtermrna","Filter mRNA");
199  arg_desc->AddFlag("filterprots","Filter proteins");
200  arg_desc->AddFlag("collapsest","Collaps EST");
201  arg_desc->AddFlag("collapssr","Collaps SR");
202  arg_desc->AddFlag("fillgenomicgaps","Use provided selfspecies cDNA for genomic gap filling");
203 
204  arg_desc->AddDefaultKey("max-extension", "MaxExtension",
205  "Maximal extension for one-exon collapsed alignments",
207 
208  arg_desc->AddDefaultKey("min-consensus-support", "MinConsensusSupport",
209  "Minimal number of support for consensus intron",
211 
212  arg_desc->AddDefaultKey("min-non-consensussupport", "MinNonconsensusSupport",
213  "Minimal number of support for non-consensus intron",
215 
216  arg_desc->AddDefaultKey("high-identity", "HighIdentity",
217  "Minimal exon identity threshold for accepted introns",
218  CArgDescriptions::eDouble, "0.98");
219 
220  arg_desc->AddDefaultKey("min-support-fraction", "MinSupportFraction",
221  "Minimal splice expression relative exon expression",
222  CArgDescriptions::eDouble, "0.03");
223 
224  arg_desc->AddDefaultKey("end-pair-support-cutoff", "EndOairSupportCutoff",
225  "Minimal expression relative to the mean for introns with the same splice",
227 
228  arg_desc->AddDefaultKey("minest", "minest",
229  "Minimal EST support to trump expression checks",
231 
232  arg_desc->AddDefaultKey("min-edge-coverage", "MinEdgeCoverage",
233  "Minimal absolute expression for accepted single-exon alignmnets without polyA/Cap",
235 
236  arg_desc->AddDefaultKey("sharp-boundary", "SharpBoundary",
237  "Minimal relative expression for crossing splice",
239 
240  arg_desc->SetCurrentGroup("");
241 }
242 
243 CAlignCollapser::CAlignCollapser(string contig, CScope* scope, bool nofilteringcollapsing) : m_count(0), m_scope(scope) {
244  const CArgs& args = CNcbiApplication::Instance()->GetArgs();
245 
246  if(nofilteringcollapsing) {
247  m_filtersr = false;
248  m_filterest = false;
249  m_filtermrna = false;
250  m_filterprots = false;
251  m_collapsest = false;
252  m_collapssr = false;
253  } else {
254  m_filtersr = args["filtersr"];
255  m_filterest = args["filterest"];
256  m_filtermrna = args["filtermrna"];
257  m_filterprots = args["filterprots"];
258  m_collapsest = args["collapsest"];
259  m_collapssr = args["collapssr"];
260  }
261  m_fillgenomicgaps = args["fillgenomicgaps"];
262 
263  if(m_scope != 0 && contig != "") {
264 
265  m_contig_name = contig;
266 
267  CRef<CSeq_id> contigid(new CSeq_id);
268  contigid->Assign(*CIdHandler::ToSeq_id(contig));
269  if(!contigid)
270  contigid = new CSeq_id(CSeq_id::e_Local, contig);
271 
272  CBioseq_Handle bh (m_scope->GetBioseqHandle(*contigid));
273  if (!bh) {
274  NCBI_THROW(CException, eUnknown, "contig '"+contig+"' retrieval failed");
275  }
276  CSeqVector sv (bh.GetSeqVector(CBioseq_Handle::eCoding_Iupac));
277  int length (sv.size());
278 
279  sv.GetSeqData(0, length, m_contig);
280 
281  m_contigrv.resize(length);
282  copy(m_contig.begin(),m_contig.end(),m_contigrv.begin());
283 
284  TIntMap::iterator current_gap = m_genomic_gaps_len.end();
285  for(int i = 0; i < length; ++i) {
286  if(sv.IsInGap(i)) {
287  if(current_gap == m_genomic_gaps_len.end())
288  current_gap = m_genomic_gaps_len.insert(TIntMap::value_type(i,1)).first;
289  else
290  ++current_gap->second;
291  } else {
292  current_gap = m_genomic_gaps_len.end();
293  }
294  }
295 
296  m_genomic_gaps_len[-1] = 1; // fake gap at the beginning
297  m_genomic_gaps_len[length] = 1; // fake gap at the end
298  }
299 }
300 
301 
303  return ali.m_weight < 0;
304 }
305 
306 #define COVERED_FRACTION 0.75
307 bool AlignmentIsSupportedBySR(const CAlignModel& align, const vector<double>& coverage, int mincoverage, int left_end) {
308 
309  int align_len = align.AlignLen();
310 
311  int covered_length = 0;
312  ITERATE(CGeneModel::TExons, i, align.Exons()) {
313  for(int p = i->Limits().GetFrom(); p <= i->Limits().GetTo(); ++p)
314  if(coverage[p-left_end] >= mincoverage)
315  ++covered_length;
316  }
317 
318  return (covered_length >= COVERED_FRACTION*align_len);
319 }
320 
321 bool isGoodIntron(int a, int b, EStrand strand, const CAlignCollapser::TAlignIntrons& introns, bool check_introns_on_both_strands) {
322  SIntron intron_oriented_nosig(a, b, strand, true, "");
323  SIntron intron_notoriented_nosig(a, b, ePlus, false, "");
324  bool good_intron = (introns.find(intron_oriented_nosig) != introns.end() || introns.find(intron_notoriented_nosig) != introns.end());
325  if(!good_intron && check_introns_on_both_strands) {
326  SIntron intron_otherstrand_nosig(a, b, OtherStrand(strand), true, "");
327  good_intron = (introns.find(intron_otherstrand_nosig) != introns.end());
328  }
329 
330  return good_intron;
331 }
332 
333 
334 #define END_PART_LENGTH 35
335 
336 void CAlignCollapser::ClipNotSupportedFlanks(CAlignModel& align, double clip_threshold) {
337 
338  double cov = 0;
339  int nt = 0;
340  ITERATE(CGeneModel::TExons, e, align.Exons()) {
341  for(int i = e->GetFrom(); i <= e->GetTo(); ++i) {
342  cov += m_coverage[i-m_left_end];
343  ++nt;
344  }
345  }
346  cov /= nt;
347 
348  CAlignMap amap = align.GetAlignMap();
349  TSignedSeqRange old_limits = align.Limits();
350 
351  if(align.Type()&CGeneModel::eNotForChaining) {
352  TSignedSeqRange tlim = align.TranscriptLimits();
353  int not_aligned_left = tlim.GetFrom();
354  int not_aligned_right = align.TargetLen()-1-tlim.GetTo();
355  if(align.Orientation() == eMinus)
356  swap(not_aligned_left,not_aligned_right);
357 
358  if(not_aligned_left > 30) {
359  int l = align.Limits().GetFrom();
360  int ie = 0;
361  while(l < align.Limits().GetTo() && m_coverage[l-m_left_end] < clip_threshold*cov) {
362  if(l < align.Exons()[ie].GetTo())
363  ++l;
364  else
365  l = align.Exons()[++ie].GetFrom();
366  }
367  if(l != align.Limits().GetFrom()) {
368  TSignedSeqRange seg = amap.ShrinkToRealPoints(TSignedSeqRange(l,align.Limits().GetTo()), false);
369  if(seg.Empty() || amap.FShiftedLen(seg,false) < END_PART_LENGTH) {
370  align.ClearExons();
371  return;
372  } else {
373  align.Clip(seg,CGeneModel::eRemoveExons);
374  }
375  }
376  }
377 
378  if(not_aligned_right > 30) {
379  int r = align.Limits().GetTo();
380  int ie = align.Exons().size()-1;
381  while(r > align.Limits().GetFrom() && m_coverage[r-m_left_end] < clip_threshold*cov) {
382  if(r > align.Exons()[ie].GetFrom())
383  --r;
384  else
385  r = align.Exons()[--ie].GetTo();
386  }
387  if(r != align.Limits().GetTo()) {
388  TSignedSeqRange seg = amap.ShrinkToRealPoints(TSignedSeqRange(align.Limits().GetFrom(),r), false);
389  if(seg.Empty() || amap.FShiftedLen(seg,false) < END_PART_LENGTH) {
390  align.ClearExons();
391  return;
392  } else {
393  align.Clip(seg,CGeneModel::eRemoveExons);
394  }
395  }
396  }
397  }
398 
399  bool snap_to_codons = align.Type()&CAlignModel::eProt;
400  bool keepdoing = true;
401  while(keepdoing) {
402  keepdoing = false;
403  for (int k = 1; k < (int)align.Exons().size(); ++k) {
404  CModelExon exonl = align.Exons()[k-1];
405  CModelExon exonr = align.Exons()[k];
406  if(!(exonl.m_ssplice && exonr.m_fsplice)) {
407  int l = exonl.GetTo();
408  TSignedSeqRange segl(align.Limits().GetFrom(),l);
409  for( ; l >= exonl.GetFrom() && m_coverage[l-m_left_end] < clip_threshold*cov; --l);
410  if(l != exonl.GetTo())
411  segl = amap.ShrinkToRealPoints(TSignedSeqRange(align.Limits().GetFrom(),max(align.Limits().GetFrom(),l)),snap_to_codons);
412 
413  int r = exonr.GetFrom();
414  TSignedSeqRange segr(r,align.Limits().GetTo());
415  for( ; r <= exonr.GetTo() && m_coverage[r-m_left_end] < clip_threshold*cov; ++r);
416  if(r != exonr.GetFrom())
417  segr = amap.ShrinkToRealPoints(TSignedSeqRange(min(align.Limits().GetTo(),r),align.Limits().GetTo()), snap_to_codons);
418 
419  if(segl.Empty() || amap.FShiftedLen(segl,false) < END_PART_LENGTH) {
420  if(segr.Empty() || amap.FShiftedLen(segr,false) < END_PART_LENGTH) {
421  align.ClearExons();
422  return;
423  } else {
424  align.Clip(segr,CGeneModel::eRemoveExons);
425  keepdoing = true;
426  break;
427  }
428  } else if(segr.Empty() || amap.FShiftedLen(segr,false) < END_PART_LENGTH) {
429  align.Clip(segl,CGeneModel::eRemoveExons);
430  keepdoing = true;
431  break;
432  } else if(l != exonl.GetTo() || r != exonr.GetFrom()) {
433  align.CutExons(TSignedSeqRange(segl.GetTo()+1,segr.GetFrom()-1));
434  keepdoing = true;
435  break;
436  }
437  }
438  }
439  }
440 
441  for(int prev_exon = -1; prev_exon < (int)align.Exons().size()-1; ++prev_exon) {
442  int piece_begin = prev_exon+1;
443  if(align.Exons()[piece_begin].m_fsplice)
444  continue;
445  int piece_end = piece_begin;
446  for( ; piece_end < (int)align.Exons().size() && align.Exons()[piece_end].m_ssplice; ++piece_end);
447  int a = align.Exons()[piece_begin].GetFrom();
448  int b = align.Exons()[piece_end].GetTo();
449  if(amap.FShiftedLen(a, b, false) < END_PART_LENGTH) {
450  if(a == align.Limits().GetFrom() && b == align.Limits().GetTo()) {
451  align.ClearExons();
452  return;
453  } else if(a == align.Limits().GetFrom()) {
454  TSignedSeqRange seg(align.Exons()[piece_end+1].GetFrom(),align.Limits().GetTo());
455  align.Clip(seg, CGeneModel::eRemoveExons);
456  } else if(b == align.Limits().GetTo()) {
457  TSignedSeqRange seg(align.Limits().GetFrom(),align.Exons()[piece_begin-1].GetTo());
458  align.Clip(seg, CGeneModel::eRemoveExons);
459  } else {
460  TSignedSeqRange seg(a, b);
461  align.CutExons(seg);
462  }
463  }
464  }
465 
466  if((align.Status()&CGeneModel::ePolyA) &&
467  ((align.Strand() == ePlus && align.Limits().GetTo() != old_limits.GetTo()) ||
468  (align.Strand() == eMinus && align.Limits().GetFrom() != old_limits.GetFrom()))) { // clipped polyA
469 
470  align.Status() ^= CGeneModel::ePolyA;
471  }
472  if((align.Status()&CGeneModel::eCap) &&
473  ((align.Strand() == eMinus && align.Limits().GetTo() != old_limits.GetTo()) ||
474  (align.Strand() == ePlus && align.Limits().GetFrom() != old_limits.GetFrom()))) { // clipped cap
475 
476  align.Status() ^= CGeneModel::eCap;
477  }
478 }
479 
480 
481 #define CUT_MARGIN 15
482 
484 
485  CAlignMap amap = align.GetAlignMap();
486 
487  bool keepdoing = true;
488  while(keepdoing) {
489  keepdoing = false;
490  for (int k = 1; k < (int)align.Exons().size(); ++k) {
491  CModelExon exonl = align.Exons()[k-1];
492  CModelExon exonr = align.Exons()[k];
493  if(!(exonl.m_ssplice && exonr.m_fsplice) || isGoodIntron(exonl.GetTo(), exonr.GetFrom(), align.Strand(), m_align_introns, false))
494  continue;
495 
496  TSignedSeqRange segl;
497  if(exonl.GetTo()-CUT_MARGIN > align.Limits().GetFrom())
498  segl = amap.ShrinkToRealPoints(TSignedSeqRange(align.Limits().GetFrom(),exonl.GetTo()-CUT_MARGIN), true);
499 
500  TSignedSeqRange segr;
501  if(exonr.GetFrom()+CUT_MARGIN < align.Limits().GetTo())
502  segr = amap.ShrinkToRealPoints(TSignedSeqRange(exonr.GetFrom()+CUT_MARGIN,align.Limits().GetTo()), true);
503 
504  if(segl.Empty() || amap.FShiftedLen(segl,false) < END_PART_LENGTH) {
505  if(segr.Empty() || amap.FShiftedLen(segr,false) < END_PART_LENGTH) {
506  align.ClearExons();
507  return false;
508  } else {
509  align.Clip(segr,CGeneModel::eRemoveExons);
510  keepdoing = true;
511  break;
512  }
513  } else if(segr.Empty() || amap.FShiftedLen(segr,false) < END_PART_LENGTH) {
514  align.Clip(segl,CGeneModel::eRemoveExons);
515  keepdoing = true;
516  break;
517  } else {
518  align.CutExons(TSignedSeqRange(segl.GetTo()+1,segr.GetFrom()-1));
519  keepdoing = true;
520  break;
521  }
522  }
523  }
524 
525  return true;
526 }
527 
528 bool CAlignCollapser::RemoveNotSupportedIntronsFromTranscript(CAlignModel& align, bool check_introns_on_both_strands) const {
529 
530  CAlignMap amap = align.GetAlignMap();
531 
532  CGeneModel editedmodel = align;
533 
534  if(!(editedmodel.Status()&CGeneModel::eGapFiller)) { //remove flanking bad introns AND exons
535  editedmodel.ClearExons(); // empty alignment with all atributes and remove indels
536  for (CAlignModel::TExons::const_iterator piece_begin = align.Exons().begin(); piece_begin != align.Exons().end(); ++piece_begin) {
537  _ASSERT( !piece_begin->m_fsplice );
538 
539  CAlignModel::TExons::const_iterator piece_end = piece_begin;
540  for ( ; piece_end != align.Exons().end() && piece_end->m_ssplice; ++piece_end) ;
541  _ASSERT( piece_end != align.Exons().end() );
542 
543  CAlignModel a = align;
544  a.Clip(TSignedSeqRange(piece_begin->Limits().GetFrom(),piece_end->Limits().GetTo()),CGeneModel::eRemoveExons); // only one piece
545 
546  //remove flanking bad introns
547  int new_left = a.Limits().GetFrom();
548  for(int k = 1; k < (int)a.Exons().size(); ++k) {
549  CModelExon exonl = a.Exons()[k-1];
550  CModelExon exonr = a.Exons()[k];
551  if(isGoodIntron(exonl.GetTo(), exonr.GetFrom(), a.Strand(), m_align_introns, check_introns_on_both_strands))
552  break;
553  else
554  new_left = exonr.GetFrom();
555  }
556  int new_right = a.Limits().GetTo();
557  for(int k = (int)a.Exons().size()-1; k > 0 && a.Exons()[k-1].GetTo() > new_left; --k) {
558  CModelExon exonl = a.Exons()[k-1];
559  CModelExon exonr = a.Exons()[k];
560  if(isGoodIntron(exonl.GetTo(), exonr.GetFrom(), a.Strand(), m_align_introns, check_introns_on_both_strands))
561  break;
562  else
563  new_right = exonl.GetTo();
564  }
565 
566  TSignedSeqRange new_lim(new_left,new_right);
567  if(new_lim != a.Limits()) {
568  new_lim = amap.ShrinkToRealPoints(new_lim,false);
569  a.Clip(new_lim,CGeneModel::eRemoveExons);
570  _ASSERT(a.Limits().NotEmpty());
571  }
572 
573  if(!editedmodel.Exons().empty())
574  editedmodel.AddHole();
575 
576  ITERATE(CGeneModel::TExons, e, a.Exons()) {
577  editedmodel.AddExon(e->Limits(), e->m_fsplice_sig, e->m_ssplice_sig, e->m_ident);
578  }
579  editedmodel.FrameShifts().insert(editedmodel.FrameShifts().end(),a.FrameShifts().begin(),a.FrameShifts().end());
580 
581  piece_begin = piece_end;
582  }
583  }
584 
585 
586  bool good_alignment = true;
587  if((align.Type()&CGeneModel::eEST) && (int)editedmodel.Exons().size() == 1 && editedmodel.Limits() != align.Limits())
588  good_alignment = false;
589 
590 
591  bool keepdoing = true;
592  while(keepdoing) {
593  keepdoing = false;
594  for (int k = 1; k < (int)editedmodel.Exons().size() && good_alignment; ++k) {
595  CModelExon exonl = editedmodel.Exons()[k-1];
596  CModelExon exonr = editedmodel.Exons()[k];
597  if(exonl.m_ssplice && exonr.m_fsplice && !isGoodIntron(exonl.GetTo(), exonr.GetFrom(), editedmodel.Strand(), m_align_introns, check_introns_on_both_strands)) {
598  if(editedmodel.Status()&CGeneModel::eGapFiller) {
599  TSignedSeqRange segl = amap.ShrinkToRealPoints(TSignedSeqRange(editedmodel.Limits().GetFrom(),exonl.GetTo()-1), false);
600  TSignedSeqRange segr = amap.ShrinkToRealPoints(TSignedSeqRange(exonr.GetFrom()+1,editedmodel.Limits().GetTo()), false);
601  if(segl.NotEmpty() && segr.NotEmpty()) {
602  editedmodel.CutExons(TSignedSeqRange(segl.GetTo()+1,segr.GetFrom()-1));
603  keepdoing = true;
604  break;
605  }
606  } else {
607  good_alignment = false;
608  }
609  }
610  }
611  }
612 
613  vector<TSignedSeqRange> transcript_exons;
614  ITERATE(CGeneModel::TExons, e, editedmodel.Exons()) {
616  _ASSERT(te.NotEmpty());
617  transcript_exons.push_back(te);
618  }
619 
620  TSignedSeqRange old_limits = align.Limits();
621 
622  CAlignMap editedamap(editedmodel.Exons(), transcript_exons, editedmodel.FrameShifts(), align.Orientation(), align.GetAlignMap().TargetLen());
623  CAlignModel editedalign(editedmodel, editedamap);
624  editedalign.SetTargetId(*align.GetTargetId());
625  align = editedalign;
626 
627  if((align.Status()&CGeneModel::ePolyA) &&
628  ((align.Strand() == ePlus && align.Limits().GetTo() != old_limits.GetTo()) ||
629  (align.Strand() == eMinus && align.Limits().GetFrom() != old_limits.GetFrom()))) { // clipped polyA
630 
631  align.Status() ^= CGeneModel::ePolyA;
632  }
633  if((align.Status()&CGeneModel::eCap) &&
634  ((align.Strand() == eMinus && align.Limits().GetTo() != old_limits.GetTo()) ||
635  (align.Strand() == ePlus && align.Limits().GetFrom() != old_limits.GetFrom()))) { // clipped cap
636 
637  align.Status() ^= CGeneModel::eCap;
638  }
639 
640  return good_alignment;
641 }
642 
643 #define MISM_PENALTY 10
644 #define INDEL_PENALTY 20
645 #define EXTRA_CUT 5
646 #define BIG_NOT_ALIGNED 20
647 void CAlignCollapser::CleanSelfTranscript(CAlignModel& align, const string& trans) const {
648 
649  string transcript = trans; // transcript as it appears on the genome
650  if(align.Orientation() == eMinus)
651  ReverseComplement(transcript.begin(),transcript.end());
652 
653  int tlen = align.TargetLen();
654  _ASSERT(tlen == (int)transcript.size());
655 
656  //expand not splices exons if identical
657  CGeneModel::TExons exons = align.Exons();
658  vector<TSignedSeqRange> transcript_exons;
659  transcript_exons.reserve(exons.size());
660  for(int ie = 0; ie < (int)exons.size(); ++ie) {
661  transcript_exons.push_back(align.TranscriptExon(ie));
662  }
663  if(align.Orientation() == eMinus) {
664  for(int ie = 0; ie < (int)exons.size(); ++ie) {
665  TSignedSeqRange& te = transcript_exons[ie];
666  te = TSignedSeqRange(tlen-1-te.GetTo(),tlen-1-te.GetFrom());
667  }
668  }
669  for(int ie = 0; ie < (int)exons.size(); ++ie) {
670  if(!exons[ie].m_fsplice) {
671  int glim = (ie > 0) ? exons[ie-1].GetTo() : -1;
672  int tlim = (ie > 0) ? transcript_exons[ie-1].GetTo() : -1;
673  int g = exons[ie].GetFrom();
674  int t = transcript_exons[ie].GetFrom();
675  while(g > glim+1 && t > tlim+1 && transcript[t-1] == m_contig[g-1]) {
676  --t;
677  --g;
678  }
679  if(g < exons[ie].GetFrom()) {
680  exons[ie].AddFrom(g-exons[ie].GetFrom());
681  exons[ie].m_fsplice_sig.clear();
682  transcript_exons[ie].SetFrom(t);
683  }
684  }
685  if(!exons[ie].m_ssplice) {
686  int glim = (ie+1 < (int)exons.size()) ? exons[ie+1].GetFrom() : m_contig.size();
687  int tlim = (ie+1 < (int)exons.size()) ? transcript_exons[ie+1].GetFrom() : transcript.size();
688  int g = exons[ie].GetTo();
689  int t = transcript_exons[ie].GetTo();
690  while(g < glim-1 && t < tlim-1 && transcript[t+1] == m_contig[g+1]) {
691  ++t;
692  ++g;
693  }
694  if(g > exons[ie].GetTo()) {
695  exons[ie].AddTo(g-exons[ie].GetTo());
696  exons[ie].m_ssplice_sig.clear();
697  transcript_exons[ie].SetTo(t);
698  }
699  }
700  }
701 
702  CAlignMap amap(exons,transcript_exons, align.FrameShifts(), ePlus, tlen);
703 
704  CGeneModel::TExons edited_exons;
705  vector<TSignedSeqRange> edited_transcript_exons;
706 
707  for (int piece_begin = 0; piece_begin < (int)exons.size(); ++piece_begin) {
708  _ASSERT( !exons[piece_begin].m_fsplice );
709  int piece_end = piece_begin;
710  for( ; exons[piece_end].m_ssplice; ++piece_end);
711  _ASSERT(piece_end < (int)exons.size());
712 
713  TInDels indels = align.GetInDels(exons[piece_begin].GetFrom(), exons[piece_end].GetTo(), false);
714  TInDels::const_iterator indl = indels.begin();
715 
716  string tseq;
717  string gseq;
718  TIVec exons_to_align;
719  int tp = transcript_exons[piece_begin].GetFrom();
720  for(int ie = piece_begin; ie <= piece_end; ++ie) {
721  int gp = exons[ie].GetFrom();
722  while(gp <= exons[ie].GetTo()) {
723  if(indl == indels.end() || indl->Loc() != gp) {
724  tseq.push_back(transcript[tp++]);
725  gseq.push_back(m_contig[gp++]);
726  } else if(indl->IsDeletion()) {
727  tseq += transcript.substr(tp,indl->Len());
728  gseq.insert(gseq.end(),indl->Len(),'-');
729  tp += indl->Len();
730  ++indl;
731  } else {
732  tseq.insert(tseq.end(),indl->Len(),'-');
733  gseq += m_contig.substr(gp,indl->Len());
734  gp += indl->Len();
735  ++indl;
736  }
737  }
738  if(indl != indels.end() && indl->Loc() == gp) { // deletion at the end of exon
739  _ASSERT(indl->IsDeletion());
740  tseq += transcript.substr(tp,indl->Len());
741  gseq.insert(gseq.end(), indl->Len(), '-');
742  tp += indl->Len();
743  ++indl;
744  }
745  exons_to_align.push_back(gseq.size()-1);
746  }
747  _ASSERT(tseq.size() == gseq.size() && indl == indels.end());
748 
749  TIVec score(tseq.size());
750  for(int i = 0; i < (int)score.size(); ++i) {
751  if(tseq[i] == gseq[i] && tseq[i] != 'N')
752  score[i] = 1;
753  else if(tseq[i] == '-' || gseq[i] == '-')
754  score[i] = -INDEL_PENALTY;
755  else
756  score[i] = -MISM_PENALTY;
757  if(i > 0)
758  score[i] += score[i-1];
759  score[i] = max(0,score[i]);
760  }
761 
762  int align_right = max_element(score.begin(),score.end())-score.begin();
763 
764  if(score[align_right] > 0) { // there is at least one match
765  int align_left = align_right;
766  while(align_left > 0 && score[align_left-1] > 0)
767  --align_left;
768 
769  int agaps = count(tseq.begin(), tseq.begin()+align_left, '-');
770  int bgaps = count(tseq.begin(), tseq.begin()+align_right, '-');
771  TSignedSeqRange trange(transcript_exons[piece_begin].GetFrom()+align_left-agaps, transcript_exons[piece_begin].GetFrom()+align_right-bgaps);
772 
773  TSignedSeqRange grange = amap.MapRangeEditedToOrig(trange, false);
774  _ASSERT(grange.NotEmpty());
775 
776  int pb = piece_begin;
777  while(exons[pb].GetTo() < grange.GetFrom())
778  ++pb;
779  int pe = piece_end;
780  while(exons[pe].GetFrom() > grange.GetTo())
781  --pe;
782  _ASSERT(pe >= pb);
783 
784  double lident = 0; // left exon identity
785  int len = 0;
786  for(int i = align_left; i <= (pe > pb ? exons_to_align[pb-piece_begin] : align_right); ++i) {
787  ++len;
788  if(tseq[i] == gseq[i])
789  ++lident;
790  }
791  lident /= len;
792 
793  double rident = 0; // right exon identity
794  len = 0;
795  for(int i = align_right; i >= (pe > pb ? exons_to_align[pe-1-piece_begin]+1 : align_left); --i) {
796  ++len;
797  if(tseq[i] == gseq[i])
798  ++rident;
799  }
800  rident /= len;
801 
802  for( int ie = pb; ie <= pe; ++ie) {
803  CModelExon e = exons[ie];
804  TSignedSeqRange t = transcript_exons[ie];
805  if(ie == pb) {
806  e.m_fsplice = false;
807  e.Limits().SetFrom(grange.GetFrom());
808  t.SetFrom(trange.GetFrom());
809  e.m_fsplice_sig.clear();
810  e.m_ident = lident;
811  }
812  if(ie == pe) {
813  e.m_ssplice = false;
814  e.Limits().SetTo(grange.GetTo());
815  t.SetTo(trange.GetTo());
816  e.m_ssplice_sig.clear();
817  e.m_ident = rident;
818  }
819 
820  edited_exons.push_back(e);
821  edited_transcript_exons.push_back(t);
822  }
823  }
824  piece_begin = piece_end;
825  }
826 
827 
828  CGeneModel editedmodel = align;
829  editedmodel.ClearExons(); // empty alignment with all atributes
830  TInDels edited_indels;
831 
832  for (int piece_begin = 0; piece_begin < (int)edited_exons.size(); ++piece_begin) {
833  _ASSERT( !edited_exons[piece_begin].m_fsplice );
834  int piece_end = piece_begin;
835  for( ; edited_exons[piece_end].m_ssplice; ++piece_end);
836  _ASSERT(piece_end < (int)edited_exons.size());
837 
838  //find splices if possible
839  if(!(align.Status()&CGeneModel::eUnknownOrientation)) {
840  TSignedSeqRange& elim = edited_exons[piece_begin].Limits();
841  TSignedSeqRange& tlim = edited_transcript_exons[piece_begin];
842  int distance_to_lgap = -1;
844  if(igap != m_genomic_gaps_len.begin()) {
845  --igap;
846  distance_to_lgap = elim.GetFrom()-(igap->first+igap->second);
847  }
848  if(distance_to_lgap == 0) { // ubutting gap
849  edited_exons[piece_begin].m_fsplice_sig = "NN";
850  } else if(tlim.GetFrom() > BIG_NOT_ALIGNED && (piece_begin == 0 || tlim.GetFrom() > edited_transcript_exons[piece_begin-1].GetTo()+1)) {
851  string splice = (align.Strand() == ePlus) ? "AG" : "AC";
852  for(int p = max(0,elim.GetFrom()-2); p <= min(elim.GetFrom()+EXTRA_CUT, elim.GetTo()-MISM_PENALTY)-2; ++p) {
853  if(m_contig[p] == splice[0] && m_contig[p+1] == splice[1]) {
854  tlim.SetFrom(tlim.GetFrom()+p+2-elim.GetFrom());
855 
856  int del_len = 0;
857  ITERATE(TInDels, indl, align.FrameShifts()) {
858  if(indl->IsDeletion() && Include(elim, indl->Loc()))
859  del_len += indl->Len();
860  }
861  double errors = (1.-edited_exons[piece_begin].m_ident)*(elim.GetLength()+del_len);
862  elim.SetFrom(p+2);
863  edited_exons[piece_begin].m_ident = 1.-errors/(elim.GetLength()+del_len); // splices won't clip indels or mismatches
864  if(align.Strand() == eMinus)
865  ReverseComplement(splice.begin(),splice.end());
866  edited_exons[piece_begin].m_fsplice_sig = splice;
867  _ASSERT(elim.NotEmpty());
868 
869  break;
870  }
871  }
872  }
873  }
874  if(!(align.Status()&CGeneModel::eUnknownOrientation)) {
875  TSignedSeqRange& elim = edited_exons[piece_end].Limits();
876  TSignedSeqRange& tlim = edited_transcript_exons[piece_end];
877  int distance_to_rgap = -1;
879  if(igap != m_genomic_gaps_len.end())
880  distance_to_rgap = igap->first-elim.GetTo()-1;
881  if(distance_to_rgap == 0) { // ubutting gap
882  edited_exons[piece_end].m_ssplice_sig = "NN";
883  } else if(tlen-tlim.GetTo()-1 > BIG_NOT_ALIGNED && (piece_end == (int)edited_exons.size()-1 || tlim.GetTo() < edited_transcript_exons[piece_end+1].GetFrom()-1)) {
884  string splice = (align.Strand() == ePlus) ? "GT" : "CT";
885  for(int p = min((int)m_contig.size()-1,elim.GetTo()+2); p >= max(elim.GetTo()-EXTRA_CUT, elim.GetFrom()+MISM_PENALTY)+2; --p) {
886  if(m_contig[p-1] == splice[0] && m_contig[p] == splice[1]) {
887  tlim.SetTo(tlim.GetTo()-elim.GetTo()+p-2);
888 
889  int del_len = 0;
890  ITERATE(TInDels, indl, align.FrameShifts()) {
891  if(indl->IsDeletion() && Include(elim, indl->Loc()))
892  del_len += indl->Len();
893  }
894  double errors = (1.-edited_exons[piece_end].m_ident)*(elim.GetLength()+del_len);
895  elim.SetTo(p-2);
896  edited_exons[piece_end].m_ident = 1.-errors/(elim.GetLength()+del_len); // splices won't clip indels
897  if(align.Strand() == eMinus)
898  ReverseComplement(splice.begin(),splice.end());
899  edited_exons[piece_end].m_ssplice_sig = splice;
900  _ASSERT(elim.NotEmpty());
901 
902  break;
903  }
904  }
905  }
906  }
907 
908  for(int ie = piece_begin; ie <= piece_end; ++ie) {
909  CModelExon& e = edited_exons[ie];
910  editedmodel.AddExon(e.Limits(), e.m_fsplice_sig, e.m_ssplice_sig, e.m_ident);
911  }
912  editedmodel.AddHole();
913  ITERATE(TInDels, indl, align.FrameShifts()) {
914  if(indl->Loc() > edited_exons[piece_begin].GetFrom() && indl->Loc() < edited_exons[piece_end].GetTo())
915  edited_indels.push_back(*indl);
916  }
917 
918  piece_begin = piece_end;
919  }
920 
921  if(align.Orientation() == eMinus) {
922  for(int ie = 0; ie < (int)edited_transcript_exons.size(); ++ie) {
923  TSignedSeqRange& te = edited_transcript_exons[ie];
924  te = TSignedSeqRange(tlen-1-te.GetTo(),tlen-1-te.GetFrom());
925  }
926  }
927  CAlignMap editedamap(editedmodel.Exons(),edited_transcript_exons, edited_indels, align.Orientation(), tlen);
928  editedmodel.FrameShifts() = edited_indels;
929  CAlignModel editedalign(editedmodel, editedamap);
930  editedalign.SetTargetId(*align.GetTargetId());
931 
932  align = editedalign;
933 }
934 
935 int TotalFrameShift(const TInDels& indels, int a, int b) {
936  int fs = 0;
937  ITERATE(TInDels, indl, indels) {
938  if(indl->IsMismatch() || !indl->IntersectingWith(a, b))
939  continue;
940  if(indl->IsInsertion())
941  fs += indl->Len();
942  else
943  fs -= indl->Len();
944  }
945 
946  return fs%3;
947 }
948 
949 
950 
952 
953  return TotalFrameShift(indels, range.GetFrom(), range.GetTo());
954 }
955 
956 
957 
959 
960  const CArgs& args = CNcbiApplication::Instance()->GetArgs();
961 
963  int right_end = 0;
964 
965  size_t count_m_aligns = 0;
966  ITERATE(Tdata, i, m_aligns) {
967  ITERATE(deque<SAlignIndividual>, k, i->second) {
968  m_left_end = min(m_left_end, k->m_range.GetFrom());
969  right_end = max(right_end, k->m_range.GetTo());
970  ++count_m_aligns;
971  }
972  }
973  if (count_m_aligns == 0) {
974  for(TAlignModelList::iterator it = m_aligns_for_filtering_only.begin(); it != m_aligns_for_filtering_only.end(); ) {
975  TAlignModelList::iterator i = it++;
976  CAlignModel& align = *i;
979  }
980 
981  return;
982  }
983 
985  const SIntron& intron = it->first;
986  int a = intron.m_range.GetFrom();
987  int b = intron.m_range.GetTo();
988  m_left_end = min(m_left_end, a);
989  right_end = max(right_end, b);
990  }
991 
993  m_left_end = min(m_left_end,i->Limits().GetFrom());
994  right_end = max(right_end,i->Limits().GetTo());
995  }
996 
997  int len = right_end-m_left_end+1;
998 
999  cerr << "Before filtering: " << m_align_introns.size() << " introns, " << m_count << " alignments" << endl;
1000 
1001 
1002 
1003 #define COVERAGE_WINDOW 20
1004  //coverage calculation
1005  m_coverage.resize(len,0.);
1006  ITERATE(Tdata, i, m_aligns) {
1007  ITERATE(deque<SAlignIndividual>, k, i->second) {
1008  if(i->first.isSR()) {
1009  float weight = k->m_weight;
1010  TSignedSeqRange range = k->m_range;
1011  for(int l = range.GetFrom(); l <= range.GetTo(); ++l) // add coverage for all alignmnet range
1012  m_coverage[l-m_left_end] += weight;
1013  ITERATE(CAlignCommon::Tintrons, in, i->first.GetIntrons()) { // substract intron ranges
1014  for(int l = in->m_range.GetFrom()+1; l <= in->m_range.GetTo()-1; ++l)
1015  m_coverage[l-m_left_end] -= weight;
1016  }
1017  }
1018  }
1019  }
1020  vector<double> left_coverage(len,0.); // average from the left side (including point)
1021  double wsum = 0;
1022  for(int i = 0; i < len; ++i) {
1023  wsum += m_coverage[i];
1024  int ipast = i - COVERAGE_WINDOW;
1025  if(ipast >= 0)
1026  wsum -= m_coverage[ipast];
1027  left_coverage[i] = wsum/COVERAGE_WINDOW;
1028  }
1029  vector<double> right_coverage(len,0.); // average from the right side (including point)
1030  wsum = 0;
1031  for(int i = len-1; i >= 0; --i) {
1032  wsum += m_coverage[i];
1033  int ipast = i + COVERAGE_WINDOW;
1034  if(ipast < len)
1035  wsum -= m_coverage[ipast];
1036  right_coverage[i] = wsum/COVERAGE_WINDOW;
1037  }
1038 
1039  //initial intron filtering
1040  int minconsensussupport = args["min-consensus-support"].AsInteger();
1041  int minnonconsensussupport = args["min-non-consensussupport"].AsInteger();
1042  double minident = args["high-identity"].AsDouble();
1043  int minest = args["minest"].AsInteger();
1045  TAlignIntrons::iterator intron = it++;
1046  bool bad_intron = false;
1047  SIntronData& id = intron->second;
1048 
1049  if(id.m_selfsp_support) {
1050  if(id.m_est_support >= minest)
1051  id.m_keep_anyway = true;
1052  } else {
1053  bad_intron = true;
1054  }
1055 
1056  if(id.m_keep_anyway)
1057  continue;
1058 
1059  if(intron->first.m_sig == "CTAC" && intron->first.m_oriented)
1060  bad_intron = true;
1061 
1062  if(intron->first.m_sig == "GTAG") {
1063  if(id.m_weight < minconsensussupport)
1064  bad_intron = true;
1065  } else {
1066  if(id.m_weight < minnonconsensussupport)
1067  bad_intron = true;
1068  }
1069 
1070  if(id.m_ident < minident)
1071  bad_intron = true;
1072 
1073  if(bad_intron)
1074  m_align_introns.erase(intron);
1075  }
1076 
1077  //filter low expressed splices
1078  double minspliceexpression = args["min-support-fraction"].AsDouble();
1079  double minintronexpression = args["end-pair-support-cutoff"].AsDouble();
1080 
1082  TIntronsBySplice introns_by_left_splice;
1084  introns_by_left_splice.insert(TIntronsBySplice::value_type(intron->first.m_range.GetFrom(),intron));
1085  }
1086  for(TIntronsBySplice::iterator a = introns_by_left_splice.begin(); a != introns_by_left_splice.end(); ) {
1087  int splice = a->first;
1088  TIntronsBySplice::iterator b = introns_by_left_splice.upper_bound(splice); // first with different splice
1089 
1090  double weight = 0;
1091  int number = 0;
1092  for(TIntronsBySplice::iterator i = a; i != b; ++i) {
1093  ++number;
1094  weight += i->second->second.m_weight;
1095  }
1096  double mean = weight/number;
1097 
1098  for(TIntronsBySplice::iterator it = a; it != b; ) {
1099  TIntronsBySplice::iterator i = it++;
1100  SIntronData& id = i->second->second;
1101  if(!id.m_keep_anyway && (id.m_weight < minintronexpression*mean || weight < minspliceexpression*left_coverage[splice-m_left_end])) {
1102  id.m_weight = -1;
1103  introns_by_left_splice.erase(i);
1104  }
1105  }
1106 
1107  a = b;
1108  }
1109 
1110  TIntronsBySplice introns_by_right_splice;
1112  introns_by_right_splice.insert(TIntronsBySplice::value_type(intron->first.m_range.GetTo(),intron));
1113  }
1114  for(TIntronsBySplice::iterator a = introns_by_right_splice.begin(); a != introns_by_right_splice.end(); ) {
1115  int splice = a->first;
1116  TIntronsBySplice::iterator b = introns_by_right_splice.upper_bound(splice); // first with different splice
1117 
1118  double weight = 0;
1119  int number = 0;
1120  for(TIntronsBySplice::iterator i = a; i != b; ++i) {
1121  ++number;
1122  weight += i->second->second.m_weight;
1123  }
1124  double mean = weight/number;
1125 
1126  for(TIntronsBySplice::iterator it = a; it != b; ) {
1127  TIntronsBySplice::iterator i = it++;
1128  SIntronData& id = i->second->second;
1129  if(!id.m_keep_anyway && (id.m_weight < minintronexpression*mean || weight < minspliceexpression*right_coverage[splice-m_left_end])) {
1130  id.m_weight = -1;
1131  introns_by_right_splice.erase(i);
1132  }
1133  }
1134 
1135  a = b;
1136  }
1137 
1139  TAlignIntrons::iterator intron = it++;
1140  if(intron->second.m_weight < 0)
1141  m_align_introns.erase(intron);
1142  }
1143 
1144  //remove/clip alignmnets with bad introns
1145  for(Tdata::iterator it = m_aligns.begin(); it != m_aligns.end(); ) {
1146  Tdata::iterator data = it++;
1147  const CAlignCommon& alc = data->first;
1148 
1149  if((alc.isEST() && !m_filterest) || (alc.isSR() && !m_filtersr))
1150  continue;
1151 
1152  CAlignCommon::Tintrons introns = alc.GetIntrons();
1153  if(introns.empty())
1154  continue;
1155 
1156  //remove flanking bad introns
1157  int new_right = right_end;
1158  while(!introns.empty() && m_align_introns.find(introns.back()) == m_align_introns.end()) {
1159  new_right = introns.back().m_range.GetFrom();
1160  introns.pop_back();
1161  }
1162  int new_left = m_left_end;
1163  while(!introns.empty() && m_align_introns.find(introns.front()) == m_align_introns.end()) {
1164  new_left = introns.front().m_range.GetTo();
1165  introns.erase(introns.begin());
1166  }
1167 
1168  bool all_good = true;
1169  for(int i = 0; all_good && i < (int)introns.size(); ++i) {
1170  all_good = (m_align_introns.find(introns[i]) != m_align_introns.end());
1171  }
1172 
1173  if(all_good && introns.size() == alc.GetIntrons().size()) // all initial introns good
1174  continue;
1175 
1176  if(all_good && introns.size() > 0) { // clipped some flanked introns but not all
1177  const deque<char>& id_pool = m_target_id_pool[alc];
1178  ITERATE(deque<SAlignIndividual>, i, data->second) {
1179  CAlignModel align(alc.GetAlignment(*i, id_pool));
1180  align.Clip(TSignedSeqRange(new_left,new_right),CGeneModel::eRemoveExons);
1181  if(alc.isEST())
1182  align.Status() |= CGeneModel::eChangedByFilter;
1183  _ASSERT(align.Limits().NotEmpty());
1184  _ASSERT(align.Exons().size() == introns.size()+1);
1185  CAlignCommon c(align);
1186  m_aligns[c].push_back(SAlignIndividual(align, m_target_id_pool[c]));
1187  }
1188  }
1189 
1190  // delete initial alignments and ids
1191  m_target_id_pool.erase(data->first);
1192  m_aligns.erase(data);
1193  }
1194 
1195  //splices which should not be crossed
1196  double mincrossexpression = args["sharp-boundary"].AsDouble();
1197  TIVec left_plus(len,right_end); // closest left + strand splice 'on the right' from the current position
1198  TIVec left_minus(len,right_end); // closest left - strand splice 'on the right' from the current position
1199  TIVec right_plus(len,m_left_end); // closest right + strand splice 'on the left' from the current position
1200  TIVec right_minus(len,m_left_end); // closest right - strand splice 'on the left' from the current position
1202  const SIntron& intron = it->first;
1203  int a = intron.m_range.GetFrom();
1204  int b = intron.m_range.GetTo();
1205 
1206  double two_side_exon_coverage = max(left_coverage[a-m_left_end],right_coverage[b-m_left_end]);
1207 
1208  // if(right_coverage[a+1-m_left_end] < mincrossexpression*left_coverage[a-m_left_end]) {
1209  if(right_coverage[a+1-m_left_end] < mincrossexpression*two_side_exon_coverage) {
1210  if(!intron.m_oriented || intron.m_strand == ePlus)
1211  left_plus[a-m_left_end] = a;
1212  if(!intron.m_oriented || intron.m_strand == eMinus)
1213  left_minus[a-m_left_end] = a;
1214  }
1215 
1216  // if(left_coverage[b-1-m_left_end] < mincrossexpression*right_coverage[b-m_left_end]) {
1217  if(left_coverage[b-1-m_left_end] < mincrossexpression*two_side_exon_coverage) {
1218  if(!intron.m_oriented || intron.m_strand == ePlus)
1219  right_plus[b-m_left_end] = b;
1220  if(!intron.m_oriented || intron.m_strand == eMinus)
1221  right_minus[b-m_left_end] = b;
1222  }
1223  }
1224 
1225  for(int i = 1; i < len; ++i) {
1226  right_plus[i] = max(right_plus[i],right_plus[i-1]);
1227  right_minus[i] = max(right_minus[i],right_minus[i-1]);
1228  }
1229  for(int i = len-2; i >= 0; --i) {
1230  left_plus[i] = min(left_plus[i],left_plus[i+1]);
1231  left_minus[i] = min(left_minus[i],left_minus[i+1]);
1232  }
1233 
1234  //filter/cut low abandance one-exon and crossing splices
1235  int minsingleexpression = args["min-edge-coverage"].AsInteger();
1236  int trim = args["trim"].AsInteger();
1237  int total = 0;
1238  for(Tdata::iterator it = m_aligns.begin(); it != m_aligns.end(); ) {
1239  Tdata::iterator data = it++;
1240  const CAlignCommon& alc = data->first;
1241  deque<SAlignIndividual>& aligns = data->second;
1242 
1243  if((alc.isEST() && m_filterest) || (alc.isSR() && m_filtersr)) {
1244  if(alc.GetIntrons().empty()) { // not spliced
1245  NON_CONST_ITERATE(deque<SAlignIndividual>, i, aligns) {
1246  int a = i->m_range.GetFrom()+trim;
1247  int b = i->m_range.GetTo()-trim;
1248  if(b > a) {
1249  if((m_coverage[a-m_left_end] < minsingleexpression || m_coverage[b-m_left_end] < minsingleexpression) && !alc.isPolyA() && !alc.isCap())
1250  i->m_weight = -1;
1251  else if((alc.isUnknown() || alc.isPlus()) && ((right_plus[b-m_left_end] > a && !alc.isCap()) || (left_plus[a-m_left_end] < b && !alc.isPolyA())))
1252  i->m_weight = -1;
1253  else if((alc.isUnknown() || alc.isMinus()) && ((right_minus[b-m_left_end] > a && !alc.isPolyA()) || (left_minus[a-m_left_end] < b && !alc.isCap())))
1254  i->m_weight = -1;
1255  } else {
1256  i->m_weight = -1;
1257  }
1258  }
1259  } else { // spliced
1260  const deque<char>& id_pool = m_target_id_pool[alc];
1261  NON_CONST_ITERATE(deque<SAlignIndividual>, i, aligns) {
1262  CAlignModel align(alc.GetAlignment(*i, id_pool));
1263  TSignedSeqRange new_lim = align.Limits();
1264  if(align.Exons().front().Limits().GetLength() > trim) {
1265  int a = align.Exons().front().Limits().GetFrom()+trim;
1266  int b = align.Exons().front().Limits().GetTo();
1267  if((alc.isUnknown() || alc.isPlus()) && (right_plus[b-m_left_end] > a && !alc.isCap())) // crosses right plus splice
1268  new_lim.SetFrom(right_plus[b-m_left_end]);
1269  if((alc.isUnknown() || alc.isMinus()) && (right_minus[b-m_left_end] > a && !alc.isPolyA())) // crosses right minus splice
1270  new_lim.SetFrom(right_minus[b-m_left_end]);
1271  _ASSERT(new_lim.GetFrom() <= align.Exons().front().GetTo());
1272  }
1273  if(align.Exons().back().Limits().GetLength() > trim) {
1274  int a = align.Exons().back().Limits().GetFrom();
1275  int b = align.Exons().back().Limits().GetTo()-trim;
1276  if((alc.isUnknown() || alc.isPlus()) && (left_plus[a-m_left_end] < b && !alc.isPolyA())) // crosses left plus splice
1277  new_lim.SetTo(left_plus[a-m_left_end]);
1278  if((alc.isUnknown() || alc.isMinus()) && (left_minus[a-m_left_end] < b && !alc.isCap())) // crosses left minus splice
1279  new_lim.SetTo(left_minus[a-m_left_end]);
1280  _ASSERT(new_lim.GetTo() >= align.Exons().back().GetFrom());
1281  }
1282  i->m_range = new_lim;
1283 
1284  //delete if retained intron in internal exon
1285  for(int n = 1; n < (int)align.Exons().size()-1 && i->m_weight > 0; ++n) {
1286  int a = align.Exons()[n].Limits().GetFrom();
1287  int b = align.Exons()[n].Limits().GetTo();
1288 
1289  pair<TIntronsBySplice::iterator,TIntronsBySplice::iterator> eqr(introns_by_right_splice.end(),introns_by_right_splice.end());
1290  if((alc.isUnknown() || alc.isPlus()) && right_plus[b-m_left_end] > a) // crosses right plus splice
1291  eqr = introns_by_right_splice.equal_range(right_plus[b-m_left_end]);
1292  else if((alc.isUnknown() || alc.isMinus()) && right_minus[b-m_left_end] > a) // crosses right minus splice
1293  eqr = introns_by_right_splice.equal_range(right_minus[b-m_left_end]);
1294  for(TIntronsBySplice::iterator ip = eqr.first; ip != eqr.second; ++ip) {
1295  if(ip->second->first.m_range.GetFrom() > a)
1296  i->m_weight = -1;
1297  }
1298  }
1299  }
1300  }
1301 
1302  aligns.erase(remove_if(aligns.begin(),aligns.end(),AlignmentMarkedForDeletion),aligns.end());
1303  }
1304 
1305  total += aligns.size();
1306  if(aligns.empty())
1307  m_aligns.erase(data);
1308  }
1309 
1310 
1311  //filter other alignments
1312 
1313  //filter introns
1314  double clip_threshold = args["utrclipthreshold"].AsDouble();
1315  for(TAlignModelList::iterator it = m_aligns_for_filtering_only.begin(); it != m_aligns_for_filtering_only.end(); ) {
1316  TAlignModelList::iterator i = it++;
1317  CAlignModel& align = *i;
1318 
1319  if(align.Type()&CAlignModel::eProt) {
1320  CAlignModel a = align;
1322  m_aligns_for_filtering_only.push_front(a);
1323  }
1324 
1325  int intronnum = 0;
1326  ITERATE(CGeneModel::TExons, e, align.Exons()) {
1327  if(e->m_fsplice)
1328  ++intronnum;
1329  }
1330 
1331  if((align.Type()&CGeneModel::eEST) && !m_filterest)
1332  continue;
1333  if((align.Type()&CGeneModel::emRNA) && !m_filtermrna)
1334  continue;
1335  if((align.Type()&CGeneModel::eProt) && !m_filterprots)
1336  continue;
1337 
1338  if(!AlignmentIsSupportedBySR(align, m_coverage, minsingleexpression, m_left_end)) {
1339  if(align.Type()&(CGeneModel::emRNA|CGeneModel::eProt)) {
1340  continue;
1341  } else if(align.Type()&CGeneModel::eNotForChaining) {
1342  m_aligns_for_filtering_only.erase(i);
1343  continue;
1344  }
1345  }
1346 
1347  bool good_alignment = true;
1348 
1349  //clip alignmnets with bad introns
1350  if(align.Type()&CAlignModel::eProt) {
1351  good_alignment = RemoveNotSupportedIntronsFromProt(align);
1352  } else if(align.Type()&CGeneModel::eNotForChaining) {
1353  good_alignment = RemoveNotSupportedIntronsFromTranscript(align, true);
1354  } else {
1355  CAlignModel reversed = align;
1356  good_alignment = RemoveNotSupportedIntronsFromTranscript(align, false);
1358  reversed.ReverseComplementModel();
1359  bool good_reversed_alignment = RemoveNotSupportedIntronsFromTranscript(reversed, false);
1360  if(reversed.Exons().size() > align.Exons().size()) {
1361  align = reversed;
1362  good_alignment = good_reversed_alignment;
1363  }
1364  }
1365  }
1366 
1367  if(!align.Exons().empty())
1368  ClipNotSupportedFlanks(align, clip_threshold);
1369 
1370  if(align.Exons().empty() || (!good_alignment && !(align.Type()&CGeneModel::eNotForChaining)) || !AlignmentIsSupportedBySR(align, m_coverage, minsingleexpression, m_left_end)) {
1371  m_aligns_for_filtering_only.erase(i);
1372  continue;
1373  }
1374 
1375  ITERATE(CGeneModel::TExons, e, align.Exons()) {
1376  if(e->m_fsplice)
1377  --intronnum;
1378  }
1379 
1380  if(intronnum > 0 && !(align.Type()&CGeneModel::eNotForChaining))
1382  }
1383 
1384  TIVec self_coverage(len,0);
1385 
1386  //modify contig near correction indels which will ensure their clipping near self species cDNA edges (as mismatches)
1388  if(indl->GetStatus() != CInDelInfo::eGenomeNotCorrect)
1389  continue;
1390  if(indl->IsDeletion()) {
1391  m_contig[indl->Loc()] = tolower(m_contig[indl->Loc()]);
1392  m_contig[indl->Loc()-1] = tolower(m_contig[indl->Loc()]-1);
1393  } else {
1394  for(int p = indl->Loc(); p < indl->Loc()+indl->Len(); ++p)
1395  m_contig[p] = tolower(m_contig[p]);
1396  }
1397  }
1398 
1399 
1400  //clean self species cDNA edges and calculate self coverage
1401  for(TAlignModelList::iterator it = m_aligns_for_filtering_only.begin(); it != m_aligns_for_filtering_only.end(); ) {
1402  TAlignModelList::iterator i = it++;
1403  CAlignModel& align = *i;
1404 
1405  if(align.Status()&CGeneModel::eGapFiller) {
1406  string transcript = GetDNASequence(align.GetTargetId(),*m_scope);
1407 
1408  CleanSelfTranscript(align, transcript);
1409 
1410  ITERATE(CGeneModel::TExons, ie, align.Exons()) {
1411  int a = max(m_left_end, ie->GetFrom()); // TSA could be slightly extended in the area without alignments
1412  int b = min(right_end, ie->GetTo());
1413  for(int p = a; p <= b; ++p) {
1414  ++self_coverage[p-m_left_end];
1415  }
1416  }
1417  }
1418  }
1419 
1420  //restore contig
1421  NON_CONST_ITERATE(string, ip, m_contig)
1422  *ip = toupper(*ip);
1423 
1424  typedef pair<TSignedSeqRange,TInDels> TGapEnd;
1425  set<TGapEnd> right_gends; //rightmost exon befor gap
1426  set<TGapEnd> left_gends; // leftmost exon before gap
1427 
1428 #define MIN_EXON 10
1429 #define DESIRED_CHUNK 100
1430  //cut NotForChaining and fill gaps
1431  for(TAlignModelList::iterator it = m_aligns_for_filtering_only.begin(); it != m_aligns_for_filtering_only.end(); ) {
1432  TAlignModelList::iterator i = it++;
1433  CAlignModel& align = *i;
1434 
1435  if(!(align.Status()&CGeneModel::eGapFiller)) {
1436  if(align.Type()&CGeneModel::eNotForChaining)
1437  m_aligns_for_filtering_only.erase(i);
1438  continue;
1439  }
1440 
1441  //collect fshifts
1442  ITERATE(CGeneModel::TExons, ie, align.Exons()) {
1443  TInDels fs = align.GetInDels(ie->GetFrom(), ie->GetTo(), true);
1444  left_gends.insert(TGapEnd(ie->Limits(),fs));
1445  right_gends.insert(TGapEnd(ie->Limits(),fs));
1446  }
1447 
1448  if(!(align.Type()&CGeneModel::eNotForChaining)) {
1450  if(editedalign.Exons().size() > align.Exons().size()) {
1451  m_aligns_for_filtering_only.push_front(editedalign);
1452  m_aligns_for_filtering_only.erase(i);
1453  }
1454  } else {
1455  align.Status() &= ~CGeneModel::ePolyA;
1456  align.Status() &= ~CGeneModel::eCap;
1457  if(align.Exons().front().Limits().GetLength() > MIN_EXON) {
1458  CAlignModel a = align;
1459 
1460  TSignedSeqRange l = a.Exons().front().Limits();
1461  int len = l.GetLength();
1462  if(!align.Exons().front().m_ssplice && len > DESIRED_CHUNK) {
1463  l.SetTo(l.GetFrom()+DESIRED_CHUNK-1);
1464  len = DESIRED_CHUNK;
1465  }
1466  for(int ie = 0; len < DESIRED_CHUNK-2*MIN_EXON && a.Exons()[ie].m_ssplice; ++ie) {
1467  if(a.Exons()[ie+1].m_ssplice) {
1468  l.SetTo(a.Exons()[ie+1].GetTo());
1469  len += a.Exons()[ie+1].Limits().GetLength();
1470  } else {
1471  l.SetTo(min(a.Exons()[ie+1].GetTo(),a.Exons()[ie+1].GetFrom()+DESIRED_CHUNK-len-1));
1472  }
1473  }
1474  if(l.NotEmpty())
1475  l = a.GetAlignMap().ShrinkToRealPoints(l, false);
1476  if(l.NotEmpty()) {
1479  if(editedalign.Exons().size() > a.Exons().size()) {
1480  m_aligns_for_filtering_only.push_front(editedalign);
1481  }
1482  }
1483  }
1484 
1485  for(int ie = 0; ie < (int)align.Exons().size()-1; ++ie) {
1486  if((!align.Exons()[ie].m_ssplice || !align.Exons()[ie+1].m_fsplice) &&
1487  align.Exons()[ie].Limits().GetLength() > MIN_EXON && align.Exons()[ie+1].Limits().GetLength() > MIN_EXON) {
1488  CAlignModel a = align;
1489 
1490  int left = a.Exons()[ie].GetFrom();
1491  int len = a.Exons()[ie].Limits().GetLength();
1492  if(!a.Exons()[ie].m_fsplice && len > DESIRED_CHUNK) {
1493  left = a.Exons()[ie].GetTo()-DESIRED_CHUNK+1;
1494  len = DESIRED_CHUNK;
1495  }
1496  for(int iie = ie; len < DESIRED_CHUNK-2*MIN_EXON && a.Exons()[iie].m_fsplice; --iie) {
1497  if(a.Exons()[iie-1].m_fsplice) {
1498  left = a.Exons()[iie-1].GetFrom();
1499  len += a.Exons()[iie-1].Limits().GetLength();
1500  } else {
1501  left = max(a.Exons()[iie-1].GetFrom(),a.Exons()[iie-1].GetTo()-DESIRED_CHUNK+len+1);
1502  }
1503  }
1504  int right = a.Exons()[ie+1].GetTo();
1505  len = a.Exons()[ie+1].Limits().GetLength();
1506  if(!a.Exons()[ie+1].m_ssplice && len > DESIRED_CHUNK) {
1507  right = a.Exons()[ie+1].GetFrom()+DESIRED_CHUNK-1;
1508  len = DESIRED_CHUNK;
1509  }
1510  for(int iie = ie+1; len < DESIRED_CHUNK-2*MIN_EXON && a.Exons()[iie].m_ssplice; ++iie) {
1511  if(a.Exons()[iie+1].m_ssplice) {
1512  right = a.Exons()[iie+1].GetTo();
1513  len += a.Exons()[iie+1].Limits().GetLength();
1514  } else {
1515  right = min(a.Exons()[iie+1].GetTo(),a.Exons()[iie+1].GetFrom()+DESIRED_CHUNK-len-1);
1516  }
1517  }
1518  if(left >= 0 && right >= 0) {
1519  TSignedSeqRange l(left, right);
1520  l = a.GetAlignMap().ShrinkToRealPoints(l, false);
1521  if(l.NotEmpty()) {
1524  if(editedalign.Exons().size() > a.Exons().size()) {
1525  m_aligns_for_filtering_only.push_front(editedalign);
1526  }
1527  }
1528  }
1529  }
1530  }
1531 
1532  if(align.Exons().back().Limits().GetLength() > MIN_EXON) {
1533  CAlignModel a = align;
1534 
1535  TSignedSeqRange l = a.Exons().back().Limits();
1536  int len = l.GetLength();
1537  if(!align.Exons().back().m_fsplice && len > DESIRED_CHUNK) {
1538  l.SetFrom(a.Exons().back().GetTo()-DESIRED_CHUNK+1);
1539  len = DESIRED_CHUNK;
1540  }
1541  for(int ie = (int)a.Exons().size()-1; len < DESIRED_CHUNK-2*MIN_EXON && a.Exons()[ie].m_fsplice; --ie) {
1542  if(a.Exons()[ie-1].m_fsplice) {
1543  l.SetFrom(a.Exons()[ie-1].GetFrom());
1544  len += a.Exons()[ie-1].Limits().GetLength();
1545  } else {
1546  l.SetFrom(max(a.Exons()[ie-1].GetFrom(),a.Exons()[ie-1].GetTo()-DESIRED_CHUNK+len+1));
1547  }
1548  }
1549  if(l.NotEmpty())
1550  l = a.GetAlignMap().ShrinkToRealPoints(l, false);
1551  if(l.NotEmpty()) {
1554  if(editedalign.Exons().size() > a.Exons().size()) {
1555  m_aligns_for_filtering_only.push_front(editedalign);
1556  }
1557  }
1558  }
1559 
1560  m_aligns_for_filtering_only.erase(i);
1561  }
1562  }
1563 
1564  enum EnpPoint { eRightPlus = 1, eRightMinus = 2, eLeftPlus = 4, eLeftMinus = 8};
1565  vector<unsigned char> end_status(len, 0);
1566 
1567  //include gap's boundaries in no cross splices
1569  const CAlignModel& align = *i;
1570  for(int ie = 0; ie < (int)align.Exons().size(); ++ie) {
1571  if(align.Exons()[ie].Limits().Empty()) {
1572  if(ie > 0) {
1573  int a = align.Exons()[ie-1].GetTo();
1574  int al = a-m_left_end;
1575  // if(a < right_end && self_coverage[al+1] == 0) { // TSA could be slightly extended in the area without alignments; include only at drop
1576  if(a < right_end) {
1577  if((align.Status()&CGeneModel::eUnknownOrientation) || align.Strand() == ePlus) {
1578  left_plus[al] = a;
1579  end_status[al] |= eLeftPlus;
1580  }
1581  if((align.Status()&CGeneModel::eUnknownOrientation) || align.Strand()== eMinus) {
1582  left_minus[al] = a;
1583  end_status[al] |= eLeftMinus;
1584  }
1585  }
1586  }
1587  if(ie < (int)align.Exons().size()-1) {
1588  int b = align.Exons()[ie+1].GetFrom();
1589  int bl = b-m_left_end;
1590  // if(b > m_left_end && self_coverage[bl-1] == 0) { // TSA could be slightly extended in the area without alignments; include only at drop
1591  if(b > m_left_end) {
1592  if((align.Status()&CGeneModel::eUnknownOrientation) || align.Strand() == ePlus) {
1593  right_plus[bl] = b;
1594  end_status[bl] |= eRightPlus;
1595  }
1596  if((align.Status()&CGeneModel::eUnknownOrientation) || align.Strand()== eMinus) {
1597  right_minus[bl] = b;
1598  end_status[bl] |= eRightMinus;
1599  }
1600  }
1601  }
1602  }
1603  }
1604  }
1605 
1606  for(int i = 1; i < len; ++i) {
1607  right_plus[i] = max(right_plus[i],right_plus[i-1]);
1608  right_minus[i] = max(right_minus[i],right_minus[i-1]);
1609  }
1610  for(int i = len-2; i >= 0; --i) {
1611  left_plus[i] = min(left_plus[i],left_plus[i+1]);
1612  left_minus[i] = min(left_minus[i],left_minus[i+1]);
1613  }
1614 
1615 
1616 #define FS_FUZZ 10
1617 #define MAX_CLIP 200
1618 #define SMALL_CLIP 30
1619 
1620  //trim 3'/5' exons crossing splices (including hole boundaries)
1621  for(TAlignModelList::iterator it = m_aligns_for_filtering_only.begin(); it != m_aligns_for_filtering_only.end(); ) {
1622  TAlignModelList::iterator i = it++;
1623  CAlignModel& align = *i;
1624 
1626  continue;
1627 
1628  CAlignMap amap = align.GetAlignMap();
1629 
1630  if((align.Type()&CGeneModel::eEST) && !m_filterest)
1631  continue;
1632  if((align.Type()&CGeneModel::emRNA) && !m_filtermrna)
1633  continue;
1634  if((align.Type()&CGeneModel::eProt) && !m_filterprots)
1635  continue;
1636 
1637  bool snap_to_codons = align.Type()&CAlignModel::eProt;
1638  bool good_alignment = true;
1639 
1640  bool keepdoing = true;
1641  while(keepdoing && good_alignment) {
1642  keepdoing = false;
1643  for(int ie = 0; ie < (int)align.Exons().size(); ++ie) {
1644  const CModelExon& e = align.Exons()[ie];
1645 
1646  if(!e.m_fsplice && e.Limits().GetLength() > trim && e.GetTo() <= right_end &&
1647  (ie != 0 || (align.Strand() == ePlus && !(align.Status()&CGeneModel::eCap) && !align.HasStart()) || (align.Strand() == eMinus && !(align.Status()&CGeneModel::ePolyA) && !align.HasStop()))) {
1648  int l = e.GetFrom();
1649  int r = e.GetTo();
1650  int new_l = l;
1651 
1652  TIVec* rights = 0;
1653  EnpPoint endp;
1654  if(((align.Status()&CGeneModel::eUnknownOrientation) || align.Strand() == ePlus) && right_plus[r-m_left_end] > l+trim) { // crosses right plus splice
1655  new_l = right_plus[r-m_left_end];
1656  rights = &right_plus;
1657  endp = eRightPlus;
1658  }
1659  if(((align.Status()&CGeneModel::eUnknownOrientation) || align.Strand() == eMinus) && right_minus[r-m_left_end] > l+trim) { // crosses right minus splice
1660  new_l = right_minus[r-m_left_end];
1661  rights = &right_minus;
1662  endp = eRightMinus;
1663  }
1664 
1665  if(new_l != l && (end_status[new_l-m_left_end]&endp) && (align.Type()&CAlignModel::eProt)) {
1666  // try to extend
1667  while(new_l-l > MAX_CLIP && (end_status[new_l-m_left_end]&endp))
1668  new_l = max(l,(*rights)[new_l-1-m_left_end]);
1669  TInDels pindels = align.GetInDels(true);
1670  int firstclip = new_l;
1671  for(int putativel = new_l; (new_l > l+SMALL_CLIP || TotalFrameShift(pindels, l, new_l)) && (end_status[new_l-m_left_end]&endp) && new_l == putativel; ) {
1672  putativel = max(l,(*rights)[new_l-1-m_left_end]);
1673  for(set<TGapEnd>::iterator ig = left_gends.begin(); ig != left_gends.end(); ++ig) {
1674  if(ig->first.GetFrom() <= putativel && ig->first.GetTo() >= firstclip) {
1675  int prot_fs = TotalFrameShift(pindels, putativel, firstclip+FS_FUZZ);
1676  int tsa_fs = TotalFrameShift(ig->second, putativel-FS_FUZZ, firstclip);
1677  if(prot_fs == tsa_fs)
1678  new_l = putativel;
1679  }
1680  }
1681  }
1682  //check if undertrimmed
1683  if(end_status[new_l-m_left_end]&endp) {
1684  for(int i = 0; i < (int)pindels.size() && pindels[i].Loc() <= new_l+FS_FUZZ; ++i)
1685  new_l = max(new_l,pindels[i].Loc());
1686  }
1687  }
1688 
1689  if(new_l != l) {
1690  _ASSERT(new_l <= r);
1691  if((align.Type()&CGeneModel::eEST) && (int)align.Exons().size() == 1) {
1692  good_alignment = false;
1693  break;
1694  }
1695 
1696  TSignedSeqRange seg = amap.ShrinkToRealPoints(TSignedSeqRange(new_l,align.Limits().GetTo()),snap_to_codons);
1697  if(seg.Empty() || amap.FShiftedLen(seg,false) < END_PART_LENGTH) { // nothing left on right
1698  if(ie == 0 || amap.FShiftedLen(TSignedSeqRange(align.Limits().GetFrom(),align.Exons()[ie-1].GetTo())) < END_PART_LENGTH) { // no alignmnet left
1699  good_alignment = false;
1700  } else { // left side is kept
1701  align.Clip(TSignedSeqRange(align.Limits().GetFrom(),align.Exons()[ie-1].GetTo()),CGeneModel::eRemoveExons);
1702  }
1703  } else { // trim
1704  if(ie == 0) { // first exon
1705  if(align.Type()&CGeneModel::eProt)
1706  align.Clip(seg,CGeneModel::eRemoveExons);
1707  else
1708  align.CutExons(TSignedSeqRange(align.Limits().GetFrom(),seg.GetFrom()-1)); // Clip() is not friendly to gapfillers
1709  } else {
1710  align.CutExons(TSignedSeqRange(align.Exons()[ie-1].GetTo()+1,seg.GetFrom()-1));
1711  }
1712  }
1713  keepdoing = true;
1714  break;
1715  }
1716  }
1717 
1718  if(!e.m_ssplice && e.Limits().GetLength() > trim && e.GetFrom() >= m_left_end &&
1719  (ie != (int)align.Exons().size()-1 || (align.Strand() == ePlus && !(align.Status()&CGeneModel::ePolyA) && !align.HasStop()) || (align.Strand() == eMinus && !(align.Status()&CGeneModel::eCap) && !align.HasStart()))) {
1720  int l = e.GetFrom();
1721  int r = e.GetTo();
1722  int new_r = r;
1723 
1724  TIVec* lefts = 0;
1725  EnpPoint endp;
1726  if(((align.Status()&CGeneModel::eUnknownOrientation) || align.Strand() == ePlus) && left_plus[l-m_left_end] < r-trim) { // crosses left plus splice
1727  new_r = left_plus[l-m_left_end];
1728  lefts = &left_plus;
1729  endp = eLeftPlus;
1730  }
1731  if(((align.Status()&CGeneModel::eUnknownOrientation) || align.Strand() == eMinus) && left_minus[l-m_left_end] < r-trim) { // crosses left minus splice
1732  new_r = left_minus[l-m_left_end];
1733  lefts = &left_minus;
1734  endp = eLeftMinus;
1735  }
1736 
1737  if(new_r != r && (end_status[new_r-m_left_end]&endp) && (align.Type()&CAlignModel::eProt)) {
1738  // try to extend
1739  while(r-new_r > MAX_CLIP && (end_status[new_r-m_left_end]&endp))
1740  new_r = min(r,(*lefts)[new_r+1-m_left_end]);
1741  TInDels pindels = align.GetInDels(true);
1742  int firstclip = new_r;
1743  for(int putativer = new_r; (new_r < r-SMALL_CLIP || TotalFrameShift(pindels, new_r, r)) && (end_status[new_r-m_left_end]&endp) && new_r == putativer; ) {
1744  putativer = min(r,(*lefts)[new_r+1-m_left_end]);
1745  for(set<TGapEnd>::iterator ig = right_gends.begin(); ig != right_gends.end(); ++ig) {
1746  if(ig->first.GetFrom() <= firstclip && ig->first.GetTo() >= putativer) {
1747  int prot_fs = TotalFrameShift(pindels, firstclip-FS_FUZZ, putativer);
1748  int tsa_fs = TotalFrameShift(ig->second, firstclip, putativer+FS_FUZZ);
1749  if(prot_fs == tsa_fs)
1750  new_r = putativer;
1751  }
1752  }
1753  }
1754  //check if undertrimmed
1755  if(end_status[new_r-m_left_end]&endp) {
1756  for(int i = pindels.size()-1; i >= 0 && pindels[i].Loc() >= new_r-FS_FUZZ; --i)
1757  new_r = min(new_r,pindels[i].Loc()-1);
1758  }
1759  }
1760 
1761  if(new_r != r) {
1762  _ASSERT(new_r >= l);
1763  if((align.Type()&CGeneModel::eEST) && (int)align.Exons().size() == 1) {
1764  good_alignment = false;
1765  break;
1766  }
1767 
1768  TSignedSeqRange seg = amap.ShrinkToRealPoints(TSignedSeqRange(align.Limits().GetFrom(),new_r),snap_to_codons);
1769  if(seg.Empty() || amap.FShiftedLen(seg,false) < END_PART_LENGTH) { // nothing left on left
1770  if(ie == (int)align.Exons().size()-1 || amap.FShiftedLen(TSignedSeqRange(align.Exons()[ie+1].GetFrom(),align.Limits().GetTo())) < END_PART_LENGTH) { // no alignmnet left
1771  good_alignment = false;
1772  } else { // right side is kept
1773  align.Clip(TSignedSeqRange(align.Exons()[ie+1].GetFrom(),align.Limits().GetTo()),CGeneModel::eRemoveExons);
1774  }
1775  } else { // trim
1776  if(ie == (int)align.Exons().size()-1) { // last exon
1777  if(align.Type()&CGeneModel::eProt)
1778  align.Clip(seg,CGeneModel::eRemoveExons);
1779  else
1780  align.CutExons(TSignedSeqRange(seg.GetTo()+1, align.Limits().GetTo())); // Clip() is not friendly to gapfillers
1781  } else {
1782  align.CutExons(TSignedSeqRange(seg.GetTo()+1,align.Exons()[ie+1].GetFrom()-1));
1783  }
1784  }
1785  keepdoing = true;
1786  break;
1787  }
1788  }
1789  }
1790  }
1791 
1792  if(!good_alignment)
1793  m_aligns_for_filtering_only.erase(i);
1794  }
1795 
1796  //clean genomic gaps
1797  sort(m_correction_data.m_correction_indels.begin(),m_correction_data.m_correction_indels.end(),GenomicGapsOrder()); // accsession is used if the sequence is same
1798  m_correction_data.m_correction_indels.erase( unique(m_correction_data.m_correction_indels.begin(),m_correction_data.m_correction_indels.end()), m_correction_data.m_correction_indels.end() ); // uses == for CInDelInfo which ignores accession
1799 
1800  total += m_aligns_for_filtering_only.size();
1801 
1802 
1803  cerr << "After filtering: " << m_align_introns.size() << " introns, " << total << " alignments" << endl;
1804 }
1805 
1807 
1808  ITERATE(CGeneModel::TExons, i, align.Exons()) {
1809  if(i->Limits().NotEmpty()) {
1810  CInDelInfo p(i->GetFrom(), 1, CInDelInfo::eDel);
1811  TInDels::const_iterator ig = lower_bound(m_correction_data.m_correction_indels.begin(), m_correction_data.m_correction_indels.end(), p); // first equal or greater
1812  for( ; ig != m_correction_data.m_correction_indels.end() && ig->Loc() <= i->GetTo(); ++ig) {
1813  if(ig->GetSource().m_range.NotEmpty()) // exon overlaps with inserted gap
1814  return false;
1815  }
1816  }
1817  }
1818 
1819  clsset.Insert(align);
1820  return true;
1821 }
1822 
1824 {
1825  bool operator() (TAlignModelList::const_iterator a, TAlignModelList::const_iterator b) { // left and long first
1826  if(a->Limits() == b->Limits()) {
1827  if(a->Ident() != b->Ident())
1828  return a->Ident() > b->Ident();
1829  else
1830  return a->TargetAccession() < b->TargetAccession();
1831  } else if(a->Limits().GetFrom() != b->Limits().GetFrom()) {
1832  return a->Limits().GetFrom() < b->Limits().GetFrom();
1833  } else {
1834  return a->Limits().GetTo() > b->Limits().GetTo();
1835  }
1836  }
1837 };
1838 
1839 // one-exon alignments are equal
1840 // gapfilled exons compared by seq; real exons compared by range and splices
1841 // real exon < gapfilled exon;
1842 bool OneExonCompare(const CModelExon& a, const CModelExon& b) {
1843  if(!a.m_seq.empty() || !b.m_seq.empty()) { // at least one is gapfilling
1844  return a.m_seq < b.m_seq;
1845  } else if(b.Limits().Empty()) { // b is from one-exon alignment
1846  return false;
1847  } else if(a.Limits().Empty()) { // a is from one-exon alignment and b is not
1848  return true;
1849  } else if(a.m_fsplice != b.m_fsplice) {
1850  return a.m_fsplice < b.m_fsplice;
1851  } else if(a.m_ssplice != b.m_ssplice) {
1852  return a.m_ssplice < b.m_ssplice;
1853  } else {
1854  return a.Limits() < b.Limits();
1855  }
1856 }
1857 
1859 {
1860  bool operator() (const CGeneModel::TExons& a, const CGeneModel::TExons& b) const {
1861  if(a.size() != b.size()) {
1862  return a.size() < b.size();
1863  } else {
1864  for(int i = 0; i < (int)a.size(); ++i) {
1865  if(OneExonCompare(a[i],b[i]))
1866  return true;
1867  if(OneExonCompare(b[i],a[i]))
1868  return false;
1869  }
1870  return false;
1871  }
1872  }
1873 };
1874 
1876  if(m_count == 0)
1877  return;
1878 
1879  FilterAlignments();
1880 
1882  if(i->Type() == CGeneModel::emRNA) {
1883  CBioseq_Handle bh (m_scope->GetBioseqHandle(*i->GetTargetId()));
1884  const CMolInfo* molinfo = GetMolInfo(bh);
1885  if(molinfo && molinfo->IsSetTech() && molinfo->GetTech() == CMolInfo::eTech_tsa)
1886  i->Status() |= CGeneModel::eTSA; // used to exclude from CDS projection
1887  }
1888  CheckAndInsert(*i, clsset);
1889  }
1890 }
1891 
1893 
1894  cerr << "Added " << m_count << " alignments to collapser for contig " << m_contig_name << endl;
1895 
1896  if(m_count == 0)
1897  return;
1898 
1899  FilterAlignments();
1900 
1901  const CArgs& args = CNcbiApplication::Instance()->GetArgs();
1902  int oep = args["oep"].AsInteger();
1903  int max_extend = args["max-extension"].AsInteger();
1904 
1905  set<int> left_exon_ends, right_exon_ends;
1907  const SIntron& intron = it->first;
1908  int a = intron.m_range.GetFrom();
1909  int b = intron.m_range.GetTo();
1910  left_exon_ends.insert(b);
1911  right_exon_ends.insert(a);
1912  }
1913 
1914  TAlignModelList rnaseq_or_est;
1916  const CAlignCommon& alc = i->first;
1917  const deque<char>& id_pool = m_target_id_pool[alc];
1918  deque<SAlignIndividual>& alideque = i->second;
1919  sort(alideque.begin(),alideque.end(),LeftAndLongFirstOrder(id_pool));
1920 
1921  if(alc.isSR() && !m_collapssr) { // don't collaps
1922  ITERATE(deque<SAlignIndividual>, k, alideque) {
1923  CAlignModel align(alc.GetAlignment(*k, id_pool));
1924  rnaseq_or_est.push_back(align);
1925  }
1926  } else {
1927  bool leftisfixed = (alc.isCap() && alc.isPlus()) || (alc.isPolyA() && alc.isMinus());
1928  bool rightisfixed = (alc.isPolyA() && alc.isPlus()) || (alc.isCap() && alc.isMinus());
1929  bool notspliced = alc.GetIntrons().empty();
1930 
1931  typedef list<SAlignExtended> TEA_List;
1932  TEA_List extended_aligns;
1933 
1934  NON_CONST_ITERATE(deque<SAlignIndividual>, k, alideque) {
1935  SAlignIndividual& aj = *k;
1936  bool collapsed = false;
1937 
1938  for(TEA_List::iterator itloop = extended_aligns.begin(); itloop != extended_aligns.end(); ) {
1939  TEA_List::iterator ita = itloop++;
1940  SAlignIndividual& ai = *ita->m_ali;
1941 
1942  if(aj.m_range.GetFrom() >= min((leftisfixed ? ai.m_range.GetFrom():ai.m_range.GetTo())+1,ita->m_llimb)) { // extendent align is completed
1943  CAlignModel align(alc.GetAlignment(ai, id_pool));
1944  rnaseq_or_est.push_back(align);
1945  extended_aligns.erase(ita);
1946  } else if(!collapsed) { // even if collapsed must check extended_aligns to the end to purge finished
1947  if(rightisfixed && ai.m_range.GetTo() != aj.m_range.GetTo())
1948  continue;
1949  if(notspliced && aj.m_range.GetTo() > ai.m_range.GetTo()) {
1950  if(ai.m_range.GetTo()-aj.m_range.GetFrom()+1 < oep)
1951  continue;
1952  if(aj.m_range.GetTo()-ita->m_initial_right_end > max_extend)
1953  continue;
1954  if(aj.m_range.GetFrom()-ai.m_range.GetFrom() > max_extend)
1955  continue;
1956  }
1957  if(aj.m_range.GetTo() > (alc.isEST() ? ita->m_initial_right_end : ita->m_rlimb) || aj.m_range.GetTo() <= ita->m_rlima)
1958  continue;
1959 
1960  ai.m_weight += aj.m_weight;
1961  if(aj.m_range.GetTo() > ai.m_range.GetTo())
1962  ai.m_range.SetTo(aj.m_range.GetTo());
1963  collapsed = true;
1964  }
1965  }
1966 
1967  if(!collapsed)
1968  extended_aligns.push_back(SAlignExtended(aj,left_exon_ends,right_exon_ends));
1969  }
1970 
1971  ITERATE(TEA_List, ita, extended_aligns) {
1972  CAlignModel align(alc.GetAlignment(*ita->m_ali, id_pool));
1973  rnaseq_or_est.push_back(align);
1974  }
1975  }
1976  }
1977 
1978  //stranded intervals (start->len)
1979  TIntMap strandedplus_len;
1980  TIntMap strandedminus_len;
1981  ITERATE(TAlignModelList, ia, rnaseq_or_est) {
1982  const CAlignModel& align = *ia;
1983  if((align.Type()&CGeneModel::eSR) && !(align.Status()&CGeneModel::eUnknownOrientation) &&
1984  !(align.Status()&CGeneModel::ePolyA) && !(align.Status()&CGeneModel::eCap) && align.Exons().size() == 1) { // ORINTEED notspliced rnaseq
1985  TIntMap* mp = (align.Strand() == ePlus) ? &strandedplus_len : &strandedminus_len;
1986  if(mp->empty() || mp->rbegin()->first+mp->rbegin()->second < align.Limits().GetFrom()) { // abutting intervals are united
1987  (*mp)[align.Limits().GetFrom()] = align.Limits().GetLength();
1988  } else {
1989  mp->rbegin()->second = max(mp->rbegin()->second, align.Limits().GetTo()-mp->rbegin()->first+1);
1990  }
1991  }
1992  }
1993 
1994  int forced_orientation = 0;
1995  NON_CONST_ITERATE(TAlignModelList, ia, rnaseq_or_est) {
1996  CAlignModel& align = *ia;
1997  if((align.Type()&CGeneModel::eSR) && (align.Status()&CGeneModel::eUnknownOrientation) && align.Exons().size() == 1) { // NOTORINTEED notspliced rnaseq
1998  bool included_in_plus = false;
1999  TIntMap::iterator plus = strandedplus_len.lower_bound(align.Limits().GetTo());
2000  if(plus != strandedplus_len.begin() && (--plus)->first <= align.Limits().GetFrom() && plus->first+plus->second > align.Limits().GetTo())
2001  included_in_plus = true;
2002  bool included_in_minus = false;
2003  TIntMap::iterator minus = strandedminus_len.lower_bound(align.Limits().GetTo());
2004  if(minus != strandedminus_len.begin() && (--minus)->first <= align.Limits().GetFrom() && minus->first+minus->second > align.Limits().GetTo())
2005  included_in_minus = true;
2006 
2007  if(included_in_plus != included_in_minus) {
2009  align.SetStrand(included_in_plus ? ePlus : eMinus);
2010  ++forced_orientation;
2011  }
2012  }
2013  }
2014  cerr << "Forced orintation: " << forced_orientation << endl;
2015 
2016  int total = 0;
2017  ITERATE(TAlignModelList, ia, rnaseq_or_est) {
2018  if(CheckAndInsert(*ia, clsset))
2019  ++total;
2020  }
2021 
2022  if(m_collapsest && m_fillgenomicgaps) { // collaps ests used for gapfilling
2024  TEstHolder est_for_collapsing;
2026  if(i->Type() == CGeneModel::eEST) {
2027  CGeneModel::TExons exons = i->Exons();
2028  if(exons.size() == 1) {
2029  exons.front().Limits() = TSignedSeqRange::GetEmpty();
2030  _ASSERT(exons.front().m_seq.empty());
2031  } else {
2032  if(exons.front().m_ssplice_sig != "XX")
2033  exons.front().Limits().SetFrom(exons.front().GetTo());
2034  if(exons.back().m_fsplice_sig != "XX")
2035  exons.back().Limits().SetTo(exons.back().GetFrom());
2036  }
2037  est_for_collapsing[exons].push_back(i);
2038  }
2039  }
2040 
2041  NON_CONST_ITERATE(TEstHolder, i, est_for_collapsing) {
2042  sort(i->second.begin(),i->second.end(),LeftAndLongFirstOrderForAligns());
2043  list<TAlignModelList::iterator> ests(i->second.begin(),i->second.end());
2044  for(list<TAlignModelList::iterator>::iterator ihost = ests.begin(); ihost != ests.end(); ++ihost) {
2045  CAlignModel& host = **ihost;
2046  set<int>::const_iterator ri = right_exon_ends.lower_bound(host.Limits().GetTo()); // leftmost compatible rexon
2047  int rlima = -1;
2048  if(ri != right_exon_ends.begin())
2049  rlima = *(--ri); // position of the rightmost incompatible rexon
2050  set<int>::const_iterator li = left_exon_ends.upper_bound(host.Limits().GetFrom()); // leftmost not compatible lexon
2051  int llimb = numeric_limits<int>::max() ;
2052  if(li != left_exon_ends.end())
2053  llimb = *li; // position of the leftmost not compatible lexon
2054 
2055  list<TAlignModelList::iterator>::iterator iloop = ihost;
2056  for(++iloop; iloop != ests.end(); ) {
2057  list<TAlignModelList::iterator>::iterator iguest = iloop++;
2058  CAlignModel& guest = **iguest;
2059 
2060  if(guest.Limits().GetFrom() >= min(host.Limits().GetTo()+1,llimb)) // host is completed
2061  break;
2062 
2063  if(guest.Limits().GetTo() > host.Limits().GetTo() || guest.Limits().GetTo() <= rlima)
2064  continue;
2065 
2066  if(host.Strand() != guest.Strand() || (host.Status()&CGeneModel::eUnknownOrientation) != (guest.Status()&CGeneModel::eUnknownOrientation))
2067  continue;
2068  if((guest.Status()&CGeneModel::ePolyA) || (host.Status()&CGeneModel::ePolyA)) {
2069  if((guest.Status()&CGeneModel::ePolyA) != (host.Status()&CGeneModel::ePolyA)
2070  || (guest.Strand() == ePlus && guest.Limits().GetTo() != host.Limits().GetTo())
2071  || (guest.Strand() == eMinus && guest.Limits().GetFrom() != host.Limits().GetFrom()))
2072  continue;
2073  }
2074  if((guest.Status()&CGeneModel::eCap) || (host.Status()&CGeneModel::eCap)) {
2075  if((guest.Status()&CGeneModel::eCap) != (host.Status()&CGeneModel::eCap)
2076  || (guest.Strand() == eMinus && guest.Limits().GetTo() != host.Limits().GetTo())
2077  || (guest.Strand() == ePlus && guest.Limits().GetFrom() != host.Limits().GetFrom()))
2078  continue;
2079  }
2080 
2081  host.SetWeight(host.Weight()+guest.Weight());
2082  m_aligns_for_filtering_only.erase(*iguest);
2083  ests.erase(iguest);
2084  }
2085  }
2086  }
2087  }
2088 
2090  if(i->Type() == CGeneModel::emRNA) {
2091  CBioseq_Handle bh (m_scope->GetBioseqHandle(*i->GetTargetId()));
2092  const CMolInfo* molinfo = GetMolInfo(bh);
2093  if(molinfo && molinfo->IsSetTech() && molinfo->GetTech() == CMolInfo::eTech_tsa)
2094  i->Status() |= CGeneModel::eTSA; // used to exclude from CDS projection
2095  }
2096  if(CheckAndInsert(*i, clsset))
2097  ++total;
2098  }
2099 
2100  cerr << "After collapsing: " << total << " alignments" << endl;
2101 }
2102 
2103 
2104 #define MAX_DIST_TO_FLANK_GAP 10000
2106 
2107  CGeneModel editedmodel = align;
2108  editedmodel.ClearExons(); // empty alignment with all atributes
2109  vector<TSignedSeqRange> transcript_exons;
2110 
2111  string acc = align.TargetAccession();
2112  bool chainer_tsa = (acc.find("ChainerTSA") != string::npos);
2113 
2114  string left_seq, right_seq;
2115  CInDelInfo::SSource left_src;
2116  CInDelInfo::SSource right_src;
2117  TSignedSeqRange left_texon, right_texon;
2118  TSignedSeqRange tlim = align.TranscriptLimits();
2119  string transcript = GetDNASequence(align.GetTargetId(),*m_scope);
2120  if(tlim.GetFrom() > 30 && ((align.Status()&CGeneModel::ePolyA) == 0 || (align.Status()&CGeneModel::eReversed) == 0)) {
2121  left_seq = transcript.substr(0,tlim.GetFrom());
2122  left_texon = TSignedSeqRange(0,tlim.GetFrom()-1);
2123  left_src.m_acc = align.TargetAccession();
2124  left_src.m_strand = ePlus;
2125  left_src.m_range = left_texon;
2126  }
2127  if(tlim.GetTo() < align.TargetLen()-30 && ((align.Status()&CGeneModel::ePolyA) == 0 || (align.Status()&CGeneModel::eReversed) != 0)) {
2128  right_seq = transcript.substr(tlim.GetTo()+1);
2129  right_texon = TSignedSeqRange(tlim.GetTo()+1,align.TargetLen()-1);
2130  right_src.m_acc = align.TargetAccession();
2131  right_src.m_strand = ePlus;
2132  right_src.m_range = right_texon;
2133  }
2134  if(align.Orientation() == eMinus) {
2135  swap(left_seq, right_seq);
2136  swap(left_texon, right_texon);
2137  swap(left_src, right_src);
2138  }
2139 
2140  if(!left_seq.empty() && (fill&efill_left) != 0 && !chainer_tsa) {
2142  if(ig != m_genomic_gaps_len.begin() && (--ig)->first > align.Limits().GetFrom()-MAX_DIST_TO_FLANK_GAP) { // there is gap on left
2143  transcript_exons.push_back(left_texon);
2144  editedmodel.AddExon(TSignedSeqRange::GetEmpty(), "XX", "XX", 1, left_seq, left_src);
2145 
2146  if(align.Orientation() == eMinus) {
2147  ReverseComplement(left_seq.begin(),left_seq.end());
2148  left_src.m_strand = eMinus;
2149  }
2150  m_correction_data.m_correction_indels.push_back(CInDelInfo(max(0,ig->first+2*ig->second/3), left_seq.length(), CInDelInfo::eDel, left_seq, left_src)); // 1/3 of gap length will separate genes abatting the same gap
2151  }
2152  }
2153 
2154  for(int i = 0; i < (int)align.Exons().size(); ++i) {
2155  transcript_exons.push_back(align.TranscriptExon(i));
2156  const CModelExon& e = align.Exons()[i];
2157  editedmodel.AddExon(e.Limits(),e.m_fsplice_sig, e.m_ssplice_sig, e.m_ident);
2158 
2159  if(i < (int)align.Exons().size()-1 && (!e.m_ssplice || !align.Exons()[i+1].m_fsplice)) {
2160  if((fill&efill_middle) != 0) {
2161  TSignedSeqRange texon = align.GetAlignMap().MapRangeOrigToEdited(TSignedSeqRange(e.GetTo(),align.Exons()[i+1].GetFrom()),false);
2162  TIntMap::iterator ig = m_genomic_gaps_len.lower_bound(e.GetTo()); // first gap on right
2163  if(ig != m_genomic_gaps_len.end() && ig->first < align.Exons()[i+1].GetFrom() && texon.GetLength() > 2) { // there is a gap
2164  texon.SetFrom(texon.GetFrom()+1);
2165  texon.SetTo(texon.GetTo()-1);
2166  transcript_exons.push_back(texon);
2167  string seq = transcript.substr(texon.GetFrom(),texon.GetLength());
2168  CInDelInfo::SSource src;
2169  src.m_acc = align.TargetAccession();
2170  src.m_strand = ePlus;
2171  src.m_range = texon;
2172  editedmodel.AddExon(TSignedSeqRange::GetEmpty(), "XX", "XX", 1, seq, src);
2173 
2174  if(align.Orientation() == eMinus) {
2175  ReverseComplement(seq.begin(),seq.end());
2176  src.m_strand = eMinus;
2177  }
2178  m_correction_data.m_correction_indels.push_back(CInDelInfo(ig->first+ig->second/2, seq.length(), CInDelInfo::eDel, seq, src));
2179  } else {
2180  editedmodel.AddHole();
2181  }
2182  } else {
2183  editedmodel.AddHole();
2184  }
2185  }
2186  }
2187 
2188  if(!right_seq.empty() && (fill&efill_right) != 0 && !chainer_tsa) {
2190  if(ig != m_genomic_gaps_len.end() && ig->first < align.Limits().GetTo()+MAX_DIST_TO_FLANK_GAP) { // there is gap on right
2191  transcript_exons.push_back(right_texon);
2192  editedmodel.AddExon(TSignedSeqRange::GetEmpty(), "XX", "XX", 1, right_seq, right_src);
2193 
2194  if(align.Orientation() == eMinus) {
2195  ReverseComplement(right_seq.begin(),right_seq.end());
2196  right_src.m_strand = eMinus;
2197  }
2198  m_correction_data.m_correction_indels.push_back(CInDelInfo(ig->first+ig->second/3, right_seq.length(), CInDelInfo::eDel, right_seq, right_src)); // 1/3 of gap length will separate genes abatting the same gap
2199  }
2200  }
2201 
2202  CAlignMap editedamap(editedmodel.Exons(), transcript_exons, align.FrameShifts(), align.Orientation(), align.GetAlignMap().TargetLen());
2203  editedmodel.FrameShifts() = align.FrameShifts();
2204  CAlignModel editedalign(editedmodel, editedamap);
2205  editedalign.SetTargetId(*align.GetTargetId());
2206 
2207  return editedalign;
2208 }
2209 
2210 #define COLLAPS_CHUNK 500000
2212 
2213  string acc = a.TargetAccession();
2214  if(acc.find("CorrectionData") != string::npos) {
2215  if(!m_genomic_gaps_len.empty()) {
2216  TIntMap::iterator gap = m_genomic_gaps_len.upper_bound(a.Limits().GetTo()); // gap clearly on the right (could be end)
2217  if(gap != m_genomic_gaps_len.begin())
2218  --gap; // existing gap (not end)
2219  if(gap->first <= a.Limits().GetTo() && gap->first+gap->second-1 >= a.Limits().GetFrom()) // overlap
2220  return;
2221  }
2222 
2224 
2225  TInDels corrections = a.FrameShifts();
2226  ITERATE(TInDels, i, corrections) {
2227  if(i->IsMismatch()) {
2228  string seq = i->GetInDelV();
2229  for(int l = 0; l < i->Len(); ++l)
2230  m_correction_data.m_replacements[i->Loc()+l] = seq[l];
2231  } else {
2234  }
2235  }
2236 
2237  return;
2238  }
2239 
2240  if((a.Type()&CGeneModel::eSR) && !a.Continuous()) // ignore SR with internal gaps
2241  return;
2242 
2243  CAlignModel align(a);
2244  if(!m_fillgenomicgaps)
2245  align.Status() &= ~CGeneModel::eGapFiller;
2246 
2248  return;
2249 
2250  if((align.Status()&CGeneModel::eUnknownOrientation) && align.Strand() == eMinus)
2251  align.ReverseComplementModel();
2252 
2253  const CGeneModel::TExons& e = align.Exons();
2254  for(unsigned int l = 1; l < e.size(); ++l) {
2255  if(e[l-1].m_ssplice && e[l].m_fsplice) {
2256  string sig;
2257  if(align.Strand() == ePlus)
2258  sig = e[l-1].m_ssplice_sig+e[l].m_fsplice_sig;
2259  else
2260  sig = e[l].m_fsplice_sig+e[l-1].m_ssplice_sig;
2261  SIntron intron(e[l-1].GetTo(),e[l].GetFrom(), align.Strand(), (align.Status()&CGeneModel::eUnknownOrientation) == 0, sig);
2262  SIntronData& id = m_align_introns[intron];
2263 
2264  if(((align.Type()&CGeneModel::eSR) && !m_filtersr) ||
2265  ((align.Type()&CGeneModel::eEST) && !m_filterest) ||
2266  ((align.Type()&CGeneModel::emRNA) && !m_filtermrna) ||
2267  ((align.Type()&CGeneModel::eProt) && !m_filterprots)) {
2268 
2269  id.m_keep_anyway = true;
2270  }
2271 
2272  if((align.Type()&CGeneModel::eSR) ||
2273  (align.Status()&CGeneModel::eGapFiller && sig == "GTAG" &&
2274  e[l-1].Limits().GetLength() > 15 && e[l-1].m_ident > 0.99 &&
2275  e[l].Limits().GetLength() > 15 && e[l].m_ident > 0.99)) {
2276 
2277  id.m_selfsp_support = true;
2278  }
2279 
2280  id.m_weight += align.Weight();
2281 
2283  id.m_est_support += align.Weight()+0.5;
2284 
2285  double ident = min(e[l-1].m_ident,e[l].m_ident);
2286  if(ident == 0.)
2287  ident = 1.; // collapsed SRA and proteins don't have ident information
2288  id.m_ident = max(id.m_ident,ident);
2289  }
2290  }
2291 
2292  if((align.Type()&CGeneModel::eSR) || ((align.Type()&CGeneModel::eEST) && !(align.Status()&CGeneModel::eGapFiller) && m_collapsest)) { // add alignments for collapsing
2293  if(align.Continuous()) {
2294  CAlignCommon c(align);
2295  m_aligns[c].push_back(SAlignIndividual(align, m_target_id_pool[c]));
2296  } else {
2297  TAlignModelList aligns = GetAlignParts(align, false);
2298  ITERATE(TAlignModelList, i, aligns) {
2299  CAlignCommon c(*i);
2300  m_aligns[c].push_back(SAlignIndividual(*i, m_target_id_pool[c]));
2301  }
2302  }
2303  } else {
2304  m_aligns_for_filtering_only.push_back(align);
2305  }
2306 
2307  if(++m_count%COLLAPS_CHUNK == 0) {
2308  cerr << "Added " << m_count << " alignments to collapser" << endl;
2309  CollapsIdentical();
2310  }
2311 }
2312 
2315  deque<SAlignIndividual>& alideque = i->second;
2316  deque<char>& id_pool = m_target_id_pool[i->first];
2317  if(!alideque.empty()) {
2318 
2319  //remove identicals
2320  sort(alideque.begin(),alideque.end(),LeftAndLongFirstOrder(id_pool));
2321  deque<SAlignIndividual>::iterator ali = alideque.begin();
2322  for(deque<SAlignIndividual>::iterator farp = ali+1; farp != alideque.end(); ++farp) {
2323  _ASSERT(farp > ali);
2324  if(farp->m_range == ali->m_range) {
2325  ali->m_weight += farp->m_weight;
2326  for(deque<char>::iterator p = id_pool.begin()+farp->m_target_id; *p != 0; ++p) {
2327  _ASSERT(p < id_pool.end());
2328  *p = 0;
2329  }
2330  } else {
2331  *(++ali) = *farp;
2332  }
2333  }
2334  _ASSERT(ali-alideque.begin()+1 <= (int)alideque.size());
2335  alideque.resize(ali-alideque.begin()+1); // ali - last retained element
2336 
2337 
2338 
2339  //clean up id pool and reset shifts
2340  sort(alideque.begin(),alideque.end(),OriginalOrder);
2341  deque<char>::iterator id = id_pool.begin();
2342  int shift = 0;
2343  ali = alideque.begin();
2344  for(deque<char>::iterator farp = id; farp != id_pool.end(); ) {
2345  while(farp != id_pool.end() && *farp == 0) {
2346  ++farp;
2347  ++shift;
2348  }
2349  if(farp != id_pool.end()) {
2350 
2351  if(farp-id_pool.begin() == ali->m_target_id) {
2352  ali->m_target_id -= shift;
2353  _ASSERT(ali->m_target_id >= 0);
2354  ++ali;
2355  }
2356 
2357 
2358  _ASSERT(farp >= id);
2359  while(*farp != 0) {
2360  *id++ = *farp++;
2361  }
2362  *id++ = *farp++;
2363  }
2364  }
2365  id_pool.resize(id-id_pool.begin()); // id - next after last retained element
2366 
2367  _ASSERT(ali == alideque.end());
2368  }
2369  }
2370 }
2371 
2372 
2373 END_SCOPE(gnomon)
2374 END_SCOPE(ncbi)
2375 
2376 
T minus(T x_)
CBioseq_Handle –.
vector< CInDelInfo > TInDels
string m_fsplice_sig
void SetID(Int8 id)
const TExons & Exons() const
const_iterator lower_bound(const key_type &key) const
Definition: map.hpp:154
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:783
size_type size() const
Definition: map.hpp:148
Set coding to printable coding (Iupacna or Iupacaa)
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
TSignedSeqRange TranscriptLimits() const
TIntMap m_genomic_gaps_len
void SetStrand(EStrand s)
void ClipNotSupportedFlanks(CAlignModel &align, double clip_threshold)
CResidueVec m_contigrv
int FShiftedLen(TSignedSeqRange ab, ERangeEnd lend, ERangeEnd rend) const
Definition: gnomon_seq.cpp:977
vector< SIntron > Tintrons
virtual void Clip(TSignedSeqRange limits, EClipMode mode, bool ensure_cds_invariant=true)
void AddDefaultKey(const string &name, const string &synopsis, const string &comment, EType type, const string &default_value, TFlags flags=0, const string &env_var=kEmptyStr, const char *display_value=nullptr)
Add description for optional key with default value.
Definition: ncbiargs.cpp:2256
int tolower(Uchar c)
Definition: ncbictype.hpp:72
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:73
bool Continuous() const
#define COLLAPS_CHUNK
bool isEST() const
string GetDNASequence(CConstRef< objects::CSeq_id > id, CScope &scope)
Definition: id_handler.cpp:130
TAlignIntrons m_align_introns
struct parameters_t * pb[]
double Weight() const
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1337
Defines command line argument related classes.
void GetOnlyOtherAlignments(TAlignModelClusterSet &clsset)
static TThisType GetWhole(void)
Definition: range.hpp:272
#define COVERED_FRACTION
position_type GetLength(void) const
Definition: range.hpp:158
void ReverseComplementModel()
static void SetupArgDescriptions(CArgDescriptions *arg_desc)
container_type::const_iterator const_iterator
Definition: map.hpp:53
int toupper(Uchar c)
Definition: ncbictype.hpp:73
vector< double > m_coverage
int TargetLen() const
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
string m_ssplice_sig
void GetCollapsedAlgnments(TAlignModelClusterSet &clsset)
bool isMinus() const
void AddExon(TSignedSeqRange exon, const string &fs="", const string &ss="", double ident=0, const string &seq="", const CInDelInfo::SSource &src=CInDelInfo::SSource())
SAlignIndividual * m_ali
#define INDEL_PENALTY
EStrand Orientation() const
#define gp
EStrand
const Tintrons & GetIntrons() const
TThisType & SetTo(position_type to)
Definition: range.hpp:180
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:99
#define MAX_CLIP
const_iterator end() const
Definition: set.hpp:136
TThisType & SetFrom(position_type from)
Definition: range.hpp:170
void AddAlignment(const CAlignModel &align)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:776
int i
static const char ip[]
Definition: des.c:81
bool m_oriented
list< CAlignModel > TAlignModelList
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:612
bool CheckAndInsert(const CAlignModel &align, TAlignModelClusterSet &clsset) const
int Type() const
void CleanSelfTranscript(CAlignModel &align, const string &trans) const
double m_ident
bool AlignmentMarkedForDeletion(const SAlignIndividual &ali)
int TargetLen() const
TSignedSeqRange TranscriptExon(int i) const
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:566
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
const_iterator end() const
Definition: map.hpp:152
bool HasStop() const
void SetTargetId(const objects::CSeq_id &id)
const_iterator begin() const
Definition: set.hpp:135
#define END_PART_LENGTH
const_iterator find(const key_type &key) const
Definition: map.hpp:153
bool RemoveNotSupportedIntronsFromProt(CAlignModel &align)
static TThisType GetEmpty(void)
Definition: range.hpp:306
int TotalFrameShift(const TInDels &indels, int a, int b)
bool RemoveNotSupportedIntronsFromTranscript(CAlignModel &align, bool check_introns_on_both_strands) const
CAlignCollapser(string contig="", CScope *scope=0, bool nofilteringcollapsing=false)
TInDels & FrameShifts()
vector< int > TIVec
TSignedSeqRange m_range
TInDels GetInDels(bool fs_only) const
const_iterator lower_bound(const key_type &key) const
Definition: set.hpp:138
bool isGoodIntron(int a, int b, EStrand strand, const CAlignCollapser::TAlignIntrons &introns, bool check_introns_on_both_strands)
bool Empty(void) const
Definition: range.hpp:148
CSeqVector –.
Definition: seq_vector.hpp:64
int g(Seg_Gsm *spe, Seq_Mtf *psm, Thd_Gsm *tdg)
Definition: thrddgri.c:44
USING_SCOPE(sequence)
bool Include(TSignedSeqRange big, TSignedSeqRange small)
#define DESIRED_CHUNK
TSignedSeqRange m_range
parent_type::iterator iterator
Definition: set.hpp:80
#define COVERAGE_WINDOW
void AddFlag(const string &name, const string &comment, CBoolEnum< EFlagValue > set_value=eFlagHasValueIfSet)
Add description for flag argument.
Definition: ncbiargs.cpp:2273
TInDels m_correction_indels
bool empty() const
Definition: map.hpp:149
virtual CAlignMap GetAlignMap() const
void ClearExons()
SAlignExtended(SAlignIndividual &ali, const set< int > &left_exon_ends, const set< int > &right_exon_ends)
TAlignModelList m_aligns_for_filtering_only
list< TSignedSeqRange > m_confirmed_intervals
EStrand OtherStrand(EStrand s)
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:190
#define MAX_DIST_TO_FLANK_GAP
EStrand Strand() const
bool AlignmentIsSupportedBySR(const CAlignModel &align, const vector< double > &coverage, int mincoverage, int left_end)
void Insert(const typename Cluster::TModel &a)
virtual void CutExons(TSignedSeqRange hole)
#define EXTRA_CUT
bool NotEmpty(void) const
Definition: range.hpp:152
T max(T x_, T y_)
position_type GetTo(void) const
Definition: range.hpp:142
LeftAndLongFirstOrder(const deque< char > &idp)
CArgDescriptions –.
Definition: ncbiargs.hpp:514
TSignedSeqPos GetTo() const
Magic spell ;-) needed for some weird compilers... very empiric.
map< string, string > ss
unsigned int & Status()
T min(T x_, T y_)
map< int, char > m_replacements
TSignedSeqRange ShrinkToRealPoints(TSignedSeqRange orig_range, bool snap_to_codons=false) const
Definition: gnomon_seq.cpp:748
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
Definition: pointer.h:1084
string GetTargetAcc(int shift, const deque< char > &id_pool)
TSignedSeqRange Limits() const
CRange< TSignedSeqPos > TSignedSeqRange
Definition: range.hpp:420
position_type GetFrom(void) const
Definition: range.hpp:134
bool operator()(TAlignModelList::const_iterator a, TAlignModelList::const_iterator b)
CException –.
Definition: ncbiexpt.hpp:709
void SetCurrentGroup(const string &group)
Set current arguments group name.
Definition: ncbiargs.cpp:2445
int AlignLen() const
CScope –.
Definition: scope.hpp:90
bool isPlus() const
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:484
CArgs –.
Definition: ncbiargs.hpp:356
bool isSR() const
void erase(iterator pos)
Definition: map.hpp:167
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
string TargetAccession() const
#define MIN_EXON
const TSignedSeqRange & Limits() const
bool operator()(const CGeneModel::TExons &a, const CGeneModel::TExons &b) const
int len
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1153
bool HasStart() const
container_type::value_type value_type
Definition: map.hpp:52
SCorrectionData m_correction_data
#define CUT_MARGIN
yy_size_t n
const CMolInfo * GetMolInfo(const CBioseq &bioseq)
Retrieve the MolInfo object for a given bioseq handle.
Definition: sequence.cpp:173
static BOOL number
Definition: pcregrep.c:167
#define _ASSERT
bool isCap() const
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:278
virtual void CutExons(TSignedSeqRange hole)
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:70
void SetWeight(double w)
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string...
Definition: ncbiexpt.hpp:546
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
Definition: MolInfo_.hpp:459
#define BIG_NOT_ALIGNED
#define const
Definition: zconf.h:217
#define FS_FUZZ
#define MISM_PENALTY
CAlignModel GetAlignment(const SAlignIndividual &ali, const deque< char > &target_id_pool) const
T plus(T x_)
CAlignModel FillGapsInAlignmentAndAddToGenomicGaps(const CAlignModel &align, int fill)
const_iterator begin() const
Definition: map.hpp:151
bool operator()(const SAlignIndividual &a, const SAlignIndividual &b)
const deque< char > & id_pool
std::istream & in(std::istream &in_, double &x_)
static CRef< CSeq_id > ToSeq_id(const string &str)
Definition: id_handler.cpp:73
bool isUnknown() const
bool OriginalOrder(const SAlignIndividual &a, const SAlignIndividual &b)
Convertible into a floating point number (double)
Definition: ncbiargs.hpp:567
vector< CModelExon > TExons
const_iterator upper_bound(const key_type &key) const
Definition: map.hpp:155
#define SMALL_CLIP
Tintrons m_introns
const_iterator upper_bound(const key_type &key) const
Definition: set.hpp:139
TSignedSeqRange m_range
void ReverseComplement(const BidirectionalIterator &first, const BidirectionalIterator &last)
bool isPolyA() const
TSignedSeqRange MapRangeOrigToEdited(TSignedSeqRange orig_range, ERangeEnd lend, ERangeEnd rend) const
Definition: gnomon_seq.cpp:922
TSignedSeqPos GetFrom() const
list< Model > GetAlignParts(const Model &algn, bool settrimflags)
CConstRef< objects::CSeq_id > GetTargetId() const
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:68
bool OneExonCompare(const CModelExon &a, const CModelExon &b)
Modified on Tue Jul 25 19:51:25 2017 by modify_doxy.py rev. 533848