NCBI C++ ToolKit
depth_filter.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: depth_filter.cpp 72122 2016-04-18 01:34:13Z whlavina $
2  * ===========================================================================
3  *
4  * PUBliC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Nathan Bouk
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
37 
38 
39 using namespace ncbi;
40 using namespace objects;
41 
42 
43 
44 
45 
46 struct SRangeDepth {
47  SRangeDepth(TSeqRange range) : Range(range), Depth(1) { ; }
48  SRangeDepth(TSeqRange range, size_t depth) : Range(range), Depth(depth) { ; }
49 
51  size_t Depth;
52 };
53 bool operator<(const SRangeDepth& A, const SRangeDepth& B) {
54  if(A.Range.GetFrom() != B.Range.GetFrom())
55  return A.Range.GetFrom() < B.Range.GetFrom();
56  else
57  return A.Range.GetTo() < B.Range.GetTo();
58 }
59 
60 
62 public:
64 
65  void AddRange(TSeqRange NewRange);
66 
67  void CollapseOnDepth(size_t CollapseDepth);
68 
69  size_t MinDepthForRange(TSeqRange CheckRange) const;
70 
71  void ZeroFill();
72 
73  void CalcStats() const;
74 
75  void Print() const {
76  ITERATE(vector<SRangeDepth>, RangeIter, x_Ranges) {
77  cerr << RangeIter->Range << " : " << RangeIter->Depth << endl;
78  }
79  }
80 private:
81 
82  vector<SRangeDepth> x_Ranges;
83 
84 };
85 
87 {
88  if(x_Ranges.empty()) {
89  SRangeDepth NewRangeDepth(NewRange);
90  x_Ranges.push_back(NewRangeDepth);
91  return;
92  }
93 
94  vector<SRangeDepth> News;
95  TSeqRange RemainRange = NewRange;
96 
97  ERASE_ITERATE(vector<SRangeDepth>, RangeDepthIter, x_Ranges) {
98 
99  if(RemainRange.Empty()) {
100  break;
101  }
102 
103  if( RangeDepthIter->Range == RemainRange) {
104  RangeDepthIter->Depth++;
105  RemainRange = TSeqRange();
106  break;
107  }
108 
109  // past it
110  if(RemainRange.GetTo() < RangeDepthIter->Range.GetFrom()) {
111  break;
112  }
113 
114  bool EraseCurr = false;
115  TSeqRange Inter = RangeDepthIter->Range.IntersectionWith(RemainRange);
116 
117  if(Inter.NotEmpty()) {
118  SRangeDepth InterRangeDepth(Inter, RangeDepthIter->Depth + 1);
119  News.push_back(InterRangeDepth);
120  EraseCurr = true;
121 
122 
123  if(RemainRange.GetFrom() < Inter.GetFrom()) {
124  TSeqRange NewPrevRange;
125  NewPrevRange.SetFrom(RemainRange.GetFrom());
126  NewPrevRange.SetTo(Inter.GetFrom()-1);
127  SRangeDepth NewPrev(NewPrevRange, 1);
128  News.push_back(NewPrev);
129  }
130  if(RemainRange.GetTo() >= Inter.GetTo()) {
131  TSeqRange NewAfterRange;
132  NewAfterRange.SetFrom(Inter.GetTo()+1);
133  NewAfterRange.SetTo(RemainRange.GetTo());
134  RemainRange = NewAfterRange;
135  }
136 
137  if(RangeDepthIter->Range.GetFrom() < Inter.GetFrom()) {
138  TSeqRange OldPrevRange;
139  OldPrevRange.SetFrom(RangeDepthIter->Range.GetFrom());
140  OldPrevRange.SetTo(Inter.GetFrom()-1);
141  SRangeDepth OldPrev(OldPrevRange, RangeDepthIter->Depth);
142  News.push_back(OldPrev);
143  }
144  if(RangeDepthIter->Range.GetTo() > Inter.GetTo()) {
145  TSeqRange OldAfterRange;
146  OldAfterRange.SetFrom(Inter.GetTo()+1);
147  OldAfterRange.SetTo(RangeDepthIter->Range.GetTo());
148  SRangeDepth OldAfter(OldAfterRange, RangeDepthIter->Depth);
149  News.push_back(OldAfter);
150  // VECTOR_ERASE(RangeDepthIter, x_Ranges);
151  // break;
152  }
153  }
154 
155  if(EraseCurr) {
156  VECTOR_ERASE(RangeDepthIter, x_Ranges);
157  }
158  }
159 
160  if(RemainRange.NotEmpty()) {
161  SRangeDepth RemainRangeDepth(RemainRange);
162  x_Ranges.push_back(RemainRangeDepth);
163  }
164 
165 
166  x_Ranges.insert(x_Ranges.end(), News.begin(), News.end());
167  sort(x_Ranges.begin(), x_Ranges.end());
168 }
169 
170 
171 void CDepthCollection::CollapseOnDepth(size_t CollapseDepth)
172 {
173  vector<SRangeDepth> Result;
174 
175  ITERATE(vector<SRangeDepth>, RangeDepthIter, x_Ranges) {
176 
177  // break the merging on empty results,
178  // or the segments aren't adjacent
179  if (Result.empty() ||
180  !RangeDepthIter->Range.AbuttingWith( Result.back().Range )) {
181  Result.push_back(*RangeDepthIter);
182  continue;
183  }
184 
185  // or when the two are not on the same side of the line
186  if ((Result.back().Depth <= CollapseDepth) ^
187  (RangeDepthIter->Depth <= CollapseDepth)) {
188  Result.push_back(*RangeDepthIter);
189  continue;
190  }
191 
192  // merge both unders, or both overs, together
193  Result.back().Depth = min(Result.back().Depth, RangeDepthIter->Depth);
194  Result.back().Range += RangeDepthIter->Range;
195  }
196 
197  x_Ranges.swap(Result);
198  sort(x_Ranges.begin(), x_Ranges.end());
199 }
200 
201 
203 {
204  size_t Result = numeric_limits<size_t>::max();
205  ITERATE(vector<SRangeDepth>, RangeIter, x_Ranges) {
206  if(CheckRange.IntersectingWith(RangeIter->Range)) {
207  Result = min(Result, RangeIter->Depth);
208  }
209  }
210  return Result;
211 }
212 
214 {
215  vector<SRangeDepth> News;
216  TSeqRange Prev = x_Ranges.front().Range;
217 
218  ITERATE(vector<SRangeDepth>, RangeIter, x_Ranges) {
219 
220  if (Prev != RangeIter->Range &&
221  !Prev.AbuttingWith(RangeIter->Range)) {
222  TSeqRange Zero;
223  Zero.SetFrom(Prev.GetTo()+1);
224  Zero.SetTo(RangeIter->Range.GetFrom()-1);
225  SRangeDepth ZeroRD(Zero, 0);
226  News.push_back(ZeroRD);
227  }
228  Prev = RangeIter->Range;
229  }
230 
231  x_Ranges.insert(x_Ranges.end(), News.begin(), News.end());
232  sort(x_Ranges.begin(), x_Ranges.end());
233 }
234 
236 {
237  size_t AccumBases = 0;
238  size_t AccumBaseDepth = 0;
239  size_t AccumRangeDepth = 0;
240  size_t RangeCount = 0;
241 
242  vector<size_t> SortBaseDepths;
243  vector<size_t> SortRangeDepths;
244 
245  // mean mode median
246  ITERATE(vector<SRangeDepth>, RangeDepthIter, x_Ranges) {
247  AccumBases += RangeDepthIter->Range.GetLength();
248  AccumBaseDepth += (RangeDepthIter->Range.GetLength() * RangeDepthIter->Depth);
249  AccumRangeDepth += RangeDepthIter->Depth;
250  RangeCount++;
251 
252  for(size_t i = 0; i < RangeDepthIter->Range.GetLength(); i++) {
253  SortBaseDepths.push_back(RangeDepthIter->Depth);
254  }
255  SortRangeDepths.push_back(RangeDepthIter->Depth);
256  }
257 
258  double BaseMean = (double(AccumBaseDepth) / double(AccumBases));
259  double RangeMean = (double(AccumRangeDepth) / double(RangeCount));
260 
261  sort(SortBaseDepths.begin(), SortBaseDepths.end());
262  sort(SortRangeDepths.begin(), SortRangeDepths.end());
263 
264  size_t BaseMedian = SortBaseDepths[SortBaseDepths.size()/2];
265  size_t RangeMedian = SortRangeDepths[SortRangeDepths.size()/2];
266 
267  size_t PrevValue = SortRangeDepths.front();
268  size_t PrevCounts = 0;
269  size_t BestValue=PrevValue, BestCounts=0;
270  ITERATE(vector<size_t>, ValueIter, SortRangeDepths) {
271  if( (*ValueIter) == PrevValue) {
272  PrevCounts++;
273  } else {
274  if(PrevCounts > BestCounts) {
275  BestValue = PrevValue;
276  BestCounts = PrevCounts;
277  }
278  PrevValue = *ValueIter;
279  PrevCounts = 1;
280  }
281  }
282  size_t RangeMode = BestValue;
283 
284  PrevValue = SortBaseDepths.front();
285  PrevCounts = 0;
286  BestValue=PrevValue, BestCounts=0;
287  ITERATE(vector<size_t>, ValueIter, SortBaseDepths) {
288  if( (*ValueIter) == PrevValue) {
289  PrevCounts++;
290  } else {
291  if(PrevCounts > BestCounts) {
292  BestValue = PrevValue;
293  BestCounts = PrevCounts;
294  }
295  PrevValue = *ValueIter;
296  PrevCounts = 1;
297  }
298  }
299  size_t BaseMode = BestValue;
300 
301 
302  cerr << "BaseMean: " << BaseMean << endl;
303  cerr << "RangeMean: " << RangeMean << endl;
304  cerr << "BaseMedian: " << BaseMedian << endl;
305  cerr << "RangeMedian: " << RangeMedian << endl;
306  cerr << "BaseMode: " << BaseMode << endl;
307  cerr << "RangeMode: " << RangeMode << endl;
308 
309 }
310 
312 
313 typedef list<CRef<CSeq_align> > TAlignList;
314 
315 void
317  list<CRef<CSeq_align> >& Output,
318  int FilterOnRow,
319  size_t DepthCutoff,
320  double PctIdentRescue)
321 {
322  //int FilterOnRow = 1; // filter on Query or Subjt coverage
323  //double PctIdentRescue = 97.0; // >= Keep, pre-empting DepthCutoff.
324  //TSeqPos DepthCutoff = 3; // <= Keep, > Toss.
325 
326  CDepthCollection Depths;
327 
328  // calculate depths
329  ITERATE(TAlignList, AlignIter, Input) {
330  TSeqRange Range = (*AlignIter)->GetSeqRange(FilterOnRow);
331  Depths.AddRange(Range);
332  }
333  Depths.ZeroFill();
334  Depths.CollapseOnDepth(DepthCutoff);
335 
336  //Depths.Print();
337  //Depths.CalcStats();
338 
339 
340  // filter into output
341  ITERATE(TAlignList, AlignIter, Input) {
342  TSeqRange Range = (*AlignIter)->GetSeqRange(FilterOnRow);
343 
344  TSeqPos CurrDepth = Depths.MinDepthForRange(Range);
345 
346  if(CurrDepth <= DepthCutoff) {
347  Output.push_back(*AlignIter);
348  } else {
349  // (make an exception for very high pct-ident alignments)
350  // the region might have one perfect hit under all the cruft
351  double PctIdentUngap = 0.0;
352  bool HasScore = (*AlignIter)->GetNamedScore(CSeq_align::eScore_PercentIdentity_Ungapped, PctIdentUngap);
353  if(!HasScore) {
354  int NumIdent = 0;
355  HasScore = (*AlignIter)->GetNamedScore(CSeq_align::eScore_IdentityCount, NumIdent);
356  if(HasScore) {
357  TSeqPos AlignLen = (*AlignIter)->GetAlignLength(false);
358  PctIdentUngap = (double(NumIdent) / AlignLen) * 100.0;
359  }
360  }
361 
362  if(PctIdentUngap >= PctIdentRescue) {
363  Output.push_back(*AlignIter);
364  }
365  }
366  }
367 
368 }
369 
370 
371 void
373  list<CRef<CSeq_align> >& Output,
374  size_t DepthCutoff, double PctIdentRescue)
375 {
376  TAlignList FilteredQ, FilteredS;
377 
378  FilterOneRow(Input, FilteredQ, 0, DepthCutoff, PctIdentRescue);
379  FilterOneRow(Input, FilteredS, 1, DepthCutoff, PctIdentRescue);
380 
381  // Walk the input in unison with the pair of intermediate filtered
382  // lists (sharing the same collation), and emit results in common.
383  TAlignList::const_iterator AlignIterQ = FilteredQ.begin(),
384  EndQ = FilteredQ.end();
385  TAlignList::const_iterator AlignIterS = FilteredS.begin(),
386  EndS = FilteredS.end();
387  ITERATE(TAlignList, AlignIter, Input) {
388  // Break if we reach the end of a filtered list.
389  if (AlignIterQ == EndQ || AlignIterS == EndS) break;
390  // Emit result if it's in common with both filtered lists.
391  if (*AlignIter == *AlignIterQ && *AlignIter == *AlignIterS) {
392  Output.push_back(*AlignIter);
393  }
394  // Move to next in the filtered lists, if this alignment was present.
395  if (*AlignIter == *AlignIterQ) ++AlignIterQ;
396  if (*AlignIter == *AlignIterS) ++AlignIterS;
397  }
398 }
399 
401 
static void FilterOneRow(const list< CRef< objects::CSeq_align > > &Input, list< CRef< objects::CSeq_align > > &Output, int FilterOnRow, size_t DepthCutoff=5, double PctIdentRescue=95.0)
static void FilterBothRows(const list< CRef< objects::CSeq_align > > &Input, list< CRef< objects::CSeq_align > > &Output, size_t DepthCutoff=5, double PctIdentRescue=95.0)
void Print() const
void CollapseOnDepth(size_t CollapseDepth)
vector< SRangeDepth > x_Ranges
void AddRange(TSeqRange NewRange)
size_t MinDepthForRange(TSeqRange CheckRange) const
void CalcStats() const
@ eScore_PercentIdentity_Ungapped
Definition: Seq_align.hpp:164
@ eScore_IdentityCount
Definition: Seq_align.hpp:145
static unsigned char depth[2 *(256+1+29)+1]
bool operator<(const SRangeDepth &A, const SRangeDepth &B)
list< CRef< CSeq_align > > TAlignList
#define A(i)
Definition: ecp_curves.c:936
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define ERASE_ITERATE(Type, Var, Cont)
Non-constant version with ability to erase current element, if container permits.
Definition: ncbimisc.hpp:843
#define VECTOR_ERASE(Var, Cont)
Use this macro inside body of ERASE_ITERATE cycle to erase from vector-like container.
Definition: ncbimisc.hpp:852
bool NotEmpty(void) const
Definition: range.hpp:152
bool AbuttingWith(const TThisType &r) const
Definition: range.hpp:336
bool IntersectingWith(const TThisType &r) const
Definition: range.hpp:331
TThisType IntersectionWith(const TThisType &r) const
Definition: range.hpp:312
bool Empty(void) const
Definition: range.hpp:148
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
Definition: range.hpp:419
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
void SetFrom(TFrom value)
Assign a value to From data member.
Definition: Range_.hpp:231
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
void SetTo(TTo value)
Assign a value to To data member.
Definition: Range_.hpp:278
int i
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
Magic spell ;-) needed for some weird compilers... very empiric.
T max(T x_, T y_)
T min(T x_, T y_)
SRangeDepth(TSeqRange range)
TSeqRange Range
SRangeDepth(TSeqRange range, size_t depth)
Modified on Wed Apr 17 13:09:08 2024 by modify_doxy.py rev. 669887