NCBI C++ ToolKit
validerror_align.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: validerror_align.cpp 72346 2016-05-03 11:46:11Z bollin $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko......
27  *
28  * File Description:
29  * validation of seq_align
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 
42 #include <objmgr/seqdesc_ci.hpp>
43 #include <objmgr/util/sequence.hpp>
44 
51 
52 #include <map>
53 #include <vector>
54 #include <algorithm>
55 
58 
59 
63 
65 
66 
67 // ================================ Public ================================
68 
69 
71  CValidError_base(imp)
72 {
73 }
74 
75 
77 {
78 }
79 
80 
82 {
83  if (!align.IsSetSegs()) {
85  "Segs: This alignment is missing all segments. This is a non-correctable error -- look for serious formatting problems.",
86  align);
87  return;
88  }
89 
90  const CSeq_align::TSegs& segs = align.GetSegs();
91  CSeq_align::C_Segs::E_Choice segtype = segs.Which();
92  switch ( segtype ) {
93 
95  x_ValidateDendiag(segs.GetDendiag(), align);
96  break;
97 
99  x_ValidateDenseg(segs.GetDenseg(), align);
100  break;
101 
103  x_ValidateStd(segs.GetStd(), align);
104  break;
106  x_ValidatePacked(segs.GetPacked(), align);
107  break;
108 
110  // call recursively
111  ITERATE(CSeq_align_set::Tdata, sali, segs.GetDisc().Get()) {
112  ValidateSeqAlign(**sali);
113  }
114  return;
115 
118  "Segs: This alignment is missing all segments. This is a non-correctable error -- look for serious formatting problems.",
119  align);
120  return;
121  break;
124  // ignore new segtype warnings in genomic gpipe sequence
125  if (m_Imp.IsGpipe() && m_Imp.IsGenomic()) {
126  return;
127  }
129  "Segs: This alignment has an undefined or unsupported Seqalign segtype "
130  + NStr::IntToString(segtype), align);
131  return;
132  break;
133  default:
134  // ignore new segtype warnings in genomic gpipe sequence
135  if (m_Imp.IsGpipe() && m_Imp.IsGenomic()) {
136  return;
137  }
139  "Segs: This alignment has an undefined or unsupported Seqalign segtype "
140  + NStr::IntToString(segtype), align);
141  return;
142  break;
143  } // end of switch statement
144 
145  if (segtype != CSeq_align::C_Segs::e_Denseg
146  && align.IsSetType()
147  && (align.GetType() == CSeq_align::eType_partial
148  || align.GetType() == CSeq_align::eType_global)) {
149  PostErr(eDiag_Error, eErr_SEQ_ALIGN_UnexpectedAlignmentType, "UnexpectedAlignmentType: This is not a DenseSeg alignment.", align);
150  }
151  try {
152  x_ValidateAlignPercentIdentity (align, false);
153  } catch (CException &) {
154  } catch (std::exception &) {
155  }
156 
157 }
158 
159 
160 // ================================ Private ===============================
161 
162 typedef struct ambchar {
164  const char * match_list;
165 } AmbCharData;
166 
167 static const AmbCharData ambiguity_list[] = {
168  { 'R', "AG" },
169  { 'Y', "CT" },
170  { 'M', "AC" },
171  { 'K', "GT" },
172  { 'S', "CG" },
173  { 'W', "AT" },
174  { 'H', "ACT" },
175  { 'B', "CGT" },
176  { 'V', "ACG" },
177  { 'D', "AGT" }};
178 
179 static const int num_ambiguities = sizeof (ambiguity_list) / sizeof (AmbCharData);
180 
181 static bool s_AmbiguousMatch (char a, char b)
182 {
183  if (a == b) {
184  return true;
185  } else if (a == 'N' || b == 'N') {
186  return true;
187  } else {
188  char search[2];
189  search[1] = 0;
190  for (int i = 0; i < num_ambiguities; i++) {
191  search[0] = b;
192  if (a == ambiguity_list[i].ambig_char
193  && NStr::Find (ambiguity_list[i].match_list, search) != string::npos) {
194  return true;
195  }
196  search[0] = a;
197  if (b == ambiguity_list[i].ambig_char
198  && NStr::Find (ambiguity_list[i].match_list, search) != string::npos) {
199  return true;
200  }
201  }
202  }
203  return false;
204 }
205 
206 
207 static size_t s_GetNumIdsToUse (const CDense_seg& denseg)
208 {
209  size_t dim = denseg.GetDim();
210  if (!denseg.IsSetIds()) {
211  dim = 0;
212  } else if (denseg.GetIds().size() < dim) {
213  dim = denseg.GetIds().size();
214  }
215  return dim;
216 }
217 
218 
220 {
221  if (!align.IsSetScore()) {
222  return false;
223  }
224  ITERATE(CSeq_align::TScore, it, align.GetScore()) {
225  if ((*it)->IsSetId() && (*it)->GetId().IsStr() &&
226  NStr::EqualNocase((*it)->GetId().GetStr(), "pct_identity_ungap") &&
227  (*it)->IsSetValue() && (*it)->GetValue().IsReal()) {
228  if ((*it)->GetValue().GetReal() > 50.0) {
229  return true;
230  } else {
231  return false;
232  }
233  }
234  }
235  return false;
236 }
237 
238 
240 {
241  int dim = denseg.GetDim();
242  if (dim != s_GetNumIdsToUse(denseg)) {
243  return false;
244  }
245 
246  bool is_tpa = false;
247  for (CDense_seg::TDim row = 0; row < dim && !is_tpa; ++row) {
248  CRef<CSeq_id> id = denseg.GetIds()[row];
249  CBioseq_Handle bsh = scope.GetBioseqHandle(*id);
250  if (bsh) {
251  CSeqdesc_CI desc_ci(bsh, CSeqdesc::e_User);
252  while (desc_ci && !is_tpa) {
253  if (desc_ci->GetUser().IsSetType() && desc_ci->GetUser().GetType().IsStr()
254  && NStr::EqualNocase(desc_ci->GetUser().GetType().GetStr(), "TpaAssembly")) {
255  is_tpa = true;
256  }
257  ++desc_ci;
258  }
259  }
260  }
261 
262  return is_tpa;
263 }
264 
265 
266 bool CValidError_align::IsTpaAlignment(const CSparseAln& sparse_aln, CScope& scope)
267 {
268  // check to see if alignment is TPA
269  bool is_tpa = false;
270  for (CSparseAln::TDim row = 0; row < sparse_aln.GetDim() && !is_tpa; ++row) {
271  const CSeq_id& id = sparse_aln.GetSeqId(row);
272  CBioseq_Handle bsh = scope.GetBioseqHandle(id);
273  if (bsh) {
274  CSeqdesc_CI desc_ci(bsh, CSeqdesc::e_User);
275  while (desc_ci && !is_tpa) {
276  if (desc_ci->GetUser().IsSetType() && desc_ci->GetUser().GetType().IsStr()
277  && NStr::EqualNocase(desc_ci->GetUser().GetType().GetStr(), "TpaAssembly")) {
278  is_tpa = true;
279  }
280  ++desc_ci;
281  }
282  }
283  }
284  return is_tpa;
285 }
286 
287 void CValidError_align::x_ValidateAlignPercentIdentity (const CSeq_align& align, bool internal_gaps)
288 {
289  TSeqPos col = 0;
290  size_t num_match = 0;
291  size_t match_25 = 0;
292  bool ids_missing = false;
293 
294  // Now calculate Percent Identity
295  if (!align.IsSetSegs()) {
296  return;
297  } else if (AlignmentScorePercentIdOk(align)) {
298  return;
299  } else if (align.GetSegs().IsDenseg()) {
300  const CDense_seg& denseg = align.GetSegs().GetDenseg();
301  // first, make sure this isn't a TPA alignment
302  if (IsTpaAlignment(denseg, *m_Scope)) {
303  return;
304  }
305 
306  int dim = denseg.GetDim();
307  if (dim != s_GetNumIdsToUse(denseg)) {
308  return;
309  }
310 
311  try {
312  CRef<CAlnVec> av(new CAlnVec(denseg, *m_Scope));
313  av->SetGapChar('-');
314  av->SetEndChar('.');
315 
316  TSeqPos aln_len = av->GetAlnStop() + 1;
317 
318  try {
319  while (col < aln_len && !ids_missing) {
320  string column;
321  av->GetColumnVector(column, col);
322  if (internal_gaps && NStr::Find(column, "-") != string::npos) {
323  // do nothing
324  } else {
325  bool match = true;
326  // don't care about end gaps, ever
327  NStr::ReplaceInPlace(column, ".", "");
328  // if we cared about internal gaps, it would have been handled above
329  NStr::ReplaceInPlace(column, "-", "");
330  if (!NStr::IsBlank(column)) {
331  string::iterator it1 = column.begin();
332  string::iterator it2 = it1;
333  ++it2;
334  while (match && it2 != column.end()) {
335  if (!s_AmbiguousMatch(*it1, *it2)) {
336  match = false;
337  }
338  ++it2;
339  if (it2 == column.end()) {
340  ++it1;
341  it2 = it1;
342  ++it2;
343  }
344  }
345  }
346  if (match) {
347  ++num_match;
348  ++match_25;
349  }
350  }
351  col++;
352  if (col % 25 == 0) {
353  match_25 = 0;
354  }
355  }
356  } catch (CException &x1) {
357  // if sequence is not in scope,
358  // the above is impossible
359  // report 0 %, same as C Toolkit
360  col = aln_len;
361  if (NStr::StartsWith(x1.GetMsg(), "iterator out of range")) {
362  // bad offsets
363  } else {
364  ids_missing = true;
365  }
366  } catch (std::exception &) {
367  // if sequence is not in scope,
368  // the above is impossible
369  // report 0 %, same as C Toolkit
370  col = aln_len;
371  ids_missing = true;
372  }
373  } catch (CException &) {
374  // if AlnVec can't resolve seq id,
375  // the above is impossible
376  // report 0 %, same as C Toolkit
377  col = 1;
378  num_match = 0;
379  ids_missing = true;
380  }
381  } else if (align.GetSegs().IsStd() && !(FindSegmentGaps(align.GetSegs().GetStd(), m_Scope)).empty()) {
382  col = 1;
383  num_match = 0;
384  ids_missing = true;
385  } else {
386  try {
387  TIdExtract id_extract;
388  TAlnIdMap aln_id_map(id_extract, 1);
389  aln_id_map.push_back (align);
390  TAlnStats aln_stats (aln_id_map);
391 
392  // Create user options
393  CAlnUserOptions aln_user_options;
394  TAnchoredAlnVec anchored_alignments;
395 
396  CreateAnchoredAlnVec (aln_stats, anchored_alignments, aln_user_options);
397 
398  /// Build a single anchored aln
399  CAnchoredAln out_anchored_aln;
400 
401  /// Optionally, create an id for the alignment pseudo sequence
402  /// (otherwise one would be created automatically)
403  CRef<CSeq_id> seq_id (new CSeq_id("lcl|PSEUDO ALNSEQ"));
404  CRef<CAlnSeqId> aln_seq_id(new CAlnSeqId(*seq_id));
405  TAlnSeqIdIRef pseudo_seqid(aln_seq_id);
406 
407  BuildAln(anchored_alignments,
408  out_anchored_aln,
409  aln_user_options,
410  pseudo_seqid);
411 
412  CSparseAln sparse_aln(out_anchored_aln, *m_Scope);
413 
414  // check to see if alignment is TPA
415  if (IsTpaAlignment(sparse_aln, *m_Scope)) {
416  return;
417  }
418 
419  vector <string> aln_rows;
420  vector <TSeqPos> row_starts;
421  vector <TSeqPos> row_stops;
422 
423  for (CSparseAln::TDim row = 0; row < sparse_aln.GetDim() && !ids_missing; ++row) {
424  try {
425  string sequence;
426  sparse_aln.GetAlnSeqString
427  (row,
428  sequence,
429  sparse_aln.GetAlnRange());
430  aln_rows.push_back (sequence);
431  TSignedSeqPos aln_start = sparse_aln.GetSeqAlnStart(row);
432  TSignedSeqPos start = sparse_aln.GetSeqPosFromAlnPos(row, aln_start);
433  row_starts.push_back (start);
434  row_stops.push_back (sparse_aln.GetAlnPosFromSeqPos(row, sparse_aln.GetSeqAlnStop(row)));
435  } catch (CException &) {
436  ids_missing = true;
437  } catch (std::exception &) {
438  // if sequence is not in scope,
439  // the above is impossible
440  ids_missing = true;
441  }
442  }
443 
444  bool any_data = false;
445  if (!ids_missing) {
446  TSeqPos aln_len = sparse_aln.GetAlnRange().GetLength();
447  while (col < aln_len) {
448  string column;
449  bool match = true;
450  for (size_t row = 0; row < aln_rows.size() && match; row++) {
451  if (row_starts[row] >= col && row_stops[row] <= col
452  && aln_rows[row].length() > col) {
453  string nt = aln_rows[row].substr(col - row_starts[row], 1);
454  if (NStr::Equal (nt, "-")) {
455  if (internal_gaps) {
456  match = false;
457  }
458  } else {
459  column += nt;
460  }
461  any_data = true;
462  }
463  }
464  if (!any_data) {
465  match = false;
466  }
467  if (match) {
468  if (!NStr::IsBlank (column)) {
469  string::iterator it1 = column.begin();
470  string::iterator it2 = it1;
471  ++it2;
472  while (match && it2 != column.end()) {
473  if (!s_AmbiguousMatch (*it1, *it2)) {
474  match = false;
475  }
476  ++it2;
477  if (it2 == column.end()) {
478  ++it1;
479  it2 = it1;
480  ++it2;
481  }
482  }
483  }
484  if (match) {
485  ++num_match;
486  }
487  }
488  col++;
489  }
490  }
491  } catch (CException &) {
492  ids_missing = true;
493  } catch (std::exception &) {
494  ids_missing = true;
495  }
496  }
497 
498  if (ids_missing) {
499  // if no columns, set col to one, so that we'll get a zero percent id error
500  col = 1;
501  num_match = 0;
502  }
503 
504  if (col > 0) {
505  size_t pct_id = (num_match * 100) / col;
506  if (pct_id < 50) {
508  "PercentIdentity: This alignment has a percent identity of " + NStr::NumericToString (pct_id) + "%",
509  align);
510  }
511  }
512 }
513 
514 
516 (const TDenseg& denseg,
517  const CSeq_align& align)
518 {
519  // assert dim >= 2
520  x_ValidateDim(denseg, align);
521 
522  size_t dim = denseg.GetDim();
523  size_t numseg = denseg.GetNumseg();
524  string label;
525  denseg.GetIds()[0]->GetLabel (&label);
526 
527 
528  string context;
529  size_t bar_pos = NStr::Find(label, "|");
530  if ( bar_pos != string::npos ) {
531  context = label.substr(bar_pos+1);
532  } else {
533  context = label;
534  }
535 
536 
537  // assert dim == Ids.size()
538  if ( dim != denseg.GetIds().size() ) {
540  "SeqId: The Seqalign has more or fewer ids than the number of rows in the alignment (context "
541  + context + "). Look for possible formatting errors in the ids.", align);
542  }
543 
544  // assert numseg == Lens.size()
545  if ( numseg != denseg.GetLens().size() ) {
547  "Mismatch between specified numseg (" +
548  NStr::SizetToString(numseg) +
549  ") and number of Lens (" +
550  NStr::SizetToString(denseg.GetLens().size()) + ")",
551  align);
552  }
553 
554  // assert dim * numseg == Starts.size()
555  if ( dim * numseg != denseg.GetStarts().size() ) {
557  "The number of Starts (" +
558  NStr::SizetToString(denseg.GetStarts().size()) +
559  ") does not match the expected size of dim * numseg (" +
560  NStr::SizetToString(dim * numseg) + ")", align);
561  }
562 
563  x_ValidateStrand(denseg, align);
564  x_ValidateFastaLike(denseg, align);
565  x_ValidateSegmentGap(denseg, align);
566 
567 #if 0
568  // commented out in C Toolkit
569  // look for short alignment
570  int align_len = 0;
571  for (size_t i = 0; i < numseg; i++) {
572  align_len += denseg.GetLens()[i];
573  }
574  bool is_short = false;
575  for (size_t i = 0; i < dim && !is_short; i++) {
577  if (bsh && bsh.IsSetInst() && bsh.IsSetInst_Length() && align_len < bsh.GetInst_Length()) {
578  is_short = true;
579  }
580  }
581  if (is_short) {
582  PostErr (eDiag_Info, eErr_SEQ_ALIGN_ShortAln, "This alignment is shorter than at least one non-farpointer sequence.", align);
583  }
584 #endif
585 
586  // operations that require remote fetching
587  if ( m_Imp.IsRemoteFetch() ) {
588  x_ValidateSeqId(align);
589  x_ValidateSeqLength(denseg, align);
590  }
591 }
592 
593 
594 
595 
597 (const TPacked& packed,
598  const CSeq_align& align)
599 {
600 
601  // assert dim >= 2
602  x_ValidateDim(packed, align);
603 
604  size_t dim = packed.GetDim();
605  size_t numseg = packed.GetNumseg();
606 
607  // assert dim == Ids.size()
608  if ( dim != packed.GetIds().size() ) {
610  "SeqId: The Seqalign has more or fewer ids than the number of rows in the alignment. Look for possible formatting errors in the ids.", align);
611  }
612 
613  // assert numseg == Lens.size()
614  if ( numseg != packed.GetLens().size() ) {
616  "Mismatch between specified numseg (" +
617  NStr::SizetToString(numseg) +
618  ") and number of Lens (" +
619  NStr::SizetToString(packed.GetLens().size()) + ")",
620  align);
621  }
622 
623  x_ValidateSegmentGap(packed, align);
624 
625  if ( m_Imp.IsRemoteFetch() ) {
626  x_ValidateSeqId(align);
627  x_ValidateSeqLength(packed, align);
628  }
629 }
630 
631 
633 (const TDendiag& dendiags,
634  const CSeq_align& align)
635 {
636  size_t num_dendiag = 0;
637  ITERATE( TDendiag, dendiag_iter, dendiags ) {
638  ++num_dendiag;
639 
640  const CDense_diag& dendiag = **dendiag_iter;
641  size_t dim = dendiag.GetDim();
642 
643  // assert dim >= 2
644  x_ValidateDim(dendiag, align, num_dendiag);
645 
646  string label;
647  dendiag.GetIds()[0]->GetLabel (&label);
648  string context;
649  size_t bar_pos = NStr::Find(label, "|");
650  if ( bar_pos != string::npos ) {
651  context = label.substr(bar_pos+1);
652  } else {
653  context = label;
654  }
655 
656  // assert dim == Ids.size()
657  if ( dim != dendiag.GetIds().size() ) {
659  "SeqId: In segment " + NStr::SizetToString (num_dendiag)
660  + ", there are more or fewer rows than there are seqids (context "
661  + context + "). Look for possible formatting errors in the ids.", align);
662  }
663 
664  // assert dim == Starts.size()
665  if ( dim != dendiag.GetStarts().size() ) {
667  "Mismatch between specified dimension (" +
668  NStr::SizetToString(dim) +
669  ") and number ofStarts (" +
670  NStr::SizetToString(dendiag.GetStarts().size()) +
671  ") in dendiag " + NStr::SizetToString(num_dendiag), align);
672  }
673 
674  // assert dim == Strands.size() (if exist)
675  if ( dendiag.IsSetStrands() ) {
676  if ( dim != dendiag.GetStrands().size() ) {
678  "Mismatch between specified dimension (" +
679  NStr::SizetToString(dim) +
680  ") and number of Strands (" +
681  NStr::SizetToString(dendiag.GetStrands().size()) +
682  ") in dendiag " + NStr::SizetToString(num_dendiag), align);
683  }
684  }
685 
686  if ( m_Imp.IsRemoteFetch() ) {
687  x_ValidateSeqLength(dendiag, num_dendiag, align);
688  }
689  }
690  if ( m_Imp.IsRemoteFetch() ) {
691  x_ValidateSeqId(align);
692  }
693  x_ValidateSegmentGap (dendiags, align);
694 }
695 
696 
698 (const TStd& std_segs,
699  const CSeq_align& align)
700 {
701  size_t num_stdseg = 0;
702  ITERATE( TStd, stdseg_iter, std_segs) {
703  ++num_stdseg;
704 
705  const CStd_seg& stdseg = **stdseg_iter;
706  size_t dim = stdseg.GetDim();
707 
708  // assert dim >= 2
709  x_ValidateDim(stdseg, align, num_stdseg);
710 
711  // assert dim == Loc.size()
712  if ( dim != stdseg.GetLoc().size() ) {
713  string label;
714  stdseg.GetLoc()[0]->GetId()->GetLabel(&label);
715  string context;
716  size_t bar_pos = NStr::Find(label, "|");
717  if ( bar_pos != string::npos ) {
718  context = label.substr(bar_pos+1);
719  } else {
720  context = label;
721  }
723  "SeqId: In segment " + NStr::SizetToString (num_stdseg)
724  + ", there are more or fewer rows than there are seqids (context "
725  + context + "). Look for possible formatting errors in the ids.", align);
726  }
727 
728  // assert dim == Ids.size()
729  if ( stdseg.IsSetIds() ) {
730  if ( dim != stdseg.GetIds().size() ) {
732  "Mismatch between specified dimension (" +
733  NStr::SizetToString(dim) +
734  ") and number of Seq-ids (" +
735  NStr::SizetToString(stdseg.GetIds().size()) + ")",
736  align);
737  }
738  }
739  }
740 
741  x_ValidateStrand(std_segs, align);
742  x_ValidateSegmentGap(std_segs, align);
743 
744  if ( m_Imp.IsRemoteFetch() ) {
745  x_ValidateSeqId(align);
746  x_ValidateSeqLength(std_segs, align);
747  }
748 }
749 
750 
751 template <typename T>
753 (T& obj,
754  const CSeq_align& align,
755  size_t part)
756 {
757  bool rval = false;
758 
759  if ( !obj.IsSetDim() || obj.GetDim() == 0) {
760  if (part > 0) {
762  "Segs: Segment " + NStr::SizetToString (part) + "has dimension zero", align);
763  } else {
765  "Dim: This alignment has dimension zero", align);
766  }
767  } else if (obj.GetDim() == 1) {
768  string msg = "";
769  EErrType et;
770  if (part > 0) {
772  msg = "Segs: Segment " + NStr::SizetToString (part) + " apparently has only one sequence. Each portion of the alignment must have at least two sequences.";
773  } else {
775  msg = "Dim: This seqalign apparently has only one sequence. Each alignment must have at least two sequences.";
776  }
778  if (id) {
780  if (bsh) {
781  int version = 0;
782  const string& label = GetAccessionFromObjects(bsh.GetCompleteBioseq(), NULL, *m_Scope, &version);
783  msg += " context " + label;
784  }
785  }
786  PostErr (eDiag_Error, et, msg, align);
787  } else {
788  rval = true;
789  }
790 
791  return rval;
792 }
793 
794 
795 //===========================================================================
796 // x_ValidateStrand:
797 //
798 // Check if the strand is consistent in SeqAlignment of global
799 // or partial type.
800 //===========================================================================
801 
803 (const TDenseg& denseg,
804  const CSeq_align& align)
805 {
806  if ( !denseg.IsSetStrands() ) {
807  return;
808  }
809 
810  size_t dim = denseg.GetDim();
811  size_t numseg = denseg.GetNumseg();
812  const CDense_seg::TStrands& strands = denseg.GetStrands();
813 
814  // go through id for each alignment sequence
815  for ( size_t id = 0; id < dim; ++id ) {
816  ENa_strand strand1 = strands[id];
817 
818  for ( size_t seg = 0; seg < numseg; ++seg ) {
819  ENa_strand strand2 = strands[id + (seg * dim)];
820 
821  // skip undefined strand
822  if ( strand2 == eNa_strand_unknown ||
823  strand2 == eNa_strand_other ) {
824  continue;
825  }
826 
827  if ( strand1 == eNa_strand_unknown ||
828  strand1 == eNa_strand_other ) {
829  strand1 = strand2;
830  continue;
831  }
832 
833  // strands should be same for a given seq-id
834  if ( strand1 != strand2 ) {
836  "Strand: The strand labels for SeqId " +
837  denseg.GetIds()[id]->AsFastaString() +
838  " are inconsistent across the alignment. "
839  "The first inconsistent region is the " +
840  NStr::SizetToString(seg + 1) + "(th) region, near sequence position "
841  + NStr::SizetToString(denseg.GetStarts()[id + (seg * dim)]), align);
842  break;
843  }
844  }
845  }
846 }
847 
848 
850 (const TStd& std_segs,
851  const CSeq_align& align)
852 {
854  map< string, bool> reported;
855  int region = 1;
856 
857  ITERATE ( TStd, stdseg, std_segs ) {
858  ITERATE ( CStd_seg::TLoc, loc_iter, (*stdseg)->GetLoc() ) {
859  const CSeq_loc& loc = **loc_iter;
860 
861  if ( !IsOneBioseq(loc, m_Scope) ) {
862  // !!! should probably be an error
863  continue;
864  }
865  CConstRef<CSeq_id> id(&GetId(loc, m_Scope));
866  string id_label = id->AsFastaString();
867 
868  ENa_strand strand = GetStrand(loc, m_Scope);
869 
870  if ( strand == eNa_strand_unknown ||
871  strand == eNa_strand_other ) {
872  continue;
873  }
874 
875  if ( strands[id_label] == eNa_strand_unknown ||
876  strands[id_label] == eNa_strand_other ) {
877  strands[id_label] = strand;
878  reported[id_label] = false;
879  } else if (!reported[id_label]
880  && strands[id_label] != strand ) {
881  TSeqPos start = loc.GetStart(eExtreme_Positional);
883  "Strand: The strand labels for SeqId " + id_label +
884  " are inconsistent across the alignment. The first inconsistent region is the "
885  + NStr::IntToString (region) + "(th) region, near sequence position "
886  + NStr::IntToString (start), align);
887  reported[id_label] = true;
888  }
889  }
890  region++;
891  }
892 }
893 
894 
896 {
897  size_t match = 0;
898  size_t min_len = b1.GetInst().GetLength();
899  if (b2.GetInst().GetLength() < min_len) {
900  min_len = b2.GetInst().GetLength();
901  }
902  if (min_len == 0) {
903  return 0;
904  }
905  if (b1.IsAa() && !b2.IsAa()) {
906  return 0;
907  } else if (!b1.IsAa() && b2.IsAa()) {
908  return 0;
909  }
910 
911  try {
914  for ( CSeqVector_CI sv1_iter(sv1), sv2_iter(sv2); (sv1_iter) && (sv2_iter); ++sv1_iter, ++sv2_iter ) {
915  if (*sv1_iter == *sv2_iter || *sv1_iter == 'N' || *sv2_iter == 'N') {
916  match++;
917  }
918  }
919 
920  match = (match * 100) / min_len;
921 
922  } catch (CException& ) {
923  match = 0;
924  }
925  return match;
926 }
927 
928 
929 //===========================================================================
930 // x_ValidateFastaLike:
931 //
932 // Check if an alignment is FASTA-like.
933 // Alignment is FASTA-like if all gaps are at the end with dimensions > 2.
934 //===========================================================================
935 
937 (const TDenseg& denseg,
938  const CSeq_align& align)
939 {
940  // check only global or partial type
941  if ( (align.GetType() != CSeq_align::eType_global &&
942  align.GetType() != CSeq_align::eType_partial) ||
943  denseg.GetDim() <= 2 ) {
944  return;
945  }
946 
947  size_t dim = denseg.GetDim();
948  size_t numseg = denseg.GetNumseg();
949 
950  vector<string> fasta_like;
951 
952  for ( int id = 0; id < s_GetNumIdsToUse(denseg); ++id ) {
953  bool gap = false;
954 
955  const CDense_seg::TStarts& starts = denseg.GetStarts();
956  for ( size_t seg = 0; seg < numseg; ++ seg ) {
957  // if start value is -1, set gap flag to true
958  if ( starts[id + (dim * seg)] < 0 ) {
959  gap = true;
960  } else if ( gap ) {
961  // if a positive start value is found after the initial -1
962  // start value, it's not fasta like.
963  //no need to check this sequence further
964  return;
965  }
966 
967  if ( seg == numseg - 1) {
968  // if no more positive start value are found after the initial
969  // -1 start value, it's fasta like
970  fasta_like.push_back(denseg.GetIds()[id]->AsFastaString());
971  }
972  }
973  }
974 
975  if ( !fasta_like.empty() ) {
976  CDense_seg::TIds::const_iterator id_it = denseg.GetIds().begin();
977  string context = (*id_it)->GetSeqIdString();
978  CBioseq_Handle master_seq = m_Scope->GetBioseqHandle(**id_it);
979  bool is_fasta_like = false;
980  if (master_seq) {
981  ++id_it;
982  while (id_it != denseg.GetIds().end() && !is_fasta_like) {
983  CBioseq_Handle seq = m_Scope->GetBioseqHandle(**id_it);
984  if (!seq || s_PercentBioseqMatch (master_seq, seq) < 50) {
985  is_fasta_like = true;
986  }
987  ++id_it;
988  }
989  } else {
990  is_fasta_like = true;
991  }
992  if (is_fasta_like) {
994  "Fasta: This may be a fasta-like alignment for SeqId: "
995  + fasta_like.front() + " in the context of " + context, align);
996  }
997  }
998 
999 }
1000 
1001 
1002 
1003 
1005 {
1006  TSegmentGapV seggaps;
1007  size_t align_pos = 0;
1008 
1009  int numseg = denseg.GetNumseg();
1010  int dim = denseg.GetDim();
1011  const CDense_seg::TStarts& starts = denseg.GetStarts();
1012 
1013  for (size_t seg = 0; seg < numseg; ++seg) {
1014  bool seggap = true;
1015  for (int id = 0; id < dim; ++id) {
1016  if (starts[seg * dim + id] != -1) {
1017  seggap = false;
1018  break;
1019  }
1020  }
1021  if (seggap) {
1022  // no sequence is present in this segment
1023  string label = "";
1024  if (denseg.IsSetIds() && denseg.GetIds().size() > 0) {
1025  denseg.GetIds()[0]->GetLabel(&label, CSeq_id::eContent);
1026  }
1027  if (NStr::IsBlank(label)) {
1028  label = "unknown";
1029  }
1030  seggaps.push_back(TSegmentGapV::value_type(seg, align_pos, label));
1031  }
1032  if (denseg.IsSetLens() && denseg.GetLens().size() > (unsigned int)seg) {
1033  align_pos += denseg.GetLens()[seg];
1034  }
1035  }
1036  return seggaps;
1037 }
1038 
1040 {
1041  ITERATE(TSegmentGapV, itr, seggaps) {
1042  // no sequence is present in this segment
1044  "Segs: Segment " + NStr::SizetToString(itr->seg_num + 1) + " (near alignment position "
1045  + NStr::SizetToString(itr->align_pos) + ") in the context of "
1046  + itr->label + " contains only gaps. Each segment must contain at least one actual sequence -- look for columns with all gaps and delete them.",
1047  align);
1048  }
1049 }
1050 
1051 
1052 //===========================================================================
1053 // x_ValidateSegmentGap:
1054 //
1055 // Check if there is a gap for all sequences in a segment.
1056 //===========================================================================
1057 
1059 (const TDenseg& denseg,
1060  const CSeq_align& align)
1061 {
1062  TSegmentGapV seggaps = FindSegmentGaps(denseg, m_Scope);
1063  x_ReportSegmentGaps(seggaps, align);
1064 }
1065 
1066 
1068 {
1069  TSegmentGapV seggaps;
1070 
1071  static Uchar bits[] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
1072 
1073  size_t numseg = packed.GetNumseg();
1074  size_t dim = packed.GetDim();
1075  const CPacked_seg::TPresent& present = packed.GetPresent();
1076 
1077  size_t align_pos = 0;
1078  for (size_t seg = 0; seg < numseg; ++seg) {
1079  size_t id = 0;
1080  for (; id < dim; ++id) {
1081  size_t i = id + (dim * seg);
1082  if ((present[i / 8] & bits[i % 8])) {
1083  break;
1084  }
1085  }
1086  if (id == dim) {
1087  // no sequence is present in this segment
1088  string label = "";
1089  if (packed.IsSetIds() && packed.GetIds().size() > 0) {
1090  packed.GetIds()[0]->GetLabel(&label, CSeq_id::eContent);
1091  }
1092  if (NStr::IsBlank(label)) {
1093  label = "Unknown";
1094  }
1095  seggaps.push_back(TSegmentGapV::value_type(seg, align_pos, label));
1096  }
1097  if (packed.IsSetLens() && packed.GetLens().size() > seg) {
1098  align_pos += packed.GetLens()[seg];
1099  }
1100  }
1101 
1102  return seggaps;
1103 }
1104 
1105 
1107 (const TPacked& packed,
1108  const CSeq_align& align)
1109 {
1110  TSegmentGapV seggaps = FindSegmentGaps(packed, m_Scope);
1111  x_ReportSegmentGaps(seggaps, align);
1112 }
1113 
1114 
1116 {
1117  TSegmentGapV seggaps;
1118 
1119  size_t seg = 0;
1120  size_t align_pos = 0;
1121  ITERATE(TStd, stdseg, std_segs) {
1122  bool gap = true;
1123  size_t len = 0;
1124  string label = "";
1125  ITERATE(CStd_seg::TLoc, loc, (*stdseg)->GetLoc()) {
1126  if (!(*loc)->IsEmpty() && !(*loc)->IsNull()) {
1127  gap = false;
1128  break;
1129  } else if (len == 0) {
1130  len = GetLength(**loc, scope);
1131  if (NStr::IsBlank(label)) {
1132  (*loc)->GetId()->GetLabel(&label, CSeq_id::eContent);
1133  }
1134  }
1135  }
1136  if (gap) {
1137  if (NStr::IsBlank(label)) {
1138  label = "Unknown";
1139  }
1140  seggaps.push_back(TSegmentGapV::value_type(seg, align_pos, label));
1141  }
1142  align_pos += len;
1143  ++seg;
1144  }
1145  return seggaps;
1146 }
1147 
1148 
1150 (const TStd& std_segs,
1151  const CSeq_align& align)
1152 {
1153  TSegmentGapV seggaps = FindSegmentGaps(std_segs, m_Scope);
1154  x_ReportSegmentGaps(seggaps, align);
1155 }
1156 
1157 
1159 {
1160  TSegmentGapV seggaps;
1161 
1162  size_t seg = 0;
1163  TSeqPos align_pos = 1;
1164  ITERATE(TDendiag, diag_seg, dendiags) {
1165  if (!(*diag_seg)->IsSetDim() || (*diag_seg)->GetDim() == 0) {
1166  string label = "";
1167  if ((*diag_seg)->IsSetIds() && (*diag_seg)->GetIds().size() > 0) {
1168  (*diag_seg)->GetIds().front()->GetLabel(&label);
1169  }
1170  if (NStr::IsBlank(label)){
1171  label = "Unknown";
1172  }
1173  seggaps.push_back(TSegmentGapV::value_type(seg, align_pos, label));
1174  }
1175  if ((*diag_seg)->IsSetLen()) {
1176  align_pos += (*diag_seg)->GetLen();
1177  }
1178  ++seg;
1179  }
1180 
1181  return seggaps;
1182 }
1183 
1184 
1186 (const TDendiag& dendiags,
1187  const CSeq_align& align)
1188 {
1189  TSegmentGapV seggaps = FindSegmentGaps(dendiags, m_Scope);
1190  x_ReportSegmentGaps(seggaps, align);
1191 }
1192 
1193 
1194 //===========================================================================
1195 // x_ValidateSeqIdInSeqAlign:
1196 //
1197 // Validate SeqId in sequence alignment.
1198 //===========================================================================
1199 
1201 {
1202  vector< CRef< CSeq_id > > ids;
1203  x_GetIds(align, ids);
1204 
1205  ITERATE( vector< CRef< CSeq_id > >, id_iter, ids ) {
1206  const CSeq_id& id = **id_iter;
1207  if ( id.IsLocal() ) {
1208  if ( !m_Scope->GetBioseqHandle(id) ) {
1210  "SeqId: The sequence corresponding to SeqId " +
1211  id.AsFastaString() + " could not be found.",
1212  align);
1213  }
1214  }
1215  }
1216 }
1217 
1218 
1220 (const CSeq_align& align,
1221  vector< CRef< CSeq_id > >& ids)
1222 {
1223  ids.clear();
1224 
1225  switch ( align.GetSegs().Which() ) {
1226 
1228  ITERATE( TDendiag, diag_seg, align.GetSegs().GetDendiag() ) {
1229  const vector< CRef< CSeq_id > >& diag_ids = (*diag_seg)->GetIds();
1230  copy(diag_ids.begin(), diag_ids.end(), back_inserter(ids));
1231  }
1232  break;
1233 
1235  ids = align.GetSegs().GetDenseg().GetIds();
1236  break;
1237 
1239  copy(align.GetSegs().GetPacked().GetIds().begin(),
1240  align.GetSegs().GetPacked().GetIds().end(),
1241  back_inserter(ids));
1242  break;
1243 
1245  ITERATE( TStd, std_seg, align.GetSegs().GetStd() ) {
1246  ITERATE( CStd_seg::TLoc, loc, (*std_seg)->GetLoc() ) {
1247  CSeq_id* idp = const_cast<CSeq_id*>(&GetId(**loc, m_Scope));
1248  CRef<CSeq_id> ref(idp);
1249  ids.push_back(ref);
1250  }
1251  }
1252  break;
1253 
1254  default:
1255  break;
1256  }
1257 }
1258 
1259 
1260 string s_DescribeSegment(const CSeq_id& id, const CSeq_id& id_context, size_t segment, size_t pos, bool use_in = false)
1261 {
1262  string label;
1263  id.GetLabel(&label);
1264  string context;
1265  id_context.GetLabel(&context, CSeq_id::eContent);
1266 
1267  string seg_string = "sequence " + label + "," + (use_in ? " in " : " ") +
1268  "segment " + NStr::NumericToString(segment) +
1269  " (near sequence position " + NStr::NumericToString(pos) +
1270  ")" + (use_in ? ", " : " ") + "context " + context;
1271  return seg_string;
1272 }
1273 
1274 
1276 (const CSeq_align& align,
1277 const CSeq_id& id,
1278 const CSeq_id& id_context,
1279 size_t segment,
1280 size_t pos,
1281 EErrType et,
1282 EDiagSev sev,
1283 const string& prefix,
1284 const string& message)
1285 {
1286  PostErr(sev, et, prefix + ": In " + s_DescribeSegment(id, id_context, segment, pos) + ", " + message, align);
1287 }
1288 
1289 static const string kAlignmentTooLong = "the alignment claims to contain residue coordinates that are past the end of the sequence. Either the sequence is too short, or there are extra characters or formatting errors in the alignment";
1290 
1292 (const CSeq_align& align,
1293  const CSeq_id& id,
1294  const CSeq_id& id_context,
1295  size_t segment,
1296  size_t pos)
1297 {
1298  x_ReportAlignErr(align, id, id_context, segment, pos,
1300  "Start", kAlignmentTooLong);
1301 }
1302 
1303 
1305 (const CSeq_align& align,
1306 const CSeq_id& id,
1307 const CSeq_id& id_context,
1308 size_t segment,
1309 size_t pos)
1310 {
1311  x_ReportAlignErr(align, id, id_context, segment, pos,
1313  "Start", kAlignmentTooLong);
1314 }
1315 
1316 
1317 //===========================================================================
1318 // x_ValidateSeqLength:
1319 //
1320 // Check segment length, start and end point in Dense_diag, Dense_seg,
1321 // Packed_seg and Std_seg.
1322 //===========================================================================
1323 
1324 // Make sure that, in Dense_diag alignment, segment length is not greater
1325 // than Bioseq length
1327 (const CDense_diag& dendiag,
1328  size_t dendiag_num,
1329  const CSeq_align& align)
1330 {
1331  size_t dim = dendiag.GetDim();
1332  TSeqPos len = dendiag.GetLen();
1333  const CDense_diag::TIds& ids = dendiag.GetIds();
1334 
1335  const CSeq_id& context_id = *(ids[0]);
1336  CDense_diag::TStarts::const_iterator starts_iter =
1337  dendiag.GetStarts().begin();
1338 
1339  for ( size_t id = 0; id < dim; ++id ) {
1340  TSeqPos bslen = GetLength(*(ids[id]), m_Scope);
1341  TSeqPos start = *starts_iter;
1342 
1343  const CSeq_id& seq_id = *(ids[id]);
1344 
1345  // verify start
1346  if ( start >= bslen ) {
1347  x_ReportStartMoreThanBiolen(align, seq_id, context_id, 1, start);
1348  }
1349 
1350  // verify length
1351  if ( start + len > bslen ) {
1352  x_ReportSumLenStart(align, seq_id, context_id, 1, start);
1353  }
1354  ++starts_iter;
1355  }
1356 }
1357 
1358 
1359 
1360 
1362 (const TDenseg& denseg,
1363  const CSeq_align& align)
1364 {
1365  int dim = denseg.GetDim();
1366  size_t numseg = denseg.GetNumseg();
1367  const CDense_seg::TIds& ids = denseg.GetIds();
1368  const CDense_seg::TStarts& starts = denseg.GetStarts();
1369  const CDense_seg::TLens& lens = denseg.GetLens();
1370  bool minus = false;
1371  const CSeq_id& id_context = *ids[0];
1372 
1373  if (numseg > lens.size()) {
1374  numseg = lens.size();
1375  }
1376 
1377  for ( int id = 0; id < ids.size(); ++id ) {
1378  TSeqPos bslen = GetLength(*(ids[id]), m_Scope);
1379  minus = denseg.IsSetStrands() &&
1380  denseg.GetStrands()[id] == eNa_strand_minus;
1381 
1382  for ( int seg = 0; seg < numseg; ++seg ) {
1383  size_t curr_index =
1384  id + (minus ? numseg - seg - 1 : seg) * dim;
1385  // no need to verify if segment is not present
1386  if ( starts[curr_index] == -1 ) {
1387  continue;
1388  }
1389  size_t lens_index = minus ? numseg - seg - 1 : seg;
1390 
1391  // verify that start plus segment does not exceed total bioseq len
1392  if ( starts[curr_index] + lens[lens_index] > bslen ) {
1393  x_ReportSumLenStart(align, *(ids[id]), id_context, seg + 1, starts[curr_index]);
1394  }
1395 
1396  // find the next segment that is present
1397  size_t next_index = curr_index;
1398  int next_seg;
1399  for ( next_seg = seg + 1; next_seg < numseg; ++next_seg ) {
1400  next_index =
1401  id + (minus ? numseg - next_seg - 1 : next_seg) * dim;
1402 
1403  if ( starts[next_index] != -1 ) {
1404  break;
1405  }
1406  }
1407  if ( next_seg == numseg || next_index == curr_index ) {
1408  continue;
1409  }
1410 
1411  // length plus start should be equal to the closest next
1412  // start that is not -1
1413  if ( starts[curr_index] + (TSignedSeqPos)lens[lens_index] !=
1414  starts[next_index] ) {
1416  "Start/Length: There is a problem with " +
1417  s_DescribeSegment(*(ids[id]), id_context, seg + 1, starts[curr_index], true) +
1418  ": the segment is too long or short or the next segment has an incorrect start position", align);
1419  }
1420  }
1421  }
1422 }
1423 
1424 
1426 (const TPacked& packed,
1427  const CSeq_align& align)
1428 {
1429  static Uchar bits[] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
1430 
1431  if (!packed.IsSetDim() || !packed.IsSetIds() || !packed.IsSetPresent() || !packed.IsSetNumseg()) {
1432  return;
1433  }
1434 
1435  size_t dim = packed.GetDim();
1436  size_t numseg = packed.GetNumseg();
1437 
1438  const CPacked_seg::TPresent& present = packed.GetPresent();
1439  CPacked_seg::TIds::const_iterator id_it = packed.GetIds().begin();
1440  const CSeq_id& id_context = **id_it;
1441 
1442  for ( size_t id = 0; id < dim && id_it != packed.GetIds().end(); ++id, ++id_it ) {
1443  CBioseq_Handle bsh = m_Scope->GetBioseqHandle (**id_it);
1444  if (bsh) {
1445  string label;
1446  (*id_it)->GetLabel (&label);
1447  TSeqPos seg_start = packed.GetStarts()[id];
1448  if (seg_start >= bsh.GetBioseqLength()) {
1449  x_ReportStartMoreThanBiolen(align, **id_it, id_context, 1, seg_start);
1450  }
1451  for ( size_t seg = 0; seg < numseg; ++seg ) {
1452  size_t i = id + seg * dim;
1453  if ( i/8 < present.size() && (present[i / 8] & bits[i % 8]) ) {
1454  seg_start += packed.GetLens()[seg];
1455  if (seg_start > bsh.GetBioseqLength()) {
1456  x_ReportSumLenStart(align, **id_it, id_context, seg + 1, seg_start);
1457  }
1458  }
1459  }
1460  }
1461  }
1462 }
1463 
1464 
1466 (const TStd& std_segs,
1467  const CSeq_align& align)
1468 {
1469  int seg = 1;
1470  ITERATE( TStd, iter, std_segs ) {
1471  const CStd_seg& stdseg = **iter;
1472  const CSeq_id& id_context = *(stdseg.GetLoc().front()->GetId());
1473 
1474  ITERATE ( CStd_seg::TLoc, loc_iter, stdseg.GetLoc() ) {
1475  const CSeq_loc& loc = **loc_iter;
1476 
1477  if (loc.IsEmpty()) {
1478  CRef<CSeq_id> id(new CSeq_id());
1479  if (stdseg.IsSetIds() && stdseg.GetIds().size() >= seg) {
1480  id->Assign(*(stdseg.GetIds()[seg - 1]));
1481  } else {
1482  id->SetLocal().SetStr("?");
1483  }
1484  x_ReportAlignErr(align, *id, id_context, seg, 0,
1486  "Length", "End point is less than zero in segment");
1487  x_ReportAlignErr(align, *id, id_context, seg, 0,
1489  "Start", "Start point is less than zero");
1490  }
1491  if ( loc.IsWhole() || loc.IsEmpty() || loc.IsNull() ) {
1492  continue;
1493  }
1494 
1495  if ( !IsOneBioseq(loc, m_Scope) ) {
1496  continue;
1497  }
1498 
1499  TSeqPos from = loc.GetTotalRange().GetFrom();
1500  TSeqPos to = loc.GetTotalRange().GetTo();
1501  TSeqPos loclen = GetLength( loc, m_Scope);
1502  TSeqPos bslen = GetLength(GetId(loc, m_Scope), m_Scope);
1503  string bslen_str = NStr::UIntToString(bslen);
1504 
1505  const CSeq_id& id = *(loc.GetId());
1506 
1507  if ( from > bslen - 1 ) {
1508  x_ReportStartMoreThanBiolen(align, id, id_context, seg, from);
1509  }
1510 
1511  if ( to > bslen - 1 ) {
1512  x_ReportAlignErr(align, id, id_context, seg, from,
1514  "Length", kAlignmentTooLong);
1515  }
1516 
1517  if ( loclen > bslen ) {
1518  x_ReportAlignErr(align, id, id_context, seg, from,
1520  "Length", kAlignmentTooLong);
1521  }
1522  }
1523  seg++;
1524  }
1525 }
1526 
1527 
TLen GetLen(void) const
Get the Len member data.
T minus(T x_)
CBioseq_Handle –.
void CreateAnchoredAlnVec(_TAlnStats &aln_stats, TAnchoredAlnVec &out_vec, const CAlnUserOptions &options)
Create anchored alignment from each seq-align in the stats.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:908
bool IsSetStrands(void) const
Check if a value has been assigned to Strands data member.
void push_back(const CSeq_align &aln)
Adding an alignment.
Definition: aln_tests.hpp:87
vector< TSignedSeqPos > TStarts
Definition: Dense_seg_.hpp:107
Set coding to printable coding (Iupacna or Iupacaa)
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:965
void ValidateSeqAlign(const CSeq_align &align)
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
const TInst & GetInst(void) const
void x_ValidateStd(const TStd &stdsegs, const CSeq_align &align)
vector< CRef< CSeq_id > > TIds
Definition: Dense_seg_.hpp:106
Sparse alignment.
Definition: sparse_aln.hpp:50
static bool s_AmbiguousMatch(char a, char b)
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
Definition: ncbistr.hpp:5138
void x_ValidatePacked(const TPacked &packed, const CSeq_align &align)
const string version
version string
Definition: variables.hpp:66
static TSegmentGapV FindSegmentGaps(const TPacked &packed, CScope *scope)
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:102
const TStarts & GetStarts(void) const
Get the Starts member data.
const TDendiag & GetDendiag(void) const
Get the variant data.
Definition: Seq_align_.hpp:713
#define T(s)
Definition: common.h:225
static size_t s_GetNumIdsToUse(const CDense_seg &denseg)
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2628
const TScore & GetScore(void) const
Get the Score member data.
Definition: Seq_align_.hpp:883
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:73
User-defined methods of the data storage class.
const objects::CSeq_id & GetSeqId(TNumrow row) const
Get seq-id for the row.
Definition: sparse_aln.cpp:264
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:953
Search in all loaded TSEs in the scope.
Definition: scope.hpp:126
EDiagSev
Severity level for the posted diagnostics.
Definition: ncbidiag.hpp:644
void SetEndChar(TResidue gap_char)
Definition: alnvec.hpp:362
TDim GetDim(void) const
Alignment dimension (number of sequence rows in the alignment)
Definition: sparse_aln.cpp:69
string & GetAlnSeqString(TNumrow row, string &buffer, const TSignedRange &rq_aln_rng, bool force_translation=false) const
Fetch alignment sequence data.
Definition: sparse_aln.cpp:549
position_type GetLength(void) const
Definition: range.hpp:158
const TIds & GetIds(void) const
Get the Ids member data.
CSeq_align::C_Segs::TDenseg TDenseg
Definition: validatorp.hpp:977
void x_ReportStartMoreThanBiolen(const CSeq_align &align, const CSeq_id &id, const CSeq_id &id_context, size_t segment, size_t pos)
bool IsSetType(void) const
type of object within class Check if a value has been assigned to Type data member.
const TLoc & GetLoc(void) const
Get the Loc member data.
Definition: Std_seg_.hpp:347
CSeq_align::C_Segs::TDendiag TDendiag
Definition: validatorp.hpp:976
mapping pieces together
Definition: Seq_align_.hpp:103
TSignedSeqPos GetSeqAlnStart(TNumrow row) const
Definition: sparse_aln.cpp:271
static bool AlignmentScorePercentIdOk(const CSeq_align &align)
void x_ValidateStrand(const TDenseg &denseg, const CSeq_align &align)
CSeq_align::C_Segs::TPacked TPacked
Definition: validatorp.hpp:978
bool IsStd(void) const
Check if variant Std is selected.
Definition: Seq_align_.hpp:733
TDim GetDim(void) const
Get the Dim member data.
TRng GetAlnRange(void) const
Get whole alignment range.
Definition: sparse_aln.cpp:238
vector< TSeqPos > TLens
Definition: Dense_seg_.hpp:108
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:895
const NCBI_NS_NCBI::CEnumeratedTypeValues *ENUM_METHOD_NAME() ENa_strand(void)
Access to ENa_strand's attributes (values, names) as defined in spec.
Warning message.
Definition: ncbidiag.hpp:646
Default IAlnSeqId implementation based on CSeq_id_Handle.
Definition: aln_seqid.hpp:114
#define NULL
Definition: ncbistd.hpp:225
void x_ValidateFastaLike(const TDenseg &denseg, const CSeq_align &align)
bool IsSetScore(void) const
for whole alignment Check if a value has been assigned to Score data member.
Definition: Seq_align_.hpp:871
static size_t s_PercentBioseqMatch(CBioseq_Handle b1, CBioseq_Handle b2)
TDim GetDim(void) const
Get the Dim member data.
Definition: Std_seg_.hpp:285
bool IsNull(void) const
Check if variant Null is selected.
Definition: Seq_loc_.hpp:504
const char * match_list
bool IsSetIds(void) const
sequences in order Check if a value has been assigned to Ids data member.
Definition: Dense_seg_.hpp:480
TType GetType(void) const
Get the Type member data.
Definition: Seq_align_.hpp:796
const TIds & GetIds(void) const
Get the Ids member data.
Definition: Std_seg_.hpp:322
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3296
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:893
void x_ReportAlignErr(const CSeq_align &align, const CSeq_id &id, const CSeq_id &id_context, size_t segment, size_t pos, EErrType et, EDiagSev sev, const string &prefix, const string &message)
const TType & GetType(void) const
Get the Type member data.
int i
void BuildAln(TAnchoredAlnVec &in_alns, CAnchoredAln &out_aln, const CAlnUserOptions &options, TAlnSeqIdIRef pseudo_seqid=TAlnSeqIdIRef())
Build anchored alignment from a set of alignmnets.
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:612
bool x_ValidateDim(T &obj, const CSeq_align &align, size_t part=0)
const TStrands & GetStrands(void) const
Get the Strands member data.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5045
unsigned char Uchar
Alias for unsigned char.
Definition: ncbitype.h:95
bool IsAa(void) const
CSeq_align::C_Segs::TStd TStd
Definition: validatorp.hpp:979
void x_ValidateSegmentGap(const TDenseg &denseg, const CSeq_align &align)
string GetAccessionFromObjects(const CSerialObject *obj, const CSeq_entry *ctx, CScope &scope, int *version)
Definition: utilities.cpp:419
struct ambchar AmbCharData
vector< char > TPresent
Definition: Packed_seg_.hpp:96
CAnchoredAln::TDim TDim
Synonym of TNumrow.
Definition: sparse_aln.hpp:56
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:101
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Definition: Seq_id.cpp:1406
EErrType
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2762
TSignedSeqPos GetAlnPosFromSeqPos(TNumrow row, TSeqPos seq_pos, ESearchDirection dir=eNone, bool try_reverse_dir=true) const
Map sequence position to alignment coordinates.
Definition: sparse_aln.cpp:373
string & GetColumnVector(string &buffer, TSeqPos aln_pos, TResidueCount *residue_count=0, bool gaps_in_count=false) const
Definition: alnvec.cpp:984
Container mapping seq-aligns to vectors of participating seq-ids.
Definition: aln_tests.hpp:55
list< CRef< CSeq_align > > Tdata
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
void x_ValidateSeqId(const CSeq_align &align)
Informational message.
Definition: ncbidiag.hpp:645
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:897
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:644
IAlnSeqId extracting functor.
static const int num_ambiguities
Error message.
Definition: ncbidiag.hpp:647
CSeqVector –.
Definition: seq_vector.hpp:64
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5514
static int match(register const unsigned char *eptr, register const uschar *ecode, const unsigned char *mstart, int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, int flags, unsigned int rdepth)
Definition: pcre_exec.c:431
Options for different alignment manager operations.
user defined object
Definition: Seqdesc_.hpp:124
Query-anchored alignment can be 2 or multi-dimentional.
const TIds & GetIds(void) const
Get the Ids member data.
Definition: Dense_seg_.hpp:492
numerical value
Definition: Na_strand.hpp:63
const TDenseg & GetDenseg(void) const
Get the variant data.
Definition: Seq_align_.cpp:153
CValidError_imp & m_Imp
Definition: validatorp.hpp:930
bool IsGenomic(void) const
Definition: validatorp.hpp:592
bool IsSetIds(void) const
Check if a value has been assigned to Ids data member.
Definition: Std_seg_.hpp:310
const TDisc & GetDisc(void) const
Get the variant data.
Definition: Seq_align_.cpp:197
bool IsDenseg(void) const
Check if variant Denseg is selected.
Definition: Seq_align_.hpp:727
Definition: map.hpp:337
bool IsOneBioseq(const CSeq_loc &loc, CScope *scope)
Returns true if all embedded CSeq_ids represent the same CBioseq, else false.
USING_SCOPE(sequence)
static const char * column
Definition: stats.c:23
vector< TSegmentGap > TSegmentGapV
Definition: validatorp.hpp:974
position_type GetTo(void) const
Definition: range.hpp:142
Helper class which collects seq-align statistics: seq-ids participating in alignments and rows...
Definition: aln_stats.hpp:56
void x_GetIds(const CSeq_align &align, vector< CRef< CSeq_id > > &ids)
vector< ENa_strand > TStrands
Definition: Dense_seg_.hpp:109
string s_DescribeSegment(const CSeq_id &id, const CSeq_id &id_context, size_t segment, size_t pos, bool use_in=false)
bool IsSetSegs(void) const
Check if a value has been assigned to Segs data member.
Definition: Seq_align_.hpp:896
const TPacked & GetPacked(void) const
Get the variant data.
Definition: Seq_align_.cpp:175
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
Definition: pointer.h:1084
vector< CRef< CSeq_loc > > TLoc
Definition: Std_seg_.hpp:93
TDim GetDim(void) const
Get the Dim member data.
Definition: Dense_seg_.hpp:408
TInst_Length GetInst_Length(void) const
position_type GetFrom(void) const
Definition: range.hpp:134
ENa_strand GetStrand(const CSeq_loc &loc, CScope *scope=0)
Returns eNa_strand_unknown if multiple Bioseqs in loc Returns eNa_strand_other if multiple strands in...
vector< CRef< CAnchoredAln > > TAnchoredAlnVec
Collection of anchored alignments.
CException –.
Definition: ncbiexpt.hpp:709
const Tdata & Get(void) const
Get the member data.
TSignedSeqPos GetSeqPosFromAlnPos(TNumrow for_row, TSeqPos aln_pos, ESearchDirection dir=eNone, bool try_reverse_dir=true) const
Definition: sparse_aln.cpp:385
TSeqPos GetBioseqLength(void) const
CScope –.
Definition: scope.hpp:90
bool IsGpipe(void) const
Definition: validatorp.hpp:589
void x_ValidateAlignPercentIdentity(const CSeq_align &align, bool internal_gaps)
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found...
static const AmbCharData ambiguity_list[]
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void x_ValidateDendiag(const TDendiag &dendiags, const CSeq_align &align)
static CRef< CScope > m_Scope
int len
void x_ReportSumLenStart(const CSeq_align &align, const CSeq_id &id, const CSeq_id &id_context, size_t segment, size_t pos)
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:923
void x_ValidateSeqLength(const TDenseg &denseg, const CSeq_align &align)
void PostErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj)
CConstRef< CSeq_id > GetReportableSeqIdForAlignment(const CSeq_align &align, CScope &scope)
Definition: utilities.cpp:391
vector< CRef< CSeq_id > > TIds
Definition: Dense_diag_.hpp:93
vector< CRef< CScore > > TScore
Definition: Seq_align_.hpp:398
const TIds & GetIds(void) const
Get the Ids member data.
bool IsSetType(void) const
Check if a value has been assigned to Type data member.
Definition: Seq_align_.hpp:777
static bool EqualNocase(const CTempString str, SIZE_TYPE pos, SIZE_TYPE n, const char *pattern)
Case-insensitive equality of a substring with a pattern.
Definition: ncbistr.hpp:5460
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:70
virtual ~CValidError_align(void)
void SetGapChar(TResidue gap_char)
Definition: alnvec.hpp:333
E_Choice
Choice variants.
Definition: Seq_align_.hpp:131
User-defined methods of the data storage class.
TSeqPos GetAlnStop(TNumseg seg) const
Definition: alnmap.hpp:488
static bool Equal(const CTempString str, SIZE_TYPE pos, SIZE_TYPE n, const char *pattern, ECase use_case=eCase)
Test for equality of a substring with a pattern.
Definition: ncbistr.hpp:5486
TSignedSeqPos GetSeqAlnStop(TNumrow row) const
Definition: sparse_aln.cpp:278
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
bool IsSetInst(void) const
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
void x_ReportSegmentGaps(const TSegmentGapV &seggaps, const CSeq_align &align)
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:443
static bool IsTpaAlignment(const CDense_seg &denseg, CScope &scope)
bool IsEmpty(void) const
Check if variant Empty is selected.
Definition: Seq_loc_.hpp:516
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:64
bool IsWhole(void) const
Check if variant Whole is selected.
Definition: Seq_loc_.hpp:522
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:98
const TStd & GetStd(void) const
Get the variant data.
Definition: Seq_align_.hpp:739
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool IsSetInst_Length(void) const
static string NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:4380
unsigned int
Definition: types.hpp:1153
void x_ValidateDenseg(const TDenseg &denseg, const CSeq_align &align)
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_align_.hpp:678
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
Modified on Fri Jun 24 18:26:07 2016 by modify_doxy.py rev. 426318