NCBI C++ ToolKit
redo_alignment.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* ===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================*/
24 
25 /** @file redo_alignment.c
26  * Routines for redoing a set of alignments, using either
27  * composition matrix adjustment or the Smith-Waterman algorithm (or
28  * both.)
29  *
30  * @author Alejandro Schaffer, E. Michael Gertz
31  */
32 
33 #include <stdlib.h>
34 #include <assert.h>
35 #include <math.h>
43 
44 /** The natural log of 2, defined in newer systems as M_LN2 in math.h, but
45  missing in older systems. */
46 #define LOCAL_LN2 0.69314718055994530941723212145818
47 
48 /** Define COMPO_INTENSE_DEBUG to be true to turn on rigorous but
49  * expensive consistency tests in the composition_adjustment
50  * module.
51  *
52  * This macro is usually used as part of a C-conditional
53  * if (COMPO_INTENSE_DEBUG) {
54  * perform expensive tests
55  * }
56  * The C compiler will then validate the code to perform the tests, but
57  * will almost always strip the code if COMPO_INTENSE_DEBUG is false.
58  */
59 #ifndef COMPO_INTENSE_DEBUG
60 #define COMPO_INTENSE_DEBUG 0
61 #endif
62 
63 /** by what factor might initially reported E-value exceed true E-value */
64 #define EVALUE_STRETCH 5
65 
66 /** -1/0/1 if a is less than/greater than/equal to b */
67 #ifndef CMP
68 #define CMP(a,b) ((a)>(b) ? 1 : ((a)<(b) ? -1 : 0))
69 #endif
70 
71 /** Test of whether one set of HSP bounds is contained in another */
72 #define KAPPA_CONTAINED_IN_HSP(a,b,c,d,e,f) \
73 ((a <= c && b >= c) && (d <= f && e >= f))
74 /** A macro that defines the mathematical "sign" function */
75 #define KAPPA_SIGN(a) (((a) > 0) ? 1 : (((a) < 0) ? -1 : 0))
76 
77 /** For translated subject sequences, the number of amino acids to
78  include before and after the existing aligned segment when
79  generating a composition-based scoring system. */
80 static const int kWindowBorder = 200;
81 
82 /** pseudocounts for relative-entropy-based score matrix adjustment */
83 static const int kReMatrixAdjustmentPseudocounts = 20;
84 
85 /**
86  * s_WindowInfo - a struct whose instances represent a range
87  * of data in a sequence. */
88 typedef struct s_WindowInfo
89 {
90  BlastCompo_SequenceRange query_range; /**< range of the query
91  included in this window */
92  BlastCompo_SequenceRange subject_range; /**< range of the subject
93  included in this window */
94  BlastCompo_Alignment * align; /**< list of existing alignments
95  contained in this window */
96  int hspcnt; /**< number of alignment in
97  this window */
99 
100 
101 /* Documented in redo_alignment.h. */
104  EMatrixAdjustRule matrix_adjust_rule,
105  int queryStart, int queryEnd, int queryIndex,
106  int matchStart, int matchEnd, int frame,
107  void * context)
108 {
110  if (align != NULL) {
111  align->score = score;
112  align->matrix_adjust_rule = matrix_adjust_rule;
113  align->queryIndex = queryIndex;
114  align->queryStart = queryStart;
115  align->queryEnd = queryEnd;
116  align->matchStart = matchStart;
117  align->matchEnd = matchEnd;
118  align->frame = frame;
119  align->context = context;
120  align->next = NULL;
121  }
122  return align;
123 }
124 
125 
126 /* Documented in redo_alignment.h. */
127 void
129  void (*free_context)(void*))
130 {
131  BlastCompo_Alignment * align; /* represents the current
132  alignment in loops */
133  align = *palign; *palign = NULL;
134  while (align != NULL) {
135  /* Save the value of align->next, because align is to be deleted. */
136  BlastCompo_Alignment * align_next = align->next;
137 
138  if (free_context != NULL && align->context != NULL) {
139  free_context(align->context);
140  }
141  free(align);
142  align = align_next;
143  }
144 }
145 
146 
147 /**
148  * Reverse a list of BlastCompo_Alignments. */
149 static void
151 {
152  BlastCompo_Alignment *list; /* the forward list */
153  BlastCompo_Alignment *new_list; /* the reversed list */
154  list = *plist; new_list = NULL;
155  while (list != NULL) {
156  BlastCompo_Alignment * list_next = list->next;
157  list->next = new_list;
158  new_list = list;
159  list = list_next;
160  }
161  *plist = new_list;
162 }
163 
164 
165 /**
166  * Compare two BlastCompo_Alignments. */
167 static int
169  const BlastCompo_Alignment * b)
170 {
171  int result;
172  if (0 == (result = CMP(b->score, a->score)) &&
173  0 == (result = CMP(a->matchStart, b->matchStart)) &&
174  0 == (result = CMP(b->matchEnd, a->matchEnd)) &&
175  0 == (result = CMP(a->queryStart, b->queryStart))) {
176  /* if all other tests cannot distinguish the alignments, then
177  * the final test is the result */
178  result = CMP(b->queryEnd, a->queryEnd);
179  }
180  return result;
181 }
182 
183 /** Temporary function to determine whether alignments are sorted */
184 static int
186 {
187  BlastCompo_Alignment * align;
188  for (align = alignments; align != NULL; align = align->next) {
189  if (align->next && align->next->score > align->score) {
190  return 0;
191  }
192  }
193  return 1;
194 }
195 
196 
197 /** Calculate the length of a list of BlastCompo_Alignment objects.
198  * This is an O(n) operation */
199 static int
201 {
202  int length = 0;
203  for ( ; list != NULL; list = list->next) {
204  length++;
205  }
206  return length;
207 }
208 
209 
210 /**
211  * Sort a list of Blast_Compo_Alignment objects, using s_AlignmentCmp
212  * comparison function. The mergesort sorting algorithm is used.
213  *
214  * @param *plist the list to be sorted
215  * @param hspcnt the length of the list
216  */
217 static void
219 {
220  if (COMPO_INTENSE_DEBUG) {
221  assert(s_DistinctAlignmentsLength(*plist) == hspcnt);
222  }
223  if(hspcnt > 1) {
224  BlastCompo_Alignment * list = *plist;
225  BlastCompo_Alignment *leftlist, *rightlist, **tail;
226  int i, leftcnt, rightcnt;
227 
228  /* Split the list in half */
229  leftcnt = hspcnt/2;
230  rightcnt = hspcnt - leftcnt;
231 
232  leftlist = list;
233  /* Find the point to split the list; this loop splits lists
234  correctly only when list != NULL and leftcnt > 0, which is
235  necessarily the case because hspcnt > 1 */
236  assert(list != NULL && leftcnt > 0);
237  for (i = 0; i < leftcnt - 1 && list->next != NULL; i++) {
238  list = list->next;
239  }
240  rightlist = list->next;
241  list->next = NULL;
242 
243  if (COMPO_INTENSE_DEBUG) {
244  assert(s_DistinctAlignmentsLength(rightlist) == rightcnt);
245  assert(s_DistinctAlignmentsLength(leftlist) == leftcnt);
246  }
247  /* Sort the two lists */
248  if (leftcnt > 1)
249  s_DistinctAlignmentsSort(&leftlist, leftcnt);
250  if (rightcnt > 1)
251  s_DistinctAlignmentsSort(&rightlist, rightcnt);
252 
253  /* And then merge them */
254  list = NULL;
255  tail = &list;
256  while (leftlist != NULL || rightlist != NULL) {
257  if (leftlist == NULL) {
258  *tail = rightlist;
259  rightlist = NULL;
260  } else if (rightlist == NULL) {
261  *tail = leftlist;
262  leftlist = NULL;
263  } else {
264  BlastCompo_Alignment * elt;
265  if (s_AlignmentCmp(leftlist, rightlist) < 0) {
266  elt = leftlist;
267  leftlist = leftlist->next;
268  } else {
269  elt = rightlist;
270  rightlist = rightlist->next;
271  }
272  *tail = elt;
273  tail = &elt->next;
274  }
275  }
276  *plist = list;
277  if (COMPO_INTENSE_DEBUG) {
278  assert(s_DistinctAlignmentsLength(list) == hspcnt);
280  }
281  }
282 }
283 
284 
285 /**
286  * Copy a BlastCompo_Alignment, setting the next field to NULL
287  */
288 static BlastCompo_Alignment *
290 {
291  return BlastCompo_AlignmentNew(align->score,
292  align->matrix_adjust_rule,
293  align->queryStart,
294  align->queryEnd,
295  align->queryIndex,
296  align->matchStart,
297  align->matchEnd, align->frame,
298  align->context);
299 
300 }
301 
302 /**
303  * Do given alignments have the same left or right end point
304  */
305 static Boolean
307  const BlastCompo_Alignment* align)
308 {
309  ASSERT(newAlign->frame == align->frame);
310  return ((align->queryStart == newAlign->queryStart &&
311  align->matchStart == newAlign->matchStart)
312  || (align->queryEnd == newAlign->queryEnd &&
313  align->matchEnd == newAlign->matchEnd));
314 }
315 
316 
317 /**
318  * Does at least one end point of newAlign is contined within align and lies
319  * on the same diagnol as the same end point of align
320  */
321 static Boolean
323  const BlastCompo_Alignment* align)
324 {
325  /* ASSERT(newAlign->frame == align->frame); */
326  /* is start of newAlign contained within align */
327  Boolean start_contained =
329  newAlign->queryStart,
330  align->matchStart, align->matchEnd,
331  newAlign->matchStart);
332  /* is end of newAlign contained within align */
333  Boolean end_contained =
335  newAlign->queryEnd,
336  align->matchStart, align->matchEnd,
337  newAlign->matchEnd);
338 
339  /* does either end point of newAlign lie on the same diagonal as the same
340  end point of align */
341  Boolean result = (start_contained &&
342  (newAlign->queryStart - newAlign->matchStart ==
343  align->queryStart - align->matchStart)) ||
344  (end_contained &&
345  (newAlign->queryEnd - newAlign->matchEnd ==
346  align->queryEnd - align->matchEnd));
347 
348  return result;
349 }
350 
351 /**
352  * Given a list of alignments and a new alignment, create a new list
353  * of alignments that conditionally includes the new alignment.
354  *
355  * If there is an equal or higher-scoring alignment in the preexisting
356  * list of alignments that shares an endpoint with the new alignment,
357  * then preexisting list is returned. Otherwise, a new list is
358  * returned with the new alignment as its head and the elements of
359  * preexisting list that do not share an endpoint with the new
360  * alignment as its tail. The order of elements is preserved.
361  *
362  * Typically, a list of alignments is built one alignment at a time
363  * through a call to s_WithDistinctEnds. All alignments in the resulting
364  * list have distinct endpoints. Which items are retained in the list
365  * depends on the order in which they were added.
366  *
367  * Note that an endpoint is a triple, specifying a frame, a location
368  * in the query and a location in the subject. In other words,
369  * alignments that are not in the same frame never share endpoints.
370  *
371  * @param p_newAlign on input the alignment that may be added to
372  * the list; on output NULL
373  * @param p_oldAlignments on input the existing list of alignments;
374  * on output the new list
375  * @param free_align_context a routine to be used to free the context
376  * field of an alignment, if any alignment is
377  * freed; may be NULL
378  * @param is_same_adjustment were all compared HSPs computed with the same
379  * composition adjustment; Nearly identical
380  * alignments are computed with exact subjects,
381  * and others with segged subjects, hence nearly
382  * identical alignments may extend farther; this
383  * makes comparing end points more difficult
384  */
385 static void
387  BlastCompo_Alignment **p_oldAlignments,
388  void free_align_context(void *),
389  Boolean is_same_adjustment)
390 {
391  /* Deference the input parameters. */
392  BlastCompo_Alignment * newAlign = *p_newAlign;
393  BlastCompo_Alignment * oldAlignments = *p_oldAlignments;
394  BlastCompo_Alignment * align; /* represents the current
395  alignment in loops */
396  int include_new_align; /* true if the new alignment
397  may be added to the list */
398  *p_newAlign = NULL;
399  include_new_align = 1;
400 
401  for (align = oldAlignments; align != NULL; align = align->next) {
402 
403  /* If all HSPs were computed with the same adjustment (and subject
404  segging), then simply compare end points. Otherwise, check if the
405  end point of the new HSPs is contained within an old one and
406  whether it lies on the same diagonal as the end point of the old
407  HSP. */
408 
409  if (align->frame == newAlign->frame &&
410  ((is_same_adjustment && s_IsSameEndPoint(newAlign, align)) ||
411  (!is_same_adjustment && s_IsSimilarEndPoint(newAlign, align)))) {
412  /* At least one of the endpoints of newAlign matches an endpoint
413  of align. */
414  if (newAlign->score <= align->score) {
415  /* newAlign cannot be added to the list. */
416  include_new_align = 0;
417  break;
418  }
419  }
420  }
421  if (include_new_align) {
422  /* tail of the list being created */
423  BlastCompo_Alignment **tail;
424 
425  tail = &newAlign->next;
426  align = oldAlignments;
427  while (align != NULL) {
428  /* Save align->next because align may be deleted. */
429  BlastCompo_Alignment * align_next = align->next;
430  align->next = NULL;
431  if (align->frame == newAlign->frame &&
432  ((align->queryStart == newAlign->queryStart &&
433  align->matchStart == newAlign->matchStart)
434  || (align->queryEnd == newAlign->queryEnd &&
435  align->matchEnd == newAlign->matchEnd))) {
436  /* The alignment shares an end with newAlign; */
437  /* delete it. */
438  BlastCompo_AlignmentsFree(&align, free_align_context);
439  } else { /* The alignment does not share an end with newAlign; */
440  /* add it to the output list. */
441  *tail = align;
442  tail = &align->next;
443  }
444  align = align_next;
445  } /* end while align != NULL */
446  *p_oldAlignments = newAlign;
447  } else { /* do not include_new_align */
448  BlastCompo_AlignmentsFree(&newAlign, free_align_context);
449  } /* end else do not include newAlign */
450 }
451 
452 
453 /** Release the data associated with this object. */
455 {
456  if (self->buffer) free(self->buffer);
457  self->data = NULL; self->buffer = NULL;
458 }
459 
460 
461 
462 /**
463  * Create and initialize a new s_WindowInfo.
464  *
465  * Parameters to this function correspond directly to fields of
466  * s_WindowInfo.
467  */
468 static s_WindowInfo *
469 s_WindowInfoNew(int begin, int end, int context,
470  int queryOrigin, int queryLength, int query_index,
471  BlastCompo_Alignment * align)
472 {
473  s_WindowInfo * window; /* new window to be returned */
474 
475  window = malloc(sizeof(s_WindowInfo));
476  if (window != NULL) {
477  window->subject_range.begin = begin;
478  window->subject_range.end = end;
479  window->subject_range.context = context;
480  window->query_range.begin = queryOrigin;
481  window->query_range.end = queryOrigin + queryLength;
482  window->query_range.context = query_index;
483  window->align = align;
484  window->hspcnt = 0;
485  for ( ; align != NULL; align = align->next) {
486  window->hspcnt++;
487  }
488  }
489  return window;
490 }
491 
492 
493 /**
494  * Swap the query and subject range
495  *
496  */
497 static void
499 {
501  range.begin = self->subject_range.begin;
502  range.end = self->subject_range.end;
503  range.context = self->subject_range.context;
504  self->subject_range.begin = self->query_range.begin;
505  self->subject_range.end = self->query_range.end;
506  self->subject_range.context = self->query_range.context;
507  self->query_range.begin = range.begin;
508  self->query_range.end = range.end;
509  self->query_range.context = range.context;
510  return;
511 }
512 
513 
514 /**
515  * Free an instance of s_WindowInfo.
516  *
517  * @param *window on entry the window to be freed; on exit NULL
518  */
519 static void
521 {
522  if (*window != NULL) {
523  BlastCompo_AlignmentsFree(&(*window)->align, NULL);
524  free(*window);
525  }
526  *window = NULL;
527 }
528 
529 
530 /**
531  * Join two instance of s_WindowInfo into a single window
532  *
533  * @param win1 on entry, one of the two windows to be joined; on exit
534  * the combined window
535  * @param *pwin2 on entry, the other window to be joined, on exit NULL
536  */
537 static void
539 {
540  /* the second window, which will be deleted when this routine exits */
541  s_WindowInfo * win2 = *pwin2;
542  BlastCompo_Alignment *align, **tail;
543  /* subject ranges for the two windows */
544  BlastCompo_SequenceRange * sbjct_range1 = &win1->subject_range;
545  BlastCompo_SequenceRange * sbjct_range2 = &win2->subject_range;
546 
547  assert(sbjct_range1->context == sbjct_range2->context);
548  assert(win1->query_range.context == win2->query_range.context);
549 
550  sbjct_range1->begin = MIN(sbjct_range1->begin, sbjct_range2->begin);
551  sbjct_range1->end = MAX(sbjct_range1->end, sbjct_range2->end);
552  win1->hspcnt += win2->hspcnt;
553 
554  tail = &win1->align;
555  for (align = win1->align; align != NULL; align = align->next) {
556  tail = &align->next;
557  }
558  *tail = win2->align;
559  win2->align = NULL;
560 
561  s_WindowInfoFree(pwin2);
562 }
563 
564 
565 /**
566  * A comparison routine used to sort a list of windows, first by frame
567  * and then by location.
568  */
569 static int
570 s_LocationCompareWindows(const void * vp1, const void *vp2)
571 {
572  /* w1 and w2 are the windows being compared */
573  s_WindowInfo * w1 = *(s_WindowInfo **) vp1;
574  s_WindowInfo * w2 = *(s_WindowInfo **) vp2;
575  /* the subject ranges of the two windows */
578  /* the query indices of the two windows */
579  /* the query ranges of the two windows */
582 
583  int result; /* result of the comparison */
584  if (0 == (result = CMP(qr1->context, qr2->context)) &&
585  0 == (result = CMP(sr1->context, sr2->context)) &&
586  0 == (result = CMP(sr1->begin, sr2->begin)) &&
587  0 == (result = CMP(sr1->end, sr2->end)) &&
588  0 == (result = CMP(qr1->begin, qr2->begin))) {
589  result = CMP(qr1->end, qr2->end);
590  }
591  return result;
592 }
593 
594 
595 /**
596  * A comparison routine used to sort a list of windows by position in
597  * the subject, ignoring strand and frame. Ties are broken
598  * deterministically.
599  */
600 static int
601 s_SubjectCompareWindows(const void * vp1, const void *vp2)
602 {
603  /* w1 and w2 are the windows being compared */
604  s_WindowInfo * w1 = *(s_WindowInfo **) vp1;
605  s_WindowInfo * w2 = *(s_WindowInfo **) vp2;
606  /* the subject ranges of the two windows */
609  /* the query ranges of the two windows */
612 
613  int result; /* result of the comparison */
614  if (0 == (result = CMP(sr1->begin, sr2->begin)) &&
615  0 == (result = CMP(sr1->end, sr2->end)) &&
616  0 == (result = CMP(sr1->context, sr2->context)) &&
617  0 == (result = CMP(qr1->begin, qr2->begin)) &&
618  0 == (result = CMP(qr1->end, qr2->end))) {
619  result = CMP(qr1->context, qr2->context);
620  }
621  return result;
622 }
623 
624 static
625 int s_GetTranslatedLength(int length,int frame, int is_pos_based) {
626  if(is_pos_based) {
627  int f = GET_SEQ_FRAME(frame);
628  int nucl_length = GET_NUCL_LENGTH(length);
629  return GET_TRANSLATED_LENGTH(nucl_length, f);
630  }
631 
632  return ((length - ABS(frame) + 1)/3);
633 }
634 
635 /**
636  * Read a list of alignments from a translated search and create a
637  * new array of pointers to s_WindowInfo so that each alignment is
638  * contained in exactly one window. See s_WindowsFromAligns for the
639  * meaning of the parameters. (@sa s_WindowsFromAligns).
640  *
641  * @return 0 on success, -1 on out-of-memory
642  */
643 static int
645  BlastCompo_QueryInfo * query_info,
646  int hspcnt, int border, int sequence_length,
647  s_WindowInfo ***pwindows, int * nWindows,
648  int subject_is_translated, int is_pos_based)
649 {
650  int k; /* iteration index */
651  s_WindowInfo ** windows; /* the output list of windows */
652  int length_joined; /* the current length of the
653  list of joined windows */
654  BlastCompo_Alignment * align; /* represents the current
655  alignment in the main loop */
656  *nWindows = 0;
657  windows = *pwindows = calloc(hspcnt, sizeof(s_WindowInfo*));
658  if (windows == NULL)
659  goto error_return;
660 
661  *nWindows = hspcnt;
662  for (align = alignments, k = 0;
663  align != NULL;
664  align = align->next, k++) {
665  int frame; /* translation frame */
666  int query_index; /* index of the query contained in the
667  current HSP */
668  int query_length; /* length of the current query */
669  int translated_length; /* length of the translation of the entire
670  nucleotide sequence in this frame */
671  int begin, end; /* interval in amino acid coordinates of
672  the translated window */
673  /* copy of the current alignment to add to the window */
674  BlastCompo_Alignment * align_copy;
675  frame = align->frame;
676  query_index = align->queryIndex;
677  query_length = query_info[query_index].seq.length;
678  translated_length = s_GetTranslatedLength(sequence_length, frame, is_pos_based);
679 
680  align_copy = s_AlignmentCopy(align);
681  if (align_copy == NULL)
682  goto error_return;
683 
684  if (subject_is_translated) {
685  begin = MAX(0, align->matchStart - border);
686  end = MIN(translated_length, align->matchEnd + border);
687  windows[k] = s_WindowInfoNew(begin, end, frame, 0,
688  query_length, query_index, align_copy);
689  } else {
690  begin = MAX(0, align->queryStart - border);
691  end = MIN(query_length, align->queryEnd + border);
692  /* for blastx, temporarily swap subject and query ranges*/
693  windows[k] = s_WindowInfoNew(begin, end, query_index, 0,
694  sequence_length, 0, align_copy);
695  }
696  if (windows[k] == NULL)
697  {
698  BlastCompo_AlignmentsFree(&align_copy, NULL);
699  goto error_return;
700  }
701  }
702  qsort(windows, hspcnt, sizeof(s_WindowInfo*),
704 
705  /* Join windows that overlap or are too close together. */
706  length_joined = 0;
707  for (k = 0; k < hspcnt; k++) { /* for all windows in the
708  original list */
709  s_WindowInfo * window; /* window at this value of k */
710  s_WindowInfo * nextWindow; /* window at the next
711  value of k, or NULL if
712  no such window
713  exists */
714  window = windows[k];
715  nextWindow = ( k + 1 < hspcnt ) ? windows[k+1] : NULL;
716 
717  if(nextWindow != NULL &&
718  window->subject_range.context ==
719  nextWindow->subject_range.context &&
720  window->query_range.context == nextWindow->query_range.context &&
721  window->subject_range.end >= nextWindow->subject_range.begin) {
722  /* Join the current window with the next window. Do not add the
723  current window to the output list. */
724  s_WindowInfoJoin(nextWindow, &windows[k]);
725  } else {
726  /* Don't join the current window with the next window. Add the
727  current window to the output list instead */
728  windows[length_joined] = window;
729  length_joined++;
730  } /* end else don't join the current window with the next window */
731  } /* end for all windows in the original list */
732  *nWindows = length_joined;
733 
734  for (k = length_joined; k < hspcnt; k++) {
735  windows[k] = NULL;
736  }
737 
738  /* for blastx, swap query and subject range */
739  if (!subject_is_translated) {
740  for (k=0; k<length_joined; k++) {
741  s_WindowSwapRange(windows[k]);
742  }
743  }
744 
745  for (k = 0; k < length_joined; k++) {
746  s_DistinctAlignmentsSort(&windows[k]->align, windows[k]->hspcnt);
747  }
748  qsort(windows, *nWindows, sizeof(s_WindowInfo*),
750  return 0; /* normal return */
751 
752 error_return:
753  if (windows)
754  {
755  for (k = 0; k < *nWindows; k++) {
756  if (windows[k] != NULL)
757  s_WindowInfoFree(&windows[k]);
758  }
759  free(windows);
760  }
761  *pwindows = NULL;
762  return -1;
763 }
764 
765 
766 /**
767  * Read a list of alignments from a protein search and create a
768  * new array of pointers to s_WindowInfo so that each alignment is
769  * contained in exactly one window. See s_WindowsFromAligns for the
770  * meaning of the parameters. (@sa s_WindowsFromAligns).
771  *
772  * @return 0 on success, -1 on out-of-memory
773  */
774 static int
776  BlastCompo_QueryInfo * query_info,
777  int numQueries,
778  int sequence_length,
779  s_WindowInfo ***pwindows,
780  int * nWindows)
781 {
782  BlastCompo_Alignment * align;
783  int query_index; /* index of the query */
784  int query_length; /* length of an individual query */
785  int window_index; /* index of a window in the window list */
786 
787  /* new list of windows */
788  s_WindowInfo ** windows =
789  calloc(numQueries, sizeof(s_WindowInfo*));
790  *nWindows = 0;
791  if (windows == NULL)
792  goto error_return;
793  *nWindows = numQueries;
794  for (align = alignments; align != NULL; align = align->next) {
795  BlastCompo_Alignment * copiedAlign;
796 
797  query_index = align->queryIndex;
798  query_length = query_info[query_index].seq.length;
799 
800  if (windows[query_index] == NULL) {
801  windows[query_index] =
802  s_WindowInfoNew(0, sequence_length, 0,
803  0, query_length, query_index, NULL);
804  if (windows[query_index] == NULL)
805  goto error_return;
806  }
807  copiedAlign = s_AlignmentCopy(align);
808  if (copiedAlign == NULL)
809  goto error_return;
810  copiedAlign->next = windows[query_index]->align;
811  windows[query_index]->align = copiedAlign;
812  windows[query_index]->hspcnt++;
813  }
814  window_index = 0;
815  for (query_index = 0; query_index < numQueries; query_index++) {
816  if (windows[query_index] != NULL) {
817  windows[window_index] = windows[query_index];
818  s_AlignmentsRev(&windows[window_index]->align);
819  window_index++;
820  }
821  }
822  /* shrink to fit */
823  {
824  s_WindowInfo ** new_windows = NULL;
825  if (window_index > 0)
826  new_windows = realloc(windows, window_index * sizeof(s_WindowInfo*));
827 
828  if (new_windows == NULL) {
829  goto error_return;
830  } else {
831  windows = new_windows;
832  *nWindows = window_index;
833  }
834  }
835  qsort(windows, *nWindows, sizeof(s_WindowInfo*),
837  *pwindows = windows;
838  /* Normal return */
839  return 0;
840 
841 error_return:
842  for (window_index = 0; window_index < *nWindows; window_index++) {
843  s_WindowInfoFree(&windows[window_index]);
844  }
845  free(windows);
846  return -1;
847 }
848 
849 
850 /**
851  * Read a list of alignments from a search (protein or translated) and
852  * create a new array of pointers to s_WindowInfo so that each
853  * alignment is contained in exactly one window.
854  *
855  * @param alignments a list of alignments from a translated
856  * search
857  * @param query_info information about the query/queries used
858  * in the search
859  * @param hspcnt number of alignments
860  * @param numQueries number of queries
861  * @param border border around windows; windows with
862  * overlapping borders will be joined.
863  * @param sequence_length length of the subject sequence, in
864  * nucleotides for translated searches or
865  * in amino acids for protein searches
866  * @param *pwindows the new array of windows
867  * @param nWindows the length of *pwindows
868  * @param subject_is_translated is the subject sequence translated?
869  *
870  * @return 0 on success, -1 on out-of-memory
871  */
872 static int
874  BlastCompo_QueryInfo * query_info, int hspcnt,
875  int numQueries, int border, int sequence_length,
876  s_WindowInfo ***pwindows, int * nWindows,
877  int query_is_translated, int subject_is_translated, int is_pos_based)
878 {
879  if (subject_is_translated || query_is_translated) {
880  return s_WindowsFromTranslatedAligns(alignments, query_info,
881  hspcnt, border,
882  sequence_length,
883  pwindows, nWindows,
884  subject_is_translated, is_pos_based);
885  } else {
886  return s_WindowsFromProteinAligns(alignments, query_info,
887  numQueries, sequence_length,
888  pwindows, nWindows);
889  }
890 }
891 
892 
893 /**
894  * Compute the amino acid composition of the sequence.
895  *
896  * @param composition the computed composition.
897  * @param alphsize the size of the alphabet
898  * @param seq subject/query sequence data
899  * @param range the range of the given sequence data in
900  * the complete sequence
901  * @param align an alignment of the query to the
902  * subject range
903  * @param for_subject is this done for subject or for query?
904  */
905 static void
907  int alphsize,
910  BlastCompo_Alignment * align,
911  Boolean query_is_translated,
912  Boolean subject_is_translated)
913 {
914  Uint1 * data; /* sequence data for the subject */
915  int length; /* length of the subject portion of the alignment */
916  /* [left, right) is the interval of the subject to use when
917  * computing composition. The endpoints are offsets into the
918  * subject_range. */
919  int left, right;
920 
921  data = seq->data;
922  length = range->end - range->begin;
923  if (query_is_translated || subject_is_translated) {
924  int start;
925  int end;
926  start = ((query_is_translated) ?
927  align->queryStart : align->matchStart) - range->begin;
928  end = ((query_is_translated) ?
929  align->queryEnd : align->matchEnd ) - range->begin;
930  Blast_GetCompositionRange(&left, &right, data, length, start, end);
931  } else {
932  /* Use the whole subject to compute the composition */
933  left = 0;
934  right = length;
935  }
936  Blast_ReadAaComposition(composition, alphsize, &data[left], right-left);
937 }
938 
939 
940 /**
941  * Compute an e-value from a score and a set of statistical parameters
942  */
943 static double
944 s_EvalueFromScore(int score, double Lambda, double logK, double searchsp)
945 {
946  return searchsp * exp(-(Lambda * score) + logK);
947 }
948 
949 
950 /**
951  * The number of bits by which the score of a previously computed
952  * alignment must exceed the score of the HSP under consideration for
953  * a containment relationship to be reported by the isContained
954  * routine. */
955 #define KAPPA_BIT_TOL 2.0
956 
957 /**
958  * Return true if an alignment is contained in a previously-computed
959  * alignment of sufficiently high score.
960  *
961  * @param in_align the alignment to be tested
962  * @param alignments list of alignments
963  * @param lambda Karlin-Altschul statistical parameter
964  */
965 static Boolean
967  BlastCompo_Alignment * alignments,
968  double lambda)
969 {
970  BlastCompo_Alignment * align; /* represents the current alignment
971  in the main loop */
972  /* Endpoints of the alignment */
973  int query_offset = in_align->queryStart;
974  int query_end = in_align->queryEnd;
975  int subject_offset = in_align->matchStart;
976  int subject_end = in_align->matchEnd;
977  double score = in_align->score;
978  double scoreThresh = score + KAPPA_BIT_TOL * LOCAL_LN2/lambda;
979 
980  for (align = alignments; align != NULL; align = align->next ) {
981  /* for all elements of alignments */
982  if (KAPPA_SIGN(in_align->frame) == KAPPA_SIGN(align->frame)) {
983  /* hsp1 and hsp2 are in the same query/subject frame */
985  (align->queryStart, align->queryEnd, query_offset,
986  align->matchStart, align->matchEnd, subject_offset) &&
988  (align->queryStart, align->queryEnd, query_end,
989  align->matchStart, align->matchEnd, subject_end) &&
990  scoreThresh <= align->score) {
991  return 1;
992  }
993  }
994  }
995  return 0;
996 }
997 
998 
999 /* Documented in redo_alignment.h. */
1000 void
1002 {
1003  if (*pparams != NULL) {
1004  Blast_MatrixInfoFree(&(*pparams)->matrix_info);
1005  free((*pparams)->gapping_params);
1006  free(*pparams);
1007  *pparams = NULL;
1008  }
1009 }
1010 
1011 
1012 /* Documented in redo_alignment.h. */
1015  BlastCompo_GappingParams ** pgapping_params,
1016  ECompoAdjustModes compo_adjust_mode,
1017  int positionBased,
1018  int query_is_translated,
1019  int subject_is_translated,
1020  int ccat_query_length, int cutoff_s,
1021  double cutoff_e, int do_link_hsps,
1022  const Blast_RedoAlignCallbacks * callbacks,
1023  double near_identical_cutoff)
1024 {
1026  if (params) {
1027  params->matrix_info = *pmatrix_info;
1028  *pmatrix_info = NULL;
1029  params->gapping_params = *pgapping_params;
1030  *pgapping_params = NULL;
1031 
1032  params->compo_adjust_mode = compo_adjust_mode;
1033  params->positionBased = positionBased;
1035  params->query_is_translated = query_is_translated;
1036  params->subject_is_translated = subject_is_translated;
1037  params->ccat_query_length = ccat_query_length;
1038  params->cutoff_s = cutoff_s;
1039  params->cutoff_e = cutoff_e;
1040  params->do_link_hsps = do_link_hsps;
1041  params->callbacks = callbacks;
1042  params->near_identical_cutoff = near_identical_cutoff;
1043  } else {
1044  free(*pmatrix_info); *pmatrix_info = NULL;
1045  free(*pgapping_params); *pgapping_params = NULL;
1046  }
1047  return params;
1048 }
1049 
1050 #define MINIMUM_LENGTH_NEAR_IDENTICAL 50
1051 
1052 
1053 /* this function is called for each align in the window, hence we need both
1054  arguments */
1056  s_WindowInfo* window,
1057  BlastCompo_Alignment* align,
1058  double cutoff)
1059 {
1060  int queryIndex, queryLength, align_len;
1061 
1062  /* if cutoff > 0, use the more flexible test that allows for multiple HSPs,
1063  and gaps in the alignments */
1064  if (cutoff > 0) {
1065  queryIndex = align->queryIndex;
1066  queryLength = query_info[queryIndex].seq.length;
1067  if ((align->matchEnd - align->matchStart +1) <
1068  (MIN(queryLength, MINIMUM_LENGTH_NEAR_IDENTICAL)))
1069  return(FALSE);
1070 
1071  align_len = MIN(align->queryEnd - align->queryStart,
1072  align->matchEnd - align->matchStart);
1073 
1074  if ((double)align->score / (double)align_len < cutoff) {
1075  return (FALSE);
1076  }
1077  }
1078  else {
1079  /* Otherwise fallback to the old near identical test. This is left for
1080  compatibility with the code in c toolkit's tools/kappa.c */
1081 
1082  if ((window->hspcnt > 1) || (window->hspcnt < 1))
1083  return(FALSE);
1084 
1085  queryIndex = align->queryIndex;
1086  queryLength = query_info[queryIndex].seq.length;
1087  if ((align->queryEnd - align->queryStart) !=
1088  (align->matchEnd - align->matchStart))
1089  return(FALSE);
1090 
1091  if ((align->matchEnd - align->matchStart +1) <
1092  (MIN(queryLength, MINIMUM_LENGTH_NEAR_IDENTICAL)))
1093  return(FALSE);
1094  }
1095 
1096  return(TRUE);
1097 }
1098 
1099 /* Documented in redo_alignment.h. */
1100 int
1102  Blast_RedoAlignParams * params,
1103  BlastCompo_Alignment * incoming_aligns, int hspcnt,
1104  double Lambda,
1105  BlastCompo_MatchingSequence * matchingSeq,
1106  int ccat_query_length, BlastCompo_QueryInfo query_info[],
1107  int numQueries, int ** matrix, int alphsize,
1108  Blast_CompositionWorkspace * NRrecord,
1109  double *pvalueForThisPair,
1110  int compositionTestIndex,
1111  double *LambdaRatio)
1112 {
1113  int status = 0; /* return status */
1114 
1115  s_WindowInfo **windows; /* array of windows */
1116  int nWindows; /* length of windows */
1117  int window_index; /* loop index */
1118  int query_index; /* index of the current query */
1119  /* which mode of composition adjustment is actually used? */
1120  EMatrixAdjustRule matrix_adjust_rule = eDontAdjustMatrix;
1121 
1122  /* fields of params, as local variables */
1123  Blast_MatrixInfo * scaledMatrixInfo = params->matrix_info;
1124  ECompoAdjustModes compo_adjust_mode = params->compo_adjust_mode;
1125  int RE_pseudocounts = params->RE_pseudocounts;
1126  int query_is_translated = params->query_is_translated;
1127  int subject_is_translated = params->subject_is_translated;
1128  BlastCompo_GappingParams * gapping_params = params->gapping_params;
1129  const Blast_RedoAlignCallbacks * callbacks = params->callbacks;
1130 
1131  assert((int) compo_adjust_mode < 2 || !params->positionBased);
1132  for (query_index = 0; query_index < numQueries; query_index++) {
1133  alignments[query_index] = NULL;
1134  }
1135  status =
1136  s_WindowsFromAligns(incoming_aligns, query_info, hspcnt, numQueries,
1137  kWindowBorder, matchingSeq->length, &windows,
1138  &nWindows, query_is_translated, subject_is_translated, params->positionBased);
1139  if (status != 0) {
1140  goto function_level_cleanup;
1141  }
1142  /* for all windows */
1143  for (window_index = 0; window_index < nWindows; window_index++) {
1144  s_WindowInfo * window; /* the current window */
1145  BlastCompo_Alignment * in_align; /* the current alignment */
1146  int hsp_index; /* index of the current alignment */
1147  /* data for the current window */
1150  /* the composition of this query */
1151  Blast_AminoAcidComposition * query_composition;
1152  Boolean nearIdenticalStatus = TRUE; /* are query and subject nearly
1153  identical in the aligned part?*/
1154  Boolean oldNearIdenticalStatus = FALSE; /* were query and subject nearly
1155  identical for previous alignmnet
1156  between the same query and subject*/
1157 
1158  Boolean subject_maybe_biased = TRUE; /* False means that the subject
1159  sequence does not have biased
1160  segments. True means we do
1161  not know */
1162  int num_adjustments = 0; /* number of matrix adjustments done in this
1163  window; nearly identical alignments may be
1164  computed with different composition
1165  adjustment than other alignments due to using
1166  full subjects instead of segged ones */
1167 
1168  window = windows[window_index];
1169  query_index = window->align->queryIndex;
1170  query_composition = &query_info[query_index].composition;
1171 
1172  /* for all alignments in this window */
1173  for (in_align = window->align, hsp_index = 0;
1174  in_align != NULL;
1175  in_align = in_align->next, hsp_index++) {
1176 
1177 
1178  /* if the fist HSP for the pair or subject may have biased
1179  segments; if the subject is not biased, then segging is not
1180  necessary so we do not have to check for near identical status */
1181  if (hsp_index == 0 || subject_maybe_biased) {
1182  nearIdenticalStatus = s_preliminaryTestNearIdentical(query_info,
1183  window,
1184  in_align,
1185  params->near_identical_cutoff);
1186  }
1187 
1188  /* if the first HSP for the pair or subject may have biased
1189  segments and near identical status is different than for the
1190  previous HSP */
1191  if (hsp_index == 0 ||
1192  (subject_maybe_biased &&
1193  (nearIdenticalStatus != oldNearIdenticalStatus))) {
1194 
1195  /* release previously allocated sequence data */
1198  status = callbacks->get_range(
1199  matchingSeq,
1200  &window->subject_range,
1201  &subject,
1202  &query_info[query_index].seq,
1203  &window->query_range,
1204  &query,
1205  query_info[query_index].words,
1206  window->align,
1207  nearIdenticalStatus,
1208  compo_adjust_mode,
1209  FALSE,
1210  &subject_maybe_biased
1211  );
1212 
1213  if (status != 0) {
1214  goto window_index_loop_cleanup;
1215  }
1216  }
1217 
1218  /* do frequency count for partial translated query */
1219  if (query_is_translated) {
1220  s_GetComposition(query_composition,
1221  alphsize, &query,
1222  &window->query_range,
1223  in_align, TRUE, FALSE);
1224  }
1225  /* if in_align is not contained in a higher-scoring
1226  * alignment */
1227  if ( !s_IsContained(in_align, alignments[query_index], Lambda) ) {
1228  BlastCompo_Alignment * newAlign; /* the new alignment */
1229  /* adjust_search_failed is true only if Blast_AdjustScores
1230  * is called and returns a nonzero value */
1231  int adjust_search_failed = 0;
1232  if (compo_adjust_mode != eNoCompositionBasedStats &&
1233  (subject_is_translated || hsp_index == 0
1234  || (nearIdenticalStatus != oldNearIdenticalStatus))) {
1235  Blast_AminoAcidComposition subject_composition;
1236  s_GetComposition(&subject_composition,
1237  alphsize, &subject,
1238  &window->subject_range,
1239  in_align, FALSE, subject_is_translated);
1240  adjust_search_failed =
1241  Blast_AdjustScores(matrix, query_composition,
1242  query.length,
1243  &subject_composition,
1244  subject.length,
1245  scaledMatrixInfo, compo_adjust_mode,
1246  RE_pseudocounts, NRrecord,
1247  &matrix_adjust_rule,
1248  callbacks->calc_lambda,
1249  pvalueForThisPair,
1250  compositionTestIndex,
1251  LambdaRatio);
1252  if (adjust_search_failed < 0) { /* fatal error */
1253  status = adjust_search_failed;
1254  goto window_index_loop_cleanup;
1255  }
1256  num_adjustments++;
1257  }
1258 
1259  if ( !adjust_search_failed ) {
1260  newAlign = callbacks->redo_one_alignment(
1261  in_align,
1262  matrix_adjust_rule,
1263  &query,
1264  &window->query_range,
1265  ccat_query_length,
1266  &subject,
1267  &window->subject_range,
1268  matchingSeq->length,
1269  gapping_params
1270  );
1271  if (newAlign && newAlign->score >= params->cutoff_s) {
1272  s_WithDistinctEnds(&newAlign, &alignments[query_index],
1273  callbacks->free_align_traceback,
1274  num_adjustments == 1);
1275  } else {
1276  BlastCompo_AlignmentsFree(&newAlign,
1277  callbacks->
1278  free_align_traceback);
1279  }
1280  }
1281  } /* end if in_align is not contained...*/
1282  oldNearIdenticalStatus = nearIdenticalStatus;
1283  } /* end for all alignments in this window */
1284  window_index_loop_cleanup:
1285  if (subject.data != NULL)
1287  if (query.data != NULL)
1289  if (status != 0)
1290  goto function_level_cleanup;
1291  } /* end for all windows */
1292  function_level_cleanup:
1293  if (status != 0) {
1294  for (query_index = 0; query_index < numQueries; query_index++) {
1295  BlastCompo_AlignmentsFree(&alignments[query_index],
1296  callbacks->free_align_traceback);
1297  }
1298  }
1299  for (window_index = 0; window_index < nWindows; window_index++) {
1300  s_WindowInfoFree(&windows[window_index]);
1301  }
1302  free(windows);
1303 
1304  return status;
1305 }
1306 
1307 
1308 /* Documented in redo_alignment.h. */
1309 int
1311  Blast_RedoAlignParams * params,
1312  BlastCompo_Alignment * incoming_aligns,
1313  int hspcnt,
1314  double Lambda, double logK,
1315  BlastCompo_MatchingSequence * matchingSeq,
1316  BlastCompo_QueryInfo query_info[],
1317  int numQueries,
1318  int ** matrix, int alphsize,
1319  Blast_CompositionWorkspace * NRrecord,
1320  Blast_ForbiddenRanges * forbidden,
1321  BlastCompo_Heap * significantMatches,
1322  double *pvalueForThisPair,
1323  int compositionTestIndex,
1324  double *LambdaRatio)
1325 {
1326  int status = 0; /* status return value */
1327  s_WindowInfo **windows = NULL; /* array of windows */
1328  int nWindows; /* length of windows */
1329  int window_index; /* loop index */
1330  int query_index; /* index of the current query */
1331  /* which mode of composition adjustment is actually used? */
1332  EMatrixAdjustRule matrix_adjust_rule = eDontAdjustMatrix;
1333 
1334  /* fields of params, as local variables */
1335  Blast_MatrixInfo * scaledMatrixInfo = params->matrix_info;
1336  ECompoAdjustModes compo_adjust_mode = params->compo_adjust_mode;
1337  int positionBased = params->positionBased;
1338  int RE_pseudocounts = params->RE_pseudocounts;
1339  int query_is_translated = params->query_is_translated;
1340  int subject_is_translated = params->subject_is_translated;
1341  int do_link_hsps = params->do_link_hsps;
1342  int ccat_query_length = params->ccat_query_length;
1343  BlastCompo_GappingParams * gapping_params = params->gapping_params;
1344  const Blast_RedoAlignCallbacks * callbacks = params->callbacks;
1345 
1346  int gap_open = gapping_params->gap_open;
1347  int gap_extend = gapping_params->gap_extend;
1348 
1349  assert((int) compo_adjust_mode < 2 || !positionBased);
1350  for (query_index = 0; query_index < numQueries; query_index++) {
1351  alignments[query_index] = NULL;
1352  }
1353  /* Find the multiple translation windows used by tblastn queries. */
1354  status =
1355  s_WindowsFromAligns(incoming_aligns, query_info, hspcnt, numQueries,
1356  kWindowBorder, matchingSeq->length, &windows,
1357  &nWindows, query_is_translated, subject_is_translated, params->positionBased);
1358  if (status != 0)
1359  goto function_level_cleanup;
1360  /* We are performing a Smith-Waterman alignment */
1361  for (window_index = 0; window_index < nWindows; window_index++) {
1362  /* for all window */
1363  s_WindowInfo * window = NULL; /* the current window */
1365  /* subject data for this window */
1366  BlastCompo_SequenceData query = {0,}; /* query data for this window */
1367  Blast_AminoAcidComposition * query_composition;
1368  double searchsp; /* effective search space */
1369  Boolean nearIdenticalStatus; /*are query and subject nearly
1370  identical in the aligned part?*/
1371  /* adjust_search_failed is true only if Blast_AdjustScores
1372  * is called and returns a nonzero value */
1373  int adjust_search_failed = FALSE;
1374 
1375 
1376  window = windows[window_index];
1377  query_index = window->query_range.context;
1378  query_composition = &query_info[query_index].composition;
1379 
1380  nearIdenticalStatus = s_preliminaryTestNearIdentical(query_info,
1381  window,
1382  window->align,
1383  params->near_identical_cutoff);
1384 
1385  status =
1386  callbacks->get_range(matchingSeq, &window->subject_range,
1387  &subject,
1388  &query_info[query_index].seq,
1389  &window->query_range,
1390  &query,
1391  query_info[query_index].words,
1392  window->align, nearIdenticalStatus,
1393  compo_adjust_mode, TRUE,
1394  NULL);
1395 
1396  if (status != 0)
1397  goto window_index_loop_cleanup;
1398 
1399  /* do frequency count for partial translated query */
1400  if (query_is_translated) {
1401  s_GetComposition(query_composition,
1402  alphsize, &query,
1403  &window->query_range,
1404  window->align, TRUE, FALSE);
1405  }
1406  searchsp = query_info[query_index].eff_search_space;
1407 
1408  /* For Smith-Waterman alignments, adjust the search using the
1409  * composition of the highest scoring alignment in window */
1410  if (compo_adjust_mode != eNoCompositionBasedStats) {
1411  Blast_AminoAcidComposition subject_composition;
1412  s_GetComposition(&subject_composition, alphsize,
1413  &subject, &window->subject_range,
1414  window->align, FALSE, subject_is_translated);
1415  adjust_search_failed =
1416  Blast_AdjustScores(matrix,
1417  query_composition, query.length,
1418  &subject_composition, subject.length,
1419  scaledMatrixInfo, compo_adjust_mode,
1420  RE_pseudocounts, NRrecord,
1421  &matrix_adjust_rule, callbacks->calc_lambda,
1422  pvalueForThisPair,
1423  compositionTestIndex,
1424  LambdaRatio);
1425  if (adjust_search_failed < 0) { /* fatal error */
1426  status = adjust_search_failed;
1427  goto window_index_loop_cleanup;
1428  }
1429  }
1430  if ( !adjust_search_failed ) {
1431  /* BlastCompo_AdjustSearch ran without error; compute the new
1432  alignments. */
1433  int aSwScore; /* score computed by the
1434  * Smith-Waterman algorithm. */
1435  int alignment_is_significant; /* True if the score/evalue of
1436  * the Smith-Waterman alignment
1437  * is significant. */
1438  Blast_ForbiddenRangesClear(forbidden);
1439  do {
1440  int matchEnd, queryEnd; /* end points of the alignments
1441  * computed by the Smith-Waterman
1442  * algorithm. */
1443  status =
1444  Blast_SmithWatermanScoreOnly(&aSwScore, &matchEnd,
1445  &queryEnd,
1446  subject.data,
1447  subject.length,
1448  query.data,
1449  query.length, matrix,
1450  gap_open, gap_extend,
1451  positionBased,
1452  forbidden);
1453  if (status != 0)
1454  goto window_index_loop_cleanup;
1455 
1456  if (do_link_hsps) {
1457  alignment_is_significant = aSwScore >= params->cutoff_s;
1458  } else {
1459  double newSwEvalue; /* evalue as computed by the
1460  * Smith-Waterman algorithm */
1461  newSwEvalue =
1462  s_EvalueFromScore(aSwScore, Lambda, logK, searchsp);
1463 
1464  alignment_is_significant = newSwEvalue < params->cutoff_e;
1465  if (alignments[query_index] == NULL) {
1466  /* this is the most significant alignment; if
1467  * it will not be accepted, no alignments from
1468  * this match will */
1469  alignment_is_significant =
1470  alignment_is_significant &&
1472  &significantMatches[query_index],
1473  newSwEvalue, aSwScore, matchingSeq->index);
1474  }
1475  }
1476  if (alignment_is_significant) {
1477  /* the redone alignment */
1478  BlastCompo_Alignment * newAlign;
1479  int matchStart, queryStart; /* the start of the
1480  * alignment in the
1481  * match/query sequence */
1482  int updatedScore; /* score found by the SW
1483  algorithm run in reverse */
1484  status =
1485  Blast_SmithWatermanFindStart(&updatedScore,
1486  &matchStart,
1487  &queryStart,
1488  subject.data,
1489  subject.length,
1490  query.data,
1491  matrix, gap_open,
1492  gap_extend,
1493  matchEnd,
1494  queryEnd,
1495  aSwScore,
1496  positionBased,
1497  forbidden);
1498  if (status != 0) {
1499  goto window_index_loop_cleanup;
1500  }
1501  status =
1502  callbacks->
1503  new_xdrop_align(&newAlign, &queryEnd, &matchEnd,
1504  queryStart, matchStart, aSwScore,
1505  &query, &window->query_range,
1506  ccat_query_length,
1507  &subject, &window->subject_range,
1508  matchingSeq->length,
1509  gapping_params, matrix_adjust_rule);
1510  if (status != 0) {
1511  goto window_index_loop_cleanup;
1512  }
1513  newAlign->next = alignments[query_index];
1514  alignments[query_index] = newAlign;
1515 
1516  if (window->hspcnt > 1) {
1517  /* We may compute more alignments; make the range
1518  of the current alignment forbidden */
1519  status =
1520  Blast_ForbiddenRangesPush(forbidden,
1521  queryStart, queryEnd,
1522  matchStart, matchEnd);
1523  }
1524  if (status != 0) {
1525  goto window_index_loop_cleanup;
1526  }
1527  }
1528  /* end if the next local alignment is significant */
1529  } while (alignment_is_significant && window->hspcnt > 1);
1530  /* end do..while the next local alignment is significant, and
1531  * the original blast search found more than one alignment. */
1532  } /* end if BlastCompo_AdjustSearch ran without error. */
1533 window_index_loop_cleanup:
1534  if (subject.data != NULL)
1536  if (query.data != NULL)
1538  if (status != 0)
1539  goto function_level_cleanup;
1540  } /* end for all windows */
1541 
1542 function_level_cleanup:
1543  if (status != 0) {
1544  for (query_index = 0; query_index < numQueries; query_index++) {
1545  BlastCompo_AlignmentsFree(&alignments[query_index],
1546  callbacks->free_align_traceback);
1547  }
1548  }
1549  for (window_index = 0; window_index < nWindows; window_index++) {
1550  s_WindowInfoFree(&windows[window_index]);
1551  }
1552  free(windows);
1553 
1554  return status;
1555 }
1556 
1557 
1558 /* Documented in redo_alignment.h. */
1559 int
1561  BlastCompo_Heap significantMatches[],
1562  int numQueries)
1563 {
1564  int i;
1565  for (i = 0; i < numQueries; i++) {
1566  if (BlastCompo_HeapFilledToCutoff(&significantMatches[i])) {
1567  double ecutoff = significantMatches[i].ecutoff;
1568  /* Only matches with evalue <= ethresh will be saved. */
1569  if (evalue <= EVALUE_STRETCH * ecutoff) {
1570  /* The evalue if this match is sufficiently small
1571  * that we want to redo it to try to obtain an
1572  * alignment with evalue smaller than ecutoff. */
1573  return FALSE;
1574  }
1575  } else {
1576  return FALSE;
1577  }
1578  }
1579  return TRUE;
1580 }
Declares a "heap" data structure that is used to store computed alignments when composition adjustmen...
int BlastCompo_HeapFilledToCutoff(const BlastCompo_Heap *self)
Return true if only matches with evalue <= self->ecutoff may be inserted.
Definition: compo_heap.c:405
int BlastCompo_HeapWouldInsert(BlastCompo_Heap *self, double eValue, int score, int subject_index)
Return true if self may insert a match that had the given eValue, score and subject_index.
Definition: compo_heap.c:252
Definitions used in compositional score matrix adjustment.
int Blast_AdjustScores(int **matrix, const Blast_AminoAcidComposition *query_composition, int queryLength, const Blast_AminoAcidComposition *subject_composition, int subjectLength, const Blast_MatrixInfo *matrixInfo, ECompoAdjustModes composition_adjust_mode, int RE_pseudocounts, Blast_CompositionWorkspace *NRrecord, EMatrixAdjustRule *matrix_adjust_rule, double calc_lambda(double *, int, int, double), double *pvalueForThisPair, int compositionTestIndex, double *ratioToPassBack)
Compute a compositionally adjusted scoring matrix.
void Blast_MatrixInfoFree(Blast_MatrixInfo **ss)
Free memory associated with a Blast_MatrixInfo object.
void Blast_GetCompositionRange(int *pleft, int *pright, const Uint1 *subject_data, int length, int start, int finish)
Get the range of a sequence to be included when computing a composition.
void Blast_ReadAaComposition(Blast_AminoAcidComposition *composition, int alphsize, const Uint1 *sequence, int length)
Compute the true amino acid composition of a sequence, ignoring ambiguity characters and other nonsta...
Constants used in compositional score matrix adjustment.
ECompoAdjustModes
An collection of constants that specify all permissible modes of composition adjustment.
@ eNoCompositionBasedStats
Don't use composition based statistics.
EMatrixAdjustRule
An collection of constants that specify all rules that may be used to generate a compositionally adju...
@ eDontAdjustMatrix
static Uint4 border[]
char data[12]
Definition: iconv.c:80
#define NULL
Definition: ncbistd.hpp:225
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int i
range(_Ty, _Ty) -> range< _Ty >
unsigned int a
Definition: ncbi_localip.c:102
Type and macro definitions from C toolkit that are not defined in C++ toolkit.
#define MIN(a, b)
returns smaller of a and b.
Definition: ncbi_std.h:112
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:97
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
#define ABS(a)
returns absolute value of a (|a|)
Definition: ncbi_std.h:122
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
#define MAX(a, b)
returns larger of a and b.
Definition: ncbi_std.h:117
double lambda(size_t dimMatrix_, const Int4 *const *scoreMatrix_, const double *q_)
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
Declarations for several linear algebra routines.
static int s_DistinctAlignmentsLength(BlastCompo_Alignment *list)
Calculate the length of a list of BlastCompo_Alignment objects.
void BlastCompo_AlignmentsFree(BlastCompo_Alignment **palign, void(*free_context)(void *))
Recursively free all alignments in the singly linked list whose head is *palign.
static void s_SequenceDataRelease(BlastCompo_SequenceData *self)
Release the data associated with this object.
#define KAPPA_CONTAINED_IN_HSP(a, b, c, d, e, f)
Test of whether one set of HSP bounds is contained in another.
static Boolean s_IsSimilarEndPoint(const BlastCompo_Alignment *newAlign, const BlastCompo_Alignment *align)
Does at least one end point of newAlign is contined within align and lies on the same diagnol as the ...
static Boolean s_IsContained(BlastCompo_Alignment *in_align, BlastCompo_Alignment *alignments, double lambda)
Return true if an alignment is contained in a previously-computed alignment of sufficiently high scor...
static s_WindowInfo * s_WindowInfoNew(int begin, int end, int context, int queryOrigin, int queryLength, int query_index, BlastCompo_Alignment *align)
Create and initialize a new s_WindowInfo.
#define KAPPA_SIGN(a)
A macro that defines the mathematical "sign" function.
struct s_WindowInfo s_WindowInfo
s_WindowInfo - a struct whose instances represent a range of data in a sequence.
static void s_GetComposition(Blast_AminoAcidComposition *composition, int alphsize, BlastCompo_SequenceData *seq, BlastCompo_SequenceRange *range, BlastCompo_Alignment *align, Boolean query_is_translated, Boolean subject_is_translated)
Compute the amino acid composition of the sequence.
int Blast_RedoOneMatchSmithWaterman(BlastCompo_Alignment **alignments, Blast_RedoAlignParams *params, BlastCompo_Alignment *incoming_aligns, int hspcnt, double Lambda, double logK, BlastCompo_MatchingSequence *matchingSeq, BlastCompo_QueryInfo query_info[], int numQueries, int **matrix, int alphsize, Blast_CompositionWorkspace *NRrecord, Blast_ForbiddenRanges *forbidden, BlastCompo_Heap *significantMatches, double *pvalueForThisPair, int compositionTestIndex, double *LambdaRatio)
Recompute all alignments for one query/subject pair using the Smith-Waterman algorithm and possibly a...
static void s_AlignmentsRev(BlastCompo_Alignment **plist)
Reverse a list of BlastCompo_Alignments.
static int s_WindowsFromProteinAligns(BlastCompo_Alignment *alignments, BlastCompo_QueryInfo *query_info, int numQueries, int sequence_length, s_WindowInfo ***pwindows, int *nWindows)
Read a list of alignments from a protein search and create a new array of pointers to s_WindowInfo so...
#define COMPO_INTENSE_DEBUG
Define COMPO_INTENSE_DEBUG to be true to turn on rigorous but expensive consistency tests in the comp...
int Blast_RedoOneMatch(BlastCompo_Alignment **alignments, Blast_RedoAlignParams *params, BlastCompo_Alignment *incoming_aligns, int hspcnt, double Lambda, BlastCompo_MatchingSequence *matchingSeq, int ccat_query_length, BlastCompo_QueryInfo query_info[], int numQueries, int **matrix, int alphsize, Blast_CompositionWorkspace *NRrecord, double *pvalueForThisPair, int compositionTestIndex, double *LambdaRatio)
Recompute all alignments for one query/subject pair using composition-based statistics or composition...
int BlastCompo_EarlyTermination(double evalue, BlastCompo_Heap significantMatches[], int numQueries)
Return true if a heuristic determines that it is unlikely to be worthwhile to redo a query-subject pa...
static int s_GetTranslatedLength(int length, int frame, int is_pos_based)
#define LOCAL_LN2
The natural log of 2, defined in newer systems as M_LN2 in math.h, but missing in older systems.
#define MINIMUM_LENGTH_NEAR_IDENTICAL
#define EVALUE_STRETCH
by what factor might initially reported E-value exceed true E-value
static const int kReMatrixAdjustmentPseudocounts
pseudocounts for relative-entropy-based score matrix adjustment
static void s_WindowInfoFree(s_WindowInfo **window)
Free an instance of s_WindowInfo.
#define KAPPA_BIT_TOL
The number of bits by which the score of a previously computed alignment must exceed the score of the...
static int s_WindowsFromAligns(BlastCompo_Alignment *alignments, BlastCompo_QueryInfo *query_info, int hspcnt, int numQueries, int border, int sequence_length, s_WindowInfo ***pwindows, int *nWindows, int query_is_translated, int subject_is_translated, int is_pos_based)
Read a list of alignments from a search (protein or translated) and create a new array of pointers to...
static BlastCompo_Alignment * s_AlignmentCopy(const BlastCompo_Alignment *align)
Copy a BlastCompo_Alignment, setting the next field to NULL.
static int s_SubjectCompareWindows(const void *vp1, const void *vp2)
A comparison routine used to sort a list of windows by position in the subject, ignoring strand and f...
static void s_WithDistinctEnds(BlastCompo_Alignment **p_newAlign, BlastCompo_Alignment **p_oldAlignments, void free_align_context(void *), Boolean is_same_adjustment)
Given a list of alignments and a new alignment, create a new list of alignments that conditionally in...
static int s_WindowsFromTranslatedAligns(BlastCompo_Alignment *alignments, BlastCompo_QueryInfo *query_info, int hspcnt, int border, int sequence_length, s_WindowInfo ***pwindows, int *nWindows, int subject_is_translated, int is_pos_based)
Read a list of alignments from a translated search and create a new array of pointers to s_WindowInfo...
static int s_LocationCompareWindows(const void *vp1, const void *vp2)
A comparison routine used to sort a list of windows, first by frame and then by location.
static int s_AlignmentCmp(const BlastCompo_Alignment *a, const BlastCompo_Alignment *b)
Compare two BlastCompo_Alignments.
static Boolean s_IsSameEndPoint(const BlastCompo_Alignment *newAlign, const BlastCompo_Alignment *align)
Do given alignments have the same left or right end point.
static int s_AlignmentsAreSorted(BlastCompo_Alignment *alignments)
Temporary function to determine whether alignments are sorted.
static const int kWindowBorder
For translated subject sequences, the number of amino acids to include before and after the existing ...
void Blast_RedoAlignParamsFree(Blast_RedoAlignParams **pparams)
Free a set of Blast_RedoAlignParams.
BlastCompo_Alignment * BlastCompo_AlignmentNew(int score, EMatrixAdjustRule matrix_adjust_rule, int queryStart, int queryEnd, int queryIndex, int matchStart, int matchEnd, int frame, void *context)
Create a new BlastCompo_Alignment; parameters to this function correspond directly to fields of Blast...
static void s_WindowInfoJoin(s_WindowInfo *win1, s_WindowInfo **pwin2)
Join two instance of s_WindowInfo into a single window.
static Boolean s_preliminaryTestNearIdentical(BlastCompo_QueryInfo query_info[], s_WindowInfo *window, BlastCompo_Alignment *align, double cutoff)
#define CMP(a, b)
-1/0/1 if a is less than/greater than/equal to b
Blast_RedoAlignParams * Blast_RedoAlignParamsNew(Blast_MatrixInfo **pmatrix_info, BlastCompo_GappingParams **pgapping_params, ECompoAdjustModes compo_adjust_mode, int positionBased, int query_is_translated, int subject_is_translated, int ccat_query_length, int cutoff_s, double cutoff_e, int do_link_hsps, const Blast_RedoAlignCallbacks *callbacks, double near_identical_cutoff)
Create new Blast_RedoAlignParams object.
static void s_WindowSwapRange(s_WindowInfo *self)
Swap the query and subject range.
static void s_DistinctAlignmentsSort(BlastCompo_Alignment **plist, int hspcnt)
Sort a list of Blast_Compo_Alignment objects, using s_AlignmentCmp comparison function.
static double s_EvalueFromScore(int score, double Lambda, double logK, double searchsp)
Compute an e-value from a score and a set of statistical parameters.
Definitions used to redo a set of alignments, using either composition matrix adjustment or the Smith...
#define GET_NUCL_LENGTH(l)
#define GET_TRANSLATED_LENGTH(l, f)
#define GET_SEQ_FRAME(f)
Definitions for computing Smith-Waterman alignments.
int Blast_ForbiddenRangesPush(Blast_ForbiddenRanges *self, int queryStart, int queryEnd, int matchStart, int matchEnd)
Add some ranges to self.
void Blast_ForbiddenRangesClear(Blast_ForbiddenRanges *self)
Reset self to be empty.
int Blast_SmithWatermanFindStart(int *score_out, int *matchSeqStart, int *queryStart, const Uint1 *subject_data, int subject_length, const Uint1 *query_data, int **matrix, int gapOpen, int gapExtend, int matchSeqEnd, int queryEnd, int score_in, int positionSpecific, const Blast_ForbiddenRanges *forbiddenRanges)
Find the left-hand endpoints of the locally optimal Smith-Waterman alignment given the score and righ...
int Blast_SmithWatermanScoreOnly(int *score, int *matchSeqEnd, int *queryEnd, const Uint1 *subject_data, int subject_length, const Uint1 *query_data, int query_length, int **matrix, int gapOpen, int gapExtend, int positionSpecific, const Blast_ForbiddenRanges *forbiddenRanges)
Compute the score and right-hand endpoints of the locally optimal Smith-Waterman alignment,...
#define assert(x)
Definition: srv_diag.hpp:58
Within the composition adjustment module, an object of type BlastCompo_Alignment represents a distinc...
int frame
the subject frame
int matchStart
the start of the alignment in the subject
int score
the score of this alignment
int matchEnd
one past the end of the alignment in the subject
int queryStart
the start of the alignment in the query
EMatrixAdjustRule matrix_adjust_rule
how the score matrix was computed
struct BlastCompo_Alignment * next
the next alignment in the list
int queryIndex
index of the query in a concatenated query
int queryEnd
one past the end of the alignment in the query
void * context
traceback info for a gapped alignment
Parameters used to compute gapped alignments.
int gap_open
penalty for opening a gap
int gap_extend
penalty for extending a gapped alignment by one residue
A BlastCompo_Heap represents a collection of alignments between one query sequence and several matchi...
Definition: compo_heap.h:82
double ecutoff
matches with evalue below ecutoff may always be inserted in the BlastCompo_Heap
Definition: compo_heap.h:89
A BlastCompo_MatchingSequence represents a subject sequence to be aligned with the query.
Int4 index
index of this sequence in the database
Int4 length
length of this matching sequence
Collected information about a query.
Blast_AminoAcidComposition composition
the composition of the query
BlastCompo_SequenceData seq
sequence data for the query
double eff_search_space
effective search space of searches involving this query
Uint8 * words
list words in the query, needed for testing whether the query and a subject are nearly identical
BlastCompo_SequenceData - represents a string of amino acids or nucleotides.
int length
the length of data.
Uint1 * data
amino acid or nucleotide data
BlastCompo_SequenceRange - a struct whose instances represent a range of data in a sequence.
int begin
the starting index of the range
int end
one beyond the last item in the range
int context
integer identifier for this window, can indicate a translation frame or an index into a set of sequen...
Represents the composition of an amino-acid sequence, in the ncbistdaa alphabet.
Work arrays used to perform composition-based matrix adjustment.
An instance of Blast_ForbiddenRanges is used by the Smith-Waterman algorithm to represent ranges in t...
Information about a amino-acid substitution matrix.
Callbacks used by Blast_RedoOneMatch and Blast_RedoOneMatchSmithWaterman routines.
redo_one_alignment_type * redo_one_alignment
free_align_traceback_type * free_align_traceback
get_range_type * get_range
calc_lambda_type * calc_lambda
A parameter block for the Blast_RedoOneMatch and Blast_RedoOneMatchSmithWaterman routines.
const Blast_RedoAlignCallbacks * callbacks
callback functions used by the Blast_RedoAlign* functions
int query_is_translated
true if the query is translated
int cutoff_s
cutoff score for saving alignments when HSP linking is used
ECompoAdjustModes compo_adjust_mode
composition adjustment mode
int ccat_query_length
length of the concatenated query, or just the length of the query if not concatenated
int do_link_hsps
if true, then HSP linking and sum statistics are used to computed e-values
int subject_is_translated
true if the subject is translated
Blast_MatrixInfo * matrix_info
information about the scoring matrix used
BlastCompo_GappingParams * gapping_params
parameters for performing a gapped alignment
int RE_pseudocounts
number of pseudocounts to use in relative-entropy based composition adjustment.
int positionBased
true if the search is position-based
double near_identical_cutoff
alignments with raw score per residue above this value will be tested if sequences are near identical...
double cutoff_e
cutoff e-value for saving alignments
static string subject
static string query
s_WindowInfo - a struct whose instances represent a range of data in a sequence.
BlastCompo_SequenceRange subject_range
range of the subject included in this window
BlastCompo_SequenceRange query_range
range of the query included in this window
BlastCompo_Alignment * align
list of existing alignments contained in this window
int hspcnt
number of alignment in this window
else result
Definition: token2.c:20
static CS_CONTEXT * context
Definition: will_convert.c:21
void free(voidpf ptr)
voidp malloc(uInt size)
voidp calloc(uInt items, uInt size)
Modified on Wed Apr 17 13:08:42 2024 by modify_doxy.py rev. 669887