NCBI C Toolkit Cross Reference

C/tools/kappa.c


  1 static char const rcsid[] = "$Id: kappa.c,v 6.91 2008/07/24 13:13:03 madden Exp $";
  2 
  3 /* $Id: kappa.c,v 6.91 2008/07/24 13:13:03 madden Exp $ 
  4 *   ==========================================================================
  5 *
  6 *                            PUBLIC DOMAIN NOTICE
  7 *               National Center for Biotechnology Information
  8 *
  9 *  This software/database is a "United States Government Work" under the
 10 *  terms of the United States Copyright Act.  It was written as part of
 11 *  the author's official duties as a United States Government employee and
 12 *  thus cannot be copyrighted.  This software/database is freely available
 13 *  to the public for use. The National Library of Medicine and the U.S.
 14 *  Government have not placed any restriction on its use or reproduction.
 15 *
 16 *  Although all reasonable efforts have been taken to ensure the accuracy
 17 *  and reliability of the software and data, the NLM and the U.S.
 18 *  Government do not and cannot warrant the performance or results that
 19 *  may be obtained by using this software or data. The NLM and the U.S.
 20 *  Government disclaim all warranties, express or implied, including
 21 *  warranties of performance, merchantability or fitness for any particular
 22 *  purpose.
 23 *
 24 *  Please cite the author in any work or product based on this material.
 25 *
 26 * ===========================================================================*/
 27 
 28 /*****************************************************************************
 29 
 30 File name: kappa.c
 31 
 32 Authors: Alejandro Schaffer, Mike Gertz
 33 
 34 Contents: Utilities for doing Smith-Waterman alignments and adjusting
 35     the scoring system for each match in blastpgp
 36 
 37 Functions in this file call functions in the composition adjustment
 38 module to perform adjustment and recalculation of alignments.  This
 39 file is intended to be (thin) glue code that:
 40 
 41 - adapts C toolkit data structures by the composition adjustment module; 
 42 - calls functions in the composition adjustment module;
 43 - converts the output of the composition adjustment module into the
 44   format needed by the C toolkit.
 45 
 46 This file is very similar to the blast_kappa.c file, which is glue
 47 code the performs the same function for the C++ toolkit.  The sole
 48 external symbol from this file should be RedoAlignmentCore. Static
 49 functions in this file have a 'Kappa_' prefix.  Please adhere to this
 50 convention to avoid a name clash with functions in blast_kappa.c (the
 51 name clash can matter in debuggers and search engines.)
 52 
 53  $Revision: 6.91 $
 54 
 55  Revision 6.75  2005/11/07 15:28:56  coulouri
 56  From Mike Gertz:
 57 
 58  Record the composition mode used for each aligment; formerly the
 59  composition mode was recorded for each subject sequence.  For blastp,
 60  the composition is only done once per subject sequence so the former
 61  behavior was correct, but it was incorrect for tblastn.
 62 
 63  Fixed a bug when the query frame is checked in an intermediate
 64  containment test.  For tblastn, this effectively disabled the test.
 65  This bug can effect tblastn alignments when SmithWaterman is not used
 66  in the rare occasion that omitting the intermediate containment test
 67  causes a different alignment to be found.
 68 
 69  Added a sort key to location_compare windows to sort first by query
 70  index.  For tblastn alignments when query concatination is used, not
 71  having this sort key may on rare occassion cause a different alignment
 72  to be reported.
 73 
 74  Cast a few Nlm_FloatHi values to Int4 to suppress compiler warnings.
 75 
 76  Revision 6.74  2005/09/29 17:40:18  coulouri
 77  from mike gertz:
 78      - Many changes to support query concatenation; including changing
 79        the API of RedoAlignementCore.
 80      - In RedoAlignmentCore, only call BlastLinkHsps if
 81        search->pbp->do_sum_stats is true.
 82      - In RedoAlignmentCore, in the non-Smith-Waterman case, compute a
 83        separate composition for each HSP, even if multiple HSPs are in
 84        the same window.
 85 
 86  Revision 6.73  2005/09/13 16:23:01  coulouri
 87  correct include paths for windows
 88 
 89  Revision 6.72  2005/09/13 14:55:06  coulouri
 90  From Mike Gertz: use libblastcompadj
 91 
 92  Revision 6.71  2005/09/08 19:48:36  coulouri
 93  From Mike Gertz:
 94  Changed the behavior of a SWheap so that space for only a small number
 95  (100) of hits is allocated initially, but the SWheap will grow to hold
 96  as many hits as are needed.
 97 
 98  Revision 6.70  2005/09/08 14:01:49  coulouri
 99  From Mike Gertz:
100    - Fixed a bug in which the extent of the subject match was being
101      calculated incorrectly setting forbidden ranges for a
102      Smith-Waterman alignment.
103 
104  Revision 6.69  2005/08/31 20:33:25  coulouri
105      New code to use the value of options->kappa_expect_value as the cutoff
106      e-value within RedoAlignmentCore. The code saves and restores the
107      value of search->pbp->cutoff_e.
108 
109  Revision 6.68  2005/08/30 18:20:46  coulouri
110  From Mike Gertz:
111      Changes to routines used in mode > 1 of compositional adjustment of
112      scoring matrices:
113        - Removed REscaleInitialScores as it is now redundant; rescaling
114          is now done as part of the call s_ScatterScores.
115        - Refactored Kappa_AdjustBXZ into the routines
116          Kappa_SetNonstandardAaScores and Kappa_AverageScores.
117        - Kappa_SetNonstandardAaScores sets scores for U, * and -,
118          unlike its predecessor Kappa_AdjustBXZ.
119        - Fixed a bug in the formula used to compute the score of (B,B)
120          matches.
121        - In s_RoundScoreMatrix, check whether the unrounded value is <
122          BLAST_SCORE_MIN and if so set the rounded value to
123          BLAST_SCORE_MIN; do not read the incoming value of the "matrix"
124          parameter.
125 
126      Changes to mode 1 (traditional) composition-based statistics:
127        - scaleMatrix renamed s_ScaleMatrx with slightly different
128          parameter list.
129        - s_ScaleMatrix does not scale substitution scores for the
130          nonstandard X, U, - and * characters (but does scale B and Z).
131          The previous version scaled scores for X and *.
132 
133      Made minor changes to Kappa_CompositionBasedStats and
134      Kappa_AdjustSearch to call the new routines described above.
135 
136  Revision 6.67  2005/08/05 12:04:53  coulouri
137  From Mike Gertz:
138  - Move setting gap_align->translate2 to Kappa_RecordInitialSearch;
139    for tblastn and some option settings it would not otherwise get set.
140  - Remove a now redundant setting of gap_align->translate2 in the
141    HitToGapAlign routine.
142  - Fixes to comments.
143 
144  Revision 6.66  2005/08/02 14:40:29  coulouri
145  From Mike Gertz:
146  - Fixes to comments
147  - Added enum constant eGapChar; renamed eStarChar to eStopChar.
148  - Change the integer type of some variables to suppress warnings about
149    assigning a wider type to a narrower type.
150  - Made the routines BLbasicSmithWatermanScoreOnly and BLSmithWatermanFindStart
151    static.
152  - Renamed s_ScatterFreqRatios -> s_ScatterScores; renamed parameters.
153  - In NewAlignmentUsingXdrop, use the translate2 field from gap_align
154    to set the same field in the edit block.
155  - Changed the Kappa_WindowInfo datatype to hold a list of HSPs in the
156    window; added logic in several places, notably WindowsFromHSPs, to
157    generate and maintain these lists.
158  - Refactored Kappa_AdjustSearch.  Use a more sophisticated rule,
159    implemented in the new Kappa_GetSubjectComposition routine, to
160    determine the subject sequence composition for tblastn.
161  - Removed unused parameters from several routines.
162  - In RedoAlignmentCore, delete NRrecord to avoid a memory leak.
163 
164  Revision 6.65  2005/07/28 14:34:06  coulouri
165  From Mike Gertz:
166  - Made minor fixes to whitespace and formatting.
167  - Fixed the comment to SWheap to accurately reflect recent changes in
168    the way SWheapRecordCompare is used.
169  - Changed some #define'd constants to enums or "static const"
170    variables.  Renamed these constants appropriately.
171  - Made the array alphaConvert and the routine scalePosMatrix static.
172  - Made sure that gap_align->translate2 is set for tblastn.
173  - Fixed a bug Kappa_SequenceGetTranslatedWindow: the wrong formula had
174    been used to compute num_nucleotides.
175 
176  Revision 6.64  2005/07/26 13:07:20  coulouri
177  - Changed #include "NRdefs.h" to #include <NRdefs.h> to be consistent
178    with toolbox conventions.
179 
180  - Removed the unused kScoreMatrixRange constant
181 
182  - Extended comments for some routines.  Fixed spelling and other
183    errors in comments.
184 
185  - Fixed white space and formatting in several locations.
186 
187  - Renamed functions:
188    o permuteLetterProbs -> s_GatherLetterProbs
189    o permuteFreqRatios  -> s_ScatterFreqRatios
190    o adjustBXZ -> Kappa_AdjustBXZ
191    o roundScoreMatrix -> s_RoundScoreMatrix
192 
193  - Eliminated the use of a temporary variable in REscaleInitialScores.
194 
195  - In sScatterFreqRatios, replaced the NRrecord parameter by a pointer
196    to the frequency ratio matrix contained in the record.
197 
198  Revision 6.63  2005/07/14 20:19:45  coulouri
199     - In Kappa_SequenceGetWindow, change all selenocysteine (U)
200       residues in the subject sequence to X.
201     - In scaleMatrix, do not compute the log of frequency ratios that are
202       zero, set the matrix element to BLAST_SCORE_MIN instead.
203     - Removed the startMatrix parameter to scaleMatrix, as it is no
204       longer used.
205 
206  Revision 6.61  2005/06/08 19:31:31  papadopo
207  From Michael Gertz:
208  1. The use of the score for comparing collections of alignments
209     was removed in March 2004; revert to the previous rule
210  2. Added a new field "bestScore" to the SWheapRecord structure
211  3. Various additional changes to support the use of score as a
212     key in a SWheapRecord
213  4. The comparison function is now used to determine whether new HSPs
214     may be added to the heap.  Previously, the evalue only was used
215  5. Removed a complex test that sometimes terminated computation of
216     Smith-Waterman alignments for a given subject sequence. It is more
217     consistent with all other modes of operation to omit this test
218  6. Removed the "capacity" parameter to SWheapInitialize and used the
219     heapThreshold parameter to set the capacity for the heap
220 
221  Revision 6.60  2005/05/18 21:27:33  papadopo
222  make fillResidueProbability unconditional in Kappa_AdjustSearch
223 
224  Revision 6.59  2005/05/16 18:10:13  kans
225  include Mode_condition.h to get prototype for chooseMode
226 
227  Revision 6.58  2005/05/16 17:46:48  papadopo
228  From Mike Gertz/Alejandro Schaffer: Added support for compositional
229  adjustment of score matrices, both conditionally and unconditionally.
230  The adjustParameters argument to RedoAlignmentCore is no longer Boolean
231  because there are now 4 allowed options:
232        0 no adjustment
233        1 composition-based statistics
234        2 conditional compositional adjustment
235        3 unconditional compositional adjustment
236 
237  Revision 6.57  2005/04/13 17:18:02  coulouri
238  changes to WindowsFromHSPs for tblastn composition-based statistics
239 
240  Revision 6.56  2005/01/24 15:52:54  camacho
241  doxygen fixes from Mike Gertz
242 
243  Revision 6.55  2005/01/20 16:28:59  camacho
244  doxygen fixes
245 
246  Revision 6.54  2005/01/18 14:54:13  camacho
247  Change in tie-breakers for score comparison, suggestion by Mike Gertz
248 
249  Revision 6.53  2004/11/24 15:42:33  coulouri
250  do not dereference null pointer
251 
252  Revision 6.52  2004/11/23 21:29:02  camacho
253  Changes from Mike Gertz:
254  - If no alignments are found, I no longer create an empty HSP list and
255    process it; I just move on to the next match.
256  - For tblastn, culling by containment now occurs before link_hsps as
257    it should. This can not have any effect on the current output, as
258    tblastn + RedoAlignmentCore is not enabled.
259  - I changed some Nlm_Mallocs, etc. to MemNew.
260  - Other cosmetic changes, doxygen friendly comments
261 
262  Revision 6.51  2004/11/01 20:43:48  camacho
263  Added error handling for missing PSSM
264 
265  Revision 6.50  2004/10/27 21:00:02  camacho
266  Change the order of elements in Score* returned by GetScoreSetFromBlastHsp to
267  be consistent with score ordering in other contexts of BLAST.
268 
269  Revision 6.49  2004/09/09 13:47:47  papadopo
270  from Michael Gertz: remove unnecessary check for the presence of Selenocysteine in translated sequences
271 
272  Revision 6.48  2004/08/23 19:37:38  papadopo
273  From Michael Gertz: fix memory leak in getStartFreqRatios
274 
275  Revision 6.47  2004/08/23 17:12:21  papadopo
276  From Michael Gertz: Backported some changes to
277  getStartFreqRatios and computeScaledStandardMatrix
278  from algo/blast/core/blast_kappa.c code. Also a few
279  straightforward changes to getStartFreqRatios,
280  because the code has diverged slightly
281 
282  Revision 6.46  2004/07/28 17:19:54  kans
283  HeapSort callbacks are int LIBCALLBACK for compilation on the PC
284 
285  Revision 6.45  2004/07/27 19:59:00  papadopo
286  Changes by Michael Gertz to allow RedoAlignmentCore to be used
287  for tblastn searches. The current version of the code uses two heuristics:
288  1) a heuristic that skips doing re-alignment for an HSP if the old
289     alignment of the HSP is contained in a higher-scoring HSP that
290     has already been re-aligned.
291  2) a heuristic that attempts to ensure that all alignments in the list
292     of redone alignments has distinct ends.
293  The original code also used these heuristics, but the code that
294  implements the heuristics has been rewritten. As a result,
295  RedoAlignmentCore will occasionally report better-scoring alignments.
296  This is still a heuristic; which HSPs are reported still depends on
297  the order in which they are re-aligned.
298 
299  Revision 6.44  2004/06/23 14:53:58  camacho
300  Use renamed FreqRatios structure from posit.h
301 
302  Revision 6.43  2004/06/22 14:16:56  camacho
303  Use SFreqRatios structure from algo/blast/core to obtain underlying matrices' frequency ratios.
304 
305  Revision 6.42  2004/06/18 15:50:25  madden
306  For secondary alignments with Smith-Waterman on, use the E-value from the X-drop alignment computed by ALIGN that has more flexibility in which positions are allowed in the dynamic program.
307 
308  Add a sort of alignments for the same query-subject pair because the use of X-drop alignments occasionally reorders such alignments.
309 
310  Changes from Mike Gertz, submitted by Alejandro Schaffer.
311 
312  Revision 6.41  2004/06/14 21:11:05  papadopo
313  From Michael Gertz:
314  - Added several casts where casts occur in blast_kappa.c.  These casts
315       should have no real effect; the log of blast_kappa.c indicates that
316       they suppress compiler warnings.
317  - Changed the type of one variable that holds a score from
318       Nlm_FloatHi to Int4.
319  - moved the definition Kappa_ForbiddenRanges and relevant
320       routines earlier in the file.
321  - fixed some comments.
322  - made a few (~5) changes in whitespace.
323 
324  Revision 6.40  2004/06/03 16:10:50  dondosha
325  Fix in Kappa_SearchParametersNew: allocate correct number of rows for matrices
326 
327  Revision 6.39  2004/03/31 18:12:13  papadopo
328  Mike Gertz' refactoring of RedoAlignmentCore
329 
330  Revision 6.38  2004/02/06 19:25:16  camacho
331  Alejandro Schaffer's corrections to RedoAlignmentCore pointed out by
332  Mike Gertz:
333  1. Corrected the rule for assigning newSW->isfirstAlignment
334  2. Changed upper bound on assignment to forbiddenRanges
335  3. Assigned a value to newScore earlier
336  4. Eliminated use of skipThis
337  5. Restored value of search->gapAlign at the end
338 
339  Revision 6.37  2004/01/27 20:31:52  madden
340  remove extra setting of kbp_gap
341 
342  Revision 6.36  2004/01/06 17:48:44  dondosha
343  Do not free Karlin block in RedoAlignmentCore, because its pointer is passed outside
344 
345  Revision 6.35  2003/12/01 19:15:27  madden
346  Add one byte to filteredMatchingSequence to prevent ABR/ABW
347 
348  Revision 6.34  2003/10/22 20:37:19  madden
349  Set kbp to rescaled values, use upper-case for SCALING_FACTOR define
350 
351  Revision 6.33  2003/10/02 19:59:34  kans
352  BlastGetGapAlgnTbck needed FALSE instead of NULL in two parameters - Mac compiler complaint
353 
354  Revision 6.32  2003/10/02 19:31:24  madden
355  In RedoAlignmentCore, call procedure BlastGetGapAlgnTbck instead of ALIGN
356  to redo alignments; this allows the endpoints of the alignment to change.
357  Because  BlastGetGapAlgnTbck returns a list of SeqAlign's while ALIGN
358  passes back a single-alignment, the post-processing of the redone
359  alignments is changed including the addition of the procedure
360  concatenateListOfSeqaligns.
361 
362  Revision 6.30  2003/05/30 17:25:36  coulouri
363  add rcsid
364 
365  Revision 6.29  2003/05/13 16:02:53  coulouri
366  make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
367 
368  Revision 6.28  2002/12/19 14:40:35  kans
369  changed C++-style comment to C-style
370 
371  Revision 6.27  2002/12/10 22:58:42  bealer
372  Keep mappings to sequences from readdb so that "num_ident" code does not
373  segfault with multiple databases.
374 
375  Revision 6.26  2002/11/06 20:31:14  dondosha
376  Added recalculation of the number of identities when computing seqalign from SWResults
377 
378  Revision 6.25  2002/10/16 13:33:58  madden
379  Corrected initialization of initialUngappedLambda in RedoAlignmentCore
380 
381  Revision 6.24  2002/09/03 13:55:14  kans
382  changed NULL to 0 for Mac compiler
383 
384  Revision 6.23  2002/08/29 15:47:49  camacho
385  Changed RedoAlignmentCore to work without readdb facility
386 
387  Revision 6.22  2002/08/20 15:43:08  camacho
388  Fixed memory leak in getStartFreqRatios
389 
390  Revision 6.21  2002/05/23 20:14:21  madden
391  Correction for last checkin to cover SmithWaterman type alignment
392 
393  Revision 6.20  2001/12/28 18:02:33  dondosha
394  Keep score and scoreThisAlign for each local alignment, so as to allow tie-breaking by score
395 
396  Revision 6.19  2001/07/26 12:52:25  madden
397  Fix memory leaks
398 
399  Revision 6.18  2001/07/09 15:12:47  shavirin
400  Functions BLbasicSmithWatermanScoreOnly() and BLSmithWatermanFindStart()
401  used to calculate Smith-waterman alignments on low level become external.
402 
403  Revision 6.17  2001/05/25 19:40:46  vakatov
404  Nested comment typo fixed
405 
406  Revision 6.16  2001/04/13 20:47:36  madden
407  Eliminated use of PRO_K_MULTIPLIER in adjusting E-values Added allocateStartFreqs and freeStartFreqs and getStartFreqRatios to enable the score matrix scaling to work entirely with frequency ratios and round to integers only at the very end of the scaling calculation.
408 
409  Revision 6.15  2001/03/20 15:07:34  madden
410  Fix from AS for (near) exact matches
411 
412  Revision 6.14  2001/01/04 13:48:44  madden
413  Correction for 3-parameter gapping
414 
415  Revision 6.13  2000/12/05 19:31:33  madden
416  Avoid duplicate insertion into heap when ((doThis == FALSE) && (currentState = SWPurging))
417 
418  Revision 6.12  2000/10/16 21:08:05  madden
419  segResult takes BioseqPtr as argument, produced from readdb_get_bioseq
420 
421  Revision 6.11  2000/10/13 19:32:58  madden
422  Fix for bug that caused crash
423 
424  Revision 6.10  2000/10/10 21:46:03  shavirin
425  Added support for BLOSUM50, BLOSUM90, PAM250 with -t T
426 
427  Revision 6.9  2000/10/10 19:45:51  shavirin
428  Checked for NULL hsp_array in the function RedoAlignmentCore().
429 
430  Revision 6.8  2000/08/18 21:28:24  madden
431  support for BLOSUM62_20A and BLOSUM62_20B, prevent LambdaRatio from getting above 1.0
432 
433  Revision 6.7  2000/08/09 21:10:15  shavirin
434  Added paramter discontinuous to the function newConvertSWalignsToSeqAligns()
435 
436  Revision 6.6  2000/08/08 21:45:04  shavirin
437  Removed initialization of decline_aline parameter to INT2_MAX.
438 
439  Revision 6.5  2000/08/03 22:20:10  shavirin
440  Added creation of the default posFreqs array if it is empty in
441  RedoAlignmentCore(). Fixed some memory leaks.
442 
443  Revision 6.4  2000/08/02 18:00:34  shavirin
444  Fixed memory leak in RedoAlignmentCore. Fixed a problem specific
445  to BLOSUM45 and BLOSUM80 in the procedure scalePosMatrix().
446 
447  Revision 6.3  2000/07/26 19:34:13  kans
448  removed unix-only headers
449 
450  Revision 6.2  2000/07/26 17:06:31  lewisg
451  add LIBCALLs
452 
453  Revision 6.1  2000/07/25 17:40:05  shavirin
454  Initial revision.
455 
456 
457 ******************************************************************************/
458 
459 #include<assert.h>
460 #include<string.h>
461 
462 #include <ncbi.h>
463 #include <blast.h>
464 #include <objseq.h>
465 #include <objsset.h>
466 #include <sequtil.h>
467 #include <seqport.h>
468 #include <tofasta.h>
469 #include <blastpri.h>
470 #include <txalign.h>
471 #include <simutil.h>
472 #include <posit.h>
473 #include <gapxdrop.h>
474 #include <fcntl.h>
475 #include <profiles.h>
476 
477 #include <algo/blast/composition_adjustment/nlm_linear_algebra.h>
478 #include <algo/blast/composition_adjustment/matrix_frequency_data.h>
479 #include <algo/blast/composition_adjustment/composition_adjustment.h>
480 #include <algo/blast/composition_adjustment/compo_heap.h>
481 #include <algo/blast/composition_adjustment/smith_waterman.h>
482 #include <algo/blast/composition_adjustment/redo_alignment.h>
483 #include <algo/blast/composition_adjustment/unified_pvalues.h>
484 
485 
486 /** Define KAPPA_PRINT_DIAGNOSTICS to be true to turn on printing of
487  * diagnostic information from some routines.
488  *
489  * This macro is usually used as part of a C-conditional
490  * @code
491  * if (KAPPA_PRINT_DIAGNOSTICS) {
492  *     perform expensive tests
493  * }
494  * @endcode
495  *
496  * The C compiler will then validate the code that prints the
497  * diagnostics, but will almost always strip the code if
498  * KAPPA_PRINT_DIAGNOSTICS is false.
499  */
500 #ifndef KAPPA_PRINT_DIAGNOSTICS
501 #define KAPPA_PRINT_DIAGNOSTICS 0
502 #endif
503 
504 /**
505  * Create a score set from the data in an HSP.
506  *
507  * @param hsp               the HSP of interest [in]
508  * @param logK              Karlin-Altschul statistical parameter [in]
509  * @param lambda            Karlin-Altschul statistical parameter [in]
510  * @param scoreDivisor      the value by which reported scores are to be
511  *                          divided [in]
512  */
513 static ScorePtr
514 Kappa_GetScoreSetFromBlastHsp(
515   const BLAST_HSPPtr hsp,
516   Nlm_FloatHi lambda,
517   Nlm_FloatHi logK,
518   Nlm_FloatHi scoreDivisor)
519 {
520   ScorePtr      score_set = NULL;       /* the new score set */
521   int          score;          /* the score, scaled using scoreDivisor */
522   Nlm_FloatHi   bit_score;      /* the integer-valued score, in bits */
523   Nlm_FloatHi   evalue;         /* the e-value, with numbers too close to zero
524                                    set to zero */
525 
526   score = Nlm_Nint(((Nlm_FloatHi) hsp->score) / scoreDivisor);
527   if (score > 0) {
528     MakeBlastScore(&score_set, "score", 0.0, score);
529   }
530 
531   evalue = (hsp->evalue >= 1.0e-180) ? hsp->evalue : 0;
532   if (hsp->num > 1) {
533     MakeBlastScore(&score_set, "sum_n", 0.0, hsp->num);
534     MakeBlastScore(&score_set, "sum_e", evalue, 0);
535     if( hsp->ordering_method == BLAST_SMALL_GAPS) {
536       MakeBlastScore(&score_set, "small_gap", 0.0, 1);
537     }
538   } else {
539     MakeBlastScore(&score_set, "e_value", evalue, 0);
540   }
541 
542   /* Compute the bit score using the newly computed scaled score. */
543   bit_score = (score*lambda*scoreDivisor - logK)/NCBIMATH_LN2;
544   if (bit_score >= 0.) {
545     MakeBlastScore(&score_set, "bit_score", bit_score, 0);
546   }
547 
548   if (hsp->ordering_method > 3) {
549     /* In new tblastn this means splice junction was found */
550     MakeBlastScore(&score_set, "splice_junction", 0.0, 1);
551   }
552 
553   if (hsp->num_ident > 0) {
554     MakeBlastScore(&score_set, "num_ident", 0.0, hsp->num_ident);
555   }
556 
557   MakeBlastScore(&score_set, "comp_adjustment_method",0.0,
558                  hsp->comp_adjustment_method);
559   return score_set;
560 }
561 
562 
563 /**
564  * Converts a collection of alignments represented by a BLAST_HitList
565  * into a collection of instances of SeqAlign. Returns the new
566  * collection.
567  *
568  * @param hitlist           the hitlist to convert to SeqAligns [in]
569  * @param subject_id        the identifier of the subject sequence [in]
570  * @param query_id          the identifier of the query sequence [in]
571  * @param queryOrigin       origin of this specific query in the
572  *                          concatenated query [in]
573  * @param queryLength       length of this specific query [in]
574  * @param lambda            Karlin-Altschul statistical parameter [in]
575  * @param logK              Karlin-Altschul statistical parameter [in]
576  * @param scoreDivisor      value by which reported scores are to be
577  *                          divided [in]
578  * @param bestEvalue)       the best (smallest) e-value of all the alignments
579  *                          in the hitlist [out]
580  */
581 static SeqAlignPtr
582 Kappa_SeqAlignsFromHitlist(
583   BLAST_HitListPtr hitlist,
584   SeqIdPtr subject_id,
585   SeqIdPtr query_id,
586   int queryOrigin,
587   int queryLength,
588   Nlm_FloatHi lambda,
589   Nlm_FloatHi logK,
590   Nlm_FloatHi scoreDivisor)
591 {
592   SeqAlignPtr aligns = NULL;  /* list of SeqAligns to be returned */
593   int        hsp_index;
594 
595   for( hsp_index = hitlist->hspcnt - 1; hsp_index >= 0; hsp_index-- ) {
596     /* iterate in reverse order over all HSPs in the hitlist */
597     BLAST_HSPPtr hsp;           /* HSP for this iteration */
598     SeqAlignPtr seqAlign;       /* the new SeqAlign */
599 
600     hsp      = hitlist->hsp_array[hsp_index];
601 
602     /* Shift the edit block to query local coordinates */
603     hsp->gap_info->start1           -= queryOrigin;
604     hsp->gap_info->length1          -= queryOrigin;
605     hsp->gap_info->original_length1  = queryLength;
606 
607     seqAlign = GapXEditBlockToSeqAlign(hsp->gap_info, subject_id, query_id);
608     seqAlign->score =
609         Kappa_GetScoreSetFromBlastHsp(hsp, lambda, logK, scoreDivisor);
610 
611     seqAlign->next  = aligns;
612     aligns          = seqAlign;
613   }
614   return aligns;
615 }
616 
617 
618 /**
619  * Adjusts the E-values in a BLAST_HitList to be composites of
620  * a composition-based P-value and a score/alignment-based P-value
621  *
622  * @param hitlist           the hitlist whose E-values need to be adjusted
623  * @param comp_p_value      P-value from sequence composition
624  * @param search            is the structure with all the information about
625  *                          the search
626  * @param LambdaRatio       the ratio between the observed value of Lambda
627  *                          and the predicted value of lambda (used to print
628  *                          diagnostics)
629  * @param subject_id        the subject id of this sequence (used to print
630  *                          diagnostics)
631  **/
632 static void
633 Kappa_AdjustEvaluesForComposition(
634   BLAST_HitListPtr hitlist,
635   Nlm_FloatHi comp_p_value,
636   BlastSearchBlkPtr search,
637   Nlm_FloatHi LambdaRatio,
638   int subject_id)
639 {
640   int        hsp_index;
641 
642   for (hsp_index = 0;  hsp_index < hitlist->hspcnt;  hsp_index++) {
643     /* for all HSPs */
644     BLAST_HSPPtr hsp;         /* HSP for this iteration */
645     double align_p_value;     /* P-value for the alignment score */
646     double combined_p_value;  /* combination of two P-values */
647     double old_e_value;       /* alignment E-value */
648     double db_to_sequence_scale;  /* scale factor for converting a database
649                                      e-value to a sequence e-value */
650 
651     hsp = hitlist->hsp_array[hsp_index];
652     old_e_value = hsp->evalue;
653     /* Convert the database E-value to the sequence E-value */
654     db_to_sequence_scale =
655       (MAX((search->subject->length - search->length_adjustment), 1.0))/
656       search->dblen_eff;
657     hsp->evalue *= db_to_sequence_scale;
658 
659     align_p_value = BlastKarlinEtoP(hsp->evalue);
660     combined_p_value = Blast_Overall_P_Value(comp_p_value,align_p_value);
661     hsp->evalue = BlastKarlinPtoE(combined_p_value);
662     hsp->evalue /= db_to_sequence_scale;
663 
664     if (KAPPA_PRINT_DIAGNOSTICS) {
665       int    sequence_gi; /*GI of a sequence*/
666       int    context_index; /* query context index of this HSP */
667       context_index = hsp->context;
668       
669       if (search->rdfp) {
670         SeqIdPtr sip; /*used to extract sequence from database*/
671         char    *sname = NULL; /*string for defline*/
672         char    *buff = NULL; /*string for defline*/
673         char    *string_descriptor = NULL; /*string for defline*/
674 
675         sname = (char *) malloc(SeqIdBufferSize * sizeof(Char));
676         buff = (char *) malloc(SeqIdBufferSize * sizeof(Char));
677         string_descriptor =
678           (char *) malloc(SeqIdBufferSize * sizeof(Char));
679         sip = NULL;
680         readdb_get_defline(search->rdfp,subject_id,&sname);
681         readdb_get_descriptor(search->rdfp, subject_id, &sip,
682                               &string_descriptor);
683         if (sip)
684           SeqIdWrite(sip, buff, PRINTID_FASTA_LONG, sizeof(buff));
685         (void) GetAccessionFromSeqId(SeqIdFindBest(sip,SEQID_GI),
686                                      &sequence_gi, &sname);
687         sip = SeqIdSetFree(sip);
688         free(string_descriptor);
689         free(buff);
690         free(sname);
691       } else {
692         sequence_gi = (-1);
693       }
694       printf("GI %d Lambda ratio %e comp. p-value %e; "
695              "adjust E-value of query length %d match length "
696              "%d from %e to %e\n",
697              sequence_gi, LambdaRatio, comp_p_value,
698              search->context[context_index].query->length,
699              search->subject->length, old_e_value, hsp->evalue);
700     } /* end if (KAPPA_PRINT_DIAGNOSTICS) */
701   } /* end for all HSPs */
702 }
703 
704 /**
705  * Converts a list of objects of type BlastCompo_Alignment to an
706  * new object of type BLAST_HitList and returns the result. Conversion
707  * in this direction is lossless.  The list passed to this routine is
708  * freed to ensure that there is no aliasing of fields between the
709  * list of BlastCompo_Alignments and the new hitlist.
710  *
711  * @param search            general search information
712  * @param alignments        a list of distinct alignments
713  */
714 static BLAST_HitListPtr
715 Kappa_SortedHitlistFromAligns(
716   BlastSearchBlkPtr search,
717   BlastCompo_Alignment ** alignments)
718 {
719   /* The context of the query is always zero in current
720    * implementations of RedoAlignmentCore. */
721   const Int2 context = 0;
722 
723   BLAST_HitListPtr hitlist;             /* the new hitlist */
724   BlastCompo_Alignment * align;      /* represents the current
725                                          * alignment in loops */
726   if(search->current_hitlist != NULL) {
727     search->current_hitlist->hspcnt_max = search->current_hitlist->hspcnt;
728     BlastHitListPurge(search->current_hitlist);
729   } else {
730     search->current_hitlist =  BlastHitListNew(search);
731   }
732   hitlist = search->current_hitlist;
733   hitlist->do_not_reallocate = FALSE;
734 
735   for(align = *alignments; align != NULL; align = align->next) {
736     BLAST_HSPPtr hsp;           /* the new HSP for this alignment */
737     /* queryExtent and matchExtent represent the extent of the
738        alignment in the query and subject sequences respectively. */
739     int queryExtent = align->queryEnd - align->queryStart;
740     int matchExtent = align->matchEnd - align->matchStart;
741 
742     BlastSaveCurrentHsp(search, align->score, align->queryStart,
743                         align->matchStart, matchExtent, context);
744 
745     hsp = hitlist->hsp_array[hitlist->hspcnt - 1];
746 
747     hsp->query.length   = queryExtent;
748     hsp->query.end      = align->queryEnd;
749 
750     hsp->subject.length = matchExtent;
751     hsp->subject.end    = align->matchEnd;
752 
753     hsp->subject.frame  = (Int2) align->frame;
754     hsp->evalue         = 0.0;  /* E-values are computed after the full
755                                    hitlist has been created. */
756     hsp->gap_info       = (GapXEditBlockPtr) align->context;
757     switch (align->matrix_adjust_rule) {
758     case eDontAdjustMatrix:
759         hsp->comp_adjustment_method = eNoCompositionBasedStats;
760       break;
761     case eCompoScaleOldMatrix:
762         hsp->comp_adjustment_method = eCompositionBasedStats;
763       break;
764     default:
765         hsp->comp_adjustment_method = eCompositionMatrixAdjust;
766       break;
767     }
768     /* Break the aliasing between align->context and hsp->gap_info. */
769     align->context = NULL;
770   }
771   BlastCompo_AlignmentsFree(alignments, NULL);
772   if (hitlist->hspcnt > 0) {
773     HeapSort(hitlist->hsp_array, hitlist->hspcnt, sizeof(BLAST_HSPPtr),
774              score_compare_hsps);
775   }
776   return hitlist;
777 }
778 
779 
780 /**
781  * Calculate expect values for a list of alignments and remove those
782  * that don't meet an e-value cutoff from the list.
783  *
784  * @param *pbestScore      the largest score observed in the list
785  * @param *pbestEvalue     the best (smallest) e-value observed in the
786  *                         list
787  * @param full_subject_length   the untranslated length of the subject
788  *                              sequence
789  * @param search           contains search parameters used to compute
790  *                         e-values and the e-value cutoff
791  * @param do_link_hsps     use hsp linking when computing e-values
792  * @param pvalueForThisPair  composition p-value
793  * @param LambdaRatio        lambda ratio, if available
794  * @param subject_id         index of subject
795  */
796 static void
797 Kappa_HitlistEvaluateAndPurge(int * pbestScore, double *pbestEvalue,
798                              int full_subject_length,
799                              BlastSearchBlkPtr search,
800                              int do_link_hsps,
801                              double pvalueForThisPair,
802                              double LambdaRatio,
803                              int subject_id)
804 {
805   BLAST_HitListPtr hitlist; /* a hitlist containing the newly-computed
806                                    * alignments */
807   Nlm_FloatHi bestEvalue;   /* best e-value among alignments in the
808                                      hitlist */
809   int bestScore;           /* best score among alignments in the
810                                hitlist */
811   int hsp_index;           /* index of the current HSP */
812 
813   hitlist = search->current_hitlist;
814   search->subject->length  = full_subject_length;
815   if (do_link_hsps) {
816     BlastLinkHsps(search);
817   } else {
818     BlastGetNonSumStatsEvalue(search);
819   }
820   if ((0 <= pvalueForThisPair) && (pvalueForThisPair <= 1))
821     Kappa_AdjustEvaluesForComposition(hitlist, pvalueForThisPair,
822                                       search, LambdaRatio, subject_id);
823   BlastReapHitlistByEvalue(search);
824   /* Find the e-value of the best alignment in the list -- the list
825    * is typically sorted by score and not by e-value, so a search is
826    * necessary. */
827   bestEvalue = DBL_MAX;
828   bestScore  = 0;
829   for (hsp_index = 0;  hsp_index < hitlist->hspcnt;  hsp_index++) {
830     if( hitlist->hsp_array[hsp_index]->evalue < bestEvalue ) {
831       bestEvalue = hitlist->hsp_array[hsp_index]->evalue;
832     }
833     if( hitlist->hsp_array[hsp_index]->score > bestScore ) {
834       bestScore = hitlist->hsp_array[hsp_index]->score;
835     }
836   }
837   *pbestScore  = bestScore;
838   *pbestEvalue = bestEvalue;
839 }
840 
841 
842 /**
843  * A callback routine: compute lambda for the given score frequencies.
844  * (@sa calc_lambda_type).
845  */
846 static double
847 Kappa_CalcLambda(double probs[], int min_score, int max_score, double lambda0)
848 {
849   int i, n;
850   double avg;
851   BLAST_ScoreFreq freq;
852 
853   n = max_score - min_score + 1;
854   avg = 0.0;
855   for (i = 0;  i < n;  i++) {
856     avg += (min_score + i) * probs[i];
857   }        
858   freq.obs_min = min_score;
859   freq.obs_max = max_score;
860   freq.sprob = &probs[-min_score];
861   freq.score_avg = avg;
862   
863   return impalaKarlinLambdaNR(&freq, lambda0);
864 }
865 
866 
867 /**
868  * Get a matrix of the frequency ratios that underlie the score
869  * matrix being used on this pass. The returned matrix is
870  * position-specific, so if we are in the first pass, use query to
871  * convert the 20x20 standard matrix into a position-specific
872  * variant. matrixName is the name of the underlying 20x20 score
873  * matrix used. numPositions is the length of the query;
874  * startNumerator is the matrix of frequency ratios as stored in
875  * posit.h. It needs to be divided by the frequency of the second
876  * character to get the intended ratio
877  *
878  * @param returnRatios      an allocated matrix to hold the frequency
879  *                          ratios [out]
880  * @param search            general search information [in]
881  * @param query             the query sequence [in]
882  * @param matrixName        name of the underlying matrix [in]
883  * @param startNumerator    matrix of frequency ratios [in]
884  * @param numPositions      length of the query [in]
885  * @param positionSpecific  is this a position-specific search? [in]
886  */
887 static void
888 Kappa_GetStartFreqRatios(Nlm_FloatHi ** returnRatios,
889                    BlastSearchBlkPtr search,
890                    Uint1Ptr query,
891                    const char *matrixName,
892                    Nlm_FloatHi **startNumerator,
893                    int numPositions,
894                    Boolean positionSpecific)
895 {
896    int i,j;
897    FreqRatios * stdFreqRatios = NULL;
898    /* a small cutoff used to determine whether it is necessary
899     * to reverse the multiplication done in posit.c */
900    const double posEpsilon = 0.0001;
901 
902    stdFreqRatios = PSIMatrixFrequencyRatiosNew(matrixName);
903    if (positionSpecific) {
904     for(i = 0; i < numPositions; i++) {
905        for(j = 0; j < PROTEIN_ALPHABET; j++) {
906          returnRatios[i][j] = stdFreqRatios->data[query[i]][j];
907        }
908      }
909    } else {
910      for (i = 0; i < PROTEIN_ALPHABET; i++) {
911        for (j = 0; j < PROTEIN_ALPHABET; j++) {
912          returnRatios[i][j] = stdFreqRatios->data[i][j];
913        }
914      }
915    }
916    stdFreqRatios = PSIMatrixFrequencyRatiosFree(stdFreqRatios);
917 
918    if (positionSpecific) {
919      Nlm_FloatHi *standardProb; /*probabilities of each letter*/
920      BLAST_ResFreqPtr stdrfp; /* gets standard frequencies in prob field */
921 
922      stdrfp = BlastResFreqNew(search->sbp);
923      BlastResFreqStdComp(search->sbp,stdrfp);
924      standardProb = &stdrfp->prob[0];
925 
926      /*reverse multiplication done in posit.c*/
927      for(i = 0; i < numPositions; i++) {
928        for(j = 0; j < PROTEIN_ALPHABET; j++) {
929          if ((standardProb[query[i]] > posEpsilon) &&
930              (standardProb[j] > posEpsilon) &&
931              (j != eStopChar) && (j != Xchar) &&
932              (startNumerator[i][j] > posEpsilon))
933          {
934            returnRatios[i][j] = startNumerator[i][j]/standardProb[j];
935          }
936        }
937      }
938      stdrfp = BlastResFreqDestruct(stdrfp);
939    }
940 }
941 
942 
943 /** SCALING_FACTOR is a multiplicative factor used to get more bits of
944  * precision in the integer matrix scores. It cannot be arbitrarily
945  * large because we do not want total alignment scores to exceed
946  * -(INT4_MIN) */
947 #define SCALING_FACTOR 32
948 
949 
950 /**
951  * produce a scaled-up version of the position-specific matrix
952  * starting from posFreqs
953  *
954  * @param fillPosMatrix     is the matrix to be filled
955  * @param nonposMatrix      is the underlying position-independent matrix,
956  *                          used to fill positions where frequencies are
957  *                          irrelevant
958  * @param matrixName        name of the position-independent matrix
959  * @param posFreq           frequency ratios for the position-dependent
960  *                          matrix
961  * @param query             query sequence data
962  * @param queryLength       length of the query
963  * @param sbp               stores various parameters of the search
964  */
965 static void
966 Kappa_ScalePosMatrix(BLAST_Score **fillPosMatrix, BLAST_Score **nonposMatrix,
967                      const Char *matrixName, Nlm_FloatHi **posFreqs,
968                      Uint1 *query, int queryLength, BLAST_ScoreBlkPtr sbp,
969                      Nlm_FloatHi localScalingFactor)
970 {
971 
972      posSearchItems *posSearch; /*used to pass data into scaling routines*/
973      compactSearchItems *compactSearch; /*used to pass data into scaling routines*/
974      int i; /*loop index*/
975      BLAST_ResFreqPtr stdrfp; /* gets standard frequencies in prob field */
976      int a; /*index over characters*/
977 
978 
979      posSearch = (posSearchItems *) MemNew (1 * sizeof(posSearchItems));
980      compactSearch = (compactSearchItems *) MemNew (1 * sizeof(compactSearchItems));
981      posSearch->posMatrix = (BLAST_Score **) MemNew((queryLength + 1) * sizeof(BLAST_Score *));
982      posSearch->posPrivateMatrix = fillPosMatrix;
983      posSearch->posFreqs = posFreqs;
984      posSearch->stdFreqRatios = PSIMatrixFrequencyRatiosNew(matrixName);
985      for(i = 0; i <= queryLength; i++) 
986        posSearch->posMatrix[i] = (BLAST_Score *) MemNew(PROTEIN_ALPHABET * sizeof(BLAST_Score));
987 
988      compactSearch->query = (Uint1Ptr) query;
989      compactSearch->qlength = queryLength;
990      compactSearch->alphabetSize = PROTEIN_ALPHABET;
991      compactSearch->gapped_calculation = TRUE;
992      compactSearch->matrix = nonposMatrix;
993      compactSearch->lambda =  sbp->kbp_gap_std[0]->Lambda;
994      compactSearch->kbp_std = sbp->kbp_std;
995      compactSearch->kbp_psi = sbp->kbp_psi;
996      compactSearch->kbp_gap_psi = sbp->kbp_gap_psi;
997      compactSearch->kbp_gap_std = sbp->kbp_gap_std;
998      compactSearch->lambda_ideal = sbp->kbp_ideal->Lambda;
999      compactSearch->K_ideal = sbp->kbp_ideal->K;
1000 
1001      stdrfp = BlastResFreqNew(sbp);
1002      BlastResFreqStdComp(sbp,stdrfp); 
1003      compactSearch->standardProb = MemNew(compactSearch->alphabetSize * sizeof(Nlm_FloatHi));
1004      for(a = 0; a < compactSearch->alphabetSize; a++)
1005        compactSearch->standardProb[a] = stdrfp->prob[a];
1006      stdrfp = BlastResFreqDestruct(stdrfp);
1007 
1008      posFreqsToMatrix(posSearch,compactSearch);
1009      impalaScaling(posSearch, compactSearch, localScalingFactor, FALSE);
1010 
1011      for(i = 0; i <= queryLength; i++)
1012        MemFree(posSearch->posMatrix[i]);
1013 
1014      MemFree(compactSearch->standardProb);
1015      MemFree(posSearch->posMatrix);
1016      PSIMatrixFrequencyRatiosFree(posSearch->stdFreqRatios);
1017      MemFree(posSearch);
1018      MemFree(compactSearch);
1019 }
1020 
1021 
1022 /** Convert an array of HSPs into a list of BlastCompo_Alignment
1023  * objects.
1024  *
1025  * @param search              search parameters
1026  * @param hsp_array           existing array of HSPs
1027  * @param hspcnt              length of hsp_array
1028  * @param localScalingFactor  factor by which scores are scaled in the
1029  *                            new list of alignments
1030  * @return                    the new list of alignments.
1031  */
1032 static BlastCompo_Alignment * 
1033 Kappa_ResultHspToDistinctAlign(BlastSearchBlkPtr search,
1034                                BLASTResultHsp hsp_array[], int hspcnt,
1035                                double localScalingFactor)
1036 {
1037   BlastCompo_Alignment *aligns;  /* list of alignments to be returned */ 
1038   BlastCompo_Alignment **tail;   /* next location to store a pointer to
1039                                        a new alignment */
1040   int hsp_index;                    /* loop index */
1041 
1042   aligns = NULL;
1043   tail = &aligns;
1044   for (hsp_index = 0;  hsp_index < hspcnt;  hsp_index++) {
1045     BlastCompo_Alignment *new_align;  /* a new alignment */
1046     int queryIndex, queryEnd, matchEnd;
1047     BLASTResultHsp * hsp = &hsp_array[hsp_index];
1048     queryEnd = hsp->query_offset + hsp->query_length;
1049     matchEnd = hsp->subject_offset + hsp->subject_length;
1050     if(search->mult_queries != NULL) {
1051       queryIndex = GetQueryNum(search->mult_queries,
1052                                hsp->query_offset, queryEnd - 1, 0);
1053     } else {
1054       queryIndex = 0;
1055     }
1056     new_align =
1057       BlastCompo_AlignmentNew((int) (hsp->score * localScalingFactor),
1058                                  eDontAdjustMatrix,
1059                                  hsp->query_offset, queryEnd,
1060                                  queryIndex, hsp->subject_offset,
1061                                  matchEnd, hsp->subject_frame, hsp);
1062     if (new_align == NULL) {
1063       ErrPostEx(SEV_FATAL, E_NoMemory, 0, 
1064                 "Failed to allocate a new alignment");
1065     }
1066     *tail = new_align;
1067     tail = &new_align->next;
1068   }
1069   return aligns;
1070 }
1071 
1072 
1073 /**
1074  * Redo a S-W alignment using an x-drop alignment.  The result will
1075  * usually be the same as the S-W alignment. The call to ALIGN
1076  * attempts to force the endpoints of the alignment to match the
1077  * optimal endpoints determined by the Smith-Waterman algorithm.
1078  * ALIGN is used, so that if the data structures for storing BLAST
1079  * alignments are changed, the code will not break
1080  *
1081  * @param query         the query data
1082  * @param queryStart    start of the alignment in the query sequence
1083  * @param queryEnd      end of the alignment in the query sequence,
1084  *                      as computed by the Smith-Waterman algorithm
1085  * @param subject       the subject (database) sequence
1086  * @param matchStart    start of the alignment in the subject sequence
1087  * @param matchEnd      end of the alignment in the query sequence,
1088  *                      as computed by the Smith-Waterman algorithm
1089  * @param frame         translation frame of the database sequence (0
1090  *                      if not translated)
1091  * @param gap_align     parameters for a gapped alignment
1092  * @param score         score computed by the Smith-Waterman algorithm
1093  */
1094 static void
1095 Kappa_SWFindFinalEndsUsingXdrop(
1096   BlastCompo_SequenceData * query,
1097   int queryStart,
1098   int queryEnd,
1099   BlastCompo_SequenceData * subject,
1100   int matchStart,
1101   int matchEnd,
1102   int frame,
1103   GapAlignBlkPtr gap_align,
1104   int score)
1105 {
1106   int XdropAlignScore;         /* alignment score obtained using X-dropoff
1107                                  * method rather than Smith-Waterman */
1108   int doublingCount = 0;       /* number of times X-dropoff had to be
1109                                  * doubled */
1110   int *alignScript, *dummy;    /* the alignment script that will be
1111                                    generated below by the ALIGN
1112                                    routine. */
1113   GapXEditBlockPtr editBlock;   /* traceback info for this alignment */
1114   /* Extent of the alignment as computed by an x-drop alignment
1115    * (usually the same as (queryEnd - queryStart) and (matchEnd -
1116    * matchStart)) */
1117   int queryExtent, matchExtent;
1118 
1119   gap_align->query_start = queryStart;
1120   gap_align->subject_start = matchStart;
1121   do {
1122     alignScript =
1123       (int *) MemNew((subject->length + query->length + 3) * sizeof(int));
1124 
1125     XdropAlignScore =
1126       ALIGN(&query->data[queryStart - 1], &subject->data[matchStart - 1],
1127             queryEnd - queryStart + 1, matchEnd - matchStart + 1,
1128             alignScript, &queryExtent, &matchExtent, &dummy,
1129             gap_align, queryStart - 1, FALSE);
1130     gap_align->score = XdropAlignScore;
1131     gap_align->query_stop = gap_align->query_start + queryExtent - 1;
1132     gap_align->subject_stop = gap_align->subject_start + matchExtent - 1;
1133 
1134     gap_align->x_parameter *= 2;
1135     doublingCount++;
1136     if((XdropAlignScore < score) && (doublingCount < 3)) {
1137       MemFree(alignScript);
1138     }
1139   } while((XdropAlignScore < score) && (doublingCount < 3));
1140   editBlock =
1141     TracebackToGapXEditBlock(NULL, NULL, queryExtent, matchExtent,
1142                              alignScript, queryStart, matchStart);
1143   alignScript = MemFree(alignScript);
1144   editBlock->length1          = query->length;
1145   editBlock->length2          = subject->length;
1146   editBlock->discontinuous    = gap_align->discontinuous;
1147   editBlock->translate2       = gap_align->translate2;
1148   editBlock->frame2           = (Int2) frame;
1149 
1150   gap_align->edit_block = editBlock;
1151 }
1152 
1153 
1154 typedef struct Kappa_SequenceLocalData {
1155   Uint1         prog_number;    /**< identifies the type of blast search being
1156                                      performed. The type of search determines
1157                                      how sequence data should be obtained. */
1158   CharPtr       genetic_code;   /**< genetic code for translated searches */
1159   BioseqPtr     bsp_db;         /**< An object that represents this sequence
1160                                    in low level routines. */
1161   Boolean       bioseq_needs_unlock;    /**< true if the bsp_db must be
1162                                            disposed of by a call to
1163                                            BioseqUnlock, rather than
1164                                            BioseqFree */
1165   ReadDBFILEPtr rdfp;           /**< A pointer to a database from which
1166                                      sequences may be obtained */
1167 } Kappa_SequenceLocalData;
1168 
1169 
1170 /**
1171  * Initialize a new matching sequence, obtaining information about the
1172  * sequence from the search.
1173  *
1174  * @param self              object to be initialized
1175  * @param search            search information
1176  * @param subject_index     index of the matching sequence in the database
1177  */
1178 static void
1179 Kappa_MatchingSequenceInitialize(
1180   BlastCompo_MatchingSequence * self,
1181   BlastSearchBlkPtr search,
1182   int subject_index)
1183 {
1184   Kappa_SequenceLocalData * local_data;
1185   
1186   local_data = self->local_data = MemNew(sizeof(Kappa_SequenceLocalData));
1187   self->index = subject_index;
1188   
1189   local_data->prog_number  = search->prog_number;
1190   local_data->rdfp         = search->rdfp;
1191   if(local_data->prog_number ==  blast_type_tblastn) {
1192     local_data ->genetic_code = search->db_genetic_code;
1193   } else {
1194     /* the sequence will not be translated */
1195     local_data ->genetic_code = NULL;
1196   }
1197   if(local_data->rdfp) {
1198     local_data->rdfp->parameters |= READDB_KEEP_HDR_AND_SEQ;
1199     local_data->bsp_db = readdb_get_bioseq(local_data->rdfp, self->index );
1200     local_data->bioseq_needs_unlock = FALSE;
1201   } else {
1202     local_data->bsp_db = BioseqLockById(search->subject_info->sip);
1203     local_data->bioseq_needs_unlock = TRUE;
1204   }
1205   self->length = local_data->bsp_db->length;
1206 }
1207 
1208 
1209 /** Release the resources associated with a matching sequence. */
1210 static void
1211 Kappa_MatchingSequenceRelease(BlastCompo_MatchingSequence * self)
1212 {
1213   Kappa_SequenceLocalData * local_data = self->local_data;
1214   if(local_data->bsp_db) {
1215     if(local_data->bioseq_needs_unlock) {
1216       BioseqUnlock(local_data->bsp_db);
1217       local_data->bsp_db = NULL;
1218     } else {
1219       local_data->bsp_db = BioseqFree(local_data->bsp_db);
1220     }
1221   } 
1222   MemFree(local_data);
1223   self->local_data = NULL;
1224 }
1225 
1226 #define MINUMUM_FRACTION_NEAR_IDENTICAL 0.98
1227 
1228 /**
1229  * Test whether the aligned parts of two sequences that
1230  * have a high-scoring gapless alignment are nearly identical
1231  *
1232  * @param seqData           subject sequence
1233  * @param queryData         query sequence
1234  * @param queryOffset   offset for align if there are multiple queries
1235  * @param align             information about the alignment
1236  * @param rangeOffset       offset for subject sequence (used for tblastn)
1237  *
1238  * @return                  TRUE if the aligned portions are nearly identical
1239  */
1240 static Boolean testNearIdentical(const BlastCompo_SequenceData *seqData, 
1241                                  const BlastCompo_SequenceData *queryData, 
1242                                  const int queryOffset,
1243                                  const BlastCompo_Alignment *align,
1244                                  const int rangeOffset)
1245 {
1246   int numIdentical = 0;
1247   double fractionIdentical;
1248   int qPos, sPos; /*positions in query and subject;*/
1249   int qEnd; /*end of query*/
1250 
1251   qPos = align->queryStart - queryOffset;
1252   qEnd = align->queryEnd - queryOffset;
1253   sPos = align->matchStart - rangeOffset;
1254   while (qPos < qEnd)  {
1255       if (queryData->data[qPos] == seqData->data[sPos])
1256         numIdentical++;
1257       sPos++;
1258       qPos++;
1259   }
1260   fractionIdentical = ((double) numIdentical/
1261   (double) (align->queryEnd - align->queryStart));
1262   if (fractionIdentical >= MINUMUM_FRACTION_NEAR_IDENTICAL)
1263     return(TRUE);
1264   else
1265     return(FALSE);
1266 }
1267 
1268 
1269 /** Character to use for masked residues */
1270 #define BLASTP_MASK_RESIDUE eXchar
1271 /** Default instructions and mask residue for SEG filtering */
1272 #define BLASTP_MASK_INSTRUCTIONS "S 10 1.8 2.1"
1273 
1274 
1275 /**
1276  * Obtain a string of translated data
1277  *
1278  * @param self          the sequence from which to obtain the data [in]
1279  * @param range         the range, in amino acid coordinates, of data
1280  *                      to get; includes the translation frame [in]
1281  * @param seqData       the resulting data [out]
1282  * @param queryData     the query sequence [in]
1283  * @param queryOffset   offset for align if there are multiple queries
1284  * @param align          information about the alignment between query and subject
1285  * @param shouldTestIdentical did alignment pass a preliminary test in
1286  *                       redo_alignment.c that indicates the sequence
1287  *                        pieces may be near identical
1288  */
1289 static void
1290 Kappa_SequenceGetTranslatedRange(const BlastCompo_MatchingSequence * self,
1291                                  const BlastCompo_SequenceRange * range,
1292                                  BlastCompo_SequenceData * seqData,
1293                                  const BlastCompo_SequenceData * queryData,
1294                                  const int queryOffset,
1295                                  const BlastCompo_Alignment *align,
1296                                  const Boolean shouldTestIdentical)
1297 {
1298   int i;
1299   int nucleotide_start;        /* position of the first nucleotide to be
1300                                  * translated */
1301   int num_nucleotides;         /* number of nucleotides to translate */
1302   Int2 translation_frame = (Int2) range->context;
1303   Kappa_SequenceLocalData * local_data = self->local_data;
1304 
1305   nucleotide_start   = 3 * range->begin;
1306   num_nucleotides    =
1307     3 * (range->end - range->begin) + ABS(translation_frame) - 1;
1308   { /* scope of nucleotide_data */
1309     Uint1Ptr nucleotide_data = MemNew((num_nucleotides + 1) * sizeof(Uint1));
1310     { /* scope of spp */
1311       SeqPortPtr spp;       /* a SeqPort used to read the sequence data */
1312       Uint1 strand;         /* a flag indicating which strand to read */
1313       Uint1 nucleotide;     /* an individual nucleotide */
1314 
1315       if (translation_frame >= 0) {
1316           strand = Seq_strand_plus;
1317       } else {
1318           strand = Seq_strand_minus;
1319       }
1320       spp    = SeqPortNew(local_data->bsp_db, FIRST_RESIDUE, LAST_RESIDUE,
1321                           strand, Seq_code_ncbi4na);
1322       SeqPortSeek(spp, nucleotide_start, SEEK_SET);
1323 
1324       for(i = 0;
1325           i < num_nucleotides &&
1326             (nucleotide = SeqPortGetResidue(spp)) != SEQPORT_EOF;
1327           i++ ) {
1328         /* for all nucleotides in the translation range */
1329         nucleotide_data[i] = nucleotide;
1330       }
1331 
1332       spp = SeqPortFree(spp);
1333     }  /* end scope of spp */
1334     seqData->buffer =
1335       GetTranslation(nucleotide_data, num_nucleotides, translation_frame,
1336                      &seqData->length, local_data->genetic_code);
1337     seqData->data = seqData->buffer + 1; /* This is a protein sequence,
1338                                             so the first byte is nil. */
1339     MemFree(nucleotide_data);
1340   } /* end scope of nucleotide_data */
1341 
1342 #ifndef KAPPA_NO_SEG_SEQUENCE_TBLASTN
1343   { /* scope of variables used for SEG filtering */
1344     BioseqPtr bsp_temp; /* a Bioseq for the translated sequence */
1345     ObjectIdPtr oip;    /* a unique ObjectId for the translated sequence */
1346     SeqLocPtr seg_slp;  /* a SeqLoc for SEG filtering */
1347     bsp_temp     = BlastMakeTempProteinBioseq(seqData->data, seqData->length,
1348                                               Seq_code_ncbistdaa);
1349     bsp_temp->id = SeqIdSetFree(bsp_temp->id);
1350 
1351     oip = (ObjectIdPtr) UniqueLocalId();
1352     ValNodeAddPointer(&(bsp_temp->id), SEQID_LOCAL, oip);
1353     SeqMgrAddToBioseqIndex(bsp_temp);
1354 
1355     seg_slp = BlastBioseqFilter(bsp_temp, BLASTP_MASK_INSTRUCTIONS);
1356     if (seg_slp) {
1357       HackSeqLocId(seg_slp, local_data->bsp_db->id);
1358       if ((!shouldTestIdentical) || 
1359             (shouldTestIdentical && 
1360              (!testNearIdentical(seqData, queryData, queryOffset, align, range->begin))))
1361          BlastMaskTheResidues(seqData->data, seqData->length,
1362                            BLASTP_MASK_RESIDUE, seg_slp, FALSE, 0);
1363       seg_slp = SeqLocSetFree(seg_slp);
1364     }
1365 
1366     SeqMgrDeleteFromBioseqIndex(bsp_temp);
1367     bsp_temp->id = SeqIdSetFree(bsp_temp->id);
1368     bsp_temp     = BioseqFree(bsp_temp);
1369   } /* end scope of variables used for SEG filtering */
1370 #endif
1371 }
1372 
1373 
1374 /**
1375  * Obtain the sequence data that lies within the given range.
1376  *
1377  * @param self          sequence information [in]
1378  * @param range         the range, in amino acid coordinates, of data
1379  *                      to get [in]
1380  * @param seqData       the sequence data obtained [out]
1381  * @param queryData     the query sequence [in]
1382  * @param queryOffset   offset for align if there are multiple queries
1383  * @param align          information about the alignment between query and subject
1384  * @param shouldTestIdentical did alignment pass a preliminary test in
1385  *                       redo_alignment.c that indicates the sequence
1386  *                        pieces may be near identical
1387  * 
1388  * @returns   always 0 (posts a fatal error on failure rather than
1389  *            returning an error code.)
1390  */
1391 static int
1392 Kappa_SequenceGetRange(
1393   const BlastCompo_MatchingSequence * self,
1394   const BlastCompo_SequenceRange * range,
1395   BlastCompo_SequenceData * seqData,
1396   const BlastCompo_SequenceData * queryData,
1397   const int queryOffset,
1398   const BlastCompo_Alignment *align,
1399   const Boolean shouldTestIdentical)
1400 {
1401   Kappa_SequenceLocalData * local_data = self->local_data;
1402   if(local_data->prog_number ==  blast_type_tblastn) {
1403     /* The sequence must be translated. */
1404     Kappa_SequenceGetTranslatedRange(self, range, seqData, queryData, 
1405                                      queryOffset,
1406                                      align, shouldTestIdentical);
1407   } else {
1408     /* The sequence does not need to be translated. */
1409     /* Obtain the entire sequence (necessary for SEG filtering.) */
1410     if(local_data->rdfp != NULL) {
1411       Uint1Ptr origData;        /* data obtained from readdb_get_sequence;
1412                                  * this data cannot be modified, so we copy
1413                                  * it. */
1414       int       idx;
1415       seqData->length    = readdb_get_sequence(local_data->rdfp, self->index,
1416                                                (Uint1Ptr PNTR) & origData );
1417       seqData->buffer    = MemNew((seqData->length + 2) * sizeof(Uint1));
1418       seqData->buffer[0] = '\0';
1419       seqData->data      = &seqData->buffer[1];
1420       for( idx = 0; idx < seqData->length; idx++ ) {
1421         /* Copy the sequence data, replacing occurrences of amino acid
1422          * number 24 (Selenocysteine) with number 21 (Undetermined or
1423          * atypical). */
1424         if(origData[idx] != eSelenocysteine) {
1425             seqData->data[idx] = origData[idx];
1426         } else {
1427           seqData->data[idx] = eXchar;
1428           fprintf(stderr, "Selenocysteine (U) at position %ld"
1429                   " replaced by X\n",
1430                   (long) idx + 1);
1431         }
1432       }
1433       seqData->data[seqData->length] = '\0';
1434     } else { /* self->rdfp is NULL */
1435       SeqPortPtr spp = NULL;      /* a SeqPort used to read the
1436                                      sequence data */
1437       Uint1      residue;         /* an individual residue */
1438       int       idx;
1439 
1440       seqData->length    = local_data->bsp_db->length;
1441       seqData->buffer    = MemNew((seqData->length + 2) * sizeof(Uint1));
1442       seqData->buffer[0] = '\0';
1443       seqData->data      = seqData->buffer + 1;
1444 
1445       spp =
1446         SeqPortNew(local_data->bsp_db, FIRST_RESIDUE, LAST_RESIDUE,
1447                    Seq_strand_unknown, Seq_code_ncbistdaa);
1448 
1449       idx = 0;
1450       while((residue = SeqPortGetResidue(spp)) != SEQPORT_EOF) {
1451         if(IS_residue(residue)) {
1452           /* Replace occurrences of amino acid number 24
1453              (Selenocysteine) with number 21 (Undetermined or
1454              atypical). */
1455           if(residue == eSelenocysteine) {
1456             residue = eXchar;
1457             fprintf(stderr, "Selenocysteine (U) at position %ld"
1458                     " replaced by X\n",
1459                     (long) idx + 1);
1460           }
1461           seqData->data[idx++] = residue;
1462         }
1463       }
1464       seqData->data[idx] = 0;    /* terminate the string */
1465       spp = SeqPortFree(spp);
1466     } /* end else self->rdfp is NULL */
1467 
1468 #ifndef KAPPA_BLASTP_NO_SEG_SEQUENCE
1469     {
1470       SeqLocPtr seg_slp;  /*pointer to structure for SEG filtering*/
1471 
1472       seg_slp =
1473         BlastBioseqFilter(local_data->bsp_db, BLASTP_MASK_INSTRUCTIONS);
1474       if (seg_slp) {
1475       if ((!shouldTestIdentical) || 
1476             (shouldTestIdentical && 
1477              (!testNearIdentical(seqData, queryData, queryOffset, align, 0))))
1478             BlastMaskTheResidues(seqData->data, seqData->length,
1479                                  BLASTP_MASK_RESIDUE, seg_slp, FALSE, 0);
1480         seg_slp = SeqLocSetFree(seg_slp);
1481       }
1482     }
1483 #endif
1484     /* Fit the data to the range. */
1485     seqData ->data    = &seqData->data[range->begin - 1];
1486     *seqData->data++  = '\0';
1487     seqData ->length  = range->end - range->begin;
1488   } /* end else the sequence does not need to be translated */
1489   return 0;
1490 }
1491 
1492 
1493 /**
1494  * Converts an HSP, obtained from a blast search, to a GapAlignBlk that
1495  * is in a state in which it has all information necessary to redo the
1496  * computation of a traceback.
1497  *
1498  * @param gap_align     the GapAlignBlk to be modified  [out]
1499  * @param search        general information about the search [in]
1500  * @param hsp           the HSP to be converted [in]
1501  * @param queryOrigin   origin of the query participating in this
1502  *                      alignment within the concatenated query [in]
1503  * @param subject_range,       the subject_range used to compute the traceback,
1504  *                             in amino acid coordinates [in]
1505  * @param query,        the query data [in]
1506  * @param subject       the subject data [in]
1507  */
1508 static void
1509 Kappa_HitToGapAlign(
1510   GapAlignBlkPtr gap_align,
1511   BlastSearchBlkPtr search,
1512   BLAST_HSPPtr       hsp,
1513   int queryOrigin,
1514   BlastCompo_SequenceRange * subject_range,
1515   BlastCompo_SequenceData * query,
1516   BlastCompo_SequenceData * subject)
1517 {
1518   gap_align->query           = query->data;
1519   gap_align->query_length    = query->length;
1520   gap_align->query_frame     = 0;
1521 
1522   gap_align->subject         = subject->data;
1523   gap_align->subject_length  = subject->length;
1524   gap_align->subject_frame   = hsp->subject.frame;
1525 
1526   /* Shift the hsp to use query/subject local coordinates. */
1527   hsp->query.offset         -= queryOrigin;
1528   hsp->query.gapped_start   -= queryOrigin;
1529   hsp->query.end            -= queryOrigin;
1530   hsp->subject.offset       -= subject_range->begin;
1531   hsp->subject.gapped_start -= subject_range->begin;
1532   hsp->subject.end          -= subject_range->begin;
1533   if(CheckStartForGappedAlignment(search, hsp, gap_align->query,
1534                                    gap_align->subject, search->sbp->matrix)) {
1535     /* We may use the starting point supplied by the HSP. */
1536     gap_align->q_start = hsp->query.gapped_start;
1537     gap_align->s_start = hsp->subject.gapped_start;
1538   } else {
1539     /* We must recompute the start for the gapped alignment, as the
1540        one in the HSP was unacceptable.*/
1541     gap_align->q_start =
1542       GetStartForGappedAlignment(search, hsp, gap_align->query,
1543                                  gap_align->subject, search->sbp->matrix);
1544     gap_align->s_start =
1545       (hsp->subject.offset - hsp->query.offset) + gap_align->q_start;
1546   }
1547 }
1548 
1549 
1550 /**
1551  * Reads a GapAlignBlk that has been used to compute a traceback, and
1552  * return a BlastCompo_Alignment representing the alignment.
1553  *
1554  * @param gap_align         the GapAlignBlk
1555  * @param matrix_adjust_rule    the rule used to compute the scoring matrix
1556  * @param queryRange        range of the query data in the
1557  *                          concatenated query
1558  * @param ccat_query_length length of the concatenated query
1559  * @param subjectRange      range of the subject data in the full
1560  *                          database sequence, expressed in amino acid
1561  *                          coordinates
1562  * @param subjectLength     original. untranslated length of the
1563  *                          subject sequence
1564  */
1565 static BlastCompo_Alignment *
1566 Kappa_NewAlignFromGapAlign(
1567   GapAlignBlkPtr gap_align,
1568   EMatrixAdjustRule matrix_adjust_rule,
1569   BlastCompo_SequenceRange * query_range,
1570   int ccat_query_length,
1571   BlastCompo_SequenceRange * subject_range,
1572   int subjectLength)
1573 {
1574   int query_index, translation_frame;
1575   BlastCompo_Alignment * obj; /* the new alignment */
1576 
1577   /* The gap_align is in coordinates that are relative to the
1578      query/subject subject_range.  Shift to global coordinates. */
1579   int queryStart = gap_align->query_start   + query_range->begin;
1580   int queryEnd   = gap_align->query_stop    + query_range->begin   + 1;
1581   int matchStart = gap_align->subject_start + subject_range->begin;
1582   int matchEnd   = gap_align->subject_stop  + subject_range->begin + 1;
1583 
1584   if(gap_align->edit_block != NULL) {
1585     gap_align->edit_block->start1           += query_range->begin;
1586     gap_align->edit_block->length1          += query_range->begin;
1587     gap_align->edit_block->start2           += subject_range->begin;
1588     gap_align->edit_block->length2          += subject_range->begin;
1589     gap_align->edit_block->original_length1  = ccat_query_length;
1590     gap_align->edit_block->original_length2  = subjectLength;
1591   }
1592   query_index = query_range->context;
1593   translation_frame = subject_range->context;
1594   obj = BlastCompo_AlignmentNew(gap_align->score, matrix_adjust_rule,
1595                                    queryStart, queryEnd, query_index,
1596                                    matchStart, matchEnd,
1597                                    translation_frame,
1598                                    gap_align->edit_block);
1599   if (obj == NULL) {
1600       ErrPostEx(SEV_FATAL, E_NoMemory, 0,
1601                 "Failed to allocate a new alignment");
1602   }
1603   gap_align->edit_block = NULL; /* set to NULL to avoid aliasing */
1604   
1605   return obj;
1606 }
1607 
1608 
1609 /** A callback used when performing SmithWaterman alignments:
1610  * Calculate the traceback for one alignment by performing an x-drop
1611  * alignment in the forward direction, possibly increasing the x-drop
1612  * parameter until the desired score is attained.  
1613  *
1614  * The start, end and score of the alignment should be obtained
1615  * using the Smith-Waterman algorithm before this routine is called.
1616  *
1617  * @param *palign          the new alignment
1618  * @param *pqueryEnd       on entry, the end of the alignment in the
1619  *                         query, as computed by the Smith-Waterman
1620  *                         algorithm.  On exit, the end as computed by
1621  *                         the x-drop algorithm
1622  * @param *pmatchEnd       like as *pqueryEnd, but for the subject
1623  *                         sequence
1624  * @param queryStart       the starting point in the query
1625  * @param matchStart       the starting point in the subject
1626  * @param score            the score of the alignment, as computed by
1627  *                         the Smith-Waterman algorithm
1628  * @param query            query sequence data
1629  * @param query_range      range of this query in the concatenated
1630  *                         query
1631  * @param ccat_query_length   total length of the concatenated query
1632  * @param subject          subject sequence data
1633  * @param subject_range    range of subject_data in the translated
1634  *                         query, in amino acid coordinates
1635  * @param full_subject_length   length of the full subject sequence
1636  * @param gapping_params        parameters used to compute gapped
1637  *                              alignments
1638  * @param matrix_adjust_rule    the rule used to compute the scoring matrix
1639  * @returns 0   (posts a fatal error if it fails)
1640  * @sa new_xdrop_align_type
1641  */
1642 static int
1643 Kappa_NewAlignmentUsingXdrop(BlastCompo_Alignment ** pnewAlign,
1644                              int * pqueryEnd, int *pmatchEnd,
1645                              int queryStart, int matchStart, int score,
1646                              BlastCompo_SequenceData * query,
1647                              BlastCompo_SequenceRange * query_range,
1648                              int queryLength,
1649                              BlastCompo_SequenceData * subject,
1650                              BlastCompo_SequenceRange * subject_range,
1651                              int subjectLength,
1652                              BlastCompo_GappingParams * gapping_params,
1653                              EMatrixAdjustRule matrix_adjust_rule)
1654 { 
1655   /* General search parameters */
1656   BlastSearchBlkPtr search = gapping_params->context;
1657   /* Specific parameters used for gapped alignments */
1658   GapAlignBlkPtr gap_align = search->gap_align;
1659   /* the translation frame */
1660   int frame = subject_range->context;
1661   
1662   gap_align->x_parameter = gapping_params->x_dropoff;
1663 
1664   Kappa_SWFindFinalEndsUsingXdrop(query,   queryStart, *pqueryEnd,
1665                                   subject, matchStart, *pmatchEnd,
1666                                   frame, gap_align, score);
1667   *pqueryEnd = gap_align->query_stop + 1;
1668   *pmatchEnd = gap_align->subject_stop + 1;
1669   *pnewAlign = Kappa_NewAlignFromGapAlign(gap_align, matrix_adjust_rule,
1670                                           query_range, queryLength,
1671                                           subject_range, subjectLength);
1672   gap_align->x_parameter = gapping_params->x_dropoff;
1673 
1674   return 0;
1675 }
1676 
1677 
1678 /**
1679  * A callback: calculate the traceback for one alignment by
1680  * performing an x-drop alignment in both directions
1681  *
1682  * @param in_align         the existing alignment, without traceback
1683  * @param matrix_adjust_rule    the rule used to compute the scoring matrix
1684  * @param query_data       query sequence data
1685  * @param query_range      range of this query in the concatenated
1686  *                         query
1687  * @param ccat_query_length   total length of the concatenated query
1688  * @param subject_data     subject sequence data
1689  * @param subject_range    range of subject_data in the translated
1690  *                         query, in amino acid coordinates
1691  * @param full_subject_length   length of the full subject sequence
1692  * @param gapping_params        parameters used to compute gapped
1693  *                              alignments
1694  * @sa redo_one_alignment_type
1695  */
1696 static BlastCompo_Alignment *
1697 Kappa_RedoOneAlignment(BlastCompo_Alignment * in_align,
1698                        EMatrixAdjustRule matrix_adjust_rule,
1699                        BlastCompo_SequenceData * query_data,
1700                        BlastCompo_SequenceRange * query_range,
1701                        int ccat_query_length,
1702                        BlastCompo_SequenceData * subject_data,
1703                        BlastCompo_SequenceRange * subject_range,
1704                        int full_subject_length,
1705                        BlastCompo_GappingParams * gapping_params)
1706 {
1707   /* New alignment to be returned */
1708   BlastCompo_Alignment * newAlign;
1709   /* The HSP to be redone */
1710   BLASTResultHspPtr hsp = in_align->context;
1711   /* A copy of hsp as a BLAST_HSP; needed to fix a type mismatch */
1712   BLAST_HSPPtr temp_hsp = MemNew(sizeof(BLAST_HSP));
1713   /* General search information */
1714   BlastSearchBlkPtr search = gapping_params->context;
1715   
1716   CopyResultHspToHSP(hsp, temp_hsp);
1717   Kappa_HitToGapAlign(search->gap_align, search, temp_hsp, query_range->begin, 
1718                       subject_range, query_data, subject_data);
1719   search->gap_align->x_parameter = gapping_params->x_dropoff;
1720 
1721   PerformGappedAlignmentWithTraceback(search->gap_align);
1722 
1723   newAlign = 
1724       Kappa_NewAlignFromGapAlign(search->gap_align, matrix_adjust_rule,
1725                                  query_range, ccat_query_length,
1726                                  subject_range, full_subject_length);
1727   MemFree(temp_hsp);
1728   
1729   return newAlign;
1730 }
1731 
1732 
1733 /**
1734  * A Kappa_SearchParameters represents the data needed by
1735  * RedoAlignmentCore to adjust the parameters of a search, including
1736  * the original value of these parameters
1737  */
1738 typedef struct Kappa_SearchParameters {
1739   int          gap_open;        /**< a penalty for the existence of a gap */
1740   int          gapExtend;      /**< a penalty for each residue (or
1741                                       nucleotide) in the gap */
1742   int          gapDecline;     /**< a penalty for declining to align a pair
1743                                      of residues */
1744   int           gap_x_dropoff_final;  /**< value of the x-drop parameter
1745                                            for the final gapped alignment 
1746                                            with traceback */
1747   BLAST_Score **origMatrix;     /**< The original matrix values */
1748   /** a matrix.  Each row represents a query and each column
1749        represents the probabilities of a particular residue */
1750   GapAlignBlkPtr orig_gap_align;
1751   Nlm_FloatHi original_expect_value;    /**< expect value on entry */
1752   BLAST_KarlinBlkPtr kbp_gap_orig;  /**< copy of the original gapped
1753                                       Karlin-Altschul block corresponding to
1754                                       the first context */
1755   BLAST_KarlinBlkPtr * orig_kbp_gap_array;  /**< pointer to the array of gapped
1756                                               Karlin-Altschul block for all
1757                                               contexts; needed to restore
1758                                               the search to its original 
1759                                               configuration.  */
1760   Boolean    use_mq_flag;     /**< the initial value of the
1761                                    search->mult_queries->use_mq flag;
1762                                    or false if search->mult_queries ==
1763                                    NULL */
1764 } Kappa_SearchParameters;
1765 
1766 
1767 /**
1768  * Release the data associated with a Kappa_SearchParameters and
1769  * delete the object
1770  * @param searchParams the object to be deleted [in][out]
1771  */
1772 static void
1773 Kappa_SearchParametersFree(Kappa_SearchParameters ** searchParams)
1774 {
1775   /* for convenience, remove one level of indirection from searchParams */
1776   Kappa_SearchParameters *sp = *searchParams;
1777 
1778   if(sp->kbp_gap_orig) BlastKarlinBlkDestruct(sp->kbp_gap_orig);
1779 
1780   Nlm_Int4MatrixFree(&sp->origMatrix);
1781 
1782   Nlm_Free(*searchParams);
1783   *searchParams = NULL;
1784 }
1785 
1786 
1787 /**
1788  * Create a new instance of Kappa_SearchParameters
1789  *
1790  * @param rows              number of rows in the scoring matrix
1791  * @param compo_adjust_mode  if >0, use composition-based statistics
1792  * @param numQueries        the number of queries in the concatenated
1793  *                          query
1794  * @param positionBased     if true, the search is position-based
1795  */
1796 static Kappa_SearchParameters *
1797 Kappa_SearchParametersNew(
1798   int rows,
1799   ECompoAdjustModes compo_adjust_mode,
1800   Boolean positionBased)
1801 {
1802   Kappa_SearchParameters *sp;   /* the new object */
1803   sp = MemNew(sizeof(Kappa_SearchParameters));
1804 
1805   sp->orig_kbp_gap_array = NULL;
1806   sp->kbp_gap_orig       = NULL;
1807   sp->origMatrix         = NULL;
1808   
1809   sp->kbp_gap_orig = BlastKarlinBlkCreate();
1810   if (compo_adjust_mode != eNoCompositionBasedStats) {
1811     if (positionBased) {
1812       sp->origMatrix = Nlm_Int4MatrixNew(rows, PROTEIN_ALPHABET);
1813     } else {
1814       sp->origMatrix = Nlm_Int4MatrixNew(PROTEIN_ALPHABET, PROTEIN_ALPHABET);  
1815     }
1816     if (sp->origMatrix == NULL) {
1817       ErrPostEx(SEV_FATAL, E_NoMemory, 0, 
1818                 "Failed to allocate a scoring matrix");
1819     }
1820   }
1821   /* end if(compo_adjust_mode) */
1822 
1823   return sp;
1824 }
1825 
1826 
1827 /**
1828  * Record the initial value of the search parameters that are to be
1829  * adjusted.
1830  *
1831  * @param searchParams       holds the recorded values [out]
1832  * @param search             the search parameters [in]
1833  * @param compo_adjust_mode   mode of composition adjustment [in]
1834  */
1835 static void
1836 Kappa_RecordInitialSearch(Kappa_SearchParameters * searchParams,
1837                           BlastSearchBlkPtr search,
1838                           ECompoAdjustModes compo_adjust_mode)
1839 {
1840   BLAST_KarlinBlkPtr kbp;     /* statistical parameters used to evaluate a
1841                                * query-subject pair */
1842   BLAST_Score **matrix;       /* matrix used to score a local
1843                                  query-subject alignment */
1844 
1845   searchParams->gap_x_dropoff_final   = search->pbp->gap_x_dropoff_final;
1846   searchParams->original_expect_value = search->pbp->cutoff_e;
1847 
1848   if (search->mult_queries != NULL) {
1849     searchParams->use_mq_flag = search->mult_queries->use_mq;
1850   } else {
1851     searchParams->use_mq_flag = 0;
1852   }
1853   searchParams->gap_open   = search->pbp->gap_open;
1854   searchParams->gapExtend  = search->pbp->gap_extend;
1855   searchParams->gapDecline = search->pbp->decline_align;
1856 
1857   searchParams->orig_kbp_gap_array   = search->sbp->kbp_gap;
1858   if(search->positionBased) {
1859     kbp    = search->sbp->kbp_gap_psi[0];
1860     matrix = search->sbp->posMatrix;
1861   } else {
1862     kbp    = search->sbp->kbp_gap_std[0];
1863     matrix = search->sbp->matrix;
1864   }
1865   searchParams->kbp_gap_orig->Lambda = kbp->Lambda;
1866   searchParams->kbp_gap_orig->K      = kbp->K;
1867   searchParams->kbp_gap_orig->logK   = kbp->logK;
1868   searchParams->kbp_gap_orig->H      = kbp->H;
1869 
1870   searchParams->orig_gap_align = search->gap_align;
1871   search->gap_align = NULL; /* break aliasing */
1872   
1873   if(compo_adjust_mode != eNoCompositionBasedStats) {
1874     int i, j;                  /* iteration indices */
1875     int rows;
1876     if (search->positionBased) {
1877       rows = search->context[0].query->length;
1878     } else {
1879       rows = PROTEIN_ALPHABET;
1880     }
1881     for(i = 0; i < rows; i++) {
1882       for(j = 0; j < PROTEIN_ALPHABET; j++) {
1883         searchParams->origMatrix[i][j] = matrix[i][j];
1884       }
1885     }
1886   }
1887 }
1888 
1889 
1890 /**
1891  * Rescale the search parameters in the search object to obtain more
1892  * precision.
1893  *
1894  * @param search               the search block to be rescaled
1895  * @param localScalingFactor   the scale factor
1896  * @param options              certain command-line options to BLAST
1897  */
1898 static void
1899 Kappa_RescaleSearch(BlastSearchBlkPtr search,
1900                 double localScalingFactor,
1901                 BLAST_OptionsBlkPtr options)
1902 {
1903   BLAST_KarlinBlkPtr kbp;     /* the statistical parameters used to
1904                                * evaluate alignments of a
1905                                * query-subject pair */
1906   if(search->positionBased) {
1907     kbp = search->sbp->kbp_gap_psi[0];
1908   } else {
1909     kbp = search->sbp->kbp_gap_std[0];
1910   }
1911   kbp->Lambda /= localScalingFactor;
1912   kbp->logK = log(kbp->K);
1913   
1914   if (search->mult_queries != NULL) {
1915     search->mult_queries->use_mq = 0;
1916   }
1917   search->pbp->cutoff_e = options->kappa_expect_value;
1918   search->pbp->gap_open =
1919     Nlm_Nint(search->pbp->gap_open * localScalingFactor);
1920   search->pbp->gap_extend =
1921     Nlm_Nint(search->pbp->gap_extend * localScalingFactor);
1922   if(search->pbp->decline_align < INT2_MAX) {
1923     search->pbp->decline_align =
1924       Nlm_Nint(search->pbp->decline_align * localScalingFactor);
1925   }
1926   search->gap_align                = GapAlignBlkNew(1, 1);
1927   search->gap_align->matrix        = search->sbp->matrix;
1928   search->gap_align->positionBased = search->positionBased;
1929   if(search->positionBased) {
1930     search->gap_align->posMatrix   = search->sbp->posMatrix;
1931   }
1932   search->gap_align->gap_open      = search->pbp->gap_open;
1933   search->gap_align->gap_extend    = search->pbp->gap_extend;
1934   search->gap_align->decline_align = search->pbp->decline_align;
1935   search->gap_align->translate2 = 
1936       (Boolean) (search->prog_number == blast_type_tblastn);
1937 
1938   search->gap_align->x_parameter   =
1939     (int) (options->gap_x_dropoff_final * NCBIMATH_LN2 / kbp->Lambda);
1940 }
1941 
1942 
1943 /**
1944  * Restore the parameters that were adjusted to their original values
1945  * @param search            the search to be restored [out]
1946  * @param matrix            the scoring matrix to be restored [out]
1947  * @param searchParams      a record of the original values [in]
1948  * @param SmithWaterman     if true, we have performed a Smith-Waterman
1949  *                          alignment with these search parameters [in]
1950  */
1951 static void
1952 Kappa_RestoreSearch(
1953   BlastSearchBlkPtr search,
1954   BLAST_Score ** matrix,
1955   Kappa_SearchParameters * searchParams,
1956   ECompoAdjustModes compo_adjust_mode)
1957 {
1958   BLAST_KarlinBlkPtr kbp;     /* statistical parameters used to
1959                                  evaluate the significance of
1960                                  alignment of a query-subject
1961                                    pair */
1962   int i, j;                  /* iteration indices */
1963   search->pbp->gap_x_dropoff_final = searchParams->gap_x_dropoff_final;
1964   search->pbp->cutoff_e      = searchParams->original_expect_value;
1965   search->pbp->gap_open      = searchParams->gap_open;
1966   search->pbp->gap_extend    = searchParams->gapExtend;
1967   search->pbp->decline_align = searchParams->gapDecline;
1968   if (search->mult_queries != NULL) {
1969       search->mult_queries->use_mq = searchParams->use_mq_flag;
1970   }
1971   GapAlignBlkDelete(search->gap_align);
1972   search->gap_align    = searchParams->orig_gap_align;
1973   search->sbp->kbp_gap = searchParams->orig_kbp_gap_array;
1974   
1975   if(search->positionBased) {
1976     kbp = search->sbp->kbp_gap_psi[0];
1977   } else {
1978     kbp = search->sbp->kbp_gap_std[0];
1979   }
1980   kbp->Lambda = searchParams->kbp_gap_orig->Lambda;
1981   kbp->K      = searchParams->kbp_gap_orig->K;
1982   kbp->logK   = searchParams->kbp_gap_orig->logK;
1983   kbp->H      = searchParams->kbp_gap_orig->H;
1984   
1985   if(compo_adjust_mode != eNoCompositionBasedStats) {
1986     int rows;    /* the number of rows in matrix */
1987     if (search->positionBased) {
1988       rows = search->context[0].query->length;
1989     } else {
1990       rows = PROTEIN_ALPHABET;
1991     }
1992     for(i = 0; i < rows; i++) {
1993       for(j = 0; j < PROTEIN_ALPHABET; j++) {
1994         matrix[i][j] = searchParams->origMatrix[i][j];
1995       }
1996     }
1997   }
1998 }
1999 
2000 
2001 /** Initialize an object of type Blast_MatrixInfo.
2002  * @param self                 object to initialize
2003  * @param search               search data
2004  * @param localScalingFactor   amount by which this search is scaled
2005  * @param matrixName           name of the matrix */
2006 static void
2007 Kappa_MatrixInfoInit(Blast_MatrixInfo * self,
2008                      double localScalingFactor,
2009                      BlastSearchBlkPtr search,
2010                      const char * matrixName)
2011 {
2012   Uint1Ptr query;             /* the query sequence */
2013   int queryLength;           /* the length of the query sequence */
2014   Nlm_FloatHi initialUngappedLambda;
2015 
2016   query       = search->context[0].query->sequence;
2017   queryLength = search->context[0].query->length;
2018 
2019   if(self->positionBased) {
2020     if(search->sbp->posFreqs == NULL) {
2021       search->sbp->posFreqs =
2022         allocatePosFreqs(queryLength, PROTEIN_ALPHABET);
2023     }
2024     Kappa_GetStartFreqRatios(self->startFreqRatios, search, query, matrixName,
2025                              search->sbp->posFreqs, queryLength, TRUE);
2026     Kappa_ScalePosMatrix(self->startMatrix, search->sbp->matrix, matrixName,
2027                          search->sbp->posFreqs, query, queryLength,
2028                          search->sbp, localScalingFactor);
2029     initialUngappedLambda = search->sbp->kbp_psi[0]->Lambda;
2030   } else {
2031     Kappa_GetStartFreqRatios(self->startFreqRatios, search, query,
2032                              matrixName, NULL, PROTEIN_ALPHABET,
2033                              FALSE);
2034     initialUngappedLambda = search->sbp->kbp_ideal->Lambda;
2035   }
2036   self->ungappedLambda = initialUngappedLambda / localScalingFactor;
2037   if(!search->positionBased) {
2038     FreqRatios * freqRatios;  /* frequency ratios for the matrix */
2039 
2040     freqRatios = PSIMatrixFrequencyRatiosNew(matrixName);
2041     if (freqRatios == NULL) {
2042       ErrPostEx(SEV_FATAL, 1, 0, "blastpgp: Cannot adjust parameters "
2043                 "for matrix %s\n", matrixName);
2044     }
2045     Blast_Int4MatrixFromFreq(self->startMatrix, self->cols, freqRatios->data,
2046                              self->ungappedLambda);
2047     freqRatios = PSIMatrixFrequencyRatiosFree(freqRatios);
2048   }
2049   self->matrixName = strdup(matrixName);
2050 }
2051 
2052 
2053 /**
2054  * Record information about all queries in the concatenated query.
2055  *
2056  * @param *pquery       a new list of BlastCompo_QueryInfo objects
2057  * @param *pnumQueries  the length of *pquery
2058  * @param *maxLength    the length of the longest query
2059  * @param search        a search block, which is used to obtain the 
2060  *                      query information */
2061 static void
2062 Kappa_GetQueryInfo(BlastCompo_QueryInfo **pquery,
2063                    int * pnumQueries,
2064                    int *maxLength, BlastSearchBlkPtr search)
2065 {
2066   int query_index;
2067   int numQueries = search->mult_queries ? search->mult_queries->NumQueries : 1;
2068   BlastCompo_QueryInfo * query =
2069       Nlm_Calloc(numQueries, sizeof(BlastCompo_QueryInfo));
2070 
2071   *pnumQueries = numQueries;
2072   *maxLength = 0;
2073   *pquery = query;
2074   if (search->mult_queries) {
2075     for (query_index = 0;  query_index < numQueries;  query_index++) {
2076       query[query_index].eff_search_space =
2077         search->mult_queries->SearchSpEff[query_index];
2078     }
2079   } else {
2080     query[0].eff_search_space = search->searchsp_eff;
2081   }
2082   if (search->mult_queries != NULL) {
2083     /* Query concatenation is in use; find each individual query
2084        within the concatenated query */
2085     Uint1 * ccat_query = /* the concatenated query */
2086       search->context[0].query->sequence;
2087     int length;
2088 
2089     for (query_index = 0;  query_index < numQueries;  query_index++) {
2090       query[query_index].origin =
2091         search->mult_queries->QueryStarts[query_index];
2092       query[query_index].seq.data = &ccat_query[query[query_index].origin];
2093       length = search->mult_queries->QueryEnds[query_index] -
2094           query[query_index].origin + 1;
2095       query[query_index].seq.length = length;
2096       if (length > *maxLength) {
2097           *maxLength = length;
2098       }
2099     }
2100   } else {
2101     /* Query concatenation is not in use; just use the entire query */
2102     query[0].seq.data   = search->context[0].query->sequence;
2103     query[0].seq.length = search->context[0].query->length;
2104     query[0].origin = 0;
2105     *maxLength = query[0].seq.length;
2106   }
2107   for (query_index = 0;  query_index < numQueries;  query_index++) {
2108     Blast_ReadAaComposition(&query[query_index].composition,
2109                             PROTEIN_ALPHABET,
2110                             query[query_index].seq.data,
2111                             query[query_index].seq.length);
2112   }
2113 }
2114 
2115 
2116 /**
2117  * Initialize an object that contains the parameters needed to perform
2118  * a gapped alignment.
2119  *
2120  * @param gapping_params    the object to be initialized
2121  * @param search            holds search parameters
2122  * @param options           holds BLAST command-line options
2123  * @param Lambda            the statistical parameter Lambda;
2124  *                          gives the scale of the scoring system.
2125  */
2126 static void
2127 Kappa_GappingParamsInit(BlastCompo_GappingParams * gapping_params,
2128                         BlastSearchBlkPtr search)
2129 {  
2130   gapping_params->gap_open = search->gap_align->gap_open;
2131   gapping_params->gap_extend = search->gap_align->gap_extend;
2132   gapping_params->decline_align = search->gap_align->decline_align;
2133   gapping_params->x_dropoff = search->gap_align->x_parameter;
2134   gapping_params->context = search;
2135 }
2136 
2137 
2138 /** 
2139  * A callback: Free GapXEditBlock that is pointed to by a (void *).
2140  */
2141 static void Kappa_FreeEditBlock(void * traceback)
2142 {
2143     if (traceback != NULL)
2144         GapXEditBlockDelete((GapXEditBlockPtr)traceback);
2145 }
2146 
2147 
2148 /**
2149  * Callbacks used by Blast_RedoOneMatch and
2150  * Blast_RedoOneMatchSmithWaterman */
2151 static const Blast_RedoAlignCallbacks
2152 redo_align_callbacks = {
2153     Kappa_CalcLambda, Kappa_SequenceGetRange, Kappa_RedoOneAlignment,
2154     Kappa_NewAlignmentUsingXdrop, Kappa_FreeEditBlock
2155     };
2156 
2157 
2158 /** Get a set of alignment parameters, for use by Blast_RedoOneMatch or
2159  * Blast_RedoOneMatchSmithWaterman.
2160  * @param search               search parameters
2161  * @param options              BLAST command line options
2162  * @param compo_adjust_mode     composition adjustment mode
2163  * @param localScalingFactor   factor by which scores are scaled */
2164 static Blast_RedoAlignParams *
2165 Kappa_GetAlignParams(BlastSearchBlkPtr search,
2166                      BLAST_OptionsBlkPtr options,
2167                      ECompoAdjustModes compo_adjust_mode,
2168                      double localScalingFactor)
2169 {
2170   int ccat_query_length;
2171   int rows;
2172   int cutoff_s;
2173   double cutoff_e;
2174   BlastCompo_GappingParams * gapping_params = NULL;
2175   Blast_MatrixInfo * scaledMatrixInfo;
2176   Blast_RedoAlignParams * params;
2177   int subject_is_translated = search->prog_number == blast_type_tblastn;
2178   int do_link_hsps = search->pbp->do_sum_stats;
2179 
2180   if(do_link_hsps) {
2181     cutoff_s = (int) (search->pbp->cutoff_s1 * localScalingFactor);
2182   } else {
2183     /* There is no cutoff score; we consider e-values instead */
2184     cutoff_s = 0;
2185   }
2186   cutoff_e = search->pbp->cutoff_e;
2187   ccat_query_length = search->context[0].query->length;
2188   rows = search->positionBased ? ccat_query_length : PROTEIN_ALPHABET;
2189   if (compo_adjust_mode == eNoCompositionBasedStats) {
2190     /* We cannot compute a value for scaledMatrixInfo if
2191      * compo_adjust_mode is off.  There are side effects to the
2192      * computation (specifically a call to updateLambdaK that alters
2193      * the gapped K and ungapped lambda for position-based searches)
2194      * that are undesirable and deeply embedded in the code. */
2195     scaledMatrixInfo = NULL;
2196   } else {
2197     scaledMatrixInfo = Blast_MatrixInfoNew(rows, PROTEIN_ALPHABET,
2198                                            search->positionBased);
2199     if (scaledMatrixInfo == NULL) {
2200       ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2201     }
2202     Kappa_MatrixInfoInit(scaledMatrixInfo, localScalingFactor, search,
2203                          options->matrix);
2204   }
2205   gapping_params = malloc(sizeof(BlastCompo_GappingParams));
2206   Kappa_GappingParamsInit(gapping_params, search);
2207   params =
2208     Blast_RedoAlignParamsNew(&scaledMatrixInfo, &gapping_params,
2209                              compo_adjust_mode, search->positionBased,
2210                              subject_is_translated, ccat_query_length,
2211                              cutoff_s, cutoff_e, do_link_hsps,
2212                              &redo_align_callbacks);
2213   if (params == NULL) {
2214       ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2215   }
2216   return params;
2217 }
2218 
2219 
2220 /**
2221  * Convert a Blast_CompoHeap to a flat list of SeqAligns. Note that there may
2222  * be more than one alignment per element in the heap.  The new list
2223  * preserves the order of the SeqAligns associated with each
2224  * HeapRecord.
2225  *
2226  * @param self           a Blast_CompoHeap
2227  */
2228 static SeqAlignPtr
2229 Kappa_CompoHeapToFlatList(BlastCompo_Heap * self)
2230 {
2231     SeqAlignPtr list = NULL;
2232     SeqAlignPtr result;
2233 
2234     while (NULL != (result = BlastCompo_HeapPop(self))) {
2235         SeqAlignPtr oldList = list;
2236         list = result;
2237         while (NULL != result->next) {
2238             result = result->next;
2239         }
2240         result->next = oldList;
2241     }
2242     return list;
2243 }
2244 
2245 
2246 /**
2247  *  Top level routine to recompute alignments for each
2248  *  match found by the gapped BLAST algorithm
2249  *
2250  *  @param search           is the structure with all the information about
2251  *                          the search
2252  *  @param options          is used to pass certain command line options
2253  *                          taken in by BLAST
2254  *  @param hitlist_count    is the number of old matches
2255  *  @param compo_adjust_mode determines whether we are to adjust the
2256  *                          Karlin-Altschul parameters and score matrix
2257  *  @param SmithWaterman    determines whether the new local alignments
2258  *                          should be computed by the optimal Smith-Waterman
2259  *                          algorithm; SmithWaterman false means that
2260  *                          alignments will be recomputed by the current
2261  *                          X-drop algorithm as implemented in the procedure
2262  *                          ALIGN.
2263  *  @return                 a array of lists of SeqAlign; each element
2264  *                          in the array is a list of SeqAligns for
2265  *                          one query in the concatenated query.
2266  *  It is assumed that at least one of compo_adjust_mode and
2267  *  SmithWaterman is >0 or true when this procedure is called A linked list
2268  *  of alignments is returned; the alignments are sorted according to
2269  *  the lowest E-value of the best alignment for each matching
2270  *  sequence; alignments for the same matching sequence are in the
2271  *  list consecutively regardless of the E-value of the secondary
2272  *  alignments. Ties in sorted order are much rarer than for the
2273  *  standard BLAST method, but are broken deterministically based on
2274  *  the index of the matching sequences in the database.
2275  */
2276 SeqAlignPtr *
2277 RedoAlignmentCore(BlastSearchBlkPtr search,
2278                   BLAST_OptionsBlkPtr options,
2279                   int hitlist_count,
2280                   int adjustParameters,
2281                   Boolean SmithWaterman)
2282 {
2283   int status = 0;               /* error status returned by routines */
2284   int match_index;             /* index over matches */
2285   SeqAlignPtr * results = NULL; /* an array of lists of SeqAligns to
2286                                    return */
2287   Nlm_FloatHi localScalingFactor;       /* the factor by which to
2288                                          * scale the scoring system in
2289                                          * order to obtain greater
2290                                          * precision */
2291   BLAST_Score      **matrix;    /* score matrix */
2292   Kappa_SearchParameters *searchParams; /* the values of the search
2293                                          * parameters that will be
2294                                          * recorded, altered in the
2295                                          * search structure in this
2296                                          * routine, and then restored
2297                                          * before the routine
2298                                          * exits. */
2299   Blast_ForbiddenRanges  forbidden;     /* forbidden ranges for each
2300                                          * database position (used in
2301                                          * Smith-Waterman alignments)
2302                                          */
2303   BlastCompo_Heap * significantMatches; /* a collection of alignments for each
2304                                 * query sequence with sequences from
2305                                 * the database */
2306   Blast_CompositionWorkspace
2307       *NRrecord = NULL;        /* stores all fields needed for
2308                                 * computing a compositionally adjusted
2309                                 * score matrix using Newton's method */
2310   int query_index;            /* loop index */
2311   int numQueries;             /* number of queries in the
2312                                   concatenated query */
2313   int ccat_query_length;      /* length of the concatenated query, or
2314                                   of the sole query if query
2315                                   concatenation is not in use */
2316   int maxQueryLength;         /* the greatest length among all queries */
2317   BlastCompo_Alignment * incoming_aligns;  /* existing alignments for a match */
2318   BlastCompo_QueryInfo * query_info = NULL;
2319   Blast_RedoAlignParams * redo_align_params;
2320   double Lambda, logK;
2321   double pvalueForThisPair = (-1); /*p-value for this match
2322                                      for composition; -1 == no adjustment*/
2323   double LambdaRatio; /*lambda ratio*/
2324   int compositionTestIndex = options->unified_p;
2325                               /*which test function do we use to see if
2326                                a composition-adjusted p-value is desired*/
2327 
2328   /* the composition adjustment mode, obtained from adjustParameters
2329      (we don't make adjustParameters itself an enum so that only
2330      kappa.c depends on the header that defines ECompoAdjustModes) */
2331   ECompoAdjustModes compo_adjust_mode;  
2332 
2333   if (search->positionBased) {
2334     /* Can't use a compositional P-value with position based searches */
2335     compositionTestIndex = 0;
2336   }
2337   /**** Validate parameters *************/
2338   if (0 > adjustParameters || eNumCompoAdjustModes <= adjustParameters) {
2339       /* Unknown composition adjustment mode */
2340       return NULL;
2341   } else {
2342       compo_adjust_mode = (ECompoAdjustModes) adjustParameters;
2343   }
2344   if(0 == strcmp(options->matrix, "BLOSUM62_20") &&
2345      eNoCompositionBasedStats == compo_adjust_mode) {
2346     /* BLOSUM62_20 only makes sense if compo_adjust_mode is on */
2347     return NULL;
2348   }
2349   if (search->positionBased && search->sbp->posMatrix == NULL) {
2350     Char* msg = "Cannot perform position-specific search without a PSSM";
2351     BlastConstructErrorMessage("RedoAlignmentCore", msg, 3,
2352                                &(search->error_return));
2353     return NULL;
2354   }
2355   if (search->positionBased && compo_adjust_mode > 1) {
2356     /* Only mode 1 (or 0) works with position-based searches, silently 
2357      * change the value. */
2358     compo_adjust_mode = eCompositionBasedStats;
2359   }
2360   if (compo_adjust_mode > 1 &&
2361       !Blast_FrequencyDataIsAvailable(options->matrix)) {
2362     Char* msg = "Unsupported matrix for composition-based"
2363       " matrix adjustment";
2364     BlastConstructErrorMessage("RedoAlignmentCore", msg, 3,
2365                                &(search->error_return));
2366     return NULL;
2367   }
2368   /**** End validate parameters *************/
2369 
2370   ccat_query_length = search->context[0].query->length;
2371   searchParams =
2372       Kappa_SearchParametersNew(ccat_query_length, compo_adjust_mode, 
2373                                 search->positionBased);
2374   Kappa_RecordInitialSearch(searchParams, search, compo_adjust_mode);
2375   if (compo_adjust_mode == eNoCompositionBasedStats) {
2376       localScalingFactor = 1.0;
2377   } else {
2378       localScalingFactor = SCALING_FACTOR;
2379   }
2380   if((0 == strcmp(options->matrix, "BLOSUM62_20"))) {
2381     localScalingFactor /= 10;
2382   }
2383   Kappa_RescaleSearch(search, localScalingFactor, options);
2384   if(search->positionBased) {
2385     matrix = search->sbp->posMatrix;
2386   } else {
2387     matrix = search->sbp->matrix;
2388   }
2389   redo_align_params = Kappa_GetAlignParams(search, options, compo_adjust_mode,
2390                                            localScalingFactor);
2391   {
2392       /* Get appropriate values for Lambda and logK */
2393       BLAST_KarlinBlkPtr kbp;
2394       if(search->positionBased) {
2395           kbp    = search->sbp->kbp_gap_psi[0];
2396       } else {
2397           kbp    = search->sbp->kbp_gap_std[0];
2398       }
2399       Lambda = kbp->Lambda;  
2400       logK   = kbp->logK;
2401   }
2402   Kappa_GetQueryInfo(&query_info, &numQueries, &maxQueryLength, search);
2403   if(SmithWaterman) {
2404       status = Blast_ForbiddenRangesInitialize(&forbidden, maxQueryLength);
2405       if (status != 0) 
2406           ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2407   }
2408   significantMatches = Nlm_Calloc(numQueries, sizeof(BlastCompo_Heap));
2409   ASSERT(options->ethresh != 0.0);
2410   for (query_index = 0;  query_index < numQueries;  query_index++) {
2411     status =
2412       BlastCompo_HeapInitialize(&significantMatches[query_index],
2413                                 options->hitlist_size, options->ethresh);
2414     if (status != 0) {
2415         ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2416     }
2417   }
2418   if (compo_adjust_mode != eNoCompositionBasedStats) {
2419     NRrecord = Blast_CompositionWorkspaceNew();
2420     if (NRrecord == NULL) {
2421       ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2422     }
2423     /* We already checked that the frequency data is available for
2424      * options->matrix, so this call can't fail */
2425     (void) Blast_CompositionWorkspaceInit(NRrecord, options->matrix);
2426   }
2427   for(match_index = 0; match_index < hitlist_count; match_index++) {
2428     /* for all matching sequences */
2429     BlastCompo_MatchingSequence matchingSeq; /* the data for a matching
2430                                          * database sequence */
2431     BLASTResultHitlistPtr thisMatch;    /* alignment data for the
2432                                          * current query-subject
2433                                          * match */
2434     BlastCompo_Alignment ** alignments; /* array of lists of
2435                                             * alignments for each
2436                                             * query to this subject */
2437     alignments = Nlm_Calloc(numQueries, sizeof(BlastCompo_Alignment *));
2438 
2439     thisMatch = search->result_struct->results[match_index];
2440     if(thisMatch->hsp_array == NULL) {
2441       continue;
2442     }
2443     if (BlastCompo_EarlyTermination(thisMatch->best_evalue,
2444                                     significantMatches, numQueries)) {
2445         break;
2446     }
2447     /* Get the sequence for this match */
2448     Kappa_MatchingSequenceInitialize(&matchingSeq, search,
2449                                      thisMatch->subject_id);
2450     incoming_aligns =
2451       Kappa_ResultHspToDistinctAlign(search, thisMatch->hsp_array,
2452                                      thisMatch->hspcnt, localScalingFactor);
2453     if (SmithWaterman) {
2454       status =
2455         Blast_RedoOneMatchSmithWaterman(alignments, redo_align_params,
2456                                         incoming_aligns, thisMatch->hspcnt,
2457                                         Lambda, logK,
2458                                         &matchingSeq, query_info, numQueries,
2459                                         matrix, PROTEIN_ALPHABET,
2460                                         NRrecord, &forbidden,
2461                                         significantMatches,
2462                                         &pvalueForThisPair,
2463                                         compositionTestIndex, &LambdaRatio);
2464     } else {
2465       status =
2466         Blast_RedoOneMatch(alignments, redo_align_params, incoming_aligns,
2467                            thisMatch->hspcnt, Lambda, &matchingSeq,
2468                            ccat_query_length, query_info, numQueries,
2469                            matrix, PROTEIN_ALPHABET,
2470                            NRrecord, &pvalueForThisPair,
2471                            compositionTestIndex, &LambdaRatio);
2472     }
2473     if (status != 0) {
2474       ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2475     }
2476     for (query_index = 0;  query_index < numQueries;  query_index++) {
2477       /* Loop over queries */
2478       if( alignments[query_index] != NULL) { /* alignments were found */
2479         Nlm_FloatHi bestEvalue;   /* best e-value among alignments in the
2480                                      hitlist */
2481         int bestScore;           /* best score among alignments in the
2482                                      hitlist */        
2483         BLAST_HitListPtr hitlist; /* a hitlist containing the newly-computed
2484                                    * alignments */
2485         Kappa_SortedHitlistFromAligns(search, &alignments[query_index]);
2486         hitlist = search->current_hitlist;
2487         if (hitlist->hspcnt > 1 &&
2488             !(SmithWaterman && search->prog_number == blast_type_blastp)) {
2489           /* Eliminate HSPs that are contained in a higher-scoring
2490            * HSP.  With blastp and SmithWaterman alignments, the
2491            * forbidden ranges rule does not allow one alignment to be
2492            * contained in another. */
2493           BLASTCheckHSPInclusion(hitlist->hsp_array, hitlist->hspcnt,
2494                                  FALSE);
2495           hitlist->hspcnt =
2496             HspArrayPurge(hitlist->hsp_array, hitlist->hspcnt, FALSE);
2497         }
2498         Kappa_HitlistEvaluateAndPurge(&bestScore, &bestEvalue,
2499                                       matchingSeq.length,
2500                                       search, redo_align_params->do_link_hsps,
2501                                       pvalueForThisPair, LambdaRatio,
2502                                       thisMatch->subject_id);
2503         if (bestEvalue <= search->pbp->cutoff_e &&
2504             BlastCompo_HeapWouldInsert(&significantMatches[query_index],
2505                                        bestEvalue, bestScore,
2506                                        thisMatch->subject_id)) {
2507           /* If the best alignment is significant, then create and save
2508            * a list of SeqAligns. */
2509           /* SeqAligns for this query-subject pair */
2510           SeqAlignPtr aligns;
2511           void * discardedAligns;     
2512           SeqIdPtr query_id;
2513           Kappa_SequenceLocalData * local_data = matchingSeq.local_data;
2514 
2515           if (search->mult_queries) {
2516             query_id = search->mult_queries->FakeBsps[query_index]->id;
2517           } else {
2518             query_id = search->query_id;
2519           }
2520           aligns =
2521             Kappa_SeqAlignsFromHitlist(hitlist, local_data->bsp_db->id,
2522                                        query_id,
2523                                        query_info[query_index].origin,
2524                                        query_info[query_index].seq.length,
2525                                        Lambda, logK, localScalingFactor);
2526           status =
2527             BlastCompo_HeapInsert(&significantMatches[query_index],
2528                                   aligns, bestEvalue, bestScore,
2529                                   thisMatch->subject_id,
2530                                   &discardedAligns);
2531           if (status != 0) {
2532               ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2533           }
2534           if (discardedAligns != NULL) {
2535             SeqAlignSetFree(discardedAligns);
2536           }
2537         } /* end if the best alignment is significant */
2538       } /* end if any alignments were found */
2539     } /* end loop over queries */
2540     Kappa_MatchingSequenceRelease(&matchingSeq);
2541     MemFree(alignments);
2542     BlastCompo_AlignmentsFree(&incoming_aligns, NULL);
2543   }
2544   /* end for all matching sequences */
2545   results = Nlm_Calloc(numQueries, sizeof(SeqAlignPtr));
2546   for (query_index = 0;  query_index < numQueries;  query_index++) {
2547     results[query_index] =
2548       Kappa_CompoHeapToFlatList(&significantMatches[query_index]);
2549   }
2550   /* Clean up */
2551   free(query_info);
2552   Blast_RedoAlignParamsFree(&redo_align_params);
2553   if (search->current_hitlist) {
2554     search->current_hitlist->hspcnt_max = search->current_hitlist->hspcnt;
2555     search->current_hitlist = BlastHitListDestruct(search->current_hitlist);
2556   }
2557   for (query_index = 0;  query_index < numQueries;  query_index++) {
2558     BlastCompo_HeapRelease(&significantMatches[query_index]);
2559   }
2560   MemFree(significantMatches); significantMatches = NULL;
2561   if(SmithWaterman) {
2562       Blast_ForbiddenRangesRelease(&forbidden);
2563   }
2564   Kappa_RestoreSearch(search, matrix, searchParams, compo_adjust_mode);
2565   Kappa_SearchParametersFree(&searchParams);
2566   if (NULL != NRrecord) {
2567     Blast_CompositionWorkspaceFree(&NRrecord);
2568   }
2569   return (results);
2570 }
2571 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.