|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/tools/blastutl.c |
source navigation diff markup identifier search freetext search file search |
1 static char const rcsid[] = "$Id: blastutl.c,v 6.471 2007/05/08 19:03:33 kans Exp $";
2
3 /* ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================*/
26
27 /*****************************************************************************
28
29 File name: blastutl.c
30
31 Author: Tom Madden
32
33 Contents: Utilities for BLAST
34
35 $Revision: 6.471 $
36
37 ******************************************************************************/
38 /*
39 *
40 * $Log: blastutl.c,v $
41 * Revision 6.471 2007/05/08 19:03:33 kans
42 * in FilterWithSeg added SeqDataPtr and ByteStorePtr casts for seq_data
43 *
44 * Revision 6.470 2007/03/13 20:40:24 madden
45 * - In s_ComputeAverageLength, compute the floating point value retval
46 * using floating point division.
47 *
48 * - In BioseqBlastEngineCore, call blast_set_paramters for rounds > 1
49 * of PSI-BLAST.
50 *
51 * - In GetDbSubjRatio, use floating point operations to compute the
52 * floating point value db_subj_ratio.
53 * [from Mike Gertz]
54 *
55 * Revision 6.469 2007/03/05 14:51:24 camacho
56 * - Make s_ComputeAverageLength static.
57 *
58 * Revision 6.468 2007/01/23 15:25:44 madden
59 * Use SeqLocDustEx rather than SeqLocDust
60 *
61 * Revision 6.467 2007/01/17 15:46:00 madden
62 * remove FilterDNA
63 *
64 * Revision 6.466 2006/08/10 17:34:38 merezhuk
65 * Fix for reading -z advanced option by StringToInt8; RT # 15187990
66 *
67 * Revision 6.465 2006/02/15 18:23:47 madden
68 * Made changes so that CheckStartForGappedAlignment by default
69 * checks ungapped alignments of length 11, rather than length 10.
70 * Made changes to the rules used when the starting point is close to
71 * the edge of the preliminary gapped alignment.
72 * (from Mike Gertz)
73 *
74 * Revision 6.464 2005/12/01 15:10:23 madden
75 * Gave BLASTCheckHSPInclusion external linkage (i.e. removed the static specifier).
76 *
77 * Revision 6.463 2005/10/13 15:59:06 camacho
78 * Add code to fix cutoff scores in PSI-BLAST.
79 *
80 * Revision 6.462 2005/07/28 14:57:09 coulouri
81 * remove dead code
82 *
83 * Revision 6.461 2005/07/27 15:51:54 coulouri
84 * remove unused queue_callback
85 *
86 * Revision 6.460 2005/05/02 16:03:14 coulouri
87 * refactor code to set db_chunk_size
88 *
89 * Revision 6.459 2005/04/25 14:16:36 coulouri
90 * set db_chunk_size adaptively
91 *
92 * Revision 6.458 2005/04/04 20:44:27 camacho
93 * Do not overwrite the effective search space in Pssm2Sequences if specified in the options structure
94 *
95 * Revision 6.457 2005/02/07 15:30:08 dondosha
96 * Removed restriction on the value of longest intron option
97 *
98 * Revision 6.456 2005/01/24 20:37:37 camacho
99 * Added conditional compilation to structs need for BLAST_CLUSTER_HITS
100 *
101 * Revision 6.455 2005/01/18 14:54:13 camacho
102 * Change in tie-breakers for score comparison, suggestion by Mike Gertz
103 *
104 * Revision 6.454 2004/12/20 15:22:16 camacho
105 * Calculate kbp_ideal values rather than loading them from pre-computed values
106 *
107 * Revision 6.453 2004/12/01 17:24:15 coulouri
108 * do not dereference null pointer
109 *
110 * Revision 6.452 2004/11/22 16:10:11 dondosha
111 * Minor fix to make sure that "evalue" score type is always used when hsp is not part of a linked set
112 *
113 * Revision 6.451 2004/11/04 15:51:55 bealer
114 * - bl2seq should use dblen as average length if database is not available.
115 *
116 * Revision 6.450 2004/11/01 14:07:56 madden
117 * From Mike Gertz:
118 *
119 * - In query_offset_compare_hsp and query_end_compare_hsp, use the
120 * subject query/offset as a tie-breaker. Without this tie-breaker
121 * CheckGappedAlignmentsForOverlap won't work properly.
122 *
123 * - In CheckGappedAlignmentsForOverlap check that hsp_array, rather
124 * than *hsp_array, is not nil.
125 *
126 * - In BlastSaveCurrentHsp, rewrote the binary search to use
127 * score_compare_hsps, so that the answers are consistent with the
128 * heap code used in the algo/blast/core code.
129 *
130 * - In BlastGappedScoreInternal delete gapped extensions that don't
131 * reach the cutoff score (cutoff_s1).
132 *
133 * Revision 6.449 2004/10/25 18:36:17 papadopo
134 * From Michael Gertz: remove unneeded decrement of alignment offsets in BlastNtSaveCurrentHsp
135 *
136 * Revision 6.448 2004/10/19 19:42:17 dondosha
137 * Optimized algorithm in BlastPruneSeqAlignByGiList to make it up to 25 times faster; Added new function BlastPruneSeqAlignBySortedGiList
138 *
139 * Revision 6.447 2004/10/18 13:02:41 madden
140 * Changes from Mike Gertz:
141 * - In score_compare_hsps, query_offset_compare_hsp and
142 * query_end_compare_hsp, change the comparison tests so that
143 * nil HSPs are less than any non-nil HSP. Previously, these
144 * comparison functions would return 0 if either HSP was nil,
145 * which would result in sort routines terminating before the
146 * non-nil HSPs in the list were fully sorted.
147 *
148 * - In score_compare_hsps, copied the set of tie-breakers from
149 * the corresponding routine in algo/blast/core/blast_hits.c.
150 *
151 * - In RealBlastGetGappedAlignmentTraceback, the HSP list must
152 * be sorted before BLASTCheckHSPInclusion is invoked.
153 *
154 * Revision 6.446 2004/09/28 16:05:32 papadopo
155 * From Michael Gertz: In BlastGappedScoreInternal, changed a
156 * reference to the sumscore field of an HSP to a reference to the
157 * xsum field of an HSP.
158 *
159 * Revision 6.445 2004/08/23 17:05:42 papadopo
160 * From Michael Gertz: make CopyResultHspToHSP public
161 *
162 * Revision 6.444 2004/08/16 19:37:26 dondosha
163 * Enabled uneven gap HSP linking for blastx
164 *
165 * Revision 6.443 2004/08/05 21:52:28 camacho
166 * Gracefully handle inability to calculate ungapped lambda for PSSM in psiblast2sequences
167 *
168 * Revision 6.442 2004/07/24 18:55:29 camacho
169 * Fix to GetSequenceWithDenseSeg when sequence cannot be found
170 *
171 * Revision 6.441 2004/07/19 17:05:36 papadopo
172 * specify (unused) 'output-to-scoremat' parameter
173 *
174 * Revision 6.440 2004/06/30 12:29:39 madden
175 * Moved some functions to blfmtutl.c
176 *
177 * Revision 6.439 2004/06/22 14:16:55 camacho
178 * Changed invocation of posFreqsToMatrix to conform with new signature
179 *
180 * Revision 6.438 2004/06/01 20:34:06 dondosha
181 * Fix in previous change; memory leak fix
182 *
183 * Revision 6.437 2004/05/27 17:36:24 dondosha
184 * Minor fix for previous 2 changes
185 *
186 * Revision 6.436 2004/05/25 21:42:47 dondosha
187 * Fix in previous change: in some cases edit block should not be freed when BLAST_HSP is freed
188 *
189 * Revision 6.435 2004/05/21 13:53:04 dondosha
190 * Use BLAST_HSPFree to free BLAST_HSP structures, hence no need to call GapXEditBlockDelete in multiple places
191 *
192 * Revision 6.434 2004/04/22 16:40:32 dondosha
193 * Set search->subject_id to correct ordinal id, needed for finding splice junctions in HSP links at traceback stage
194 *
195 * Revision 6.433 2004/03/22 22:10:38 dondosha
196 * Use kbp_gap instead of kbp pointers in megablast traceback
197 *
198 * Revision 6.432 2004/02/26 15:52:30 papadopo
199 * Mike Gertz' modifications to unify handling of gapped Karlin blocks between protein and nucleotide searches
200 *
201 * Revision 6.431 2004/02/04 15:35:03 camacho
202 * Rollback to fix problems in release 2.2.7
203 *
204 * Revision 6.429 2004/01/30 16:54:45 dondosha
205 * Check if HSP needs to be deleted after reevaluation with ambiguities, after greedy traceback
206 *
207 * Revision 6.428 2004/01/28 16:54:03 dondosha
208 * Restored the code that shifts subject coordinates for blastn traceback with long subject sequences
209 *
210 * Revision 6.427 2004/01/25 05:06:21 dondosha
211 * Translate only relevant parts of long subject sequences for tblastn traceback
212 *
213 * Revision 6.426 2004/01/16 23:43:44 dondosha
214 * No more need for special argument for partial search: it is set in options
215 *
216 * Revision 6.425 2004/01/14 17:01:06 dondosha
217 * Gapped alignment is position based only if posMatrix exists
218 *
219 * Revision 6.424 2004/01/09 18:13:24 dondosha
220 * In [Get,Check]StartForGappedAlignment: if posMatrix not available, use square matrix for calculations
221 *
222 * Revision 6.423 2004/01/06 22:37:40 dondosha
223 * Use BLAST_HSPfree function; in particular fixes a bug with wrong memory being freed
224 *
225 * Revision 6.422 2003/12/11 23:46:28 dondosha
226 * Correction in setting hit ranges after repeats filtering
227 *
228 * Revision 6.421 2003/12/10 17:05:28 dondosha
229 * Added function ReevaluateScoreWithAmbiguities to reevaluate score for one HSP; use it after greedy traceback
230 *
231 * Revision 6.420 2003/11/24 22:06:41 madden
232 * Tblastn optimization, only fetch part of sequence needed
233 *
234 * Revision 6.419 2003/10/30 18:37:19 dondosha
235 * Fix for megablast with non-greedy traceback
236 *
237 * Revision 6.418 2003/10/29 17:46:59 dondosha
238 * Allow 2-stage greedy extension in megablast
239 *
240 * Revision 6.417 2003/08/20 22:14:08 dondosha
241 * Little correction in call to OOFBlastHSPGetNumIdentical
242 *
243 * Revision 6.416 2003/08/04 16:19:16 dondosha
244 * Added effective HSP length (length adjustment) to other returns, so it can be reported in XML output
245 *
246 * Revision 6.415 2003/05/30 17:25:36 coulouri
247 * add rcsid
248 *
249 * Revision 6.414 2003/05/23 22:12:11 camacho
250 * Fix memory leak in PsiBlast2Sequences
251 *
252 * Revision 6.413 2003/04/22 21:52:13 dondosha
253 * Added function OOFBlastHSPGetNumIdentical
254 *
255 * Revision 6.412 2003/04/10 19:21:16 dondosha
256 * Memory leak fix for megablast with limited number of HSPs per hit
257 *
258 * Revision 6.411 2003/03/24 19:42:14 madden
259 * Changes to support query concatenation for blastn and tblastn
260 *
261 * Revision 6.410 2003/03/11 14:33:48 madden
262 * Sort HSPs after array is no longer reallocated
263 *
264 * Revision 6.409 2003/02/21 02:52:16 madden
265 * Ensure stable sorting in score_compare_hsp (change from Morgulis)
266 *
267 * Revision 6.408 2003/01/24 22:26:03 camacho
268 * RPSInit is deprecated, use RPSInitEx instead
269 *
270 * Revision 6.407 2002/12/09 17:22:16 dondosha
271 * When alignment jumps beyond a strand boundary, keep the part of it where initial word is
272 *
273 * Revision 6.406 2002/12/04 23:32:50 camacho
274 * Do not set use_this_gi with nucleotide dbs (redundant)
275 *
276 * Revision 6.405 2002/12/04 18:42:22 camacho
277 * Minor change to previous commit
278 *
279 * Revision 6.404 2002/12/04 18:38:58 camacho
280 * Use correct effective search space in B2SPssmMultipleQueries
281 *
282 * Revision 6.403 2002/12/04 17:08:33 camacho
283 * Minor change to B2SPssmCleanUpSearch
284 *
285 * Revision 6.402 2002/11/27 15:41:51 dondosha
286 * Added -t, -g and -n megablast options to parse_blast_options
287 *
288 * Revision 6.401 2002/11/26 23:02:07 madden
289 * Add w option to parse_blast_options (OOF for blastx)
290 *
291 * Revision 6.400 2002/11/25 19:57:30 dondosha
292 * Further fix to the HSP limit (-H) megablast option
293 *
294 * Revision 6.399 2002/11/22 23:31:43 dondosha
295 * 1. Use array of structures instead of array of pointers for initial offset pairs;
296 * 2. Sort the HSP array when maximal number of HSPs is reached for a sequence
297 *
298 * Revision 6.398 2002/11/13 23:23:53 dondosha
299 * Correction for getting number of identities in tblastn
300 *
301 * Revision 6.397 2002/11/07 22:25:34 dondosha
302 * Correction in calculating number of identities for very long database sequences
303 *
304 * Revision 6.396 2002/11/04 23:00:54 dondosha
305 * Calculate number of identities while computing the traceback, and save it in the seqalign
306 *
307 * Revision 6.395 2002/10/22 21:03:42 camacho
308 * Calculate the effective search space correctly for rpsblast in BlastOtherReturnsPrepare
309 *
310 * Revision 6.394 2002/10/22 17:57:48 camacho
311 * Changes to B2SPssmMultipleQueries
312 *
313 * Revision 6.393 2002/10/22 15:28:45 kans
314 * SeqAlignCompare takes LIBCALLBACK
315 *
316 * Revision 6.392 2002/10/21 23:13:36 camacho
317 * Added B2SPssmOnTheFly functions
318 *
319 * Revision 6.391 2002/10/18 15:08:28 dondosha
320 * Correction in SaveCurrentHsp functions when maximal number of HSPs is reached
321 *
322 * Revision 6.390 2002/10/17 14:33:12 dondosha
323 * Correction for the maximal number of HSPs option
324 *
325 * Revision 6.389 2002/09/19 22:22:18 camacho
326 * Sanity checks in BlastTwoSequencesByLocWithCallback
327 *
328 * Revision 6.388 2002/09/16 15:54:59 camacho
329 * Turn off RedoAlignmentCore from psi-bl2seq
330 *
331 * Revision 6.387 2002/09/13 20:05:43 camacho
332 * Set the dbseq_num to 1 in BlastTwoSequencesByLocWithCallback
333 *
334 * Revision 6.386 2002/09/11 20:46:25 camacho
335 * Removed deprecated BlastSeqIdListPtr code
336 *
337 * Revision 6.385 2002/09/03 14:22:45 camacho
338 * Changes to pacify mac compiler
339 *
340 * Revision 6.384 2002/09/02 21:54:41 camacho
341 * Correction to previous revision
342 *
343 * Revision 6.383 2002/09/02 20:44:56 camacho
344 * Allow pssm rescaling if scalingFactor is non-zero
345 *
346 * Revision 6.382 2002/08/30 15:42:49 dondosha
347 * In blastn, use ewp structure only for the first context
348 *
349 * Revision 6.381 2002/08/29 19:22:20 camacho
350 * Save karlinK parameter when rescaling pssm
351 *
352 * Revision 6.380 2002/08/29 16:23:42 camacho
353 * Removed debugging code
354 *
355 * Revision 6.379 2002/08/29 15:49:56 camacho
356 * Added matrix rescaling code for psi-blast2sequences
357 *
358 * Revision 6.378 2002/08/26 16:55:52 madden
359 * Fix for scaling with translated searches
360 *
361 * Revision 6.376 2002/08/05 20:07:37 dondosha
362 * Correction for bl2seq with megablast option: convert gap info to seqalign after search
363 *
364 * Revision 6.375 2002/08/02 21:49:56 vakatov
365 * + LIBCALL
366 *
367 * Revision 6.374 2002/08/01 21:33:12 madden
368 * Do not put p-value and small_gap into SeqAlign
369 *
370 * Revision 6.373 2002/08/01 20:45:34 dondosha
371 * Changed prototype of the BLASTPostSearchLogic function to make it
372 * more convenient
373 *
374 * Revision 6.372 2002/07/18 19:40:45 dondosha
375 * Added an option to restrict number of HSPs per database sequence
376 *
377 * Revision 6.371 2002/07/11 22:31:54 camacho
378 * Added sanity check to BlastTwoSequencesByLocWithCallback with PSSM
379 *
380 * Revision 6.370 2002/07/02 17:08:01 dondosha
381 * Reverse previous change - not needed
382 *
383 * Revision 6.369 2002/07/02 01:41:31 dondosha
384 * Typo fix
385 *
386 * Revision 6.368 2002/07/02 01:36:40 dondosha
387 * For megablast use larger window in CheckStartForGappedAlignment
388 *
389 * Revision 6.367 2002/06/21 21:43:01 camacho
390 * Removed obsolete BlastSeqIdList structure and functions
391 *
392 * Revision 6.366 2002/06/13 16:51:41 madden
393 * BlastTwoSequencesCore and BlastTwoSequencesCoreEx return status instead of SearchBlk
394 *
395 * Revision 6.365 2002/06/12 20:34:50 coulouri
396 * Don't dereference possibly NULL pointer
397 *
398 * Revision 6.364 2002/06/11 20:40:05 dondosha
399 * Correction to previous change
400 *
401 * Revision 6.363 2002/06/11 14:44:46 dondosha
402 * Return status from some functions instead of search block pointer
403 *
404 * Revision 6.362 2002/05/31 16:06:20 kans
405 * changed MemSet (..., NULL, ...) to MemSet (..., 0, ...) for Mac compiler
406 *
407 * Revision 6.361 2002/05/29 17:14:49 dondosha
408 * Check whether an id found by SeqIdFindBest is indeed a gi
409 *
410 * Revision 6.360 2002/05/28 22:00:12 camacho
411 * *** empty log message ***
412 *
413 * Revision 6.359 2002/05/13 13:51:32 dondosha
414 * Made two functions public
415 *
416 * Revision 6.358 2002/05/08 22:51:11 dondosha
417 * Do the starting positions check for final gapped alignment in Mega BLAST case as well
418 *
419 * Revision 6.357 2002/04/23 20:41:21 dondosha
420 * In case of non-affine extension in megablast, check percent identity cutoff after the traceback is obtained
421 *
422 * Revision 6.356 2002/04/19 17:26:07 madden
423 * Fix for last update
424 *
425 * Revision 6.355 2002/04/18 20:16:52 madden
426 * Fix problem with FUM for SeqLoc
427 *
428 * Revision 6.354 2002/04/17 20:42:23 madden
429 * Fix typo for mask1
430 *
431 * Revision 6.353 2002/04/04 21:19:15 dondosha
432 * Corrections for megablast with non-greedy extensions
433 *
434 * Revision 6.352 2002/03/28 18:51:39 madden
435 * All threads get access to (query) masking seqloc, merge overlapping segments for seg
436 *
437 * Revision 6.351 2002/03/26 23:18:00 dondosha
438 * Duplicate mb_endpoint_results structure on all threads
439 *
440 * Revision 6.350 2002/03/26 16:49:33 madden
441 * Use scaled up/down Lambda
442 *
443 * Revision 6.349 2002/03/14 16:11:40 camacho
444 * Extended BlastTwoSequences to allow comparison between sequence and PSSM
445 *
446 * Revision 6.348 2002/03/05 17:58:56 dondosha
447 * Set same offsets for the traceback as for preliminary extension for megablast with non-greedy extensions
448 *
449 * Revision 6.347 2002/02/15 23:36:22 dondosha
450 * Correction for megablast with non-greedy extensions
451 *
452 * Revision 6.346 2002/01/11 20:14:28 madden
453 * Put the use_this_gi into the SeqAlign
454 *
455 * Revision 6.345 2002/01/07 23:16:00 dondosha
456 * Fixed several memory leaks and allocation/freeing bugs in multithreaded megablast
457 *
458 * Revision 6.344 2001/12/28 20:38:40 dondosha
459 * Moved Mega BLAST related parameters into a separate structure
460 *
461 * Revision 6.343 2001/12/13 16:06:54 dondosha
462 * Use separate mb_endpoint_results list for each of multiple threads
463 *
464 * Revision 6.342 2001/11/26 20:19:25 madden
465 * Add call to BLASTOptionValidateEx to BlastTwoSequencesWithCallback
466 *
467 * Revision 6.341 2001/11/16 15:44:26 dondosha
468 * In BlastPruneSeqAlignByGiList: retrieve bioseq only if seqid in seqalign is not a gi
469 *
470 * Revision 6.340 2001/11/14 00:31:44 camacho
471 * Updated BlastGetAllowedGis and BlastGetFirstGiofSubset functions
472 * to return the correct seqid's when dealing with the new database
473 * format and mask (subset) databases.
474 *
475 * Revision 6.339 2001/11/13 18:20:33 dondosha
476 * Use GapxEditScript structure instead of edit_script_t in higher level function calls
477 *
478 * Revision 6.338 2001/10/12 16:10:07 dondosha
479 * 1. Made BLASTResultFreeHsp public
480 * 2. Added BioseqBlastEngineCoreEx with partial search option
481 *
482 * Revision 6.337 2001/10/05 18:10:29 madden
483 * Add threshold_second to parse_blast_options
484 *
485 * Revision 6.336 2001/09/19 17:24:17 kans
486 * removed extra parameter from BioseqMegaBlastEngineCore
487 *
488 * Revision 6.335 2001/09/07 14:46:43 dondosha
489 * Roll back removal of threshold_first from functions and structures
490 *
491 * Revision 6.334 2001/09/06 20:24:33 dondosha
492 * Removed threshold_first
493 *
494 * Revision 6.333 2001/07/27 20:04:09 dondosha
495 * Small correction in passing effective db length for two sequences engine
496 *
497 * Revision 6.332 2001/07/26 18:19:03 dondosha
498 * Added a few more letter options in parse_blast_options
499 *
500 * Revision 6.331 2001/07/20 18:55:58 dondosha
501 * 1. Use effective db length option in 2 sequences engine
502 * 2. Create diagonal array for megablast when needed
503 *
504 * Revision 6.330 2001/07/09 14:17:24 madden
505 * Fix PC-lint complaints from R. Williams
506 *
507 * Revision 6.329 2001/07/09 13:12:03 madden
508 * Removed unused variables
509 *
510 * Revision 6.328 2001/06/25 18:30:24 madden
511 * Add define for NLM_GENERATED_CODE_PROTO to get prototypes in fdlobj.h
512 *
513 * Revision 6.327 2001/06/25 16:03:31 madden
514 * Comment out CheckGappedAlignmentsForOverlap
515 *
516 * Revision 6.326 2001/06/12 19:48:55 madden
517 * Introduce total_hsp_limit, check before making SeqAlign
518 *
519 * Revision 6.325 2001/06/04 21:29:42 dondosha
520 * Add message about deleted hits with e-value below the low threshold
521 *
522 * Revision 6.324 2001/05/07 13:18:24 madden
523 * Fix to really remove deleted HSPs from (culling) heap
524 *
525 * Revision 6.323 2001/05/04 19:50:45 dondosha
526 * Improved error message when all queries are shorter than word size
527 *
528 * Revision 6.322 2001/05/03 21:48:28 dondosha
529 * Handle some cases when memory allocation fails
530 *
531 * Revision 6.321 2001/04/16 21:28:11 dondosha
532 * Added function BlastPruneSeqAlignByEvalueRange
533 *
534 * Revision 6.320 2001/04/12 21:34:50 dondosha
535 * Added function BlastPruneSeqAlignByGiList
536 *
537 * Revision 6.319 2001/04/12 17:17:15 madden
538 * Fixes core-dump for small query
539 *
540 * Revision 6.318 2001/04/12 15:01:25 madden
541 * change repeat filtering db
542 *
543 * Revision 6.317 2001/04/11 20:56:06 madden
544 * Added scalingFactor for rpsblast
545 *
546 * Revision 6.316 2001/04/11 18:22:13 dondosha
547 * Copy query_slp in BlastSearchBlkDuplicate for all programs
548 *
549 * Revision 6.315 2001/04/03 21:59:49 dondosha
550 * Implemented tabulated output for non-megablast bl2seq
551 *
552 * Revision 6.314 2001/03/28 21:05:23 dondosha
553 * Set dbinfo->is_protein in other returns
554 *
555 * Revision 6.313 2001/03/27 21:27:01 madden
556 * Minor efficiency in how lookup table is made
557 *
558 * Revision 6.312 2001/03/27 21:13:56 dondosha
559 * Do not print error if OID list exists without CommonIndex
560 *
561 * Revision 6.311 2001/03/27 20:35:10 dondosha
562 * Small bug fix
563 *
564 * Revision 6.310 2001/03/26 15:03:25 madden
565 * Fix number warnings and two bugs found by PC compiler
566 *
567 * Revision 6.309 2001/03/21 15:46:32 dondosha
568 * Added missing parentheses in previous change
569 *
570 * Revision 6.308 2001/03/20 20:06:13 dondosha
571 * Added protection from crossing strand boundary for blastn
572 *
573 * Revision 6.307 2001/03/19 18:51:39 madden
574 * HitRangeToSeqLoc returns values appropriate for subsequences
575 *
576 * Revision 6.306 2001/03/12 14:53:46 dondosha
577 * Uninitialized variable corrections
578 *
579 * Revision 6.305 2001/03/08 22:05:48 dondosha
580 * Split very long database sequences in all BLAST programs
581 *
582 * Revision 6.304 2001/02/16 18:45:39 dondosha
583 * Fixed minor purify errors
584 *
585 * Revision 6.303 2001/02/08 20:41:16 dondosha
586 * Implemented tabulated output for all translated programs
587 *
588 * Revision 6.302 2001/02/07 21:12:05 dondosha
589 * 1. Added Blast Engine functions with callback argument
590 * 2. Pass output stream from options block to search
591 *
592 * Revision 6.301 2001/01/29 22:23:00 madden
593 * Do not recreate hsp_array
594 *
595 * Revision 6.300 2001/01/26 17:43:09 madden
596 * Comment out unneeded memset
597 *
598 * Revision 6.299 2001/01/23 20:25:43 dondosha
599 * 1. Renamed BlastParceInputString to BlastParseInputString
600 * 2. Recognize a double quoted string as an option value in
601 * BlastParseInputString
602 *
603 * Revision 6.298 2001/01/23 18:23:57 madden
604 * Fix memory leak
605 *
606 * Revision 6.297 2001/01/19 16:49:37 madden
607 * Added helper array to BlastNtGappedScoreInternal
608 *
609 * Revision 6.296 2001/01/16 23:16:51 dondosha
610 * Added 2 arguments and several options to parse_blast_options
611 *
612 * Revision 6.295 2001/01/16 20:32:46 kans
613 * included simutil.h to suppress Mac error
614 *
615 * Revision 6.294 2001/01/12 17:10:04 dondosha
616 * If subject SeqLoc is on a single strand and query on both, swap the strands
617 *
618 * Revision 6.293 2001/01/11 18:34:20 dondosha
619 * Changed error level for nonexistent database from ERROR to FATAL
620 *
621 * Revision 6.292 2001/01/09 20:16:27 dondosha
622 * Implemented from-to location options for both sequences in bl2seq
623 *
624 * Revision 6.291 2001/01/05 17:12:48 dondosha
625 * Correction in previous memory leak fix
626 *
627 * Revision 6.290 2001/01/04 15:01:25 dondosha
628 * Fix for tblastx in blast two sequences engine
629 *
630 * Revision 6.289 2001/01/03 21:45:30 dondosha
631 * Fixed a memory leak - some edit blocks not freed in megablast
632 *
633 * Revision 6.288 2000/12/28 18:23:05 madden
634 * Add -P and -A to parse_blast_options
635 *
636 * Revision 6.287 2000/12/19 15:52:47 dondosha
637 * Forbid reversing query and subject for two sequences megablast
638 *
639 * Revision 6.286 2000/12/19 14:52:59 dondosha
640 * Previous change wrong
641 *
642 * Revision 6.285 2000/12/15 15:38:38 dondosha
643 * Call AdjustOffSetsInSeqAlign with correct query and subject SeqLocs
644 *
645 * Revision 6.284 2000/12/15 14:25:41 madden
646 * Optimization to BlastTranslateUnambiguousSequence
647 *
648 * Revision 6.283 2000/12/15 14:23:34 madden
649 * Use readdb_get_sequence_ex to get sequence faster
650 *
651 * Revision 6.282 2000/12/13 22:26:44 dondosha
652 * Free the ncbi4na-encoded subject sequence after search in two sequences megablast engine
653 *
654 * Revision 6.281 2000/12/13 13:51:35 madden
655 * Free SeqLocPtr in BlastSequencesOnTheFly
656 *
657 * Revision 6.280 2000/12/07 17:46:56 dondosha
658 * Call AdjustOffSetsInSeqAlign for for megablast too
659 *
660 * Revision 6.279 2000/12/04 18:51:24 madden
661 * Fix memory leaks
662 *
663 * Revision 6.278 2000/11/29 23:05:00 dondosha
664 * Keep ncbi4na-encoded subject sequence in search->subject for megablast
665 *
666 * Revision 6.277 2000/11/16 19:15:31 dondosha
667 * Pass back endpoint results in other_returus for Mega BLAST with no traceback
668 *
669 * Revision 6.276 2000/11/09 17:28:35 dondosha
670 * Set block_width to 0 for Mega BLAST in BlastTwoSequences engine
671 *
672 * Revision 6.275 2000/11/08 22:21:33 dondosha
673 * Enabled new tblastn by adding a longest_intron option
674 *
675 * Revision 6.274 2000/11/08 20:20:31 dondosha
676 * Do not free subject in BlastTwoSequencesCore for new tblastn - done elsewhere
677 *
678 * Revision 6.273 2000/11/07 16:30:27 madden
679 * Introduce intermediate score (before linking of HSPs) for blastx and tblastn
680 *
681 * Revision 6.272 2000/11/03 20:15:19 dondosha
682 * Pass the subject sequence to new_link_hsps from two sequences engine
683 *
684 * Revision 6.271 2000/11/02 20:15:38 dondosha
685 * Added functions BlastTwoSequencesByLocWithCallback and BlastTwoSequencesWithCallback
686 *
687 * Revision 6.270 2000/11/02 16:36:12 madden
688 * Fixed another minor problem from merge
689 *
690 * Revision 6.269 2000/11/02 16:12:37 madden
691 * fix Errors during merge of code
692 *
693 * Revision 6.268 2000/11/01 16:25:57 madden
694 * Changes from Futamura for psitblastn
695 *
696 * Revision 6.267 2000/10/31 17:51:44 dondosha
697 * Copy the necessary search block data for multi-threaded megablast
698 *
699 * Revision 6.266 2000/10/23 22:17:54 shavirin
700 * Added creation of "no database found" message in case if database is
701 * not found.
702 *
703 * Revision 6.265 2000/10/18 19:46:29 dondosha
704 * Fixed bug in BlastTwoSequencesCore for partial subject sequence search
705 *
706 * Revision 6.264 2000/10/16 19:34:16 shavirin
707 * Added possibility to run RPS Blast search from function BioseqBlastEngineByLocEx().
708 *
709 * Revision 6.263 2000/10/13 17:32:50 shavirin
710 * Adjusted calls to readdb_get_header for ASN.1 structured deflines.
711 *
712 * Revision 6.262 2000/10/13 16:05:44 shavirin
713 * Fixed minir bug with reporting database name.
714 *
715 * Revision 6.261 2000/10/12 14:45:34 madden
716 * Break out of loop if hsp is freed
717 *
718 * Revision 6.260 2000/10/11 17:14:02 dondosha
719 * For tblastn traceback convert subject sequence to ncbi4na encoding in BlastTwoSequencesCore
720 *
721 * Revision 6.259 2000/10/10 16:11:15 shavirin
722 * Added check for NULL in the function BLASTCheckHSPInclusion().
723 *
724 * Revision 6.258 2000/10/06 19:32:02 shavirin
725 * Added call to SeqMgrAddToBioseqIndex() for created fake Bioseq.
726 *
727 * Revision 6.257 2000/10/05 22:43:10 dondosha
728 * Use mb_result_struct for Mega BLAST results in two sequences functions
729 *
730 * Revision 6.256 2000/10/05 19:57:08 dondosha
731 * In Mega BLAST, results are saved in and freed from mb_result_struct, not result_struct
732 *
733 * Revision 6.255 2000/10/03 21:28:54 shavirin
734 * Added check for search->pbp for not NULL in BlastSearchBlkDestruct().
735 *
736 * Revision 6.254 2000/09/29 21:14:47 shavirin
737 * Added additional check for inclusion of HSPs after traceback for
738 * OOF gapped alignment case.
739 *
740 * Revision 6.253 2000/09/28 14:57:50 dondosha
741 * Initialize exact match array for megablast in BlastHitListNew
742 *
743 * Revision 6.252 2000/09/25 15:43:36 madden
744 * Fix for rpsblast, too high expect values getting through
745 *
746 * Revision 6.251 2000/09/14 15:05:46 dondosha
747 * For new tblastn, reset evalues to individual ones before relinking HSPs
748 *
749 * Revision 6.250 2000/09/07 13:41:42 madden
750 * Fix if first start is -1 in DenseSeg
751 *
752 * Revision 6.249 2000/09/01 18:29:12 dondosha
753 * Removed calls to ReadDBFreeSharedInfo and ReadDBCloseMHdrAndSeqFiles
754 *
755 * Revision 6.248 2000/08/31 18:37:21 shavirin
756 * Added check for NULL in BlastMakeCopyQueryDNAP().
757 *
758 * Revision 6.247 2000/08/31 16:55:17 shavirin
759 * Fixed problem with OOF alignment of negative starnd HSPs.
760 *
761 * Revision 6.246 2000/08/28 21:53:12 shavirin
762 * Added function BlastOtherReturnsFree(). Cleaned memory in case of
763 * tweak_parameters = TRUE. (Freed SeqAlign calculated before RedoAlignmentCore.
764 *
765 * Revision 6.245 2000/08/22 20:02:27 dondosha
766 * Previous change not quite right: use real subject length for all programs
767 *
768 * Revision 6.244 2000/08/22 19:42:25 dondosha
769 * Divide search->subject->length by 3 for tblastn in RealBlastGetGappedAlignmentTraceback
770 *
771 * Revision 6.243 2000/08/18 21:27:59 madden
772 * undo change 6.240 when smith_waterman is not set, the extra alignment is needed when only tweak_parameters is set
773 *
774 * Revision 6.242 2000/08/18 20:12:29 dondosha
775 * Do not use search->query_id in megablast, use only qid_array
776 *
777 * Revision 6.241 2000/08/08 21:43:35 shavirin
778 * Initialized GapAlignBlkPtr for the value of discontinuous parametrers.
779 *
780 * Revision 6.240 2000/08/03 22:25:36 shavirin
781 * Removed redundant gapped Traceback in case when tweak_parameters or
782 * smith_waterman is set.
783 *
784 * Revision 6.239 2000/07/31 23:08:13 dondosha
785 * Do not go over the end of the HSP in subject sequence when computing start for gapped alignment
786 *
787 * Revision 6.238 2000/07/25 18:12:03 shavirin
788 * WARNING: This is no-turning-back changed related to S&W Blast from
789 * Alejandro Schaffer
790 *
791 * Revision 6.237 2000/07/25 16:54:26 shavirin
792 * Corrected functions initializing gap_align in case of OOF gapping.
793 *
794 * Revision 6.236 2000/07/18 22:33:02 shavirin
795 * Adjusted start for gapped alignment in OOF case.
796 *
797 * Revision 6.235 2000/07/17 14:26:08 shavirin
798 * Added support for Out of frame gapping.
799 *
800 * Revision 6.234 2000/07/13 18:33:28 madden
801 * Fix for exploded hits with pdb
802 *
803 * Revision 6.233 2000/07/11 18:38:02 madden
804 * decreased size of helper array, added prefetch to BlastGappedScoreInternal
805 *
806 * Revision 6.232 2000/07/10 15:23:30 dondosha
807 * Moved check query_invalid from BlastTwoSequencesCoreEx to BlastTwoSequencesCore
808 *
809 * Revision 6.231 2000/07/10 15:06:23 madden
810 * Use helper array in BlastGappedScoreInternal to reduce cache misses
811 *
812 * Revision 6.230 2000/06/30 17:52:44 madden
813 * Move AWAKE_THR_MIN_SIZE to blastdef.h
814 *
815 * Revision 6.229 2000/06/29 21:27:02 dondosha
816 * Fixed memory leaks in culling by similarity
817 *
818 * Revision 6.228 2000/06/29 19:19:39 madden
819 * Fix minus strand offset in BlastConvertDNASeqLoc
820 *
821 * Revision 6.227 2000/06/26 20:15:34 shavirin
822 * Fixed coordinates transfer in the function BlastConvertDNASeqLoc().
823 *
824 * Revision 6.226 2000/06/23 20:17:42 madden
825 * Optimization for CheckGappedAlignmentsForOverlap (remove n-squared hsp check)
826 *
827 * Revision 6.225 2000/06/23 15:22:43 madden
828 * Fix problem with removing translated hits with different frames
829 *
830 * Revision 6.224 2000/06/21 18:02:25 dondosha
831 * In BlastSaveCurrentHspGapped no need to allocate new memory for hsp_array
832 *
833 * Revision 6.223 2000/06/21 15:10:27 madden
834 * efficiency in BlastGappedScoreInternal
835 *
836 * Revision 6.222 2000/06/21 12:53:22 madden
837 * Do each frame separately in CheckGappedScoreInternal for efficiency
838 *
839 * Revision 6.221 2000/06/20 16:45:36 dondosha
840 * Fixed a minor bug in revision 6.219
841 *
842 * Revision 6.220 2000/06/19 20:07:19 madden
843 * Skip transferring sequence to blastna format
844 *
845 * Revision 6.219 2000/06/19 19:16:19 dondosha
846 * Optimized reallocation of hsp array when it is overflowing
847 *
848 * Revision 6.218 2000/06/15 15:31:26 dondosha
849 * Added two sequences BLAST functions returning SearchBlk instead of SeqAlign;added code to cluster hits and keep only one hit per cluster - disabled so far; enabled two sequences BLAST for tblastn
850 *
851 * Revision 6.217 2000/06/13 20:54:38 shavirin
852 * Added return of EFF_SEARCH_SPACE in the function BlastOtherReturnsPrepare
853 *
854 * Revision 6.216 2000/06/08 20:34:15 madden
855 * add explode_seqids option to show all ids in a defline
856 *
857 * Revision 6.215 2000/05/24 20:53:48 dondosha
858 * Fixed a bug in previous change
859 *
860 * Revision 6.214 2000/05/24 19:49:07 dondosha
861 * Create qid_array for the new search in BlastSearchDuplicate, if megablast
862 *
863 * Revision 6.213 2000/05/22 19:49:35 dondosha
864 * Initialize vnp to NULL in BlastSeqLocFilterEx
865 *
866 * Revision 6.212 2000/05/16 20:00:02 madden
867 * fix for formatting db names
868 *
869 * Revision 6.211 2000/05/12 19:41:54 dondosha
870 * Free qid_array in BlastSearchBlkDestruct
871 *
872 * Revision 6.210 2000/05/05 20:10:22 madden
873 * Add vecscreen filtering capability
874 *
875 * Revision 6.209 2000/04/29 18:55:53 wheelan
876 * temporary fix for BlastTwoSequences NULL return problem
877 *
878 * Revision 6.208 2000/04/28 16:52:31 madden
879 * Fix for ungapped search of subset databases
880 *
881 * Revision 6.207 2000/04/10 17:26:28 madden
882 * Add BLASTResultFreeHsp to free memory as it is no longer needed
883 *
884 * Revision 6.206 2000/04/10 15:24:49 dondosha
885 * Enabled use of MegaBlast for BlastTwoSequences
886 *
887 * Revision 6.205 2000/04/07 16:57:45 shavirin
888 * Transfered queue parameters in BlastSearchBlkDuplicate() function.
889 *
890 * Revision 6.204 2000/04/06 17:33:57 madden
891 * Check if pointer is NULL in BlastGetAllowedGis
892 *
893 * Revision 6.203 2000/04/03 21:23:18 dondosha
894 * Do not construct ewp_params and ewp for MegaBlast search
895 *
896 * Revision 6.202 2000/04/03 20:05:27 madden
897 * Free lh_helper on tmp_hitlist, fixes leak
898 *
899 * Revision 6.201 2000/03/31 19:11:06 dondosha
900 * Changed some names related to MegaBlast
901 *
902 * Revision 6.200 2000/03/31 16:45:43 dondosha
903 * Enabled blastx for BlastTwoSequences search
904 *
905 * Revision 6.199 2000/03/30 21:44:22 madden
906 * Add BLASTResultHitlistFreeEx that checks Heap integrity
907 *
908 * Revision 6.198 2000/03/29 22:18:02 dondosha
909 * Moved adjustment of offsets in blastn to BlastSaveCurrentHitlist, added gap info processing for MegaBlast
910 *
911 * Revision 6.197 2000/03/22 17:58:54 dondosha
912 * Duplicate entire list of query_ids in BlastSearchBlkDuplicate
913 *
914 * Revision 6.196 2000/03/08 20:34:30 madden
915 * Add BlastGetFirstGiofSubset, BlastGetAllowedGis returns primary SeqId
916 *
917 * Revision 6.195 2000/03/03 18:15:52 dondosha
918 * Fixed bugs and memory leaks in MegaBlast related code
919 *
920 * Revision 6.194 2000/03/03 17:58:23 shavirin
921 * Added new function BlastConvertDNASeqLoc()
922 *
923 * Revision 6.193 2000/03/01 14:37:45 dondosha
924 * Adjust query offsets after search for all 3 versions of blastn
925 *
926 * Revision 6.192 2000/02/29 18:06:07 dondosha
927 * In case of MegaBlast save correct query ids in seqaligns
928 *
929 * Revision 6.191 2000/02/24 23:21:27 dondosha
930 * Adjust context offsets before gapped alignment to avoid strand crossover
931 *
932 * Revision 6.190 2000/02/23 20:51:05 dondosha
933 * Modifications for blastn to concatenate strands - handling of query offsets
934 *
935 * Revision 6.189 2000/02/17 21:23:10 shavirin
936 * Added parameter is_rps_blast.
937 *
938 * Revision 6.188 2000/02/17 19:02:09 shavirin
939 * Removed all references to absolete theCacheSize variable.
940 *
941 * Revision 6.187 2000/02/17 18:30:56 shavirin
942 * Added translated DNA filtering for RPS Blast
943 *
944 * Revision 6.186 2000/02/17 14:38:27 madden
945 * Duplicate filter_string for multiple threads
946 *
947 * Revision 6.185 2000/02/16 21:49:16 shavirin
948 * Fixed some memory leaks.
949 *
950 * Revision 6.184 2000/02/15 19:16:26 shavirin
951 * MemFree(pbp->filter_string) in BlastSearchBlkDestruct
952 *
953 * Revision 6.183 2000/02/14 16:15:50 madden
954 * Revert to 6.179
955 *
956 * Revision 6.182 2000/02/11 22:03:03 shavirin
957 * Returned back previous change.
958 *
959 * Revision 6.181 2000/02/11 21:25:58 shavirin
960 * Removed call to BlastLinkHsps() function for tblastn program.
961 *
962 * Revision 6.180 2000/02/11 20:45:54 dondosha
963 * Adjust the second strand offsets after blastn search
964 *
965 * Revision 6.179 2000/02/11 16:40:53 egorov
966 * The parse_blast_options is made public.
967 *
968 * Revision 6.178 2000/02/04 22:31:38 kans
969 * test subject_bsp for NULL before dereferencing in BlastTwoSequencesByLocEx
970 *
971 * Revision 6.177 2000/02/04 16:13:15 shavirin
972 * Returned changes done in Revision 6.172.
973 *
974 * Revision 6.176 2000/02/02 18:22:05 madden
975 * Free memory for LinkHelpStruct
976 *
977 * Revision 6.175 2000/02/01 22:13:26 dondosha
978 * Added code related to greedy basic gapped alignment
979 *
980 * Revision 6.174 2000/01/28 16:45:53 madden
981 * HitRangeToSeqLoc called with combine TRUE
982 *
983 * Revision 6.173 2000/01/26 22:01:56 madden
984 * Add function BlastGetProgramName
985 *
986 * Revision 6.172 2000/01/14 18:28:11 shavirin
987 * Some WordExtention* function mad external.
988 *
989 * Revision 6.171 2000/01/12 21:46:19 dondosha
990 * Minor memory leak clean-up (routine BlastSeqLocFilterEx)
991 *
992 * Revision 6.170 2000/01/12 18:54:44 madden
993 * Do not free bestid to fix problem
994 *
995 * Revision 6.169 2000/01/11 17:12:51 shavirin
996 * Added handling of the new parameter theCacheSize.
997 *
998 * Revision 6.168 2000/01/11 15:32:47 dondosha
999 * Fixed memory leaks in opening shared header and sequence file memory maps
1000 *
1001 * Revision 6.167 2000/01/04 21:56:59 madden
1002 * Add NULLB to both ends of db sequence before gap extend, use dynamic buffer for blast options in repeat filtering
1003 *
1004 * Revision 6.166 2000/01/03 17:38:33 shavirin
1005 * Added check for rdfp in BlastGetAllowedGis() function.
1006 *
1007 * Revision 6.165 1999/12/31 14:23:20 egorov
1008 * Add support for using mixture of real and maks database with gi-list files:
1009 * 1. Change logic of creating rdfp list.
1010 * 2. BlastGetDbChunk gets real databases first, then masks.
1011 * 3. Propoper calculation of database sizes using alias files.
1012 * 4. Change to CommonIndex to support using of mask databases.
1013 * 5. Use correct gis in formated output (BlastGetAllowedGis()).
1014 * 6. Other small changes
1015 *
1016 * Revision 6.164 1999/12/22 22:00:35 dondosha
1017 * Destruct the header and sequence memory maps separately before destructing the search structure
1018 *
1019 * Revision 6.163 1999/12/22 21:08:36 shavirin
1020 * Rewritten function BlastNewFindWords() added function BlastNewFindWordsEx()
1021 *
1022 * Revision 6.160 1999/12/21 20:02:45 egorov
1023 * Fix memory leak.
1024 *
1025 * Revision 6.159 1999/12/17 22:22:57 madden
1026 * New masking parameters from Wojtek
1027 *
1028 * Revision 6.158 1999/12/16 19:08:36 egorov
1029 * Check rdfp for NULL before using. Bug reported by Patrick and Sergei Sh.
1030 *
1031 * Revision 6.157 1999/12/15 17:42:26 egorov
1032 * Change BlastGetAllowedGis() to handle gi's belonged to a database alias.
1033 *
1034 * Revision 6.156 1999/12/13 21:53:02 madden
1035 * Some fixes for repeat masking
1036 *
1037 * Revision 6.155 1999/11/26 22:11:26 madden
1038 * Added BlastNT functions for nucl. extensions
1039 *
1040 * Revision 6.154 1999/11/24 15:21:38 egorov
1041 * Avoid GCC warning
1042 *
1043 * Revision 6.153 1999/11/09 14:14:12 madden
1044 * Start alive thread for masking only if query is above min size
1045 *
1046 * Revision 6.152 1999/11/02 15:32:36 madden
1047 * Allow setting of repeat filtering options and database
1048 *
1049 * Revision 6.151 1999/11/01 20:18:22 egorov
1050 * New format of filter_string
1051 *
1052 * Revision 6.150 1999/10/27 21:33:02 madden
1053 * Use housekeeping threads only for larger sequences
1054 *
1055 * Revision 6.149 1999/10/18 20:06:52 shavirin
1056 * evalue_compare_hits() : In case of equal scores and E-values order
1057 * will be determined by subject id
1058 *
1059 * Revision 6.148 1999/10/18 16:15:04 egorov
1060 * Bug fixed
1061 *
1062 * Revision 6.147 1999/10/15 20:52:10 shavirin
1063 * Fixed bug with seq_id_list initialization
1064 *
1065 * Revision 6.146 1999/10/12 21:50:47 shavirin
1066 * Added intialization of db_chunk_size in BlastThrInfoNew().
1067 *
1068 * Revision 6.145 1999/10/05 17:42:55 shavirin
1069 * Removed global variables from blast.c
1070 *
1071 * Revision 6.144 1999/10/01 18:26:56 madden
1072 * Check for search->rdfp before search->rdfp->oidlist
1073 *
1074 * Revision 6.143 1999/09/28 20:14:33 madden
1075 * Joerg changes to mimize cache misses
1076 *
1077 * Revision 6.142 1999/09/22 20:58:49 egorov
1078 * OID list change
1079 *
1080 * Revision 6.141 1999/09/16 16:55:12 madden
1081 * Changes for long words in blastn
1082 *
1083 * Revision 6.140 1999/09/03 17:23:25 madden
1084 * Fixed bug in CheckStartForGappedAlignment
1085 *
1086 * Revision 6.139 1999/09/01 19:21:06 shavirin
1087 * Added propagation of the score for discontinuous alignment in
1088 * functions: RealBlastGetGappedAlignmentTraceback() and BioseqBlastEngineCore()
1089 *
1090 * Revision 6.138 1999/08/27 18:07:34 shavirin
1091 * Passed parameter decline_align from top to the engine.
1092 *
1093 * Revision 6.137 1999/08/20 20:54:12 madden
1094 * place sentinel byte at beginning of nt sequence for ALIGN
1095 *
1096 * Revision 6.136 1999/08/20 19:48:13 madden
1097 * Changed call to BlastSearchBlkNew(Extra), removed use of version array
1098 *
1099 * Revision 6.135 1999/08/20 16:35:25 shavirin
1100 * Added protection against invalid program name in BlastGetTypes().
1101 *
1102 * Revision 6.134 1999/08/06 18:53:57 madden
1103 * Added calls to lookup_position_aux_destruct
1104 *
1105 * Revision 6.133 1999/08/05 19:01:29 madden
1106 * Add check for NULL search or invalid query in BlastTwoSequencesCore
1107 *
1108 * Revision 6.132 1999/07/01 13:03:24 sicotte
1109 * Updated for DenseDiag and Moved seqalign_reverse_strand from blastutl.c(blast.h) to SeqAlignListReverseStrand in salpedit.ch and fixed call in salutil.c
1110 *
1111 * Revision 6.131 1999/06/24 17:24:12 madden
1112 * Fix bug in GetSeqAlignCount when SeqAlignPtr is NULL
1113 *
1114 * Revision 6.130 1999/06/18 21:17:58 madden
1115 * Check that an exact match gives a positive value when making words for blast2seqs
1116 *
1117 * Revision 6.129 1999/06/14 15:20:26 madden
1118 * Produce temporary BLAST_HitList to fix blastx core-dump
1119 *
1120 * Revision 6.128 1999/05/27 17:33:05 madden
1121 * Fixed Int2 (should have been Int4) problem
1122 *
1123 * Revision 6.127 1999/05/25 13:37:49 madden
1124 * Make smallest float 1.0e-180
1125 *
1126 * Revision 6.126 1999/05/19 12:44:00 madden
1127 * Change in longest_db_seq for multiple db search
1128 *
1129 * Revision 6.125 1999/05/13 13:48:11 madden
1130 * Only filter out hits if on same strand
1131 *
1132 * Revision 6.124 1999/04/15 13:24:35 madden
1133 * Fix for sum stats problems
1134 *
1135 * Revision 6.123 1999/04/13 19:16:47 madden
1136 * Check that two HSPs are on same strand before deleting one
1137 *
1138 * Revision 6.122 1999/04/12 20:24:54 egorov
1139 * Fix MT problem
1140 *
1141 * Revision 6.121 1999/04/01 21:42:46 madden
1142 * Fix memory leaks when gi list is used
1143 *
1144 * Revision 6.120 1999/04/01 14:18:58 madden
1145 * Fixed memory leaks with gi_list
1146 *
1147 * Revision 6.119 1999/03/31 15:46:52 madden
1148 * Removed unused code and variables
1149 *
1150 * Revision 6.118 1999/03/17 13:21:06 madden
1151 * Fix comment in comment problem
1152 *
1153 * Revision 6.117 1999/03/16 19:27:36 egorov
1154 * More type castings
1155 *
1156 * Revision 6.116 1999/03/12 17:19:59 egorov
1157 * More type casting fixes
1158 *
1159 * Revision 6.115 1999/03/12 15:03:45 egorov
1160 * Add proper Int4-long type casting
1161 *
1162 * Revision 6.114 1999/03/04 14:18:09 egorov
1163 * Do correct filter masking when query is seqloc
1164 * The only BlastMaskTheResidues() function is changed:
1165 *
1166 * Revision 6.113 1999/02/22 21:59:05 madden
1167 * binary search in GetAllowedGis function
1168 *
1169 * Revision 6.112 1999/02/22 17:32:46 madden
1170 * Fix memory leak
1171 *
1172 * Revision 6.111 1999/02/18 21:18:23 madden
1173 * Optimization
1174 *
1175 * Revision 6.110 1999/02/17 13:23:01 madden
1176 * Added hsp_num_max
1177 *
1178 * Revision 6.109 1999/02/11 13:53:46 madden
1179 * Added combine Boolean to HitRangeToSeqLoc, fixed mem leak
1180 *
1181 * Revision 6.108 1999/01/28 17:20:57 madden
1182 * Check do_sum_stats for linking, Int2 to Int4, UMR
1183 *
1184 * Revision 6.107 1999/01/28 16:05:49 madden
1185 * HspArrayPurge change, HSPs saved more efficiently
1186 *
1187 * Revision 6.106 1999/01/26 18:27:23 madden
1188 * handle delta sequences correctly
1189 *
1190 * Revision 6.105 1999/01/26 17:59:26 madden
1191 * ContextToFrame no longer static
1192 *
1193 * Revision 6.104 1999/01/25 21:31:25 madden
1194 * Check for illegal chars when nucl. query is translated
1195 *
1196 * Revision 6.103 1999/01/25 19:04:37 madden
1197 * prevent core-dump when query is empty
1198 *
1199 * Revision 6.102 1999/01/20 21:05:33 madden
1200 * Look for repeats on both strands
1201 *
1202 * Revision 6.101 1999/01/19 13:29:24 madden
1203 * Change to HspArrayPurge
1204 *
1205 * Revision 6.100 1998/12/31 18:17:08 madden
1206 * Added strand option
1207 *
1208 * Revision 6.99 1998/12/31 15:36:07 victorov
1209 * filtering internals is now based on SeqLoc instead of Bioseq
1210 *
1211 * Revision 6.98 1998/12/18 16:20:18 madden
1212 * efficiencies
1213 *
1214 * Revision 6.97 1998/12/15 14:11:29 madden
1215 * Change to permit an arbitrary number of HSPs
1216 *
1217 * Revision 6.96 1998/11/30 15:58:20 madden
1218 * Added CheckStartForGappedAlignment
1219 *
1220 * Revision 6.95 1998/11/27 15:24:12 madden
1221 * Duplicated handle_results and query_id if SearchBlk duplicated
1222 *
1223 * Revision 6.94 1998/11/16 17:39:23 kans
1224 * added FALSE for new paramter to FilterCC
1225 *
1226 * Revision 6.93 1998/11/06 14:13:01 madden
1227 * Added call to AdjustOffSetsInSeqAlign in BioseqBlastEngineByLocEx
1228 *
1229 * Revision 6.92 1998/10/21 13:44:16 madden
1230 * Fixed UMR found by purify
1231 *
1232 * Revision 6.91 1998/10/20 19:57:21 madden
1233 * Run dust if filtering is selected for nt
1234 *
1235 * Revision 6.90 1998/10/13 20:37:53 madden
1236 * Use IS_residue after call to SeqPortGetResidue
1237 *
1238 * Revision 6.89 1998/09/24 15:26:38 egorov
1239 * Fix lint complaints
1240 *
1241 * Revision 6.88 1998/09/16 19:00:16 madden
1242 * Added subset Boolean
1243 *
1244 * Revision 6.87 1998/09/15 13:12:29 madden
1245 * Fixed memory leak
1246 *
1247 * Revision 6.86 1998/09/14 15:11:18 egorov
1248 * Add support for Int8 length databases; remove unused variables
1249 *
1250 * Revision 6.85 1998/09/04 20:48:48 madden
1251 * typo fix (= instead of ==)
1252 *
1253 * Revision 6.84 1998/09/03 20:23:42 madden
1254 * Copied seq_ext and seq_ext_type in MakeFakeBioseq
1255 *
1256 * Revision 6.83 1998/09/03 19:41:09 madden
1257 * do not switch sequences for Blast2Sequences if filtering is performed
1258 *
1259 * Revision 6.82 1998/08/24 14:59:59 madden
1260 * readdb_get_sequence_ex function
1261 *
1262 * Revision 6.81 1998/07/30 19:00:56 madden
1263 * Fix memory leak
1264 *
1265 * Revision 6.80 1998/07/29 21:29:45 madden
1266 * Fixed UMR with longest_db_seq that showed up in Blast 2 sequences
1267 *
1268 * Revision 6.79 1998/07/28 21:18:35 madden
1269 * Change to BLAST_ExtendWordParamsNew saves memory
1270 *
1271 * Revision 6.78 1998/07/24 14:58:53 madden
1272 * Jinqhuis call to SeqLocRevCmp put back
1273 *
1274 * Revision 6.77 1998/07/22 20:31:51 madden
1275 * Replaced cutvalue of 1000000 with INT4_MAX
1276 *
1277 * Revision 6.76 1998/07/22 12:17:03 madden
1278 * Added BioseqHitRange call for repeat filtering
1279 *
1280 * Revision 6.75 1998/07/21 20:58:10 madden
1281 * Changes to allow masking at hash only
1282 *
1283 * Revision 6.74 1998/07/20 15:51:28 zjing
1284 * add a check for plus-minus before SeqLocRevCmp
1285 *
1286 * Revision 6.73 1998/07/17 15:39:59 madden
1287 * Changes for Effective search space.
1288 *
1289 * Revision 6.72 1998/07/14 21:31:43 madden
1290 * Fix for incorrectly sorted HSP bug and speed-up of CheckHspOverlap
1291 *
1292 * Revision 6.71 1998/07/06 13:39:04 madden
1293 * Fixed improper use of Int4 in parse_seg_options
1294 *
1295 * Revision 6.70 1998/07/02 21:00:39 egorov
1296 * Remove memory leak in threaded version
1297 *
1298 * Revision 6.69 1998/06/12 22:09:14 madden
1299 * Added call to SegParamsFree
1300 *
1301 * Revision 6.68 1998/06/12 16:08:51 madden
1302 * BlastHitRange stuff
1303 *
1304 * Revision 6.67 1998/06/08 15:07:32 madden
1305 * Fixed bug in BlastConvertProteinSeqLoc
1306 *
1307 * Revision 6.66 1998/06/04 16:23:17 madden
1308 * Use new seg
1309 *
1310 * Revision 6.65 1998/05/28 19:59:58 madden
1311 * Zhengs new culling code
1312 *
1313 * Revision 6.64 1998/05/22 20:20:38 madden
1314 * Added BlastTwoSequencesByLocEx and BlastTwoSequencesEx
1315 *
1316 * Revision 6.63 1998/05/18 17:58:31 madden
1317 * fixed parsing of coil-coil options, added parsing of dust options
1318 *
1319 * Revision 6.62 1998/05/17 16:28:41 madden
1320 * Allow changes to filter options and cc filtering.
1321 *
1322 * Revision 6.61 1998/05/05 14:05:35 madden
1323 * Added functions BlastStartAwakeThread and BlastStopAwakeThread
1324 *
1325 * Revision 6.60 1998/04/28 21:04:19 madden
1326 * Reset number of HSPs to zero if relinking
1327 *
1328 * Revision 6.59 1998/04/24 21:52:09 madden
1329 * Protection against NULL pointers
1330 *
1331 * Revision 6.58 1998/04/24 19:10:59 egorov
1332 * Fix bug when if wordsize == 2 blastall produces extra alignments
1333 *
1334 * Revision 6.57 1998/04/23 21:15:09 egorov
1335 * Show exact matching even if score is below threshold (case of two sequences)
1336 *
1337 * Revision 6.56 1998/04/15 20:24:54 madden
1338 * BlastMaskTheResidues optimized
1339 *
1340 * Revision 6.55 1998/04/10 17:46:58 madden
1341 * Changed FALSE to NULL in BioseqSeg
1342 *
1343 * Revision 6.54 1998/04/02 21:12:55 madden
1344 * Properly set value for linking HSPs in blastx and tblastn
1345 *
1346 * Revision 6.53 1998/04/01 22:47:35 madden
1347 * Check for query_invalid flag
1348 *
1349 * Revision 6.52 1998/03/26 14:20:20 madden
1350 * Changed GetScoreSetFromBlastResultHsp1 from static to LIBCALL
1351 *
1352 * Revision 6.51 1998/03/25 22:28:16 madden
1353 * Changes to allow random access BLAST by gi
1354 *
1355 * Revision 6.50 1998/03/24 15:38:25 madden
1356 * Use BlastDoubleInt4Ptr to keep track of gis and ordinal_ids
1357 *
1358 * Revision 6.49 1998/03/19 22:16:24 madden
1359 * Changes to allow blasting by gi list
1360 *
1361 * Revision 6.48 1998/03/18 14:14:11 madden
1362 * Support random access by gi list
1363 *
1364 * Revision 6.47 1998/03/16 17:41:59 madden
1365 * Fixed leaks
1366 *
1367 * Revision 6.46 1998/03/14 18:28:10 madden
1368 * Added BioseqBlastEngineEx
1369 *
1370 * Revision 6.45 1998/03/09 16:35:10 madden
1371 * Fixed bug with tblastn and blastx gapped searches
1372 *
1373 * Revision 6.44 1998/02/27 14:32:33 madden
1374 * Functions moved to blastool.c
1375 *
1376 * Revision 6.43 1998/02/26 22:34:27 madden
1377 * Changes for 16 bit windows
1378 *
1379 * Revision 6.42 1998/02/26 19:12:39 madden
1380 * Removed AdjustOffSetsInSeqAlign, added BlastNtFindWords BlastPopulateAllWordArrays BlastFindWords and BlastNewFindWords
1381 *
1382 * Revision 6.41 1998/02/24 22:47:06 madden
1383 * Fixed problem with Option validation
1384 *
1385 * Revision 6.40 1998/02/23 16:09:57 madden
1386 * Corrected from offset for subject in tblastx search
1387 *
1388 * Revision 6.39 1998/02/19 17:17:05 madden
1389 * Use of Int4 rather than Int2 when pruning SeqAlign
1390 *
1391 * Revision 6.38 1998/02/12 21:50:39 madden
1392 * protection against NULL hitlist in blastx and tblastn
1393 *
1394 * Revision 6.37 1998/02/11 17:18:19 madden
1395 * Made BlastGetGappedAlignmentTraceback functions to BlastGetGapAlgnTbck (shorter than 32 chars)
1396 *
1397 * Revision 6.36 1998/01/31 21:34:09 madden
1398 * Fix to SeqAlign pruning
1399 *
1400 * Revision 6.35 1998/01/06 18:26:22 madden
1401 * Use SeqLocLen rather than bsp->length, wordsize done properly for nucl
1402 *
1403 * Revision 6.34 1998/01/05 22:41:40 madden
1404 * Added seqalign_reverse_strand
1405 *
1406 * Revision 6.33 1998/01/05 20:53:16 madden
1407 * Added ability to align minus-minus or plus-minus in BlastTwoSeqsByLoc
1408 *
1409 * Revision 6.32 1998/01/05 16:46:55 madden
1410 * One or both strands can be searched, as opposed to only both, changes to number of contexts
1411 *
1412 * Revision 6.31 1997/12/31 17:52:09 madden
1413 * Change to BLAST_WordFinderNew
1414 *
1415 * Revision 6.30 1997/12/23 19:16:52 madden
1416 * Minor efficiency in ExtendWordExit
1417 *
1418 * Revision 6.29 1997/12/23 18:12:34 madden
1419 * Changes for range-dependent blast
1420 *
1421 * Revision 6.28 1997/12/12 20:38:55 madden
1422 * ContextToFrame lost last parameter, fix to sprintf
1423 *
1424 * Revision 6.27 1997/12/11 22:22:24 madden
1425 * Proper casting of variables
1426 *
1427 * Revision 6.26 1997/12/10 22:43:09 madden
1428 * proper casting
1429 *
1430 * Revision 6.25 1997/12/01 22:07:10 madden
1431 * Changed call to BLASTOptionValidateEx
1432 *
1433 * Revision 6.24 1997/11/28 18:19:33 madden
1434 * Changes to TxDfDbInfoNew
1435 *
1436 * Revision 6.23 1997/11/18 22:23:20 madden
1437 * Added BLASTOptionSetGapParams
1438 *
1439 * Revision 6.22 1997/11/14 17:15:29 madden
1440 * Realign matches when they contain ambiguities in blastx/tblastn
1441 *
1442 * Revision 6.21 1997/11/07 00:49:02 madden
1443 * Added call to BLAST_MatrixFill
1444 *
1445 * Revision 6.20 1997/10/29 22:11:13 madden
1446 * ABS value of frames
1447 *
1448 * Revision 6.19 1997/10/24 20:44:52 madden
1449 * Removed BlastSetReadDB and BlastGetReadDB_ID
1450 *
1451 * Revision 6.18 1997/10/22 21:46:34 madden
1452 * Changed default values
1453 *
1454 * Revision 6.17 1997/10/21 20:39:18 madden
1455 * Fix for more alignments than descriptions.
1456 *
1457 * Revision 6.16 1997/10/21 19:50:00 madden
1458 * Fix for no valid query sequence and hitlist_max of 1
1459 *
1460 * Revision 6.15 1997/10/03 21:27:28 madden
1461 * Added BlastGetTypes
1462 *
1463 * Revision 6.14 1997/10/02 17:29:29 madden
1464 * Added PrintDbInformationBasic
1465 *
1466 * Revision 6.13 1997/10/01 13:35:31 madden
1467 * Changed BLAST_VERSION to BLAST_ENGINE_VERSION
1468 *
1469 * Revision 6.12 1997/09/30 20:03:07 madden
1470 * Saved db filename in dbinfo
1471 *
1472 * Revision 6.11 1997/09/24 22:36:35 madden
1473 * Fixes for MT multidb searches
1474 *
1475 * Revision 6.10 1997/09/23 16:43:41 madden
1476 * removed unneeded DenseSegPtr
1477 *
1478 * Revision 6.9 1997/09/22 18:18:35 madden
1479 * Added umlaut to Schaffer in reference
1480 *
1481 * Revision 6.8 1997/09/18 22:22:03 madden
1482 * Added prune functions
1483 *
1484 * Revision 6.7 1997/09/16 16:54:09 kans
1485 * return FASLE instead of NULL for Boolean value
1486 *
1487 * Revision 6.6 1997/09/16 16:31:28 madden
1488 * More changes for multiple db runs
1489 *
1490 * Revision 6.5 1997/09/11 18:49:31 madden
1491 * Changes to enable searches against multiple databases.
1492 *
1493 * Revision 6.4 1997/09/10 21:28:00 madden
1494 * Changes to set CPU limits
1495 *
1496 * Revision 6.3 1997/09/08 16:25:32 madden
1497 * Fixed bug that did not mask low-complexity regions at the end of a query
1498 *
1499 * Revision 6.2 1997/08/27 14:46:51 madden
1500 * Changes to enable multiple DB searches
1501 *
1502 * Revision 6.1 1997/08/26 15:05:26 madden
1503 * Fix for negative effective search space
1504 *
1505 * Revision 6.0 1997/08/25 18:52:49 madden
1506 * Revision changed to 6.0
1507 *
1508 * Revision 1.105 1997/08/22 18:37:43 madden
1509 * Added function BlastOtherReturnsPrepare
1510 *
1511 * Revision 1.104 1997/08/20 21:43:34 madden
1512 * Added page numbers
1513 *
1514 * Revision 1.103 1997/08/14 21:07:08 madden
1515 * ignored gapped for tblastx
1516 *
1517 * Revision 1.102 1997/08/14 14:30:35 madden
1518 * BlastNewFindWords called with range set for ranged blast
1519 *
1520 * Revision 1.101 1997/07/31 21:18:11 madden
1521 * Removed left-over file from seg
1522 *
1523 * Revision 1.100 1997/07/30 16:39:30 madden
1524 * Print gap existence and extension parameters for blastn
1525 *
1526 * Revision 1.99 1997/07/30 16:31:37 madden
1527 * tblastx prepares StdSeg
1528 *
1529 * Revision 1.98 1997/07/29 17:07:27 madden
1530 * better tblastx error messages.
1531 *
1532 * Revision 1.97 1997/07/25 15:39:49 madden
1533 * Corrected citation
1534 *
1535 * Revision 1.96 1997/07/25 13:47:46 madden
1536 * Made buffer longer to avoid ABR
1537 *
1538 * Revision 1.95 1997/07/23 20:59:02 madden
1539 * Changed blastn defaults for gap opening and extension
1540 *
1541 * Revision 1.94 1997/07/22 17:22:41 madden
1542 * Added NULL arg (for index callback) to BLASTSetUpSearch funcs
1543 *
1544 * Revision 1.93 1997/07/21 17:36:42 madden
1545 * Added BlastGetReleaseDate
1546 *
1547 * Revision 1.92 1997/07/18 20:57:02 madden
1548 * Added functions BlastGetVersionNumber and BlastGetReference
1549 *
1550 * Revision 1.91 1997/07/18 14:26:20 madden
1551 * call to AcknowledgeBlastQuery changed, SeqId no longer deleted there.
1552 *
1553 * Revision 1.90 1997/07/16 20:34:35 madden
1554 * Added function BlastConvertProteinSeqLoc
1555 *
1556 * Revision 1.89 1997/07/15 20:36:14 madden
1557 * Added BioseqSeg and SeqLocSeg
1558 *
1559 * Revision 1.88 1997/07/14 20:11:10 madden
1560 * Removed unused variables
1561 *
1562 * Revision 1.87 1997/07/14 16:15:41 madden
1563 * call to BLASTOptionValidateEx in BlastBioseqEngine
1564 *
1565 * Revision 1.86 1997/07/14 15:31:49 madden
1566 * Added BlastErrorMessage functions
1567 *
1568 * Revision 1.85 1997/07/11 19:29:37 madden
1569 * Added function BioseqBlastEngineByLoc
1570 *
1571 * Revision 1.84 1997/07/10 20:35:43 madden
1572 * Changed parameter output
1573 *
1574 * Revision 1.83 1997/07/02 20:18:39 madden
1575 * Made continuous SeqAlign the default
1576 *
1577 * Revision 1.82 1997/07/02 18:31:39 madden
1578 * changed defaults
1579 *
1580 * Revision 1.81 1997/07/01 19:15:44 madden
1581 * More changes to FormatBlastParameters
1582 *
1583 * Revision 1.80 1997/07/01 17:51:36 madden
1584 * changed gap_decay rate, gap_prob
1585 *
1586 * Revision 1.79 1997/07/01 15:44:44 madden
1587 * Changes to FormatBlastParameters per S. Altschul
1588 *
1589 * Revision 1.78 1997/06/30 15:50:06 madden
1590 * Changes to FormatBlastParameters
1591 *
1592 * Revision 1.77 1997/06/27 22:18:51 madden
1593 * Updated default parameters
1594 *
1595 * Revision 1.76 1997/06/27 14:31:08 madden
1596 * Added functions BlastAddSeqIdToList and BlastSeqIdListDestruct
1597 *
1598 * Revision 1.75 1997/06/24 13:51:27 madden
1599 * Fixed SeqLoc leak
1600 *
1601 * Revision 1.74 1997/06/23 20:49:31 madden
1602 * BLASTOptionValidate checks for proper gapping parameters
1603 *
1604 * Revision 1.73 1997/06/20 13:11:33 madden
1605 * Made AdjustOffSetsInSeqAlign non-static, Fixed purify error
1606 *
1607 * Revision 1.72 1997/06/06 21:29:48 madden
1608 * Added Boolean html to AcknowledgeBlastQuery and PrintDbInformation
1609 *
1610 * Revision 1.71 1997/06/06 19:49:46 madden
1611 * Added BlastMakeFakeBioseq and BlastDeleteFakeBioseq
1612 *
1613 * Revision 1.70 1997/05/30 21:05:59 madden
1614 * corrected call to readdb_new
1615 *
1616 * Revision 1.69 1997/05/27 20:20:02 madden
1617 * Added function BlastMaskTheResidues
1618 *
1619 * Revision 1.68 1997/05/22 21:24:55 madden
1620 * Added support for final gapX dropoff value
1621 *
1622 * Revision 1.67 1997/05/20 17:52:58 madden
1623 * Added functions BlastTwoSequencesByLoc and BlastSequencesOnTheFlyByLoc
1624 *
1625 * Revision 1.66 1997/05/12 21:34:16 madden
1626 * readdb_new allows indeterminate database type
1627 *
1628 * Revision 1.65 1997/05/06 22:17:59 madden
1629 * Duplicate dblen_eff, dbseq_num, and length_adjustment
1630 *
1631 * Revision 1.64 1997/05/01 15:53:19 madden
1632 * Addition of extra KarlinBlk's for psi-blast
1633 *
1634 * Revision 1.63 1997/04/29 14:07:45 madden
1635 * Fixed problem with hits failing PreliminaryGapping; fixed UMR.
1636 *
1637 * Revision 1.62 1997/04/25 20:23:06 madden
1638 * Freed SeqPort to clear mem leak.
1639 *
1640 * Revision 1.61 1997/04/24 14:43:07 madden
1641 * Fix for minus strand (ungapped) tblastn runs.
1642 *
1643 * Revision 1.60 1997/04/23 21:56:07 madden
1644 * Changes in BlastGetGappedAlignmentTraceback for in-frame gapping tblastn.
1645 *
1646 * Revision 1.59 1997/04/22 14:00:14 madden
1647 * Removed unused variables.
1648 *
1649 * Revision 1.58 1997/04/22 13:04:19 madden
1650 * Changes for in-frame blastx gapping.
1651 *
1652 * Revision 1.57 1997/04/21 15:35:26 madden
1653 * Fixes for 'gapped' StdSegs.
1654 *
1655 * Revision 1.56 1997/04/18 17:08:35 madden
1656 * Corrected printing of threshold values.
1657 *
1658 * Revision 1.55 1997/04/17 22:12:43 madden
1659 * Fix for offset in GetStartForGappedAlignment.
1660 *
1661 * Revision 1.54 1997/04/17 22:07:48 madden
1662 * Changes to allow in-frame gapped tblastn.
1663 *
1664 * Revision 1.53 1997/04/15 22:02:59 madden
1665 * Set original_length1 for translating searches.
1666 *
1667 * Revision 1.52 1997/04/14 21:31:58 madden
1668 * Checking for NULL pointer.
1669 *
1670 * Revision 1.51 1997/04/14 15:59:47 madden
1671 * Changes for ungapped psi-blast.
1672 *
1673 * Revision 1.50 1997/04/11 21:18:45 madden
1674 * Added GetSequenceWithDenseSeg.
1675 *
1676 * Revision 1.49 1997/04/11 19:02:49 madden
1677 * Changes for in-frame blastx, tblastn gapping.
1678 *
1679 * Revision 1.48 1997/04/09 20:01:53 madden
1680 * Copied seqid_list from search structure to duplicate, for use on threads.
1681 *
1682 * Revision 1.47 1997/04/08 16:27:28 madden
1683 * Fixed leaks; fix for blastn formatting of parameters.
1684 *
1685 * Revision 1.46 1997/04/07 21:42:56 madden
1686 * Freed SeqLocPtr used for dust.
1687 *
1688 * Revision 1.45 1997/04/07 18:17:09 madden
1689 * Formatted parameters for Stephen.
1690 *
1691 * Revision 1.44 1997/04/04 20:44:09 madden
1692 * Added check for NULL return.
1693 *
1694 * Revision 1.43 1997/04/04 20:42:35 madden
1695 * Added function BioseqBlastEngineCore.
1696 *
1697 * Revision 1.42 1997/04/03 19:50:56 madden
1698 * Changes to use effective database length instead of the length of each
1699 * sequence in statistical calculations.
1700 *
1701 * Revision 1.41 1997/03/27 22:30:51 madden
1702 * Correctly checked for overlapping HSP's.
1703 *
1704 * Revision 1.40 1997/03/20 22:56:24 madden
1705 * Added gap_info to hsp.
1706 *
1707 * Revision 1.39 1997/03/20 21:52:10 madden
1708 * Fix for segmented query BioseqPtr when gapped alignment is performed.
1709 *
1710 * Revision 1.39 1997/03/20 21:52:10 madden
1711 * Fix for segmented query BioseqPtr when gapped alignment is performed.
1712 *
1713 * Revision 1.38 1997/03/14 22:06:11 madden
1714 * fixed MT bug in BlastReevaluateWithAmbiguities.
1715 *
1716 * Revision 1.37 1997/03/14 15:57:23 madden
1717 * Removed superfluous call to SeqAlignNew
1718 *
1719 * Revision 1.36 1997/03/14 15:22:11 madden
1720 * Fixed UMR of seqalign in BlastTwoSequencesCore.
1721 *
1722 * Revision 1.35 1997/03/11 14:38:40 madden
1723 * Added BlastSequencesOnTheFly and BlastTwoSequencesCore.
1724 *
1725 * Revision 1.34 1997/03/07 22:35:54 madden
1726 * Fix for BLASTOptionNew.
1727 *
1728 * Revision 1.33 1997/03/07 21:58:36 madden
1729 * Added Boolean gapped argument to BLASTOptionNew.
1730 *
1731 * Revision 1.32 1997/03/07 21:11:22 madden
1732 * Added in check for blastn on gapped calculations.
1733 *
1734 * Revision 1.31 1997/03/06 21:47:27 madden
1735 * Made FormatBlastParameters non-static.
1736 *
1737 * Revision 1.30 1997/03/05 18:16:16 madden
1738 * SeqIdFree replaced by SeqIdSetFree, fixed memory leak.
1739 *
1740 * Revision 1.29 1997/03/05 14:29:46 madden
1741 * Moved BlastSaveCurrentHsp from blast.c; Added function CheckHspOverlap.
1742 *
1743 * Revision 1.28 1997/03/04 21:34:59 madden
1744 * Added in HspArrayPurge.
1745 *
1746 * Revision 1.27 1997/03/04 20:08:19 madden
1747 * Moved gapped alignment code from blast.c to blastutl.c
1748 *
1749 * Revision 1.26 1997/03/03 22:39:45 madden
1750 * Moved code from blast.c to blastutl.c.
1751 *
1752 * Revision 1.25 1997/03/03 21:47:22 madden
1753 * Moved functions from blast.c to blastutl.c for 16-bit windows.
1754 *
1755 * Revision 1.24 1997/03/03 20:58:09 madden
1756 * Fixed offsets for minus strands.
1757 *
1758 * Revision 1.23 1997/03/03 17:30:21 madden
1759 * Set SeqAlignPtr to NULL in BlastTwoSequences and BlastBioseqEngine, possible UMR.
1760 *
1761 * Revision 1.22 1997/03/01 18:25:33 madden
1762 * reverse flag added to BlastGetGappedAlignmentTraceback functions.
1763 *
1764 * Revision 1.21 1997/02/27 22:47:07 madden
1765 * Replaced tblastx with tblastn in BioseqBlastEngine.
1766 *
1767 * Revision 1.20 1997/02/26 23:39:54 madden
1768 * Added Txdfline stuff.
1769 *
1770 * Revision 1.19 1997/02/26 20:37:31 madden
1771 * Added *error_returns to BioseqBlastEngine.
1772 *
1773 * Revision 1.18 1997/02/25 19:17:05 madden
1774 * Changes to BioseqBlastEngine.
1775 *
1776 * Revision 1.17 1997/02/20 23:00:34 madden
1777 * Checked for NULL return in BlastTwoSequences.
1778 *
1779 * Revision 1.16 1997/02/20 18:38:34 madden
1780 * Set Default db_length to zero in Options.
1781 *
1782 * Revision 1.15 1997/02/19 16:25:22 madden
1783 * Reset gapped_calculation for blastn; returned proper SeqAlign for blastx, tblastn
1784 * in BioseqBlastEngine.
1785 *
1786 * Revision 1.14 1997/02/19 13:45:13 madden
1787 * replaced zero in call to BlastGetGappedAlignmentTraceback with FALSE.
1788 *
1789 * Revision 1.13 1997/02/18 22:09:02 madden
1790 * Removed unused variable.
1791 *
1792 * Revision 1.12 1997/02/18 21:03:00 madden
1793 * Changes to BioseqBlastEngine for gapped calculations.
1794 *
1795 * Revision 1.11 1997/02/18 18:31:34 madden
1796 * Used SeqIdFindBest in BlastTwoSequences.
1797 *
1798 * Revision 1.10 1997/02/18 17:58:52 madden
1799 * Added BioseqBlastEngine.
1800 *
1801 * Revision 1.9 1997/02/14 17:17:59 madden
1802 * Changes to default options and BlastTwoSequences for nucl.
1803 * sequences with ambiguites.
1804 *
1805 * Revision 1.8 1997/02/13 18:23:56 madden
1806 * Fixed ID type from BlastTwoSequences.
1807 *
1808 * Revision 1.7 1997/02/11 19:30:54 madden
1809 * Changes to BlastTwoSequences for gapped alignments.
1810 *
1811 * Revision 1.6 1997/02/10 20:03:58 madden
1812 * BlastTwoSequences indexes only the subject.
1813 *
1814 * Revision 1.5 1997/02/10 15:24:26 madden
1815 * Removed unused variable.
1816 *
1817 * Revision 1.4 1997/02/07 22:43:03 madden
1818 * Moved BLAST_WordFinderNew and Destruct from blast.c to blastutl.c, made
1819 * non-static.
1820 *
1821 * Revision 1.3 1997/02/07 22:32:40 madden
1822 * Changed prototypes for BlastGetSubjectId and GetSeqAlignForResultHitList.
1823 *
1824 * Revision 1.2 1997/02/05 13:36:48 madden
1825 * Removed Unused variable.
1826 *
1827 * Revision 1.1 1997/02/04 18:23:58 madden
1828 * Initial revision
1829 *
1830 */
1831
1832 #define NLM_GENERATED_CODE_PROTO
1833 #include <ncbi.h>
1834 #include <blast.h>
1835 #include <blastpri.h>
1836 #include <objcode.h>
1837 #include <objseq.h>
1838 #include <sequtil.h>
1839 #include <tofasta.h>
1840 #include <seqport.h>
1841 #include <readdb.h>
1842 #include <ncbithr.h>
1843 #include <blast_dust.h>
1844 #include <urkpcc.h>
1845 #include <txalign.h>
1846 #include <seg.h>
1847 #include <salpedit.h>
1848 #include <mbalign.h>
1849 #include <mblast.h>
1850 #include <vecscrn.h>
1851 #include <rpsutil.h>
1852 #include <simutil.h>
1853 #include <blfmtutl.h>
1854
1855 typedef struct _pgp_blast_options {
1856 BLAST_OptionsBlkPtr options;
1857 CharPtr blast_database;
1858 BioseqPtr query_bsp, fake_bsp;
1859 Int4 number_of_descriptions, number_of_alignments;
1860 FILE *infp, *outfp;
1861 AsnIoPtr aip_out;
1862 Boolean html;
1863 Boolean believe_query;
1864 Uint4 align_options, print_options;
1865 /* PHI-PSI Blast variables */
1866 Uint1 featureOrder[FEATDEF_ANY];
1867 Uint1 groupOrder[FEATDEF_ANY];
1868 Int4 program_flag;
1869 CharPtr patfile;
1870 FILE *patfp;
1871 seedSearchItems *seedSearch;
1872 } PGPBlastOptions, PNTR PGPBlastOptionsPtr;
1873
1874 /* Window size used to scan HSP for highest score region, where gapped
1875 extension starts. */
1876 #define HSP_MAX_WINDOW 11
1877
1878 #define BLASTFILTER_DIR "/usr/ncbi/blast/filter"
1879
1880 static SeqIdPtr
1881 BlastGetFirstGiofSubset(ReadDBFILEPtr rdfp, Int4 ordinal_id, Int2 aliasfilebit)
1882 {
1883 Boolean not_done = TRUE;
1884 SeqIdPtr bestid = NULL, tmp_seqid, seqid=NULL;
1885 Uint4 header_index = 0;
1886 Int4 gi = 0;
1887 Int4 alias_mask;
1888 BlastDefLinePtr bdfp;
1889
1890 if (!rdfp->cih && rdfp->formatdb_ver < FORMATDB_VER) {
1891 /* FORMATDB_VER_TEXT version requires the common index
1892 * to determine the subset databases */
1893 ErrPostEx(SEV_ERROR, 0, 0, "Database mask cannot be used without CommonIndex");
1894 return NULL;
1895 }
1896
1897 alias_mask = (0x1 << rdfp->aliasfilebit);
1898
1899 bdfp = NULL;
1900 if(rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
1901 bdfp = FDReadDeflineAsn(rdfp, ordinal_id);
1902 if(bdfp == NULL) {
1903 ErrPostEx(SEV_ERROR, 0, 0, "Failure to read defline ASN for %d",
1904 ordinal_id);
1905 return NULL;
1906 }
1907
1908 bestid = SeqIdFindBest(bdfp->seqid, SEQID_GI);
1909 if (bestid->choice == SEQID_GI) {
1910 gi = bestid->data.intvalue;
1911 ValNodeAddInt(&seqid, SEQID_GI, gi);
1912 }
1913 bdfp = BlastDefLineSetFree(bdfp);
1914
1915 return seqid;
1916 }
1917
1918 while (not_done) {
1919 CommonIndexPtr cigi;
1920
1921 /* get seqid from database headers file */
1922 not_done = readdb_get_header (rdfp, ordinal_id, &header_index, &tmp_seqid, NULL);
1923
1924 if (not_done == FALSE)
1925 break;
1926
1927 if (not_done) {
1928 /* get gi number */
1929 bestid = SeqIdFindBest(tmp_seqid, SEQID_GI);
1930 if (bestid->choice != SEQID_GI) {
1931 tmp_seqid = SeqIdSetFree(tmp_seqid);
1932 break;
1933 }
1934 gi = bestid->data.intvalue;
1935
1936 /* get database commonindex mask */
1937 cigi = rdfp->cih->ci + gi;
1938 if (alias_mask & SwapUint4(cigi->dbmask)) {
1939 ValNodeAddInt(&seqid, SEQID_GI, gi);
1940 break;
1941 }
1942 tmp_seqid = SeqIdSetFree(tmp_seqid);
1943 }
1944 }
1945 tmp_seqid = SeqIdSetFree(tmp_seqid);
1946
1947 return seqid;
1948 }
1949
1950 #define BLAST_ITER_MAX 30
1951
1952 /*
1953 Goes through the list of gi's/ordinal id's looking for matches
1954 to the ordinal ID. Returns those acceptable gi's as SeqIdPtr's.
1955 */
1956 SeqIdPtr
1957 BlastGetAllowedGis (BlastSearchBlkPtr search, Int4 ordinal_id, SeqIdPtr PNTR seqid)
1958 {
1959 BlastGiListPtr blast_gi_list;
1960 Boolean found=FALSE;
1961 BlastDoubleInt4Ptr *gi_list_pointer;
1962 Int4 index, total, first, last, current;
1963 ValNodePtr gi_list=NULL;
1964
1965 if (seqid)
1966 *seqid = NULL;
1967 gi_list = NULL;
1968 if (search->thr_info->blast_gi_list) {
1969 blast_gi_list = search->thr_info->blast_gi_list;
1970 total = blast_gi_list->total;
1971 found = FALSE;
1972 gi_list_pointer = blast_gi_list->gi_list_pointer;
1973 first = 0;
1974 last = total;
1975 for (index=0; index<BLAST_ITER_MAX; index++) {
1976 current = (first+last)/2;
1977 if (ordinal_id < gi_list_pointer[current]->ordinal_id)
1978 last = current;
1979 else if (ordinal_id > gi_list_pointer[current]->ordinal_id)
1980 first = current;
1981 else { /* back up looking for all gi's associated with this oid. */
1982 while (current > 0 &&
1983 ordinal_id == gi_list_pointer[current-1]->ordinal_id)
1984 current--;
1985 found = TRUE;
1986 break;
1987 }
1988 }
1989
1990 if (found) {
1991 while (current < total) {
1992 if (ordinal_id == gi_list_pointer[current]->ordinal_id) {
1993 ValNodeAddInt(&gi_list, SEQID_GI, blast_gi_list->gi_list_pointer[current]->gi);
1994 } else {
1995 break;
1996 }
1997 current++;
1998 }
1999 }
2000
2001 if (seqid && search->rdfp && search->rdfp->aliasfilebit != 0) {
2002 *seqid = BlastGetFirstGiofSubset(search->rdfp, ordinal_id, search->rdfp->aliasfilebit);
2003 }
2004 return (SeqIdPtr) gi_list;
2005 } else if (search->rdfp != NULL && search->rdfp->oidlist != NULL) {
2006 /* if we have at least one mask, then we need print only those gis, which
2007 are in the database list (reals and masks) */
2008
2009 Boolean not_done = TRUE;
2010 SeqIdPtr bestid = NULL, tmp_seqid = NULL;
2011 Uint4 header_index = 0;
2012 Int4 gi = 0;
2013 Int4 mask;
2014 Int2 firstpos, curfirstpos;
2015 ReadDBFILEPtr rdfp = search->rdfp, tmprdfp;
2016 BlastDefLinePtr bdfp, bdfp_head;
2017
2018 if (!rdfp->cih && rdfp->formatdb_ver < FORMATDB_VER) {
2019 /* FORMATDB_VER_TEXT version requires the common index
2020 * to determine the subset databases */
2021 /*ErrPostEx(SEV_ERROR, 0, 0, "Database mask cannot be used without CommonIndex");*/
2022 return NULL;
2023 }
2024
2025 /* kludge: only protein databases are non-redundant */
2026 if (readdb_is_prot(search->rdfp) == FALSE)
2027 return NULL;
2028
2029 bdfp = NULL; bdfp_head = NULL;
2030 if(rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
2031 /* just chain the seqid's returned, as they are filtered in
2032 * FDReadDeflineAsn according to the membership_bit in the
2033 * rdfp */
2034 bdfp = FDReadDeflineAsn(rdfp, ordinal_id);
2035 if(bdfp == NULL) {
2036 ErrPostEx(SEV_ERROR, 0, 0, "Failure to read defline ASN for %d", ordinal_id);
2037 return NULL;
2038 }
2039 for (bdfp_head = bdfp; bdfp; bdfp = bdfp->next) {
2040 bestid = SeqIdFindBest(bdfp->seqid, SEQID_GI);
2041 if (bestid->choice == SEQID_GI) {
2042 gi = bestid->data.intvalue;
2043 ValNodeAddInt(&gi_list, SEQID_GI, gi);
2044 }
2045 }
2046
2047 BlastDefLineSetFree(bdfp_head);
2048
2049 } else {
2050
2051 while (not_done) {
2052 CommonIndexPtr cigi;
2053
2054 /* get seqid from database headers file */
2055 not_done = readdb_get_header (search->rdfp, ordinal_id, &header_index, &tmp_seqid, NULL);
2056
2057 if (not_done == FALSE)
2058 break;
2059
2060 if (not_done) {
2061 /* get gi number */
2062 bestid = SeqIdFindBest(tmp_seqid, SEQID_GI);
2063 if (bestid->choice != SEQID_GI) {
2064 tmp_seqid = SeqIdSetFree(tmp_seqid);
2065 break;
2066 }
2067 gi = bestid->data.intvalue;
2068
2069 /* get database commonindex mask */
2070 cigi = search->rdfp->cih->ci + gi;
2071 mask = SwapUint4(cigi->dbmask);
2072
2073 firstpos = 0;
2074 while (((curfirstpos = bit_engine_firstbit(mask)) != -1)) {
2075 CharPtr dbname;
2076
2077 firstpos += curfirstpos;
2078
2079 dbname = DBName(search->rdfp->cih->num_of_DBs,
2080 search->rdfp->cih->dbids, firstpos);
2081
2082 /* search in rdfp list this database */
2083 tmprdfp = search->rdfp;
2084 while (tmprdfp) {
2085 if (tmprdfp->aliasfilename) {
2086 /* use mask name, if exists */
2087 if (!StrCmp(dbname, tmprdfp->aliasfilename)) {
2088 ValNodeAddInt(&gi_list, SEQID_GI, gi);
2089 }
2090 } else {
2091 /* use real file name */
2092 if (!StrCmp(dbname, tmprdfp->filename)) {
2093 ValNodeAddInt(&gi_list, SEQID_GI, gi);
2094 }
2095 }
2096 tmprdfp = tmprdfp->next;
2097 }
2098 mask >>= (curfirstpos + 1);
2099 firstpos++;
2100 }
2101 }
2102
2103 if (tmp_seqid) {
2104 tmp_seqid = SeqIdSetFree(tmp_seqid);
2105 }
2106 }
2107 }
2108 if (seqid)
2109 *seqid = BlastGetFirstGiofSubset(search->rdfp, ordinal_id, search->rdfp->aliasfilebit);
2110
2111 return (SeqIdPtr) gi_list;
2112 }
2113
2114 return NULL;
2115 }
2116
2117 /*
2118 SOME FUNCTIONS TO PRODUCE A SeqAlign from the BLAST results.
2119 */
2120
2121 /*****************************************************************************
2122
2123 Finds the best SeqId for the SeqAlign. Looks for the GI, then takes
2124 anything if that's not found and makes up a local ID if no ID is
2125 found at all.
2126 *****************************************************************************/
2127
2128 SeqIdPtr
2129 GetTheSeqAlignID(SeqIdPtr seq_id)
2130 {
2131 SeqIdPtr new_id, ret_id;
2132 ObjectIdPtr obidp;
2133
2134 ret_id = NULL;
2135 if (seq_id)
2136 {
2137 /* Get the gi from the chain, if it's there. */
2138 new_id = SeqIdFindBest(seq_id, SEQID_GI);
2139 if (new_id)
2140 {
2141 ret_id = SeqIdDup(new_id);
2142 }
2143 else
2144 { /* No Gi was found, use any ID. */
2145 ret_id = SeqIdDup(seq_id);
2146 }
2147 }
2148
2149 if (ret_id == NULL)
2150 { /* make up an ID. */
2151 obidp = ObjectIdNew();
2152 obidp->str = StringSave("lcl|unknown");
2153 ValNodeAddPointer(&ret_id, SEQID_LOCAL, obidp);
2154 }
2155
2156 return ret_id;
2157 }
2158 static SeqAlignPtr
2159 FillInSegsInfo(SeqAlignPtr sap_head, StdSegPtr ssp_head, DenseDiagPtr ddp_head)
2160
2161 {
2162 SeqAlignPtr sap;
2163
2164 if (ddp_head || ssp_head)
2165 {
2166 if (sap_head)
2167 {
2168 sap = sap_head;
2169 while (sap->next)
2170 sap = sap->next;
2171 sap->next = SeqAlignNew();
2172 sap = sap->next;
2173 }
2174 else
2175 {
2176 sap_head = sap = SeqAlignNew();
2177 }
2178
2179 if (ddp_head)
2180 {
2181 sap->type = 2;
2182 sap->segs = ddp_head;
2183 sap->segtype = 1;
2184 }
2185 else if (ssp_head)
2186 {
2187 sap->type = 2;
2188 sap->segs = ssp_head;
2189 sap->segtype = 3;
2190 }
2191 }
2192 return sap_head;
2193 }
2194
2195
2196 /*************************************************************************
2197 *
2198 * This function fills in the DenseDiag Information from the variable
2199 * hsp. On the first call to this function *old should be
2200 * NULL, after that pass in the head of the DenseDiagPtr chain.
2201 * The newest DenseDiagPtr is returned.
2202 *
2203 ************************************************************************/
2204
2205 static DenseDiagPtr
2206 FillInDenseDiagInfo(DenseDiagPtr PNTR old, BLASTResultHspPtr hsp, Boolean reverse, Int4 query_length, Int4 subject_length, SeqIdPtr gi_list)
2207
2208 {
2209 DenseDiagPtr ddp, new;
2210
2211 new = DenseDiagNew();
2212
2213 new->dim = 2; /* Only 2 is supported in spec. */
2214 new->len = hsp->query_length;
2215 new->starts = (Int4Ptr) MemNew(2 * sizeof(Int4));
2216 new->strands = (Uint1Ptr) MemNew(2 * sizeof(Uint1));
2217 if (reverse)
2218 {
2219 if (hsp->subject_frame >= 0)
2220 {
2221 new->strands[0] = Seq_strand_plus;
2222 new->starts[0] = hsp->subject_offset;
2223 }
2224 else
2225 {
2226 new->strands[0] = Seq_strand_minus;
2227 new->starts[0] = subject_length - hsp->subject_offset - hsp->subject_length;
2228 }
2229 if (hsp->query_frame >= 0)
2230 {
2231 new->strands[1] = Seq_strand_plus;
2232 new->starts[1] = hsp->query_offset;
2233 }
2234 else
2235 {
2236 new->strands[1] = Seq_strand_minus;
2237 new->starts[1] = query_length - hsp->query_offset - hsp->query_length;
2238 }
2239 }
2240 else
2241 {
2242 if (hsp->query_frame >= 0)
2243 {
2244 new->strands[0] = Seq_strand_plus;
2245 new->starts[0] = hsp->query_offset;
2246 }
2247 else
2248 {
2249 new->strands[0] = Seq_strand_minus;
2250 new->starts[0] = query_length - hsp->query_offset - hsp->query_length;
2251 }
2252 if (hsp->subject_frame >= 0)
2253 {
2254 new->strands[1] = Seq_strand_plus;
2255 new->starts[1] = hsp->subject_offset;
2256 }
2257 else
2258 {
2259 new->strands[1] = Seq_strand_minus;
2260 new->starts[1] = subject_length - hsp->subject_offset - hsp->subject_length;
2261 }
2262 }
2263 new->scores = GetScoreSetFromBlastResultHsp(hsp, gi_list);
2264
2265 /* Go to the end of the chain, and then attach "new" */
2266 if (*old)
2267 {
2268 ddp = *old;
2269 while (ddp->next)
2270 ddp = ddp->next;
2271 ddp->next = new;
2272 }
2273 else
2274 {
2275 *old = new;
2276 }
2277
2278 new->next = NULL;
2279
2280 return new;
2281 }
2282
2283 /*************************************************************************
2284 *
2285 * This function fills in the StdSeg Information from the variable
2286 * hsp. On the first call to this function *old should be
2287 * NULL, after that pass in the head of the DenseDiagPtr chain.
2288 * The newest StdSegPtr is returned.
2289 *
2290 ************************************************************************/
2291 static StdSegPtr
2292 FillInStdSegInfo(BlastSearchBlkPtr search, Int4 subject_id, Int4 length, StdSegPtr PNTR old, BLASTResultHspPtr hsp, SeqIdPtr sip, Boolean reverse, SeqIdPtr gi_list)
2293
2294 {
2295 Int4 subject_length;
2296 StdSegPtr ssp, new;
2297 SeqIdPtr query_sip, subject_sip;
2298 SeqIntPtr seq_int1, seq_int2;
2299 SeqLocPtr slp=NULL;
2300
2301 new = StdSegNew();
2302 /* Duplicate the id and split it up into query and subject parts */
2303 query_sip = SeqIdDup(sip);
2304 subject_sip = SeqIdDup(sip->next);
2305
2306 new->dim = 2; /* Only 2 is supported in spec. */
2307 seq_int1 = SeqIntNew();
2308 if (hsp->query_frame == 0)
2309 {
2310 seq_int1->from = hsp->query_offset;
2311 seq_int1->to = hsp->query_offset + hsp->query_length - 1;
2312 seq_int1->strand = Seq_strand_unknown;
2313 }
2314 else if (hsp->query_frame < 0)
2315 {
2316 seq_int1->to = search->context[hsp->context].query->original_length - CODON_LENGTH*hsp->query_offset + hsp->query_frame;
2317 seq_int1->from = search->context[hsp->context].query->original_length - CODON_LENGTH*(hsp->query_offset+hsp->query_length) + hsp->query_frame + 1;
2318 seq_int1->strand = Seq_strand_minus;
2319 }
2320 else if (hsp->query_frame > 0)
2321 {
2322 seq_int1->from = CODON_LENGTH*(hsp->query_offset) + hsp->query_frame - 1;
2323 seq_int1->to = CODON_LENGTH*(hsp->query_offset+hsp->query_length) + hsp->query_frame - 2;
2324 seq_int1->strand = Seq_strand_plus;
2325 }
2326 seq_int1->id = query_sip;
2327 seq_int2 = SeqIntNew();
2328 if (hsp->subject_frame == 0)
2329 {
2330 seq_int2->from = hsp->subject_offset;
2331 seq_int2->to = hsp->subject_offset + hsp->subject_length - 1;
2332 seq_int2->strand = Seq_strand_unknown;
2333 }
2334 else if (hsp->subject_frame < 0)
2335 {
2336 if (search->rdfp)
2337 subject_length = readdb_get_sequence_length(search->rdfp, subject_id);
2338 else
2339 subject_length = length;
2340
2341 seq_int2->from = subject_length - CODON_LENGTH*(hsp->subject_offset + hsp->subject_length) + hsp->subject_frame + 1;
2342 seq_int2->to = subject_length - CODON_LENGTH*(hsp->subject_offset) + hsp->subject_frame;
2343 seq_int2->strand = Seq_strand_minus;
2344 }
2345 else if (hsp->subject_frame > 0)
2346 {
2347 seq_int2->from = CODON_LENGTH*(hsp->subject_offset) + hsp->subject_frame - 1;
2348 seq_int2->to = CODON_LENGTH*(hsp->subject_offset + hsp->subject_length) + hsp->subject_frame - 2;
2349 seq_int2->strand = Seq_strand_plus;
2350 }
2351 seq_int2->id = subject_sip;
2352
2353 if (reverse)
2354 {
2355 ValNodeAddPointer(&slp, SEQLOC_INT, seq_int2);
2356 ValNodeAddPointer(&slp, SEQLOC_INT, seq_int1);
2357 }
2358 else
2359 {
2360 ValNodeAddPointer(&slp, SEQLOC_INT, seq_int1);
2361 ValNodeAddPointer(&slp, SEQLOC_INT, seq_int2);
2362 }
2363 new->loc = slp;
2364
2365 search->subject->sequence = MemFree(search->subject->sequence);
2366 new->scores = GetScoreSetFromBlastResultHsp(hsp, gi_list);
2367
2368 /* Go to the end of the chain, and then attach "new" */
2369 if (*old)
2370 {
2371 ssp = *old;
2372 while (ssp->next)
2373 ssp = ssp->next;
2374 ssp->next = new;
2375 }
2376 else
2377 {
2378 *old = new;
2379 }
2380
2381 new->next = NULL;
2382
2383 return new;
2384 }
2385
2386 /************************************************************************
2387 *
2388 * This function assembles all the components of the Seq-align from
2389 * a "sparse" BLAST HitList. "sparse" means that the hitlist
2390 * may contain no sequence and not even a descriptor. It is only
2391 * required to contain the sequence_number that readdb refers to
2392 * and scoring/alignment information.
2393 *
2394 * If dbname is non-NULL, then only a general ("gnl") ID is
2395 * issued, with the ordinal number of the subject sequence in
2396 * the ObjectIdPtr.
2397 *
2398 * Boolean reverse: reverse the query and db order in SeqAlign.
2399 *
2400 ************************************************************************/
2401 SeqAlignPtr LIBCALL
2402 GetSeqAlignForResultHitList(BlastSearchBlkPtr search, Boolean getdensediag, Boolean ordinal_number, Boolean discontinuous, Boolean reverse, Boolean get_redundant_seqs)
2403
2404 {
2405 BLASTResultHspPtr hsp;
2406 BLASTResultHitlistPtr results;
2407 BLASTResultsStructPtr result_struct;
2408 DenseDiagPtr ddp_head=NULL, ddp;
2409 SeqIdPtr gi_list=NULL, sip, sip_subject,
2410 sip_subject_start, query_id, new_sip;
2411 StdSegPtr ssp_head=NULL, ssp;
2412 SeqAlignPtr last, seqalign_head, seqalign, sap_head;
2413 Int4 hsp_cnt, index, index2, hspset_cnt_old, i;
2414 Int4 hitlist_count;
2415 Int4 subject_length;
2416 ValNodePtr vnp, vnp_start;
2417
2418 ddp_head = NULL;
2419 ssp_head = NULL;
2420 sap_head = NULL;
2421 seqalign_head = NULL;
2422
2423 /* discontinuous = FALSE; */
2424 result_struct = search->result_struct;
2425 hitlist_count = result_struct->hitlist_count;
2426
2427 last = NULL;
2428 sip = NULL;
2429 sip_subject_start = NULL;
2430 for (index=0; index<hitlist_count; index++)
2431 {
2432 results = result_struct->results[index];
2433 sip_subject_start = NULL;
2434 if (get_redundant_seqs)
2435 {
2436 vnp = NULL;
2437 sip = BlastGetSubjectId(search, index, ordinal_number, &vnp);
2438 vnp_start = vnp;
2439 while (vnp)
2440 {
2441 sip = GetTheSeqAlignID(vnp->data.ptrvalue);
2442 SeqIdFree(vnp->data.ptrvalue);
2443 if (sip_subject_start == NULL)
2444 {
2445 sip_subject_start = sip;
2446 }
2447 else
2448 {
2449 sip_subject = sip_subject_start;
2450 while (sip_subject->next)
2451 sip_subject = sip_subject->next;
2452 sip_subject->next = sip;
2453 }
2454 vnp = vnp->next;
2455 }
2456 vnp_start = vnp = ValNodeFree(vnp_start);
2457 }
2458 else
2459 {
2460 sip = BlastGetSubjectId(search, index, ordinal_number, NULL);
2461 sip_subject_start = sip_subject = GetTheSeqAlignID(sip);
2462 sip = SeqIdSetFree(sip);
2463 }
2464
2465 results = result_struct->results[index];
2466 if (search->rdfp)
2467 subject_length = readdb_get_sequence_length(search->rdfp, results->subject_id);
2468 else if (results->subject_info)
2469 subject_length = results->subject_info->length;
2470 else
2471 subject_length = 0;
2472
2473 gi_list = BlastGetAllowedGis(search, results->subject_id, &new_sip);
2474 /* right now sip_subject should only contain one ID. At some
2475 point it will contain multiple ID's for identical sequences. */
2476 if (new_sip != NULL)
2477 sip_subject = new_sip;
2478 else
2479 sip_subject = sip_subject_start;
2480 while (sip_subject)
2481 {
2482 seqalign = SeqAlignNew();
2483 seqalign->type = 2; /* alignment is diags */
2484 if (last == NULL) /* First sequence. */
2485 seqalign_head = seqalign;
2486 else
2487 last->next = seqalign;
2488
2489 last = seqalign;
2490
2491 hspset_cnt_old = -1;
2492 hsp_cnt = results->hspcnt;
2493 for (index2=0; index2<hsp_cnt; index2++)
2494 {
2495 hsp = &(results->hsp_array[index2]);
2496 if (discontinuous && hspset_cnt_old != hsp->hspset_cnt)
2497 {
2498 hspset_cnt_old = hsp->hspset_cnt;
2499 if (index2 != 0)
2500 { /* nothing to save on first pass. */
2501 if (getdensediag)
2502 {
2503 sap_head = FillInSegsInfo(sap_head, NULL, ddp_head);
2504 ddp_head = NULL;
2505 }
2506 else
2507 {
2508 sap_head = FillInSegsInfo(sap_head, ssp_head, NULL);
2509 ssp_head = NULL;
2510 }
2511 }
2512 }
2513
2514 query_id = search->query_id;
2515 if (search->prog_number==blast_type_blastn) {
2516 for (i=0; i<hsp->context/2; i++)
2517 query_id = query_id->next;
2518 }
2519 if (reverse)
2520 {
2521 sip = SeqIdDup(sip_subject);
2522 sip->next = GetTheSeqAlignID(query_id);
2523 }
2524 else
2525 {
2526 sip = GetTheSeqAlignID(query_id);
2527 sip->next = SeqIdDup(sip_subject);
2528 }
2529
2530 if (getdensediag)
2531 {
2532 ddp = FillInDenseDiagInfo(&ddp_head, hsp, reverse, search->context[hsp->context].query->length, subject_length, gi_list);
2533 ddp->id = sip;
2534 }
2535 else
2536 {
2537 Int4 length = 0;
2538
2539 if (results->subject_info)
2540 length = results->subject_info->length;
2541
2542 ssp = FillInStdSegInfo(search, results->subject_id, length, &ssp_head, hsp, sip, reverse, gi_list);
2543 ssp->ids = sip;
2544 }
2545 sip = NULL; /* This SeqIdPtr is now on the SeqAlign. */
2546 }
2547
2548 if (discontinuous)
2549 {
2550 if (getdensediag)
2551 {
2552 sap_head = FillInSegsInfo(sap_head, NULL, ddp_head);
2553 ddp_head = NULL;
2554 }
2555 else
2556 {
2557 sap_head = FillInSegsInfo(sap_head, ssp_head, NULL);
2558 ssp_head = NULL;
2559 }
2560 seqalign->segs = sap_head;
2561 seqalign->segtype = 5; /* Discontinuous */
2562 }
2563 else
2564 {
2565 if (getdensediag)
2566 {
2567 seqalign->segs = ddp_head;
2568 seqalign->segtype = 1; /* DenseDiag */
2569 ddp_head = NULL;
2570 }
2571 else
2572 {
2573 seqalign->segs = ssp_head;
2574 seqalign->segtype = 3; /* StdSeg */
2575 ssp_head = NULL;
2576 }
2577 }
2578
2579 sap_head = NULL;
2580
2581 sip_subject = sip_subject->next;
2582 }
2583 if (sip_subject_start)
2584 sip_subject_start = SeqIdFree(sip_subject_start);
2585 if (new_sip)
2586 new_sip = SeqIdFree(new_sip);
2587 gi_list = SeqIdSetFree(gi_list);
2588 }
2589
2590 return seqalign_head;
2591 }
2592
2593 /*
2594 "Core" function to compare two sequences, for use by
2595 BlastTwoSequences and BlastSequencesOnTheFly.
2596
2597 The subject_bsp is redundant with the subject_seq_start and
2598 subject_length (or visa-versa), but the subject must be
2599 extracted from the subject_bsp for BlastTwoSequences anyway, while
2600 the title and ID are needed from subject_bsp.
2601 */
2602 static Int2
2603 BlastTwoSequencesCoreEx (BlastSearchBlkPtr search, BioseqPtr subject_bsp, Uint1Ptr subject_seq, Int4 subject_length)
2604 {
2605 Int2 status=0;
2606
2607 search->subject_info = BLASTSubjectInfoDestruct(search->subject_info);
2608 if (!search->handle_results)
2609 search->subject_info = BLASTSubjectInfoNew(SeqIdDup(SeqIdFindBest(subject_bsp->id, SEQID_GI)), StringSave(BioseqGetTitle(subject_bsp)), subject_length);
2610 else
2611 search->subject_info = BLASTSubjectInfoNew(SeqIdSetDup(subject_bsp->id), StringSave(BioseqGetTitle(subject_bsp)), subject_length);
2612
2613 /*CC: is search->sbp->posMatrix, we're comparing a pssm with a subject
2614 * sequence, thus we need to do some set up */
2615 if (search->sbp->posMatrix && search->prog_number == blast_type_blastp) {
2616 Int4 hitlist_max;
2617 BLAST_ScoreBlkPtr sbp = search->sbp;
2618 BLAST_ParameterBlkPtr pbp = search->pbp;
2619
2620 search->positionBased = TRUE;
2621 sbp->kbp = sbp->kbp_psi;
2622 sbp->kbp_gap = sbp->kbp_gap_psi;
2623 hitlist_max = search->result_struct->hitlist_max;
2624 search->result_struct =
2625 BLASTResultsStructDelete(search->result_struct);
2626 search->result_struct = BLASTResultsStructNew(hitlist_max,
2627 pbp->max_pieces, pbp->hsp_range_max);
2628
2629 if (search->allocated & BLAST_SEARCH_ALLOC_WFP_FIRST) {
2630 search->wfp_first = BLAST_WordFinderDestruct(search->wfp_first);
2631 search->wfp_first = BLAST_WordFinderNew(sbp->alphabet_size,
2632 search->all_words->wordsize, 1, FALSE);
2633 }
2634
2635 if (search->allocated & BLAST_SEARCH_ALLOC_WFP_SECOND) {
2636 search->wfp_second = BLAST_WordFinderDestruct(search->wfp_second);
2637 search->wfp_second = BLAST_WordFinderNew(sbp->alphabet_size,
2638 search->all_words->wordsize, 1, FALSE);
2639 }
2640
2641 /* threshold_first is defunct ! */
2642 search->wfp = search->wfp_first;
2643 if (search->whole_query == TRUE)
2644 BlastNewFindWords(search, 0, search->context[0].query->length,
2645 pbp->threshold_second, (Uint1) 0);
2646 else
2647 BlastNewFindWords(search, search->required_start,
2648 search->required_end, pbp->threshold_second, (Uint1) 0);
2649 lookup_position_aux_destruct(search->wfp->lookup);
2650 search->wfp_second = search->wfp_first;
2651 }
2652 status = BLASTPerformSearch(search, subject_length, subject_seq);
2653
2654 if (status) {
2655 BlastConstructErrorMessage("BlastTwoSequencesCoreEx", "non-zero status", 2, &(search->error_return));
2656 return status;
2657 }
2658
2659 if (search->prog_number == blast_type_tblastn &&
2660 search->pbp->longest_intron > 0) {
2661 Uint1 rem;
2662 Uint1Ptr seq_4na, seq_2na, subject;
2663 Int4 i;
2664 /* Need to convert from ncbi2na to ncbi4na encoding */
2665 subject = (Uint1Ptr) MemNew(subject_length + 1);
2666 seq_4na = subject;
2667 seq_2na = subject_seq;
2668 rem = 3;
2669 for (i=0; i<subject_length; i++) {
2670 *seq_4na = (Uint1) (1 << READDB_UNPACK_BASE_N(*seq_2na, rem));
2671 seq_4na++;
2672 if (rem>0) rem--;
2673 else {
2674 rem = 3;
2675 seq_2na++;
2676 }
2677 }
2678 BlastSequenceAddSequence(search->subject, NULL, subject-1, subject_length, subject_length, 0);
2679 status = BlastLinkHsps(search);
2680 }
2681
2682 if (StringCmp(search->prog_name, "blastn") == 0 || search->pbp->gapped_calculation == FALSE)
2683 {
2684 if (search->pbp->do_sum_stats == TRUE &&
2685 !search->pbp->mb_params)
2686 status = BlastLinkHsps(search);
2687 else
2688 status = BlastGetNonSumStatsEvalue(search);
2689 }
2690 if (search->pbp->mb_params) {
2691 search->subject->sequence = subject_seq;
2692 MegaBlastReevaluateWithAmbiguities(search);
2693 }
2694 status = BlastReapHitlistByEvalue(search);
2695
2696 if (search->handle_results)
2697 search->handle_results((VoidPtr) search);
2698 else if (!search->pbp->mb_params)
2699 BlastSaveCurrentHitlist(search);
2700 else
2701 MegaBlastSaveCurrentHitlist(search);
2702 if (search->pbp->mb_params)
2703 /* Free the ncbi4na-encoded sequence */
2704 search->subject->sequence_start = (Uint1Ptr)
2705 MemFree(search->subject->sequence_start);
2706
2707 search->subject->sequence = NULL;
2708 search->subject->sequence_start = NULL;
2709 if (search->prog_number==blast_type_blastn) {
2710 /* Unconcatenate the strands by adjusting the query offsets in
2711 all hsps */
2712 search->context[search->first_context].query->length =
2713 search->query_context_offsets[search->first_context+1] - 1;
2714 }
2715
2716 return status;
2717 }
2718
2719 static BLAST_ScorePtr *RPS2SeqImpalaStatCorrections
2720 (BlastSearchBlkPtr search, Uint1Ptr subject_seq, Int4 subject_length)
2721 {
2722 BLAST_ScorePtr *retval = NULL;
2723 Nlm_FloatHi *scoreArray; /*array of score probabilities*/
2724 Nlm_FloatHi *resProb; /*array of probabilities for each residue*/
2725 BLAST_ScoreFreqPtr this_sfp, return_sfp; /*score frequency pointers to compute lambda*/
2726 BLAST_ScorePtr *posMatrix; /* position-specific matrix. */
2727 Nlm_FloatHi initialUngappedLambda, scaledInitialUngappedLambda,
2728 correctUngappedLambda, scalingFactor, lambdaRatio;
2729 Nlm_FloatHi temp1; /*intermediate variable for adjusting matrix*/
2730 Int4 temp2; /*intermediate variable for adjusting matrix*/
2731 Int4 seqlength; /* length of posMatrix (or target sequence). */
2732 Int4 i, j; /* loop indices */
2733
2734 if (search == NULL)
2735 return retval;
2736
2737 posMatrix = search->sbp->posMatrix;
2738 scalingFactor = search->pbp->scalingFactor;
2739
2740 resProb = (Nlm_FloatHi *) MemNew (PRO_ALPHABET_SIZE * sizeof(Nlm_FloatHi));
2741 scoreArray = (Nlm_FloatHi *) MemNew(scoreRange * sizeof(Nlm_FloatHi));
2742 return_sfp = (BLAST_ScoreFreqPtr) MemNew(1 * sizeof(BLAST_ScoreFreq));
2743
2744 seqlength = search->sbp->query_length;
2745
2746 IMPALAfillResidueProbability(subject_seq, subject_length, resProb);
2747 this_sfp = IMPALAfillSfp(posMatrix, seqlength, resProb, scoreArray,
2748 return_sfp, scoreRange);
2749 initialUngappedLambda = IMPALAfindUngappedLambda(search->sbp->name);
2750 scaledInitialUngappedLambda = initialUngappedLambda/scalingFactor;
2751 correctUngappedLambda = impalaKarlinLambdaNR(this_sfp, scaledInitialUngappedLambda);
2752 if(correctUngappedLambda == -1.0) {
2753 ErrPostEx(SEV_ERROR, 0, 0,
2754 "RPS2SeqImpalaStatCorrections: Could not calculate ungapped "
2755 "lambda for PSSM");
2756 MemFree(resProb);
2757 MemFree(scoreArray);
2758 MemFree(return_sfp);
2759 return retval;
2760 }
2761
2762 lambdaRatio = correctUngappedLambda/scaledInitialUngappedLambda;
2763
2764 retval = (BLAST_Score **) MemNew((seqlength+1) * sizeof(BLAST_Score *));
2765 for (i = 0; i < seqlength+1; i++)
2766 retval[i] = (BLAST_Score *)MemNew(PRO_ALPHABET_SIZE *
2767 sizeof(BLAST_Score));
2768
2769 for (i = 0; i < seqlength+1; i++) {
2770 for (j = 0; j < PRO_ALPHABET_SIZE; j++) {
2771 if ((posMatrix[i][j] == BLAST_SCORE_MIN) || (Xchar == j))
2772 retval[i][j] = posMatrix[i][j];
2773 else {
2774 temp1 = ((Nlm_FloatHi) (posMatrix[i][j]));
2775 temp1 = temp1 * (lambdaRatio);
2776 temp2 = Nlm_Nint(temp1);
2777 retval[i][j] = temp2;
2778 }
2779 }
2780 }
2781
2782 resProb = MemFree(resProb);
2783 scoreArray = MemFree(scoreArray);
2784 return_sfp = MemFree(return_sfp);
2785
2786 return retval;
2787 }
2788
2789 static SeqAlignPtr
2790 BlastTwoSequencesCore (BlastSearchBlkPtr search, SeqLocPtr slp, Uint1Ptr subject_seq, Int4 subject_length, Boolean reverse)
2791
2792 {
2793 BLASTResultsStructPtr result_struct;
2794 BioseqPtr subject_bsp;
2795 Int2 status;
2796 Int4 index, hitlist_count, rev_subject_length=0;
2797 SeqAlignPtr seqalign=NULL;
2798 SeqPortPtr spp;
2799 Uint1 residue;
2800 Uint1Ptr sequence, sequence_start, rev_subject=NULL;
2801 SeqIdPtr sip;
2802 BLAST_ScorePtr *scaledMatrix = NULL, *copyMatrix = NULL;
2803
2804 if (search == NULL || search->query_invalid)
2805 return NULL;
2806
2807 sip = SeqLocId(slp);
2808 subject_bsp = BioseqLockById(sip);
2809
2810 /* Save subject sequence location for tabulated output */
2811 if (search->handle_results && SeqLocLen(slp) < subject_bsp->length)
2812 search->query_slp->next = slp;
2813
2814 status = BlastTwoSequencesCoreEx(search, subject_bsp, subject_seq,
2815 subject_length);
2816
2817 if (status == 0) {
2818 /*CC: if we're emulating rpsblast, do the impala style matrix
2819 * rescaling */
2820 if (search->positionBased && search->pbp->scalingFactor != 0.0) {
2821 scaledMatrix = RPS2SeqImpalaStatCorrections(search, subject_seq,
2822 subject_length);
2823 if ( !scaledMatrix ) {
2824 BioseqUnlock(subject_bsp);
2825 return NULL;
2826 }
2827 copyMatrix = search->sbp->posMatrix;
2828 search->sbp->posMatrix = scaledMatrix;
2829
2830 if (search->sbp->karlinK != 0.0)
2831 search->sbp->kbp_gap[0]->K =
2832 PRO_K_MULTIPLIER*search->sbp->karlinK;
2833 search->sbp->kbp_gap[0]->logK = log(search->sbp->kbp_gap[0]->K);
2834 search->sbp->kbp_gap[0]->Lambda /= search->pbp->scalingFactor;
2835 }
2836 if (search->pbp->mb_params && !search->pbp->mb_params->no_traceback
2837 && !search->pbp->mb_params->use_dyn_prog) {
2838 seqalign = MegaBlastGapInfoToSeqAlign(search, 0, 0);
2839 } else if (StringCmp(search->prog_name, "blastn") == 0 &&
2840 search->pbp->gapped_calculation == TRUE) {
2841 result_struct = search->result_struct;
2842 hitlist_count = result_struct->hitlist_count;
2843 if (hitlist_count > 0)
2844 {
2845 spp = SeqPortNewByLoc(slp, Seq_code_ncbi4na);
2846 if (subject_bsp->repr == Seq_repr_delta)
2847 SeqPortSet_do_virtual(spp, TRUE);
2848
2849 /* make one longer to "protect" ALIGN. */
2850 sequence_start = MemNew((2+SeqLocLen(slp))*sizeof(Uint1));
2851 sequence_start[0] = ncbi4na_to_blastna[0];
2852 sequence = sequence_start+1;
2853 index=0;
2854 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
2855 {
2856 if (IS_residue(residue))
2857 {
2858 sequence[index] = ncbi4na_to_blastna[residue];
2859 index++;
2860 }
2861 }
2862 /* Gap character in last space. */
2863 sequence[index] = ncbi4na_to_blastna[0];
2864
2865 if (!search->pbp->mb_params) {
2866 /* Traditional Blastn */
2867 seqalign = SumBlastGetGappedAlignmentTraceback(
2868 search, 0, reverse, FALSE, sequence,
2869 SeqLocLen(slp));
2870 } else if (!search->pbp->mb_params->no_traceback) {
2871 /* Mega BLAST with non-greedy extension */
2872 SumBlastGetGappedAlignmentEx(search, 0, FALSE, FALSE,
2873 sequence, SeqLocLen(slp), TRUE, &seqalign, NULL, 0);
2874 }
2875
2876 sequence_start = MemFree(sequence_start);
2877 spp = SeqPortFree(spp);
2878 }
2879 }
2880 else if (search->pbp->gapped_calculation == TRUE)
2881 {
2882 result_struct = search->result_struct;
2883 hitlist_count = result_struct->hitlist_count;
2884 if (hitlist_count > 0) {
2885
2886 if (!StringCmp(search->prog_name, "tblastn")
2887 || !StringCmp(search->prog_name, "psitblastn")) {
2888 Uint1Ptr subject = NULL;
2889 SeqPortPtr rev_spp;
2890 if (slp->choice == SEQLOC_WHOLE) {
2891 spp = SeqPortNew(subject_bsp, 0, -1, Seq_strand_plus,
2892 Seq_code_ncbi4na);
2893 rev_spp = SeqPortNew(subject_bsp, 0, -1, Seq_strand_minus,
2894 Seq_code_ncbi4na);
2895 } else {
2896 spp = SeqPortNew(subject_bsp, SeqLocStart(slp),
2897 SeqLocStop(slp), Seq_strand_plus,
2898 Seq_code_ncbi4na);
2899 rev_spp = SeqPortNew(subject_bsp, SeqLocStart(slp),
2900 SeqLocStop(slp), Seq_strand_minus,
2901 Seq_code_ncbi4na);
2902 }
2903 /* make one longer to "protect" ALIGN. */
2904 subject = (Uint1Ptr) MemNew((1+subject_length)*sizeof(Uint1));
2905 rev_subject = (Uint1Ptr) MemNew((1+subject_length)*sizeof(Uint1));
2906 for (index=0; index<subject_length; index++) {
2907 subject[index] = SeqPortGetResidue(spp);
2908 rev_subject[index] = SeqPortGetResidue(rev_spp);
2909 }
2910 /* Gap character in last space. */
2911 subject[subject_length] = NULLB;
2912 rev_subject[subject_length] = NULLB;
2913 rev_subject_length = subject_length;
2914 spp = SeqPortFree(spp);
2915 rev_spp = SeqPortFree(rev_spp);
2916
2917
2918 seqalign = BlastGetGapAlgnTbck(search, 0, reverse,
2919 FALSE, subject, subject_length,
2920 rev_subject, rev_subject_length);
2921
2922 if (search->pbp->longest_intron <= 0)
2923 MemFree(subject);
2924 MemFree(rev_subject);
2925 } else {
2926 seqalign = BlastGetGapAlgnTbck(search, 0, reverse,
2927 FALSE, subject_seq, subject_length,
2928 rev_subject, rev_subject_length);
2929 result_struct->results[0]->seqalign = seqalign;
2930 }
2931 }
2932 }
2933 else /* Ungapped case, any program */
2934 {
2935 if (search->prog_number == blast_type_blastn ||
2936 search->prog_number == blast_type_blastp)
2937 seqalign = GetSeqAlignForResultHitList(search, TRUE, FALSE,
2938 search->pbp->discontinuous, reverse, FALSE);
2939 else
2940 seqalign = GetSeqAlignForResultHitList(search, FALSE, FALSE,
2941 search->pbp->discontinuous, reverse, FALSE);
2942 }
2943 /*CC: Revert changes done for psi-blast2sequences */
2944 if (search->positionBased && search->pbp->scalingFactor != 0.0) {
2945 if (scaledMatrix) {
2946 for (index = 0; index < search->sbp->query_length + 1; index++)
2947 MemFree(scaledMatrix[index]);
2948 MemFree(scaledMatrix);
2949 search->sbp->posMatrix = copyMatrix;
2950 }
2951 if (search->sbp->karlinK != 0.0)
2952 search->sbp->kbp_gap[0]->K = search->sbp->karlinK;
2953 search->sbp->kbp_gap[0]->logK = log(search->sbp->kbp_gap[0]->K);
2954 }
2955 }
2956 BioseqUnlock(subject_bsp);
2957
2958 return seqalign;
2959 }
2960
2961 BlastSearchBlkPtr LIBCALL
2962 BlastQuerySequenceSetUp(BioseqPtr bsp, CharPtr progname,
2963 BLAST_OptionsBlkPtr options)
2964 {
2965 BlastSearchBlkPtr search;
2966 SeqLocPtr slp=NULL;
2967
2968 if (bsp == NULL)
2969 return NULL;
2970
2971 ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(bsp->id, SEQID_GI)));
2972 if (progname == NULL && options == NULL)
2973 return NULL;
2974
2975 if (progname == NULL)
2976 progname = options->program_name;
2977
2978 if (!StringCmp(progname, "blastp") ||
2979 !StringCmp(progname, "blastx")) {
2980 if (options->gapped_calculation == TRUE) {
2981 options->two_pass_method = FALSE;
2982 options->multiple_hits_only = TRUE;
2983 }
2984 }
2985
2986 search = BLASTSetUpSearchByLoc(slp, progname, bsp->length, 0, NULL, options, NULL);
2987
2988 search->allocated += BLAST_SEARCH_ALLOC_QUERY_SLP;
2989
2990 if (search == NULL)
2991 return NULL;
2992
2993 return search;
2994 }
2995
2996 /*
2997 Runs blast between two sequences
2998 */
2999 SeqAlignPtr LIBCALL
3000 BlastTwoSequencesByLocEx(SeqLocPtr slp1, SeqLocPtr slp2, CharPtr progname, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns)
3001 {
3002 return BlastTwoSequencesByLocWithCallback(slp1, slp2, progname, options,
3003 other_returns, error_returns, NULL, NULL);
3004 }
3005
3006 /************************************************************************/
3007 /* PSIBLAST2Sequences API */
3008 /************************************************************************/
3009
3010 static BLAST_ScorePtr *B2SAllocateScoreMatrix(Int4 rows, Int4 cols)
3011 {
3012 BLAST_ScorePtr *matrix = NULL;
3013 Int4 i;
3014
3015 if (!(matrix = (BLAST_ScorePtr *) MemNew(rows*sizeof(BLAST_ScorePtr)))) {
3016 return NULL;
3017 }
3018
3019 for (i = 0; i < rows; i++) {
3020 matrix[i] = (BLAST_ScorePtr) MemNew(cols*sizeof(BLAST_Score));
3021 if (matrix[i] == NULL) {
3022 while (--i >= 0)
3023 MemFree(matrix[i]);
3024 MemFree(matrix);
3025 return NULL;
3026 }
3027 }
3028 return matrix;
3029 }
3030
3031 /* Convert a set of residue frequencies into a scaled PSSM (using
3032 * scalingFactor). */
3033 static BLAST_ScorePtr *B2SCalculateScaledPSSM(BlastSearchBlkPtr search,
3034 Nlm_FloatHiPtr *posFreqs, compactSearchItems *compactSearch,
3035 Nlm_FloatHiPtr karlinK)
3036 {
3037 BLAST_ScorePtr *retval = NULL;
3038 posSearchItems *posSearch = NULL;
3039 Int4 qlen, alphabet_sz, rv;
3040 Nlm_FloatHi scalingFactor = search->pbp->scalingFactor;
3041 BLAST_ScoreBlkPtr sbp = NULL;
3042 ValNodePtr error_return;
3043 Int4 i, gap_open, gap_extend;
3044
3045 if (!search || !compactSearch || !posFreqs)
3046 return NULL;
3047
3048 if (!(posSearch = (posSearchItems *)MemNew(sizeof(posSearchItems)))) {
3049 ErrPostEx(SEV_ERROR, 0, 0, "B2SCalculateScaledPSSM: Out of memory");
3050 return NULL;
3051 }
3052
3053 qlen = compactSearch->qlength;
3054 alphabet_sz = compactSearch->alphabetSize;
3055 gap_open = search->pbp->gap_open / scalingFactor;
3056 gap_extend = search->pbp->gap_extend / scalingFactor;
3057
3058 if (!(sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa, 1))) {
3059 ErrPostEx(SEV_ERROR, 0, 0, "B2SCalculateScaledPSSM: Out of memory");
3060 MemFree(posSearch);
3061 return NULL;
3062 }
3063 sbp->read_in_matrix = TRUE;
3064 sbp->protein_alphabet = TRUE;
3065 sbp->posMatrix = NULL;
3066 sbp->number_of_contexts = 1;
3067 BlastScoreBlkMatFill(sbp, search->sbp->name);
3068 compactSearch->matrix = sbp->matrix;
3069 compactSearch->gapped_calculation = TRUE;
3070 compactSearch->pseudoCountConst = search->pbp->pseudoCountConst;
3071 compactSearch->ethresh = 0.001;
3072 BlastScoreBlkFill(sbp, (CharPtr) compactSearch->query, qlen, 0);
3073
3074 sbp->kbp_gap_std[0] = BlastKarlinBlkCreate();
3075 rv = BlastKarlinBlkGappedCalc(sbp->kbp_gap_std[0], gap_open, gap_extend,
3076 sbp->name, &error_return);
3077 if (rv == 1) {
3078 BlastErrorPrint(error_return);
3079 BLAST_ScoreBlkDestruct(sbp);
3080 MemFree(posSearch);
3081 return NULL;
3082 }
3083 sbp->kbp_gap_psi[0] = BlastKarlinBlkCreate();
3084 rv = BlastKarlinBlkGappedCalc(sbp->kbp_gap_psi[0], gap_open, gap_extend,
3085 sbp->name, &error_return);
3086 if (rv == 1) {
3087 BlastErrorPrint(error_return);
3088 BLAST_ScoreBlkDestruct(sbp);
3089 MemFree(posSearch);
3090 return NULL;
3091 }
3092
3093 if (sbp->kbp_ideal == NULL)
3094 sbp->kbp_ideal = BlastKarlinBlkStandardCalcEx(sbp);
3095 compactSearch->lambda = sbp->kbp_gap_std[0]->Lambda;
3096 compactSearch->kbp_std = sbp->kbp_std;
3097 compactSearch->kbp_psi = sbp->kbp_psi;
3098 compactSearch->kbp_gap_psi = sbp->kbp_gap_psi;
3099 compactSearch->kbp_gap_std = sbp->kbp_gap_std;
3100 compactSearch->lambda_ideal = sbp->kbp_ideal->Lambda;
3101 compactSearch->K_ideal = sbp->kbp_ideal->K;
3102
3103 /* Initialize the posSearch structure */
3104 posSearch->posFreqs = posFreqs;
3105 posSearch->posMatrix = B2SAllocateScoreMatrix(qlen+1, alphabet_sz);
3106 posSearch->posPrivateMatrix = B2SAllocateScoreMatrix(qlen+1, alphabet_sz);
3107 if (!posSearch->posMatrix || !posSearch->posPrivateMatrix) {
3108 ErrPostEx(SEV_ERROR, 0, 0, "B2SCalculateScaledPSSM: Out of memory");
3109 BLAST_ScoreBlkDestruct(sbp);
3110 MemFree(posSearch->posMatrix); MemFree(posSearch->posPrivateMatrix);
3111 MemFree(posSearch);
3112 return NULL;
3113 }
3114
3115 posFreqsToMatrix(posSearch, compactSearch);
3116 impalaScaling(posSearch, compactSearch, scalingFactor, TRUE);
3117 if (karlinK)
3118 *karlinK = compactSearch->kbp_gap_psi[0]->K;
3119
3120 for (i = 0; i <= qlen; i++)
3121 MemFree(posSearch->posMatrix[i]);
3122 MemFree(posSearch->posMatrix);
3123 BLAST_ScoreBlkDestruct(sbp);
3124 retval = posSearch->posPrivateMatrix;
3125 MemFree(posSearch);
3126
3127 return retval;
3128 }
3129
3130 /* Calculates the PSSM for a given SeqLocPtr */
3131 static BLAST_ScorePtr *B2SCalculatePSSM(SeqLocPtr slp, BlastSearchBlkPtr search,
3132 BLAST_MatrixPtr matrix, Nlm_FloatHiPtr karlinK)
3133 {
3134 BLAST_ScorePtr *posMatrix = NULL;
3135 compactSearchItems *compactSearch = NULL;
3136 Boolean replaced_sequence = FALSE;
3137 Int4 query_length, full_query_length;
3138 SeqLocPtr filter_slp = NULL, full_slp = NULL;
3139 Uint1Ptr sequence = NULL;
3140 BlastSequenceBlk bseq;
3141 Nlm_FloatHi scalingFactor = search->pbp->scalingFactor;
3142
3143 query_length = SeqLocLen(slp);
3144
3145 /* if the slp is not the whole sequence, retrieve the whole sequence and
3146 * use it to compute the pssm */
3147 if (matrix->rows != (query_length+1)) {
3148 SeqPortPtr spp = NULL;
3149 SeqIdPtr sip = NULL;
3150 Uint1 residue;
3151 BioseqPtr bsp = NULL;
3152 Char tmp[256];
3153 Int4 index = 0;
3154
3155 sip = SeqLocId(slp);
3156 if ((bsp = BioseqLockById(SeqIdFindBest(sip, SEQID_GI))) == NULL) {
3157 SeqIdWrite(SeqLocId(slp),tmp,PRINTID_FASTA_LONG,
3158 sizeof(tmp));
3159
3160 ErrPostEx(SEV_ERROR,0,0,"Could not retrieve full bioseq "
3161 "for %s",tmp);
3162 BioseqUnlock(bsp);
3163 return NULL;
3164 }
3165
3166 /* get full sequence to be used in WposComputation */
3167 spp = SeqPortNew(bsp, FIRST_RESIDUE, LAST_RESIDUE, Seq_strand_unknown,
3168 Seq_code_ncbistdaa);
3169
3170 full_query_length = bsp->length;
3171 sequence = (Uint1Ptr) MemNew(2*((bsp->length)+2)*sizeof(Char));
3172 BioseqUnlock(bsp);
3173
3174 sequence[index++] = NULLB;
3175 while ((residue = SeqPortGetResidue(spp)) != SEQPORT_EOF) {
3176 if (IS_residue(residue)) {
3177 if (residue == 24) { /* change selenocysteine to X */
3178 residue = 21;
3179 ErrPostEx(SEV_WARNING,0,0, "Selenocysteine (U) at "
3180 "position %ld replaced by X", (long) index+1);
3181 }
3182 sequence[index++] = residue;
3183 }
3184 }
3185 sequence[index] = NULLB;
3186 spp = SeqPortFree(spp);
3187
3188 /* Filter the sequence if necessary */
3189 ValNodeAddPointer(&full_slp, SEQLOC_WHOLE, SeqIdDup(SeqLocId(slp)));
3190 filter_slp = BlastSeqLocFilter(full_slp, search->pbp->filter_string);
3191 if(search->pbp->query_lcase_mask != NULL)
3192 filter_slp = blastMergeFilterLocs(filter_slp,
3193 search->pbp->query_lcase_mask, FALSE, 0, 0);
3194
3195 BlastMaskTheResidues(sequence+1, full_query_length, 21, filter_slp,
3196 FALSE, SeqLocStart(full_slp));
3197
3198 /* Save the current query sequence */
3199 MemCpy(&bseq, search->context[0].query, sizeof(BlastSequenceBlk));
3200
3201 BlastSequenceAddSequence(search->context[0].query, NULL, sequence,
3202 full_query_length, full_query_length, 0);
3203
3204 SeqLocSetFree(full_slp);
3205 SeqLocSetFree(filter_slp);
3206 replaced_sequence = TRUE;
3207 }
3208
3209 compactSearch = compactSearchNew(compactSearch);
3210 copySearchItems(compactSearch, search, search->sbp->name);
3211 compactSearch->pseudoCountConst = search->pbp->pseudoCountConst;
3212 if (scalingFactor != 0.0 && scalingFactor != 1.0) {
3213 /* build pssm {make,copy}mat/rpsblast style */
3214 posMatrix = B2SCalculateScaledPSSM(search, search->sbp->posFreqs,
3215 compactSearch, karlinK);
3216 } else {
3217 /* build pssm psiblast style */
3218 posMatrix = WposComputation(compactSearch, NULL, search->sbp->posFreqs);
3219 }
3220 compactSearchDestruct(compactSearch);
3221
3222 if (replaced_sequence) {
3223 MemCpy(search->context[0].query, &bseq, sizeof(BlastSequenceBlk));
3224 MemFree(sequence);
3225 }
3226
3227 return posMatrix;
3228 }
3229
3230 /* Checks if the dimensions of the pssm attached to the search->sbp are
3231 * consistent with the length of the master query (slp), and trims the matrix
3232 * if necessary */
3233 static Boolean B2SVerifyPSSM(SeqLocPtr slp, BlastSearchBlkPtr search,
3234 BLAST_MatrixPtr matrix)
3235 {
3236 Int4 i, query_length = SeqLocLen(slp);
3237
3238 if ((query_length+1) > matrix->rows) {
3239 ErrPostEx(SEV_WARNING,0,0,"Ignoring PSSM because it seems not to "
3240 "correspond to query sequence (query length = %ld, PSSM's "
3241 "number of rows = %ld)", query_length+1, matrix->rows);
3242 search->positionBased = FALSE;
3243
3244 if (matrix->matrix == NULL) {
3245 BLAST_ScorePtr *posMatrix = search->sbp->posMatrix;
3246
3247 for (i = 0; i < matrix->rows; i++)
3248 posMatrix[i] = MemFree(posMatrix[i]);
3249 posMatrix = MemFree(posMatrix);
3250 }
3251 search->sbp->posMatrix = NULL;
3252 search->sbp->posFreqs = NULL;
3253 return FALSE;
3254 } else if ((query_length+1) < matrix->rows) {
3255 /* Assume BLAST_Matrix corresponds to the entire sequence, so trim
3256 * it */
3257 Int4 from, to, i, j, alphabet_sz;
3258 BLAST_ScorePtr *pssm = NULL;
3259
3260 if (slp->choice != SEQLOC_INT) {
3261 ErrPostEx(SEV_ERROR,0,0,"B2SVerifyPSSM: SeqLocPtr is not a "
3262 "SEQLOC_INT, cannot trim matrix");
3263 return FALSE;
3264 }
3265
3266 from = SeqLocStart(slp);
3267 to = SeqLocStop(slp);
3268 alphabet_sz = matrix->columns;
3269
3270 /* Adjust the pssm */
3271 pssm = (BLAST_ScorePtr *)MemNew(sizeof(BLAST_ScorePtr) *
3272 (query_length+1));
3273 for (i = 0; i <= query_length; i++) {
3274 pssm[i] = (BLAST_ScorePtr)MemNew(sizeof(BLAST_Score) *
3275 alphabet_sz);
3276 }
3277
3278 for (i = from; i <= to; i++) {
3279 for (j = 0; j < alphabet_sz; j++)
3280 pssm[(i-from)][j] = search->sbp->posMatrix[i][j];
3281 }
3282 for (j = 0; j < alphabet_sz; j++)
3283 pssm[query_length][j] = BLAST_SCORE_MIN;
3284
3285 if (matrix->matrix == NULL) {
3286 /* Free the matrix we calculated originally */
3287 BLAST_ScorePtr *posMatrix = search->sbp->posMatrix;
3288
3289 for (i = 0; i < matrix->rows; i++)
3290 posMatrix[i] = MemFree(posMatrix[i]);
3291 posMatrix = MemFree(posMatrix);
3292 }
3293 search->sbp->posMatrix = pssm;
3294
3295 }
3296 return TRUE;
3297 }
3298
3299 /* psi-blast2sequences setup: matrix must contain at least the residue
3300 * frequencies to calculate the PSSM. Otherwise, if the PSSM is given, that
3301 * will be used. */
3302 Boolean LIBCALL B2SPssmSetupSearch(BlastSearchBlkPtr search,
3303 SeqLocPtr pssm_slp, BLAST_MatrixPtr matrix)
3304 {
3305 Nlm_FloatHi karlinK = 0.0;
3306 Int4 npos, alphabet_size;
3307
3308 if (!search || !matrix)
3309 return FALSE;
3310
3311 if (search->prog_number != blast_type_blastp) {
3312 ErrPostEx(SEV_ERROR, 0, 0, "B2SPssmSetupSearch: only blastp is "
3313 "supported");
3314 return FALSE;
3315 }
3316
3317 search->positionBased = TRUE;
3318 npos = SeqLocLen(pssm_slp);
3319 alphabet_size = search->sbp->alphabet_size;
3320
3321 if (npos <= 0) {
3322 ErrPostEx(SEV_ERROR, 0, 0, "B2SPssmSetupSearch: length of pssm_slp "
3323 "must be positive");
3324 return FALSE;
3325 }
3326
3327 /* save the residue frequencies, we might need them later */
3328 if (matrix->posFreqs) {
3329 search->sbp->posFreqs = allocatePosFreqs(npos, alphabet_size);
3330 copyPosFreqs(matrix->posFreqs, search->sbp->posFreqs, npos,
3331 alphabet_size);
3332 }
3333
3334 if (matrix->posFreqs && !matrix->matrix) {
3335 search->sbp->posMatrix = B2SCalculatePSSM(pssm_slp, search, matrix,
3336 &karlinK);
3337 /* if we calculated the pssm, and use did not provide one, save it*/
3338 if (matrix->karlinK == 0.0 && karlinK != 0.0)
3339 matrix->karlinK = karlinK;
3340 } else {
3341 search->sbp->posMatrix = matrix->matrix;
3342 }
3343
3344 search->sbp->mat_dim1 = search->sbp->query_length + 1;
3345 search->sbp->mat_dim2 = search->sbp->alphabet_size;
3346
3347 /* Sanity check */
3348 if (!search->sbp->posMatrix) {
3349 ErrPostEx(SEV_ERROR, 0, 0, "B2SPssmSetupSearch: "
3350 "Could not create or obtain PSSM! Please verify "
3351 "BLAST_Matrix parameter");
3352 search->positionBased = FALSE;
3353 return FALSE;
3354 }
3355
3356 /* Make sure the BLAST_Matrix number of rows is consistent with
3357 * pssm_slp */
3358 B2SVerifyPSSM(pssm_slp, search, matrix);
3359
3360 if (matrix->karlinK != 0.0) {
3361 search->sbp->karlinK = matrix->karlinK;
3362 search->sbp->kbp_gap_psi[0]->K = matrix->karlinK;
3363 search->sbp->kbp_gap_psi[0]->logK = log(matrix->karlinK);
3364 }
3365
3366 return TRUE;
3367 }
3368
3369 /* clean up psi-blast2sequences */
3370 Boolean LIBCALL B2SPssmCleanUpSearch(BlastSearchBlkPtr search,
3371 BLAST_MatrixPtr matrix)
3372 {
3373 Int4 i, rows = search->sbp->query_length + 1;
3374 BLAST_ScorePtr *posMatrix = search->sbp->posMatrix;
3375 Nlm_FloatHiPtr *posFreqs = search->sbp->posFreqs;
3376
3377 if (!matrix)
3378 return FALSE;
3379
3380 if ((matrix->matrix == NULL) || /* B2SPssmSetupSearch created PSSM */
3381 (posMatrix != matrix->matrix)) { /* B2SVerifyPSSM trimmed PSSM */
3382 for (i = 0; i < rows; i++)
3383 posMatrix[i] = MemFree(posMatrix[i]);
3384 posMatrix = MemFree(posMatrix);
3385 }
3386 if (matrix->posFreqs) {
3387 for (i = 0; i < rows; i++)
3388 posFreqs[i] = MemFree(posFreqs[i]);
3389 posFreqs = MemFree(posFreqs);
3390 }
3391 search->sbp->posMatrix = NULL;
3392 search->sbp->posFreqs = NULL;
3393 search->positionBased = FALSE;
3394 return TRUE;
3395 }
3396
3397 SeqAlignPtr LIBCALL B2SPssmOnTheFlyByLoc(BlastSearchBlkPtr search,
3398 SeqLocPtr subj_slp)
3399 {
3400 Int4 index, subject_length;
3401 SeqAlignPtr seqalign = NULL;
3402 Uint1Ptr subject_seq = NULL, subject_seq_start = NULL;
3403 SeqPortPtr spp;
3404 Uint1 residue;
3405
3406 if (!search || search->query_invalid || !subj_slp)
3407 return NULL;
3408
3409 if (search->result_struct)
3410 search->result_struct = BLASTResultsStructDelete(search->result_struct);
3411 search->result_struct = BLASTResultsStructNew(search->result_size,
3412 search->pbp->max_pieces, search->pbp->hsp_range_max);
3413 BlastHitListPurge(search->current_hitlist);
3414
3415 subject_length = SeqLocLen(subj_slp);
3416
3417 if (search->prog_number == blast_type_blastp) {
3418 subject_seq_start = (Uint1Ptr) MemNew(
3419 ((subject_length)+2)*sizeof(Uint1));
3420 /* The first residue is the sentinel. */
3421 subject_seq_start[0] = NULLB;
3422 subject_seq = subject_seq_start+1;
3423 index = 0;
3424 spp = SeqPortNewByLoc(subj_slp, Seq_code_ncbistdaa);
3425 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
3426 if (IS_residue(residue))
3427 subject_seq[index++] = residue;
3428 }
3429 subject_seq[index] = NULLB;
3430 spp = SeqPortFree(spp);
3431 } else {
3432 return NULL;
3433 }
3434
3435 seqalign = BlastTwoSequencesCore(search, subj_slp, subject_seq,
3436 subject_length, FALSE);
3437
3438 MemFree(subject_seq_start);
3439 AdjustOffSetsInSeqAlign(seqalign, search->query_slp, subj_slp);
3440
3441 return seqalign;
3442 }
3443
3444 SeqAlignPtr LIBCALL B2SPssmOnTheFly(BlastSearchBlkPtr search,
3445 BioseqPtr subj_bsp)
3446 {
3447 SeqAlignPtr salp = NULL;
3448 SeqLocPtr slp = NULL;
3449
3450 if (!search || search->query_invalid || !subj_bsp)
3451 return NULL;
3452
3453 ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(subj_bsp->id,
3454 SEQID_GI)));
3455 salp = B2SPssmOnTheFlyByLoc(search, slp);
3456 SeqLocFree(slp);
3457 return salp;
3458 }
3459
3460 SeqAlignPtr * LIBCALL B2SPssmMultipleQueries(SeqLocPtr pssm_slp,
3461 BLAST_MatrixPtr matrix, SeqLocPtr *target_seqs, Int4 ntargets,
3462 BLAST_OptionsBlkPtr options)
3463 {
3464 SeqAlignPtr *sa_array = NULL;
3465 BlastSearchBlkPtr search = NULL;
3466 Int4 i;
3467
3468 if (!matrix || !pssm_slp || !target_seqs || ntargets <= 0 || !options)
3469 return NULL;
3470
3471 /* Set up search structure */
3472 search = BLASTSetUpSearchByLoc(pssm_slp, options->program_name,
3473 SeqLocLen(pssm_slp), 0, NULL, options, NULL);
3474 B2SPssmSetupSearch(search, pssm_slp, matrix);
3475
3476 /* Allocate memory for return value */
3477 if (!(sa_array = (SeqAlignPtr*)MemNew(sizeof(SeqAlignPtr)*ntargets))) {
3478 ErrPostEx(SEV_ERROR, 0, 0, "B2SPssmMultipleQueries: Out of memory");
3479 BlastSearchBlkDestruct(search);
3480 return NULL;
3481 }
3482
3483
3484 /* Iterate over seqlocs in target_seqs, using effective search space in
3485 * rpsblast style */
3486 for (i = 0; i < ntargets; i++) {
3487 Int8 dblen = (options->db_length != 0) ?
3488 options->db_length : SeqLocLen(pssm_slp);
3489 Int4 nseqs = (options->dbseq_num != 0) ? options->dbseq_num : 1;
3490
3491 /* If search space has been specified in the options structure, the it
3492 * must have been set in BLASTSetUpSearchEx, so don't overwrite it */
3493 if ( ! (options->searchsp_eff > 0) ) {
3494 search->searchsp_eff = BLASTCalculateSearchSpace(options, nseqs,
3495 dblen, SeqLocLen(target_seqs[i]));
3496 }
3497 sa_array[i] = B2SPssmOnTheFlyByLoc(search, target_seqs[i]);
3498 }
3499
3500 /* Clean up */
3501 B2SPssmCleanUpSearch(search, matrix);
3502 BlastSearchBlkDestruct(search);
3503
3504 return sa_array;
3505 }
3506
3507 /************************************************************************/
3508 /* END PSIBLAST2Sequences API */
3509 /************************************************************************/
3510
3511 /* Note that the matrix parameter should correspond to the full master
3512 * sequence */
3513 SeqAlignPtr LIBCALL
3514 BlastTwoSequencesByLocWithCallback(SeqLocPtr slp1, SeqLocPtr slp2, CharPtr
3515 progname, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns,
3516 ValNodePtr *error_returns, int (LIBCALLBACK
3517 *handle_results)PROTO((VoidPtr srch)), BLAST_MatrixPtr matrix)
3518 {
3519 BlastAllWordPtr all_words;
3520 BlastSearchBlkPtr search;
3521 BioseqPtr subject_bsp;
3522 Boolean complement=FALSE, reverse, reverse_forbidden, options_alloc;
3523 Int2 status;
3524 Int4 index, subject_length, num_of_cols;
3525 SeqAlignPtr seqalign=NULL;
3526 SeqLocPtr query_slp, subject_slp;
3527 SeqPortPtr spp;
3528 SPCompressPtr spc=NULL;
3529 Uint1 residue;
3530 Uint1Ptr subject_seq, subject_seq_start;
3531 Uint1Ptr *array;
3532
3533 if (slp1 == NULL || slp2 == NULL)
3534 return NULL;
3535
3536 if (error_returns)
3537 {
3538 *error_returns = NULL;
3539 }
3540
3541 if (other_returns)
3542 {
3543 *other_returns = NULL;
3544 }
3545
3546 if (progname == NULL && options == NULL)
3547 return NULL;
3548
3549 /* If filtering is performed, do not reverse the sequence.
3550 In this case the wrong sequence would be filtered. */
3551 reverse_forbidden = FALSE;
3552 if ((options && ((options->filter_string &&
3553 StringCmp(options->filter_string, "F")) ||
3554 options->is_megablast_search)) ||
3555 matrix != NULL)
3556 {
3557 reverse_forbidden = TRUE;
3558 }
3559
3560 /* Select the shorter sequence as the query, provided they are
3561 of the same type. */
3562 if ((StringCmp(progname, "blastn") && StringCmp(progname, "blastp")) ||
3563 (reverse_forbidden || SeqLocLen(slp1) < SeqLocLen(slp2)))
3564 {
3565 query_slp = slp1;
3566 subject_slp = slp2;
3567 reverse = FALSE;
3568 }
3569 else
3570 {
3571 query_slp = slp2;
3572 subject_slp = slp1;
3573 reverse = TRUE;
3574 }
3575
3576 /* Make sure strands are handled correctly */
3577 if (!StringCmp(progname, "blastn") &&
3578 SeqLocStrand(query_slp) != Seq_strand_both &&
3579 SeqLocStrand(subject_slp) == Seq_strand_both) {
3580 Change_Loc_Strand(subject_slp, SeqLocStrand(query_slp));
3581 Change_Loc_Strand(query_slp, Seq_strand_both);
3582 }
3583
3584 if (progname == NULL)
3585 {
3586 progname = options->program_name;
3587 }
3588
3589 /* If the subject strand is minus, turn it into plus for blastn. */
3590 /* Complement the other strand to keep things straight. */
3591 if (StringCmp(progname, "blastn") == 0 && SeqLocStrand(subject_slp) == Seq_strand_minus)
3592 {
3593 complement = TRUE;
3594 if(SeqLocStrand(query_slp) == Seq_strand_plus ||
3595 SeqLocStrand(query_slp) == Seq_strand_minus)
3596 SeqLocRevCmp(query_slp);
3597 SeqLocRevCmp(subject_slp);
3598 }
3599
3600 subject_seq_start = subject_seq = NULL;
3601
3602 /* Allocate default options if none are allocated yet. */
3603 options_alloc = FALSE;
3604 if (options == NULL)
3605 {
3606 options = BLASTOptionNew(progname, FALSE);
3607 options_alloc = TRUE;
3608 }
3609
3610 status = BLASTOptionValidateEx(options, progname, error_returns);
3611 if (status != 0)
3612 { /* error messages in other_returns? */
3613 return NULL;
3614 }
3615
3616 all_words = NULL;
3617
3618 subject_length = SeqLocLen(subject_slp);
3619
3620 if (!StringCmp(progname, "blastp") ||
3621 !StringCmp(progname, "blastx"))
3622 {
3623 subject_seq_start = (Uint1Ptr) MemNew(((subject_length)+2)*sizeof(Uint1));
3624 /* The first residue is the sentinel. */
3625 subject_seq_start[0] = NULLB;
3626 subject_seq = subject_seq_start+1;
3627 index = 0;
3628 spp = SeqPortNewByLoc(subject_slp, Seq_code_ncbistdaa);
3629 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
3630 {
3631 if (IS_residue(residue))
3632 {
3633 subject_seq[index] = residue;
3634 index++;
3635 }
3636 }
3637 subject_seq[index] = NULLB;
3638
3639 num_of_cols = subject_length+1-options->wordsize;
3640 all_words = BlastAllWordNew(num_of_cols, options->wordsize, FALSE, TRUE);
3641 array = (Uint1Ptr *) MemNew(num_of_cols*sizeof(Uint1Ptr));
3642 for (index=0; index<num_of_cols; index++)
3643 {
3644 array[index] = subject_seq+index;
3645 }
3646 all_words->array = array;
3647 spp = SeqPortFree(spp);
3648 if (options->gapped_calculation == TRUE)
3649 {
3650 options->two_pass_method = FALSE;
3651 options->multiple_hits_only = TRUE;
3652 }
3653 }
3654 else if (!StringCmp(progname, "blastn") ||
3655 !StringCmp(progname, "tblastn") ||
3656 !StringCmp(progname, "psitblastn") ||
3657 !StringCmp(progname, "tblastx"))
3658 {
3659 spp = SeqPortNewByLoc(subject_slp, Seq_code_ncbi4na);
3660 subject_bsp = BioseqFindCore(SeqLocId(subject_slp));
3661 if (subject_bsp != NULL && subject_bsp->repr == Seq_repr_delta)
3662 SeqPortSet_do_virtual(spp, TRUE);
3663 spc = SPCompressDNA(spp);
3664 if (spc == NULL)
3665 return NULL;
3666 subject_seq_start = subject_seq = spc->buffer;
3667 spp = SeqPortFree(spp);
3668 }
3669 else /* Impossible! */
3670 {
3671 return NULL;
3672 }
3673
3674 if (options->is_megablast_search)
3675 /* This has a different meaning in Mega BLAST and must be 0 */
3676 options->block_width = 0;
3677
3678 if (options->db_length == 0)
3679 options->db_length = subject_length;
3680
3681 options->dbseq_num = 1;
3682
3683 search = BLASTSetUpSearchByLoc(query_slp, progname, SeqLocLen(query_slp), subject_length, all_words, options, NULL);
3684
3685 if (search == NULL)
3686 return NULL;
3687
3688 if (search->query_invalid) {
3689 search = BlastSearchBlkDestruct(search);
3690 return NULL;
3691 }
3692
3693 if (!StringCmp(progname, "tblastn") ||
3694 !StringCmp(progname, "tblastx") ||
3695 !StringCmp(progname, "psitblastn")) {
3696 MemFree(search->translation_buffer);
3697 search->translation_buffer = MemNew((3+(subject_length/3))*sizeof(Uint1));
3698 search->translation_buffer_size = 1+(subject_length/3);
3699 }
3700
3701 B2SPssmSetupSearch(search, slp1, matrix);
3702
3703 search->handle_results = handle_results;
3704 search->output = options->output;
3705
3706 seqalign = BlastTwoSequencesCore(search, subject_slp, subject_seq, subject_length, reverse);
3707
3708 if (complement)
3709 {
3710 seqalign = SeqAlignListReverseStrand(seqalign);
3711 SeqLocRevCmp(query_slp);
3712 SeqLocRevCmp(subject_slp);
3713 }
3714
3715 if (spc)
3716 {
3717 SPCompressFree(spc);
3718 spc = NULL;
3719 }
3720 else
3721 {
3722 subject_seq_start = MemFree(subject_seq_start);
3723 }
3724
3725 if (search->error_return)
3726 {
3727 ValNodeLink(error_returns, search->error_return);
3728 search->error_return = NULL;
3729 }
3730
3731 if (other_returns)
3732 { /* format dbinfo etc. */
3733 *other_returns = BlastOtherReturnsPrepare(search);
3734 }
3735
3736 if (options_alloc)
3737 options = BLASTOptionDelete(options);
3738
3739 AdjustOffSetsInSeqAlign(seqalign, slp1, slp2);
3740
3741 B2SPssmCleanUpSearch(search, matrix);
3742
3743 search = BlastSearchBlkDestruct(search);
3744
3745 return seqalign;
3746 }
3747
3748 SeqAlignPtr LIBCALL
3749 BlastTwoSequencesByLoc(SeqLocPtr slp1, SeqLocPtr slp2, CharPtr progname, BLAST_OptionsBlkPtr options)
3750 {
3751 return BlastTwoSequencesByLocEx(slp1, slp2, progname, options, NULL, NULL);
3752 }
3753
3754 SeqAlignPtr LIBCALL
3755 BlastTwoSequencesEx(BioseqPtr bsp1, BioseqPtr bsp2, CharPtr progname, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns)
3756 {
3757 return BlastTwoSequencesWithCallback(bsp1, bsp2, progname, options,
3758 other_returns, error_returns, NULL);
3759 }
3760
3761 SeqAlignPtr LIBCALL
3762 BlastTwoSequencesWithCallback(BioseqPtr bsp1, BioseqPtr bsp2, CharPtr progname, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *handle_results)PROTO((VoidPtr search)))
3763 {
3764 SeqAlignPtr seqalign;
3765 SeqLocPtr slp1=NULL, slp2=NULL;
3766
3767 if (bsp1 == NULL || bsp2 == NULL)
3768 return NULL;
3769
3770 slp1 = NULL;
3771 slp2 = NULL;
3772 if (!handle_results) {
3773 ValNodeAddPointer(&slp1, SEQLOC_WHOLE,
3774 SeqIdDup(SeqIdFindBest(bsp1->id, SEQID_GI)));
3775 ValNodeAddPointer(&slp2, SEQLOC_WHOLE,
3776 SeqIdDup(SeqIdFindBest(bsp2->id, SEQID_GI)));
3777 } else {
3778 ValNodeAddPointer(&slp1, SEQLOC_WHOLE,
3779 SeqIdDup(SeqIdFindBestAccession(bsp1->id)));
3780 ValNodeAddPointer(&slp2, SEQLOC_WHOLE,
3781 SeqIdDup(SeqIdFindBestAccession(bsp2->id)));
3782 }
3783 seqalign = BlastTwoSequencesByLocWithCallback(slp1, slp2, progname,
3784 options, other_returns, error_returns, handle_results, NULL);
3785
3786 slp1 = SeqLocFree(slp1);
3787 slp2 = SeqLocFree(slp2);
3788
3789 return seqalign;
3790 }
3791
3792 SeqAlignPtr LIBCALL
3793 BlastTwoSequences(BioseqPtr bsp1, BioseqPtr bsp2, CharPtr progname, BLAST_OptionsBlkPtr options)
3794 {
3795 return BlastTwoSequencesEx(bsp1, bsp2, progname, options, NULL, NULL);
3796 }
3797
3798 /*
3799 Runs blast on the fly between the query BioseqPtr (specified with a
3800 call to BLASTSetUpSearch) and the subject BioseqPtr.
3801 */
3802
3803
3804 BlastSearchBlkPtr LIBCALL
3805 BlastSequencesOnTheFlyEx(BlastSearchBlkPtr search, BioseqPtr subject_bsp)
3806 {
3807 Int4 index, subject_length;
3808 SeqPortPtr spp;
3809 SPCompressPtr spc=NULL;
3810 Uint1Ptr subject_seq, subject_seq_start;
3811 Uint1 residue;
3812
3813 if (subject_bsp == NULL)
3814 return NULL;
3815
3816 if (search == NULL || search->query_invalid)
3817 return NULL;
3818
3819 if (!search->pbp->mb_params) {
3820 if (search->result_struct)
3821 search->result_struct =
3822 BLASTResultsStructDelete(search->result_struct);
3823 search->result_struct =
3824 BLASTResultsStructNew(search->result_size,
3825 search->pbp->max_pieces, search->pbp->hsp_range_max);
3826 } else {
3827 if (search->mb_result_struct && search->mb_result_struct[0])
3828 search->mb_result_struct[0] =
3829 BLASTResultsStructDelete(search->mb_result_struct[0]);
3830 if (!search->mb_result_struct)
3831 search->mb_result_struct = (BLASTResultsStructPtr PNTR)
3832 MemNew(sizeof(BLASTResultsStructPtr));
3833 }
3834
3835 BlastHitListPurge(search->current_hitlist);
3836
3837 subject_seq_start = subject_seq = NULL;
3838
3839 subject_length = subject_bsp->length;
3840
3841 if (StringCmp(search->prog_name, "blastp") == 0)
3842 {
3843 subject_seq_start = (Uint1Ptr) MemNew(((subject_length)+2)*sizeof(Uint1));
3844 /* The first residue is the sentinel. */
3845 subject_seq_start[0] = NULLB;
3846 subject_seq = subject_seq_start+1;
3847 index = 0;
3848 spp = SeqPortNew(subject_bsp, FIRST_RESIDUE, LAST_RESIDUE,
3849 0, Seq_code_ncbistdaa);
3850 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
3851 {
3852 if (IS_residue(residue))
3853 {
3854 subject_seq[index] = residue;
3855 index++;
3856 }
3857 }
3858 subject_seq[index] = NULLB;
3859 spp = SeqPortFree(spp);
3860 }
3861 else if (StringCmp(search->prog_name, "blastn") == 0)
3862 {
3863 spp = SeqPortNew(subject_bsp, FIRST_RESIDUE, LAST_RESIDUE,
3864 0, Seq_code_ncbi4na);
3865 spc = SPCompressDNA(spp);
3866 subject_seq = spc->buffer;
3867 spp = SeqPortFree(spp);
3868 }
3869 else
3870 {
3871 return NULL;
3872 }
3873
3874 BlastTwoSequencesCoreEx(search, subject_bsp, subject_seq,
3875 subject_length);
3876
3877 if (spc)
3878 {
3879 SPCompressFree(spc);
3880 spc = NULL;
3881 }
3882 else
3883 {
3884 subject_seq_start = MemFree(subject_seq_start);
3885 }
3886
3887 return search;
3888 }
3889
3890 SeqAlignPtr LIBCALL
3891 BlastSequencesOnTheFlyByLoc(BlastSearchBlkPtr search, SeqLocPtr subject_slp)
3892 {
3893 Int4 index, subject_length;
3894 SeqAlignPtr seqalign=NULL;
3895 SeqPortPtr spp;
3896 SPCompressPtr spc=NULL;
3897 Uint1Ptr subject_seq, subject_seq_start;
3898 Uint1 residue;
3899
3900 if (subject_slp == NULL)
3901 return NULL;
3902
3903 if (search == NULL || search->query_invalid)
3904 return NULL;
3905
3906
3907 if (!search->pbp->mb_params) {
3908 if (search->result_struct)
3909 search->result_struct = BLASTResultsStructDelete(search->result_struct);
3910 search->result_struct =
3911 BLASTResultsStructNew(search->result_size,
3912 search->pbp->max_pieces, search->pbp->hsp_range_max);
3913 } else {
3914 if (search->mb_result_struct && search->mb_result_struct[0])
3915 search->mb_result_struct[0] =
3916 BLASTResultsStructDelete(search->mb_result_struct[0]);
3917 if (!search->mb_result_struct)
3918 search->mb_result_struct = (BLASTResultsStructPtr PNTR)
3919 MemNew(sizeof(BLASTResultsStructPtr));
3920 }
3921 BlastHitListPurge(search->current_hitlist);
3922
3923 subject_seq_start = subject_seq = NULL;
3924
3925 subject_length = SeqLocLen(subject_slp);
3926
3927 if (StringCmp(search->prog_name, "blastp") == 0)
3928 {
3929 subject_seq_start = (Uint1Ptr) MemNew(((subject_length)+2)*sizeof(Uint1));
3930 /* The first residue is the sentinel. */
3931 subject_seq_start[0] = NULLB;
3932 subject_seq = subject_seq_start+1;
3933 index = 0;
3934 spp = SeqPortNewByLoc(subject_slp, Seq_code_ncbistdaa);
3935 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
3936 {
3937 if (IS_residue(residue))
3938 {
3939 subject_seq[index] = residue;
3940 index++;
3941 }
3942 }
3943 subject_seq[index] = NULLB;
3944 spp = SeqPortFree(spp);
3945 }
3946 else if (StringCmp(search->prog_name, "blastn") == 0)
3947 {
3948 spp = SeqPortNewByLoc(subject_slp, Seq_code_ncbi4na);
3949 spc = SPCompressDNA(spp);
3950 subject_seq = spc->buffer;
3951 spp = SeqPortFree(spp);
3952 }
3953 else
3954 {
3955 return NULL;
3956 }
3957
3958 seqalign = BlastTwoSequencesCore(search, subject_slp, subject_seq, subject_length, FALSE);
3959
3960 if (spc)
3961 {
3962 SPCompressFree(spc);
3963 spc = NULL;
3964 }
3965 else
3966 {
3967 subject_seq_start = MemFree(subject_seq_start);
3968 }
3969
3970 AdjustOffSetsInSeqAlign(seqalign, search->query_slp, subject_slp);
3971
3972 return seqalign;
3973 }
3974
3975 SeqAlignPtr LIBCALL
3976 BlastSequencesOnTheFly(BlastSearchBlkPtr search, BioseqPtr subject_bsp)
3977 {
3978 SeqAlignPtr seqalign;
3979 SeqLocPtr slp;
3980
3981 slp = NULL;
3982 ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(subject_bsp->id, SEQID_GI)));
3983 seqalign = BlastSequencesOnTheFlyByLoc(search, slp);
3984 SeqLocFree(slp);
3985 return seqalign;
3986 }
3987 /*
3988 Translate a nucleotide sequence without ambiguity codes.
3989 This is used for the first-pass translation of the database.
3990
3991 BlastSearchBlkPtr search: overall BLAST structure.
3992 Int4 length: length of the nucl. sequence
3993 Uint1Ptr prot_seq: the (translated) protein sequence, with NULLB
3994 sentinels on either end. This array should be allocated
3995 with sufficient memory before the function is called.
3996 Uint1Ptr nt_seq: the original nucl. sequence.
3997
3998 The genetic code to be used is determined by the translation_table
3999 on the BlastSearchBlkPtr.
4000
4001 This function translates a packed (ncbi2na) nucl. alphabet. It
4002 views a basepair as being in one of four sets of 2-bits:
4003
4004 |0|1|2|3||0|1|2|3||0|1|2|3||...
4005
4006 1st byte | 2 byte | 3rd byte...
4007
4008 A codon that starts at the beginning of the above sequence starts in
4009 state "0" and includes basepairs 0, 1, and 2. The next codon, in the
4010 same frame, after that starts in state "3" and includes 3, 0, and 1.
4011
4012 ** Optimization:
4013 changed the single main loop to
4014 - advance to state 0,
4015 - optimized inner loop does two (3 byte->4 codon) translation per iteration
4016 (loads are moved earlier so they can be done in advance.)
4017 - do remainder
4018 */
4019
4020 Int4 LIBCALL
4021 BlastTranslateUnambiguousSequence(BlastSearchBlkPtr search, Int4 length, Uint1Ptr prot_seq, Uint1Ptr nt_seq, Int2 frame)
4022
4023 {
4024 register int state;
4025 Int2 total_remainder;
4026 Int4 prot_length;
4027 register int byte_value, codon=0;
4028 Uint1 last_remainder, last_byte, remainder;
4029 register Uint1Ptr translation, nt_seq_end, nt_seq_start;
4030 Uint1Ptr prot_seq_start;
4031 int byte_value1,byte_value2,byte_value3,byte_value4,byte_value5;
4032
4033 prot_length=0;
4034 if (nt_seq == NULL || prot_seq == NULL || (length-ABS(frame)+1) < CODON_LENGTH)
4035 return prot_length;
4036
4037 *prot_seq = NULLB;
4038 prot_seq++;
4039
4040 /* record to determine protein length. */
4041 prot_seq_start = prot_seq;
4042
4043 if (frame > 0)
4044 translation = search->translation_table;
4045 else
4046 translation = search->translation_table_rc;
4047
4048 remainder = length%4;
4049
4050 if (frame > 0)
4051 {
4052 nt_seq_end = nt_seq + (length)/4 - 1;
4053 last_remainder = (4*(length/4) - frame + 1)%CODON_LENGTH;
4054 total_remainder = last_remainder+remainder;
4055
4056 state = frame-1;
4057 byte_value = *nt_seq;
4058
4059 /* If there's lots to do, advance to state 0, then enter fast loop */
4060 while (nt_seq < nt_seq_end)
4061 {
4062 switch (state)
4063 {
4064 case 0:
4065 codon = (byte_value >> 2);
4066 *prot_seq = translation[codon];
4067 prot_seq++;
4068 /* do state = 3 now, break is NOT missing. */
4069 case 3:
4070 codon = ((byte_value & 3) << 4);
4071 nt_seq++;
4072 byte_value = *nt_seq;
4073 codon += (byte_value >> 4);
4074 *prot_seq = translation[codon];
4075 prot_seq++;
4076 if (nt_seq >= nt_seq_end)
4077 {
4078 state = 2;
4079 break;
4080 }
4081 /* Go on to state = 2 if not at end. */
4082 case 2:
4083 codon = ((byte_value & 15) << 2);
4084 nt_seq++;
4085 byte_value = *nt_seq;
4086 codon += (byte_value >> 6);
4087 *prot_seq = translation[codon];
4088 prot_seq++;
4089 if (nt_seq >= nt_seq_end)
4090 {
4091 state = 1;
4092 break;
4093 }
4094 /* Go on to state = 1 if not at end. */
4095 case 1:
4096 codon = byte_value & 63;
4097 *prot_seq = translation[codon];
4098 prot_seq++;
4099 nt_seq++;
4100 byte_value = *nt_seq;
4101 state = 0;
4102 break;
4103 } /* end switch */
4104 /* switch ends at state 0, except when at end */
4105
4106
4107 /********************************************/
4108 /* optimized loop: start in state 0. continue til near end */
4109 while (nt_seq < (nt_seq_end-10))
4110 {
4111 byte_value1 = *(++nt_seq);
4112 byte_value2 = *(++nt_seq);
4113 byte_value3 = *(++nt_seq);
4114 /* case 0: */
4115 codon = (byte_value >> 2);
4116 *prot_seq = translation[codon];
4117 prot_seq++;
4118
4119 /* case 3: */
4120 codon = ((byte_value & 3) << 4);
4121 codon += (byte_value1 >> 4);
4122 *prot_seq = translation[codon];
4123 prot_seq++;
4124
4125 byte_value4 = *(++nt_seq);
4126 /* case 2: */
4127 codon = ((byte_value1 & 15) << 2);
4128
4129 codon += (byte_value2 >> 6);
4130 *prot_seq = translation[codon];
4131 prot_seq++;
4132 /* case 1: */
4133 codon = byte_value2 & 63;
4134 byte_value5 = *(++nt_seq);
4135 *prot_seq = translation[codon];
4136 prot_seq++;
4137
4138 /* case 0: */
4139 codon = (byte_value3 >> 2);
4140 *prot_seq = translation[codon];
4141 prot_seq++;
4142 /* case 3: */
4143 byte_value = *(++nt_seq);
4144 codon = ((byte_value3 & 3) << 4);
4145 codon += (byte_value4 >> 4);
4146 *prot_seq = translation[codon];
4147 prot_seq++;
4148 /* case 2: */
4149 codon = ((byte_value4 & 15) << 2);
4150 codon += (byte_value5 >> 6);
4151 *prot_seq = translation[codon];
4152 prot_seq++;
4153 /* case 1: */
4154 codon = byte_value5 & 63;
4155 *prot_seq = translation[codon];
4156 prot_seq++;
4157 state=0;
4158 } /* end optimized while */
4159 /********************************************/
4160 } /* end while */
4161
4162
4163 if (state == 1)
4164 {
4165 /* This doesn't get done above, DON't do the state = 0
4166 below if this is done. */
4167 byte_value = *nt_seq;
4168 codon = byte_value & 63;
4169 state = 0;
4170 *prot_seq = translation[codon];
4171 prot_seq++;
4172 }
4173 else if (state == 0)
4174 { /* This one doesn't get done above. */
4175 byte_value = *nt_seq;
4176 codon = ((byte_value) >> 2);
4177 state = 3;
4178 *prot_seq = translation[codon];
4179 prot_seq++;
4180 }
4181
4182 if (total_remainder >= CODON_LENGTH)
4183 {
4184 byte_value = *(nt_seq_end);
4185 last_byte = *(nt_seq_end+1);
4186 if (state == 0)
4187 {
4188 codon = (last_byte >> 2);
4189 }
4190 else if (state == 2)
4191 {
4192 codon = ((byte_value & 15) << 2);
4193 codon += (last_byte >> 6);
4194 }
4195 else if (state == 3)
4196 {
4197 codon = ((byte_value & 3) << 4);
4198 codon += (last_byte >> 4);
4199 }
4200 *prot_seq = translation[codon];
4201 prot_seq++;
4202 }
4203 *prot_seq = NULLB;
4204 }
4205 else
4206 {
4207 nt_seq_start = nt_seq;
4208 nt_seq += length/4;
4209 state = remainder+frame;
4210 /* Do we start in the last byte? This one has the lowest order
4211 bits set to represent the remainder, hence the odd coding here. */
4212 if (state >= 0)
4213 {
4214 last_byte = *nt_seq;
4215 nt_seq--;
4216 if (state == 0)
4217 {
4218 codon = (last_byte >> 6);
4219 byte_value = *nt_seq;
4220 codon += ((byte_value & 15) << 2);
4221 state = 1;
4222 }
4223 else if (state == 1)
4224 {
4225 codon = (last_byte >> 4);
4226 byte_value = *nt_seq;
4227 codon += ((byte_value & 3) << 4);
4228 state = 2;
4229 }
4230 else if (state == 2)
4231 {
4232 codon = (last_byte >> 2);
4233 state = 3;
4234 }
4235 *prot_seq = translation[codon];
4236 prot_seq++;
4237
4238 }
4239 else
4240 {
4241 state = 3 + (remainder + frame + 1);
4242 nt_seq--;
4243 }
4244
4245 byte_value = *nt_seq;
4246
4247 /* If there's lots to do, advance to state 3, then enter fast loop */
4248 while (nt_seq > nt_seq_start)
4249 {
4250 switch (state)
4251 {
4252 case 3:
4253 codon = (byte_value & 63);
4254 *prot_seq = translation[codon];
4255 prot_seq++;
4256 /* do state = 0 now, break is NOT missing. */
4257 case 0:
4258 codon = (byte_value >> 6);
4259 nt_seq--;
4260 byte_value = *nt_seq;
4261 codon += ((byte_value & 15) << 2);
4262 *prot_seq = translation[codon];
4263 prot_seq++;
4264 if (nt_seq <= nt_seq_start)
4265 {
4266 state = 1;
4267 break;
4268 }
4269 /* Go on to state = 2 if not at end. */
4270 case 1:
4271 codon = (byte_value >> 4);
4272 nt_seq--;
4273 byte_value = *nt_seq;
4274 codon += ((byte_value & 3) << 4);
4275 *prot_seq = translation[codon];
4276 prot_seq++;
4277 if (nt_seq <= nt_seq_start)
4278 {
4279 state = 2;
4280 break;
4281 }
4282 /* Go on to state = 2 if not at end. */
4283 case 2:
4284 codon = (byte_value >> 2);
4285 *prot_seq = translation[codon];
4286 prot_seq++;
4287 nt_seq--;
4288 byte_value = *nt_seq;
4289 state = 3;
4290 break;
4291 } /* end switch */
4292 /* switch ends at state 3, except when at end */
4293
4294
4295 /********************************************/
4296 /* optimized area: start in state 0. continue til near end */
4297 while (nt_seq > (nt_seq_start+10))
4298 {
4299 byte_value1 = *(--nt_seq);
4300 byte_value2 = *(--nt_seq);
4301 byte_value3 = *(--nt_seq);
4302
4303 codon = (byte_value & 63);
4304 *prot_seq = translation[codon];
4305 prot_seq++;
4306 codon = (byte_value >> 6);
4307 codon += ((byte_value1 & 15) << 2);
4308 *prot_seq = translation[codon];
4309 prot_seq++;
4310 byte_value4 = *(--nt_seq);
4311 codon = (byte_value1 >> 4);
4312 codon += ((byte_value2 & 3) << 4);
4313 *prot_seq = translation[codon];
4314 prot_seq++;
4315 codon = (byte_value2 >> 2);
4316 *prot_seq = translation[codon];
4317 prot_seq++;
4318 byte_value5 = *(--nt_seq);
4319
4320 codon = (byte_value3 & 63);
4321 *prot_seq = translation[codon];
4322 prot_seq++;
4323 byte_value = *(--nt_seq);
4324 codon = (byte_value3 >> 6);
4325 codon += ((byte_value4 & 15) << 2);
4326 *prot_seq = translation[codon];
4327 prot_seq++;
4328 codon = (byte_value4 >> 4);
4329 codon += ((byte_value5 & 3) << 4);
4330 *prot_seq = translation[codon];
4331 prot_seq++;
4332 codon = (byte_value5 >> 2);
4333 *prot_seq = translation[codon];
4334 prot_seq++;
4335 } /* end optimized while */
4336 /********************************************/
4337
4338 } /* end while */
4339
4340 byte_value = *nt_seq;
4341 if (state == 3)
4342 {
4343 codon = (byte_value & 63);
4344 *prot_seq = translation[codon];
4345 prot_seq++;
4346 }
4347 else if (state == 2)
4348 {
4349 codon = (byte_value >> 2);
4350 *prot_seq = translation[codon];
4351 prot_seq++;
4352 }
4353 }
4354
4355 *prot_seq = NULLB;
4356
4357 return (prot_seq - prot_seq_start);
4358 } /* BlastTranslateUnambiguousSequence */
4359
4360
4361
4362 /*
4363 Gets an appropriate ID for the database (subject) sequence.
4364 Int4 hit_number is the index into the BLASTResultHitlistPtr,
4365 Boolean ordinal_number specifies whether an ordinal number (the
4366 db sequence number) or a real ID should be used.
4367 */
4368 SeqIdPtr LIBCALL
4369 BlastGetSubjectIdEx(BlastSearchBlkPtr search, Int4 hit_number, Boolean ordinal_number, ValNodePtr *vnpp, Int2 query_number)
4370 {
4371 BLASTResultHitlistPtr results;
4372 DbtagPtr dbtagptr;
4373 ObjectIdPtr obidp;
4374 SeqIdPtr subject_id=NULL, sip;
4375 Uint4 header;
4376 BLASTResultsStructPtr result_struct;
4377
4378 if (search->pbp->mb_params)
4379 result_struct = search->mb_result_struct[query_number];
4380 else
4381 result_struct = search->result_struct;
4382
4383 results = result_struct->results[hit_number];
4384 if (ordinal_number) {
4385
4386 obidp = ObjectIdNew();
4387 obidp->str = NULL;
4388 obidp->id = results->subject_id;
4389 dbtagptr = DbtagNew();
4390 if (search->rdfp) {
4391 dbtagptr->db = StringSave(search->rdfp->filename);
4392 }
4393 dbtagptr->tag = obidp;
4394 ValNodeAddPointer(&subject_id, SEQID_GENERAL, dbtagptr);
4395 } else if (search->rdfp) {
4396 if (vnpp == NULL) {
4397 readdb_get_descriptor(search->rdfp, results->subject_id, &subject_id, NULL);
4398 } else {
4399 header = 0;
4400 sip = NULL;
4401
4402 if(search->rdfp->formatdb_ver == FORMATDB_VER_TEXT) {
4403 while (readdb_get_header(search->rdfp, results->subject_id, &header, &sip, NULL) == TRUE)
4404 ValNodeAddPointer(vnpp, 0, sip);
4405 } else {
4406 BlastDefLinePtr bdfp, bdfp_head;
4407
4408 bdfp_head = FDReadDeflineAsn(search->rdfp, results->subject_id);
4409
4410 if(bdfp_head == NULL) {
4411 ErrPostEx(SEV_ERROR, 0, 0, "Failure to read defline ASN for %d", results->subject_id);
4412 return NULL;
4413 }
4414
4415 for(bdfp = bdfp_head; bdfp != NULL; bdfp = bdfp->next) {
4416 sip = SeqIdSetDup(bdfp->seqid);
4417 ValNodeAddPointer(vnpp, 0, sip);
4418 }
4419
4420 BlastDefLineSetFree(bdfp_head);
4421 }
4422 }
4423 } else {
4424 if (results->subject_info)
4425 subject_id = SeqIdDup(results->subject_info->sip);
4426 }
4427
4428 return subject_id;
4429 }
4430
4431 SeqIdPtr LIBCALL
4432 BlastGetSubjectId(BlastSearchBlkPtr search, Int4 hit_number, Boolean ordinal_number, ValNodePtr *vnpp)
4433 {
4434 return BlastGetSubjectIdEx(search, hit_number, ordinal_number, vnpp, 0);
4435 }
4436
4437 /*
4438 Use by HeapSort (in BioseqBlastEngine) to rank Hitlist's.
4439 */
4440
4441 int LIBCALLBACK
4442 evalue_compare_hits(VoidPtr v1, VoidPtr v2)
4443
4444 {
4445 BLASTResultHitlistPtr h1, h2;
4446 BLASTResultHitlistPtr *hp1, *hp2;
4447
4448 hp1 = (BLASTResultHitlistPtr *) v1;
4449 hp2 = (BLASTResultHitlistPtr *) v2;
4450 h1 = *hp1;
4451 h2 = *hp2;
4452
4453 /* Sort first by evalue, then by score in case all evalues are zero. */
4454
4455 if (h1->best_evalue < h2->best_evalue)
4456 return -1;
4457 if (h1->best_evalue > h2->best_evalue)
4458 return 1;
4459 if (h1->high_score > h2->high_score)
4460 return -1;
4461 if (h1->high_score < h2->high_score)
4462 return 1;
4463
4464 /* In case of equal scores and E-values order will be determined by
4465 subject id */
4466
4467 if (h1->subject_id > h2->subject_id)
4468 return -1;
4469 if (h1->subject_id < h2->subject_id)
4470 return 1;
4471
4472 return 0;
4473 }
4474
4475 /* Code in BLAST_CLUSTER_HITS is not currently in use */
4476
4477 #ifdef BLAST_CLUSTER_HITS
4478 typedef struct _blast_result_with_subject_id {
4479 BLASTResultHspPtr hsp;
4480 Int4 hitlist_index, hsp_index;
4481 } BlastResultHspWithId, PNTR BlastResultHspWithIdPtr;
4482
4483 static int LIBCALLBACK BLASTResultHspScoreCmp(VoidPtr v1, VoidPtr v2)
4484 {
4485 BLASTResultHspPtr h1, h2;
4486
4487 h1 = (*(BlastResultHspWithIdPtr PNTR) v1)->hsp;
4488 h2 = (*(BlastResultHspWithIdPtr PNTR) v2)->hsp;
4489
4490 if (h1->score < h2->score)
4491 return 1;
4492 else if (h1->score > h2->score)
4493 return -1;
4494 else return 0;
4495 }
4496
4497 static int LIBCALLBACK ResultHspWithIdIndexCmp(VoidPtr v1, VoidPtr v2)
4498 {
4499 BlastResultHspWithIdPtr h1, h2;
4500
4501 h1 = *(BlastResultHspWithIdPtr PNTR) v1;
4502 h2 = *(BlastResultHspWithIdPtr PNTR) v2;
4503
4504 if (h1->hitlist_index < h2->hitlist_index)
4505 return -1;
4506 else if (h1->hitlist_index > h2->hitlist_index)
4507 return 1;
4508 else if (h1->hsp_index < h2->hsp_index)
4509 return -1;
4510 else if (h1->hsp_index > h2->hsp_index)
4511 return 1;
4512 else /* Should never happen */
4513 return 0;
4514 }
4515 #endif
4516
4517 #define CLUSTER_LENGTH_THRESH 0.1
4518 #define CLUSTER_OVERLAP_THRESH 0.9
4519 #define CLUSTER_SCORE_THRESH 1.6
4520
4521 static Nlm_FloatHi
4522 s_ComputeAverageLength(const BlastSearchBlk* search)
4523 {
4524 Nlm_FloatHi retval = 0.0;
4525
4526 if (StringCmp(search->prog_name, "blastn") != 0) {
4527 retval = BLAST_AA_AVGLEN;
4528 } else {
4529 retval = BLAST_NT_AVGLEN;
4530 }
4531
4532 if (search->rdfp) {
4533 Int4 total_number = 0;
4534 Int8 total_length = 0;
4535
4536 readdb_get_totals(search->rdfp, &total_length, &total_number);
4537 if (total_number > 0)
4538 retval = ((Nlm_FloatHi) total_length)/total_number;
4539 } else if (search->dblen > 0 && search->dbseq_num == 1) {
4540 retval = search->dblen;
4541 }
4542
4543 return retval;
4544 }
4545
4546 SeqAlignPtr LIBCALL
4547 BioseqBlastEngineCore(BlastSearchBlkPtr search, BLAST_OptionsBlkPtr options,
4548 Int4Ptr *pos_matrix)
4549 {
4550 Int4 hitlist_max;
4551 SeqAlignPtr head, seqalign;
4552 #ifdef BLAST_CLUSTER_HITS
4553 BLASTResultHspPtr hsp, hsp1;
4554 BlastResultHspWithIdPtr PNTR hspp;
4555 BLASTResultsStructPtr result_struct;
4556 BLASTResultHitlistPtr result_hitlist;
4557 Int4 hspcnt, index, index1, index2;
4558 Int4 q_overlap;
4559 BioseqPtr bsp1, bsp2, PNTR bspp;
4560 BlastSearchBlkPtr search1;
4561 BLAST_KarlinBlkPtr kbp;
4562 FloatHi bit_score;
4563 #endif
4564
4565 head = seqalign = NULL;
4566
4567 if (search == NULL || search->query_invalid)
4568 return NULL;
4569
4570 /* If pos_matrix is not NULL, then psi-blast iterations are being
4571 performed. The first psi-blast iteration should be with normal
4572 blast. */
4573 if (pos_matrix)
4574 {
4575 search->sbp->posMatrix = pos_matrix;
4576 search->positionBased = TRUE;
4577 search->sbp->kbp = search->sbp->kbp_psi;
4578 search->sbp->kbp_gap = search->sbp->kbp_gap_psi;
4579 hitlist_max = search->result_struct->hitlist_max;
4580 search->result_struct = BLASTResultsStructDelete(search->result_struct);
4581 search->result_struct = BLASTResultsStructNew(hitlist_max, search->pbp->max_pieces, search->pbp->hsp_range_max);
4582 if (search->allocated & BLAST_SEARCH_ALLOC_WFP_FIRST)
4583 {
4584 search->wfp_first = BLAST_WordFinderDestruct(search->wfp_first);
4585 search->wfp_first = BLAST_WordFinderNew(search->sbp->alphabet_size,options->wordsize,1, FALSE);
4586 }
4587
4588 if (search->allocated & BLAST_SEARCH_ALLOC_WFP_SECOND)
4589 {
4590 search->wfp_second = BLAST_WordFinderDestruct(search->wfp_second);
4591 search->wfp_second = BLAST_WordFinderNew(search->sbp->alphabet_size,options->wordsize,1, FALSE);
4592 }
4593
4594
4595 /* Only find words once if thresholds are the same. */
4596 search->wfp = search->wfp_first;
4597 if (search->whole_query == TRUE) {
4598 BlastNewFindWords(search, 0, search->context[search->first_context].query->length, search->pbp->threshold_second, (Uint1) 0);
4599 } else {
4600 BlastNewFindWords(search, search->required_start, search->required_end, search->pbp->threshold_second, (Uint1) 0);
4601 }
4602 lookup_position_aux_destruct(search->wfp->lookup);
4603 search->wfp_second = search->wfp_first;
4604
4605 /* Unless search->pbp->cutoff_s[2]_set is set, we wish to calculate
4606 cutoff_s[2] from cutoff_e[2], rather than the other way around.
4607 Setting cutoff_s[2] to zero, as was the case in the first call to
4608 blast_set_parameters, accomplishes this.
4609 */
4610 if (!search->pbp->cutoff_s_set) {
4611 search->pbp->cutoff_s = 0;
4612 }
4613 if (!search->pbp->cutoff_s2_set) {
4614 search->pbp->cutoff_s2 = 0;
4615 }
4616 /* recalculate the cutoff scores with the newly calculated
4617 Karlin-Altschul parameters. */
4618 blast_set_parameters(search,
4619 options->dropoff_1st_pass,
4620 options->dropoff_2nd_pass,
4621 s_ComputeAverageLength(search),
4622 search->searchsp_eff,
4623 options->window_size);
4624 }
4625
4626 /* Starting awake thread if multithreaded. */
4627 if (search->searchsp_eff > AWAKE_THR_MIN_SIZE)
4628 BlastStartAwakeThread(search->thr_info);
4629
4630 /* THE BLAST SEARCH IS HERE */
4631 do_the_blast_run(search);
4632
4633 #ifdef BLAST_CLUSTER_HITS
4634 if (!search->pbp->mb_params) {
4635 /* Cluster hits by region within the query */
4636 /* Assume that hits are already sorted in each hitlist by score */
4637 ValNodePtr mask;
4638 result_struct = search->result_struct;
4639 hspcnt = 0;
4640 /* Collect all HSPs in one array */
4641
4642 bspp = (BioseqPtr PNTR) Malloc(result_struct->hitlist_count*
4643 sizeof(BioseqPtr));
4644 for (index=0; index<result_struct->hitlist_count; index++) {
4645 hspcnt += result_struct->results[index]->hspcnt;
4646 bspp[index] = readdb_get_bioseq(search->rdfp,
4647 result_struct->results[index]->subject_id);
4648 }
4649
4650 hspp = (BlastResultHspWithIdPtr PNTR)
4651 Malloc(hspcnt*sizeof(BlastResultHspWithIdPtr));
4652 index2 = 0;
4653 for (index=0; index<result_struct->hitlist_count; index++) {
4654 result_hitlist = result_struct->results[index];
4655 for (index1=0; index1<result_hitlist->hspcnt; index1++) {
4656 hspp[index2] = (BlastResultHspWithIdPtr)
4657 Malloc(sizeof(BlastResultHspWithId));
4658 hspp[index2]->hitlist_index = index;
4659 hspp[index2]->hsp_index = index1;
4660 hspp[index2++]->hsp = &(result_hitlist->hsp_array[index1]);
4661 }
4662 }
4663 /* Sort by score */
4664 HeapSort((VoidPtr)hspp, hspcnt, sizeof(BLASTResultHspPtr),
4665 BLASTResultHspScoreCmp);
4666 index = 0;
4667 while (index<hspcnt) {
4668 hsp = hspp[index]->hsp;
4669 index2 = 0;
4670
4671 result_hitlist =
4672 search->result_struct->results[hspp[index]->hitlist_index];
4673 bsp1 = bspp[hspp[index]->hitlist_index];
4674
4675 search1 =
4676 BlastQuerySequenceSetUp(bsp1, search->prog_name,
4677 options);
4678 for (index1=index+1; index1<hspcnt; index1++) {
4679 /* Check if the next hit passes a simple test to be a
4680 candidate to belong to this cluster */
4681 if (hspp[index1]->hsp==NULL)
4682 continue;
4683 hsp1 = hspp[index1]->hsp;
4684 result_hitlist =
4685 search->result_struct->results[hspp[index1]->hitlist_index];
4686 bsp2 = bspp[hspp[index1]->hitlist_index];
4687 if (((FloatHi)ABS(bsp1->length - bsp2->length)) /
4688 MIN(bsp1->length, bsp2->length) > CLUSTER_LENGTH_THRESH)
4689 continue;
4690 q_overlap =
4691 MIN(hsp->query_offset+hsp->query_length,
4692 hsp1->query_offset+hsp1->query_length) -
4693 MAX(hsp->query_offset, hsp1->query_offset);
4694 if (((FloatHi)q_overlap) /
4695 MAX(hsp->query_length, hsp1->query_length) <
4696 CLUSTER_OVERLAP_THRESH)
4697 continue;
4698
4699 /* We have a candidate for attaching to the cluster */
4700 if (hspp[index]->hitlist_index == hspp[index1]->hitlist_index) {
4701 /* Almost identical hit from same subject in the same
4702 area of the query - remove! */
4703 result_hitlist =
4704 search->result_struct->results[hspp[index1]->hitlist_index];
4705 hspp[index1]->hsp = NULL;
4706 }
4707
4708 /* Do the two sequences search to determine whether this
4709 candidate in fact belongs to this cluster */
4710 search1 = BlastSequencesOnTheFlyEx(search1, bsp2);
4711
4712 if (search1 && search1->result_struct->results[0]) {
4713 if (search1->pbp->gapped_calculation)
4714 kbp = search1->sbp->kbp_gap[search1->first_context];
4715 else
4716 kbp = search1->sbp->kbp[search1->first_context];
4717 bit_score = ((search1->result_struct->results[0]->high_score *
4718 kbp->Lambda) - kbp->logK)/NCBIMATH_LN2;
4719 if (bit_score > CLUSTER_SCORE_THRESH *
4720 MAX(bsp1->length, bsp2->length)) {
4721 /* remove the respective hit */
4722 hspp[index1]->hsp = NULL;
4723 }
4724 }
4725 }
4726 mask = search1->mask;
4727 while (mask) {
4728 SeqLocSetFree(mask->data.ptrvalue);
4729 mask = mask->next;
4730 }
4731 ValNodeFree(search1->mask);
4732 search1 = BlastSearchBlkDestruct(search1);
4733 for (++index; index<hspcnt && hspp[index]->hsp==NULL; index++);
4734 }
4735
4736 for (index=0; index<result_struct->hitlist_count; index++)
4737 BioseqFree(bspp[index]);
4738 MemFree(bspp);
4739 /* Remove all NULLs from hspp array */
4740 for (index=0, index1=0; index<hspcnt; index++) {
4741 if (hspp[index]->hsp != NULL) {
4742 if (index != index1)
4743 hspp[index1] = hspp[index];
4744 index1++;
4745 } else
4746 hspp[index] = MemFree(hspp[index]);
4747 }
4748 hspcnt = index1;
4749 /* Sort according to original hitlist and hsp indices */
4750 HeapSort((VoidPtr)hspp, hspcnt, sizeof(BLASTResultHspPtr),
4751 ResultHspWithIdIndexCmp);
4752
4753 /* Rearrange the hsp_arrays for all hitlists */
4754 index = 0;
4755 for (index2=0; index2<result_struct->hitlist_count; index2++) {
4756 index1 = 0;
4757 while (index<hspcnt && hspp[index]->hitlist_index == index2) {
4758 result_struct->results[index2]->hsp_array[index1] =
4759 *(hspp[index]->hsp);
4760 index++;
4761 index1++;
4762 }
4763 result_struct->results[index2]->hspcnt = index1;
4764 }
4765
4766 for (index=0; index<hspcnt; index++)
4767 hspp[index] = MemFree(hspp[index]);
4768 hspp = MemFree(hspp);
4769 }
4770 #endif /* Clustering hits */
4771
4772 if (options->no_traceback) {
4773 BlastStopAwakeThread(search->thr_info);
4774 return NULL;
4775 }
4776
4777 BLASTPostSearchLogic(search, options, &head, TRUE);
4778
4779 /* Stop the awake thread. */
4780 BlastStopAwakeThread(search->thr_info);
4781
4782 return head;
4783 }
4784
4785 /*
4786 Deallocates all memory involved with the BlastHitRangePtr.
4787 */
4788
4789 BlastHitRangePtr LIBCALL
4790 BlastHitRangeDestruct(BlastHitRangePtr old)
4791
4792 {
4793 if (old == NULL)
4794 return NULL;
4795
4796 MemFree(old->range_list);
4797 MemFree(old->range_list_pointer);
4798
4799 return MemFree(old);
4800 }
4801
4802 /*
4803 Allocates a a BlastHitRangePtr, with two 'total'
4804 BlastDoubleInt4Ptr's.
4805 */
4806
4807 BlastHitRangePtr LIBCALL
4808 BlastHitRangeNew(Int4 total)
4809
4810 {
4811 BlastHitRangePtr bhrp;
4812 Int4 index;
4813
4814 bhrp = MemNew(sizeof(BlastHitRange));
4815
4816 bhrp->range_list = (BlastDoubleInt4Ptr) MemNew(total*sizeof(BlastDoubleInt4));
4817 bhrp->range_list_pointer = (BlastDoubleInt4Ptr PNTR) MemNew(total*sizeof(BlastDoubleInt4Ptr));
4818 for (index=0; index<total; index++)
4819 {
4820 bhrp->range_list_pointer[index] = &(bhrp->range_list[index]);
4821 }
4822
4823 bhrp->current = 0;
4824 bhrp->total = total;
4825
4826 return bhrp;
4827 }
4828
4829 static int LIBCALLBACK
4830 bhrp_compare(VoidPtr v1, VoidPtr v2)
4831
4832 {
4833 BlastDoubleInt4Ptr h1, h2;
4834 BlastDoubleInt4Ptr *hp1, *hp2;
4835
4836 hp1 = (BlastDoubleInt4Ptr PNTR) v1;
4837 hp2 = (BlastDoubleInt4Ptr PNTR) v2;
4838 h1 = *hp1;
4839 h2 = *hp2;
4840
4841 if (h1->gi < h2->gi)
4842 return -1;
4843 if (h1->gi > h2->gi)
4844 return 1;
4845
4846 return 0;
4847 }
4848
4849 BlastHitRangePtr LIBCALL
4850 BioseqHitRangeEngineCore(BlastSearchBlkPtr search, BLAST_OptionsBlkPtr options)
4851
4852 {
4853 BlastHitRangePtr bhrp=NULL;
4854 BLASTResultsStructPtr result_struct;
4855 Int4 hitlist_count, index, total_hsps;
4856 Int4 sequence_length, length;
4857 Uint1Ptr sequence;
4858
4859 if (search == NULL || search->query_invalid)
4860 return NULL;
4861
4862 /* Starting awake thread if multithreaded. */
4863 if (search->searchsp_eff > AWAKE_THR_MIN_SIZE)
4864 BlastStartAwakeThread(search->thr_info);
4865
4866 do_the_blast_run(search);
4867
4868 if (search->prog_number==blast_type_blastn) {
4869 /* Unconcatenate the strands by adjusting the query offsets in
4870 all hsps */
4871 search->context[search->first_context].query->length =
4872 search->query_context_offsets[search->first_context+1] - 1;
4873 /*BlastAdjustHitOffsets(search);*/
4874 }
4875
4876 if (StringCmp(search->prog_name, "blastn") == 0 &&
4877 search->pbp->gapped_calculation)
4878 {
4879 search->pbp->gap_open = options->gap_open;
4880 search->pbp->gap_extend = options->gap_extend;
4881 /*
4882 search->pbp->gap_x_dropoff = (BLAST_Score) (options->gap_x_dropoff*NCBIMATH_LN2 / search->sbp->kbp_gap[search->first_context]->Lambda);
4883 search->pbp->gap_x_dropoff_final = (BLAST_Score) (options->gap_x_dropoff_final*NCBIMATH_LN2 / search->sbp->kbp_gap[search->first_context]->Lambda);
4884 */
4885
4886
4887 result_struct = search->result_struct;
4888 hitlist_count = result_struct->hitlist_count;
4889 total_hsps = 0;
4890 for (index=0; index<hitlist_count; index++)
4891 {
4892 total_hsps += result_struct->results[index]->hspcnt;
4893 }
4894 bhrp = BlastHitRangeNew(total_hsps);
4895 bhrp->query_id = search->query_id;
4896
4897 result_struct = search->result_struct;
4898 hitlist_count = result_struct->hitlist_count;
4899
4900 sequence=NULL;
4901 sequence_length=0;
4902
4903 for (index=0; index<hitlist_count; index++)
4904 {
4905 length = readdb_get_sequence_ex(search->rdfp, result_struct->results[index]->subject_id, &sequence, &sequence_length, TRUE);
4906 SumBlastGetGappedAlignmentEx(search, index, FALSE, FALSE, sequence+1, length, FALSE, NULL, bhrp, 0);
4907 }
4908 sequence = MemFree(sequence);
4909 }
4910 else
4911 {
4912 return NULL;
4913 }
4914
4915 HeapSort(bhrp->range_list_pointer, bhrp->current, sizeof(BlastHitRangePtr PNTR), bhrp_compare);
4916
4917 /* Stop the awake thread. */
4918 BlastStopAwakeThread(search->thr_info);
4919
4920 return bhrp;
4921 }
4922
4923 SeqAlignPtr LIBCALL
4924 BioseqBlastEngineEx(BioseqPtr bsp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total)
4925
4926 {
4927 SeqLocPtr slp;
4928 SeqAlignPtr seqalign;
4929
4930 slp = NULL;
4931 ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(bsp->id, SEQID_GI)));
4932 seqalign = BioseqBlastEngineByLocEx(slp, progname, database, options, other_returns, error_returns, callback, seqid_list, gi_list, gi_list_total);
4933 SeqLocFree(slp);
4934
4935 return seqalign;
4936 }
4937
4938 SeqAlignPtr LIBCALL
4939 BioseqBlastEngine(BioseqPtr bsp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)))
4940 {
4941 /* --KM added NULL mult_queries param to call */
4942 return BioseqBlastEngineWithCallbackMult(bsp, progname, database, options, other_returns, error_returns, callback, NULL, NULL);
4943 }
4944
4945 SeqAlignPtr LIBCALL
4946 BioseqBlastEngineWithCallback(BioseqPtr bsp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), int (LIBCALLBACK *handle_results)PROTO((VoidPtr srch)))
4947 {
4948 return BioseqBlastEngineWithCallbackMult(bsp, progname, database, options, other_returns, error_returns, callback, NULL, NULL);
4949 }
4950
4951 /* --KM added mult_queries parameter */
4952 SeqAlignPtr LIBCALL
4953 BioseqBlastEngineWithCallbackMult(BioseqPtr bsp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), int (LIBCALLBACK *handle_results)PROTO((VoidPtr srch)), QueriesPtr mult_queries)
4954 {
4955 SeqLocPtr slp;
4956 SeqAlignPtr seqalign;
4957
4958 slp = NULL;
4959 ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(bsp->id, SEQID_GI)));
4960 seqalign = BioseqBlastEngineByLocWithCallbackMult(slp, progname, database, options, other_returns, error_returns, callback, NULL, NULL, 0, handle_results, mult_queries);/* --KM pass mult_queries */
4961 SeqLocFree(slp);
4962
4963 return seqalign;
4964 }
4965
4966
4967
4968 SeqAlignPtr LIBCALL
4969 BioseqBlastEngineByLoc(SeqLocPtr slp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)))
4970
4971 {
4972 return BioseqBlastEngineByLocEx(slp, progname, database, options, other_returns, error_returns, callback, NULL, NULL, 0);
4973
4974 }
4975
4976 SeqAlignPtr LIBCALL
4977 BioseqBlastEngineByLocEx(SeqLocPtr slp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total)
4978
4979 {
4980 return BioseqBlastEngineByLocWithCallback(slp, progname, database, options, other_returns, error_returns, callback, seqid_list, gi_list, gi_list_total, NULL); /* --KM pass NULL mult_queries */
4981 }
4982
4983 SeqAlignPtr LIBCALL
4984 BioseqBlastEngineByLocWithCallback(SeqLocPtr slp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total, int (LIBCALLBACK *handle_results)PROTO((VoidPtr srch)))
4985 {
4986 return BioseqBlastEngineByLocWithCallbackMult(slp, progname, database, options, other_returns, error_returns, callback, seqid_list, gi_list, gi_list_total, handle_results, NULL);
4987 }
4988
4989 /* --KM added mult_queries param */
4990 SeqAlignPtr LIBCALL
4991 BioseqBlastEngineByLocWithCallbackMult(SeqLocPtr slp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total, int (LIBCALLBACK *handle_results)PROTO((VoidPtr srch)), QueriesPtr mult_queries)
4992 {
4993 Boolean options_allocated=FALSE;
4994 BlastSearchBlkPtr search;
4995 Int2 status;
4996 SeqAlignPtr head;
4997 SeqLocPtr whole_slp=NULL;
4998 /* Futamura */
4999 posSearchItems *posSearch;
5000 compactSearchItems *compactSearch = NULL;
5001 Boolean checkReturn = FALSE;
5002
5003 head = NULL;
5004
5005 if (error_returns)
5006 {
5007 *error_returns = NULL;
5008 }
5009
5010 if (other_returns)
5011 {
5012 *other_returns = NULL;
5013 }
5014
5015 if (progname == NULL)
5016 return NULL;
5017
5018 /* If no options, use default. */
5019 if (options == NULL)
5020 {
5021 options = BLASTOptionNew(progname, FALSE);
5022 options_allocated = TRUE;
5023 }
5024
5025 status = BLASTOptionValidateEx(options, progname, error_returns);
5026 if (status != 0)
5027 { /* error messages in other_returns? */
5028 return NULL;
5029 }
5030
5031 if (slp == NULL || database == NULL)
5032 return NULL;
5033
5034 if(options->is_rps_blast) {
5035 RPSInfoPtr rpsinfo;
5036 BioseqPtr bsp, fake_bsp;
5037 Boolean query_is_na;
5038
5039 if((bsp = BioseqLockById(SeqLocId(slp))) == NULL)
5040 return NULL;
5041
5042 /* RPS Blast discard program name and use specific RPS Blast
5043 logic for this */
5044
5045 if(bsp->mol == Seq_mol_aa) {
5046 query_is_na = FALSE;
5047 progname = "blastp";
5048 } else {
5049 query_is_na = TRUE;
5050 progname = "tblastn";
5051 }
5052 if((rpsinfo = RPSInitEx(database, !query_is_na, options)) == NULL) {
5053
5054 ErrPostEx(SEV_ERROR, 0, 0, "Failure to initialize RPS: %s %s",
5055 progname, database);
5056 return NULL;
5057 }
5058 /* Update size of the database in accordance with RPS Database size */
5059 RPSUpdateDbSize(options, rpsinfo, bsp->length);
5060
5061 if(!query_is_na)
5062 fake_bsp = bsp;
5063 else {
5064 options->db_genetic_code = options->genetic_code;
5065 fake_bsp = createFakeProtein();
5066 }
5067 search = BLASTSetUpSearch (fake_bsp, progname, fake_bsp->length, 0,
5068 NULL, options, NULL);
5069
5070 if (search == NULL)
5071 return NULL;
5072
5073 search->thr_info->tick_callback = NULL;
5074 search->thr_info->star_callback = NULL;
5075
5076 head = RPSBlastSearch(search, bsp, rpsinfo);
5077
5078 if(query_is_na)
5079 BioseqFree(fake_bsp);
5080 BioseqUnlock(bsp);
5081 RPSClose(rpsinfo);
5082 } else {
5083
5084 search = BLASTSetUpSearchByLocWithReadDbEx(slp, progname, SeqLocLen(slp), database, options, NULL, seqid_list, gi_list, gi_list_total, mult_queries);
5085 /* --KM pass mult_queries */
5086
5087 if (search == NULL) {
5088 /* We need to veryfy if database name is wrong and to set error
5089 returns correctly */
5090 Boolean is_prot;
5091 BlastErrorMsgPtr error_msg;
5092 CharPtr chptr;
5093 ReadDBFILEPtr rdfp=NULL;
5094
5095 if(!StringICmp(progname, "blastp") ||
5096 !StringICmp(progname, "blastx")) {
5097 is_prot = TRUE;
5098 } else {
5099 is_prot = FALSE;
5100 }
5101
5102 rdfp = readdb_new(database, is_prot);
5103 if(rdfp == NULL) {
5104 error_msg = MemNew(sizeof(BlastErrorMsg));
5105 chptr = MemNew(StringLen(database) + 256);
5106 sprintf(chptr, "Database %s was not found or does not exist",
5107 database);
5108 error_msg->msg = chptr;
5109 error_msg->level = 3; /* FATAL */
5110 ValNodeAddPointer(error_returns, 0, error_msg);
5111 }
5112
5113 readdb_destruct(rdfp);
5114 return NULL;
5115 }
5116
5117 search->thr_info->tick_callback = callback;
5118 search->thr_info->star_callback = callback;
5119 search->handle_results = handle_results;
5120 search->output = options->output;
5121
5122 /* Futamura psitblastn */
5123 if (options->recoverCheckpoint)
5124 search->positionBased = TRUE;
5125 else
5126 search->positionBased = FALSE;
5127
5128 if (options->recoverCheckpoint) {
5129 posSearch = (posSearchItems *) MemNew(1 * sizeof(posSearchItems));
5130 compactSearch = compactSearchNew(compactSearch);
5131 copySearchItems(compactSearch, search, options->matrix);
5132 posInitializeInformation(posSearch,search);
5133 /*AAS*/
5134
5135 checkReturn = posReadCheckpoint(posSearch, compactSearch,
5136 options->CheckpointFileName,
5137 NO_SCOREMAT_IO,
5138 &(search->error_return));
5139 /* Reading the checkpoint changes the statistical parameters
5140 kbp_psi and kbp_gap_psi. Recalculate the cutoffs by calling
5141 blast_set_parameters. */
5142
5143 /* Unless search->pbp->cutoff_s[2]_set is set, we wish to calculate
5144 cutoff_s[2] from cutoff_e[2], rather than the other way around.
5145 Setting cutoff_s[2] to zero, as was the case in the first call to
5146 blast_set_parameters, accomplishes this.
5147 */
5148 if (!search->pbp->cutoff_s_set) {
5149 search->pbp->cutoff_s = 0;
5150 }
5151 if (!search->pbp->cutoff_s2_set) {
5152 search->pbp->cutoff_s2 = 0;
5153 }
5154 search->sbp->kbp = search->sbp->kbp_psi;
5155 search->sbp->kbp_gap = search->sbp->kbp_gap_psi;
5156 blast_set_parameters(search,
5157 options->dropoff_1st_pass,
5158 options->dropoff_2nd_pass,
5159 s_ComputeAverageLength(search),
5160 search->searchsp_eff,
5161 options->window_size);
5162
5163 search->sbp->posMatrix = posSearch->posMatrix;
5164 if (NULL == search->sbp->posFreqs)
5165 search->sbp->posFreqs = allocatePosFreqs(compactSearch->qlength,
5166 compactSearch->alphabetSize);
5167 copyPosFreqs(posSearch->posFreqs,search->sbp->posFreqs,
5168 compactSearch->qlength, compactSearch->alphabetSize);
5169
5170 if (!checkReturn) {
5171 BlastConstructErrorMessage("BioseqBlastEngineByLocEx",
5172 "Error recovering from checkpoint", 3, error_returns);
5173 return NULL;
5174 }
5175 }
5176
5177 /* ----- Here is real BLAST search done ------- */
5178 if (search->positionBased)
5179 head = BioseqBlastEngineCore(search, options, search->sbp->posMatrix);
5180 else if (options->is_megablast_search) {
5181 SeqAlignPtr PNTR seqalignp;
5182 seqalignp = BioseqMegaBlastEngineCore(search, options);
5183 head = *seqalignp;
5184 } else
5185 head = BioseqBlastEngineCore(search, options, NULL);
5186 /* end Futamura */
5187
5188 }
5189
5190 if (search->error_return) {
5191 ValNodeLink(error_returns, search->error_return);
5192 search->error_return = NULL;
5193 }
5194
5195 if (other_returns) { /* format dbinfo etc. */
5196 *other_returns = BlastOtherReturnsPrepare(search);
5197 }
5198
5199 if (options_allocated) {
5200 options = BLASTOptionDelete(options);
5201 }
5202
5203 search = BlastSearchBlkDestruct(search);
5204
5205 if(!options->is_rps_blast) {
5206
5207 /* Adjsut the offset if the query does not cover the entire sequence. */
5208 if (slp->choice != SEQLOC_WHOLE) {
5209 ValNodeAddPointer(&whole_slp, SEQLOC_WHOLE, SeqIdFindBest(SeqLocId(slp), SEQID_GI));
5210 if (SeqLocAinB(whole_slp, slp) != 0) {
5211 AdjustOffSetsInSeqAlign(head, slp, NULL);
5212 }
5213 ValNodeFree(whole_slp);
5214 }
5215 }
5216
5217 return head;
5218 }
5219
5220 SeqLocPtr LIBCALL
5221 BioseqHitRangeEngine(BioseqPtr bsp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total)
5222
5223 {
5224 SeqLocPtr slp;
5225
5226 slp = NULL;
5227 ValNodeAddPointer(&slp, SEQLOC_WHOLE, SeqIdDup(SeqIdFindBest(bsp->id, SEQID_GI)));
5228 return BioseqHitRangeEngineByLoc(slp, progname, database, options, other_returns, error_returns, callback, seqid_list, gi_list, gi_list_total);
5229 }
5230
5231 SeqLocPtr
5232 HitRangeToSeqLoc(BlastHitRangePtr bhrp, Int4 link_value, Boolean combine)
5233
5234 {
5235 Boolean make_seqloc, start=TRUE;
5236 Int4 index, total, start_pos=0, stop_pos, largest_stop_pos=0;
5237 SeqIntPtr sint;
5238 SeqLocPtr retval=NULL;
5239
5240 if (bhrp == NULL)
5241 return NULL;
5242
5243 total = bhrp->current;
5244 index=0;
5245 while (index < total)
5246 {
5247 if (combine)
5248 {
5249 if (start == TRUE)
5250 {
5251 start_pos = bhrp->range_list_pointer[index]->gi + bhrp->base_offset;
5252 start = FALSE;
5253 largest_stop_pos = 0;
5254 }
5255 else
5256 {
5257 /* Keep track of largest stop position. */
5258 largest_stop_pos = MAX(largest_stop_pos, bhrp->range_list_pointer[index]->ordinal_id + bhrp->base_offset);
5259 make_seqloc = FALSE;
5260 if (index == total-1) /* Last one. */
5261 {
5262 stop_pos = bhrp->range_list_pointer[index]->ordinal_id + bhrp->base_offset;
5263 start = TRUE;
5264 make_seqloc = TRUE;
5265 }
5266 else if (largest_stop_pos+link_value < bhrp->range_list_pointer[index+1]->gi + bhrp->base_offset)
5267 { /* Check overlap with next one. */
5268 stop_pos = bhrp->range_list_pointer[index]->ordinal_id + bhrp->base_offset;
5269 start = TRUE;
5270 make_seqloc = TRUE;
5271 }
5272
5273 if (make_seqloc)
5274 {
5275 sint = SeqIntNew();
5276 sint->from = start_pos;
5277 sint->to = MAX(largest_stop_pos, stop_pos);
5278 sint->strand = Seq_strand_plus;
5279 sint->id = SeqIdDup(SeqIdFindBest(bhrp->query_id, SEQID_GI));
5280 ValNodeAddPointer(&retval, SEQLOC_INT, sint);
5281 }
5282 index++;
5283 }
5284 }
5285 else
5286 {
5287 sint = SeqIntNew();
5288 sint->from = bhrp->range_list_pointer[index]->gi + bhrp->base_offset;
5289 sint->to = bhrp->range_list_pointer[index]->ordinal_id + bhrp->base_offset;
5290 sint->strand = Seq_strand_plus;
5291 sint->id = SeqIdDup(SeqIdFindBest(bhrp->query_id, SEQID_GI));
5292 ValNodeAddPointer(&retval, SEQLOC_INT, sint);
5293 index++;
5294 }
5295 }
5296
5297 return retval;
5298 }
5299
5300 #define HITRANGE_LINKVALUE 5
5301
5302 SeqLocPtr LIBCALL
5303 BioseqHitRangeEngineByLoc(SeqLocPtr slp, CharPtr progname, CharPtr database, BLAST_OptionsBlkPtr options, ValNodePtr *other_returns, ValNodePtr *error_returns, int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)), SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, Int4 gi_list_total)
5304
5305 {
5306 Boolean options_allocated=FALSE;
5307 BlastHitRangePtr bhrp;
5308 BlastSearchBlkPtr search;
5309 Int2 status;
5310 SeqLocPtr seqloc, whole_slp=NULL;
5311
5312 if (error_returns)
5313 {
5314 *error_returns = NULL;
5315 }
5316
5317 if (other_returns)
5318 {
5319 *other_returns = NULL;
5320 }
5321
5322 if (progname == NULL)
5323 return NULL;
5324
5325 /* If no options, use default. */
5326 if (options == NULL)
5327 {
5328 options = BLASTOptionNew(progname, FALSE);
5329 options_allocated = TRUE;
5330 }
5331
5332 status = BLASTOptionValidateEx(options, progname, error_returns);
5333 if (status != 0)
5334 { /* error messages in other_returns? */
5335 return NULL;
5336 }
5337
5338 if (slp == NULL || database == NULL)
5339 return NULL;
5340
5341 search = BLASTSetUpSearchByLocWithReadDbEx(slp, progname, SeqLocLen(slp), database, options, NULL, seqid_list, gi_list, gi_list_total, NULL); /* --KM pass NULL mult_queries */
5342
5343 if (search == NULL)
5344 {
5345 return NULL;
5346 }
5347
5348 search->thr_info->tick_callback = callback;
5349 search->thr_info->star_callback = callback;
5350
5351 bhrp = BioseqHitRangeEngineCore(search, options);
5352 if (bhrp == NULL) /* can happen for invalid queries. */
5353 return NULL;
5354
5355 if (slp->choice != SEQLOC_WHOLE) {
5356 ValNodeAddPointer(&whole_slp, SEQLOC_WHOLE, SeqIdFindBest(SeqLocId(slp), SEQID_GI));
5357 bhrp->base_offset = GetOffsetInLoc(slp, whole_slp, SEQLOC_START);
5358 ValNodeFree(whole_slp);
5359 }
5360
5361 seqloc = HitRangeToSeqLoc(bhrp, HITRANGE_LINKVALUE, TRUE);
5362 bhrp = BlastHitRangeDestruct(bhrp);
5363 if (search->error_return)
5364 {
5365 ValNodeLink(error_returns, search->error_return);
5366 search->error_return = NULL;
5367 }
5368
5369 if (other_returns)
5370 { /* format dbinfo etc. */
5371 *other_returns = BlastOtherReturnsPrepare(search);
5372 }
5373
5374 if (options_allocated)
5375 {
5376 options = BLASTOptionDelete(options);
5377 }
5378 search = BlastSearchBlkDestruct(search);
5379
5380 return seqloc;
5381 }
5382
5383 void LIBCALL BlastOtherReturnsFree(ValNodePtr other_returns)
5384 {
5385 BLAST_KarlinBlkPtr ka_params;
5386 BLAST_MatrixPtr matrix;
5387 CharPtr params_buffer;
5388 TxDfDbInfoPtr dbinfo;
5389 ValNodePtr mask_loc, mask_loc_start, vnp;
5390
5391 mask_loc = NULL;
5392
5393 for (vnp=other_returns; vnp; vnp = vnp->next) {
5394 switch (vnp->choice) {
5395 case TXDBINFO:
5396 dbinfo = vnp->data.ptrvalue;
5397 dbinfo = TxDfDbInfoDestruct(dbinfo);
5398 break;
5399 case TXKABLK_NOGAP:
5400 ka_params = vnp->data.ptrvalue;
5401 MemFree(ka_params);
5402 break;
5403 case TXKABLK_GAP:
5404 ka_params = vnp->data.ptrvalue;
5405 MemFree(ka_params);
5406 break;
5407 case TXPARAMETERS:
5408 params_buffer = vnp->data.ptrvalue;
5409 MemFree(params_buffer);
5410 break;
5411 case TXMATRIX:
5412 matrix = vnp->data.ptrvalue;
5413 matrix = BLAST_MatrixDestruct(matrix);
5414
5415 break;
5416 case SEQLOC_MASKING_NOTSET:
5417 case SEQLOC_MASKING_PLUS1:
5418 case SEQLOC_MASKING_PLUS2:
5419 case SEQLOC_MASKING_PLUS3:
5420 case SEQLOC_MASKING_MINUS1:
5421 case SEQLOC_MASKING_MINUS2:
5422 case SEQLOC_MASKING_MINUS3:
5423 ValNodeAddPointer(&mask_loc, vnp->choice, vnp->data.ptrvalue);
5424 break;
5425 default:
5426 break;
5427 }
5428 }
5429
5430 mask_loc_start = mask_loc;
5431 while (mask_loc) {
5432 SeqLocSetFree(mask_loc->data.ptrvalue);
5433 mask_loc = mask_loc->next;
5434 }
5435 ValNodeFree(mask_loc_start);
5436
5437 other_returns = ValNodeFree(other_returns);
5438
5439 return;
5440 }
5441
5442 ValNodePtr LIBCALL
5443 BlastOtherReturnsPrepare(BlastSearchBlkPtr search)
5444
5445 {
5446 BLAST_KarlinBlkPtr ka_params;
5447 BLAST_MatrixPtr blast_matrix;
5448 CharPtr parameters, chptr;
5449 ReadDBFILEPtr rdfp_var;
5450 TxDfDbInfoPtr dbinfo, head, dbinfo_var=NULL;
5451 ValNodePtr other_returns=NULL;
5452
5453 head = NULL;
5454 if (search->thr_info->blast_gi_list) {
5455 dbinfo = MemNew(sizeof(TxDfDbInfo));
5456 dbinfo->total_length = search->dblen;
5457 dbinfo->number_seqs = search->dbseq_num;
5458 dbinfo->subset = TRUE;
5459 head = dbinfo;
5460 dbinfo_var = dbinfo;
5461 }
5462
5463 rdfp_var = search->rdfp;
5464 while (rdfp_var) {
5465 dbinfo = MemNew(sizeof(TxDfDbInfo));
5466 dbinfo->name = StringSave(readdb_get_filename(rdfp_var));
5467
5468 if((chptr = readdb_get_title(rdfp_var)) == NULL)
5469 chptr = readdb_get_filename(rdfp_var);
5470 dbinfo->definition = StringSave(chptr);
5471
5472 dbinfo->date = StringSave(readdb_get_date(rdfp_var));
5473
5474 dbinfo->is_protein = readdb_is_prot(rdfp_var);
5475
5476 if (rdfp_var->aliaslen)
5477 dbinfo->total_length = rdfp_var->aliaslen;
5478 else
5479 dbinfo->total_length = readdb_get_dblen(rdfp_var);
5480 if (rdfp_var->aliasnseq)
5481 dbinfo->number_seqs = rdfp_var->aliasnseq;
5482 else
5483 dbinfo->number_seqs = readdb_get_num_entries(rdfp_var);
5484 if (head == NULL) {
5485 head = dbinfo;
5486 dbinfo_var = dbinfo;
5487 } else {
5488 dbinfo_var->next = dbinfo;
5489 dbinfo_var = dbinfo_var->next;
5490 }
5491 rdfp_var = rdfp_var->next;
5492 }
5493 if (head)
5494 ValNodeAddPointer (&other_returns, TXDBINFO, head);
5495
5496 if (search->sbp->kbp && search->sbp->kbp[search->first_context]) {
5497 ka_params = BlastKarlinBlkCreate();
5498 ka_params->Lambda = search->sbp->kbp[search->first_context]->Lambda;
5499 ka_params->K = search->sbp->kbp[search->first_context]->K;
5500 ka_params->H = search->sbp->kbp[search->first_context]->H;
5501 ValNodeAddPointer (&other_returns, TXKABLK_NOGAP, ka_params);
5502 }
5503
5504 if (search->pbp->gapped_calculation == TRUE) {
5505 if (search->sbp->kbp_gap && search->sbp->kbp_gap[search->first_context]) {
5506 ka_params = BlastKarlinBlkCreate();
5507 ka_params->Lambda = search->sbp->kbp_gap[search->first_context]->Lambda;
5508 ka_params->K = search->sbp->kbp_gap[search->first_context]->K;
5509 ka_params->H = search->sbp->kbp_gap[search->first_context]->H;
5510 ValNodeAddPointer (&other_returns, TXKABLK_GAP, ka_params);
5511 }
5512 }
5513
5514 if (search->query_invalid == FALSE) {
5515 parameters = FormatBlastParameters(search);
5516 ValNodeAddPointer (&other_returns, TXPARAMETERS, parameters);
5517 }
5518
5519 blast_matrix = BLAST_MatrixFill(search->sbp, search->positionBased);
5520 ValNodeAddPointer (&other_returns, TXMATRIX, blast_matrix);
5521
5522 if (search->mask)
5523 ValNodeLink(&other_returns, search->mask);
5524
5525 if (search->pbp->is_rps_blast) {
5526 ValNodeAddFloat(&other_returns, EFF_SEARCH_SPACE,
5527 ((Nlm_FloatHi) search->dblen_eff)*
5528 ((Nlm_FloatHi) (search->rps_qlen - search->length_adjustment)));
5529 } else {
5530 ValNodeAddFloat(&other_returns, EFF_SEARCH_SPACE,
5531 ((Nlm_FloatHi) search->dblen_eff)*
5532 ((Nlm_FloatHi) search->context[search->first_context].query->effective_length));
5533 }
5534 ValNodeAddInt(&other_returns, EFF_HSP_LENGTH, search->length_adjustment);
5535
5536 /* If Mega BLAST endpoint results, save them here */
5537 if (search->mb_endpoint_results && search->pbp->mb_params &&
5538 search->pbp->mb_params->no_traceback)
5539 /* Here 21 = BlastResponse_mbalign (see file objblst3.h) */
5540 ValNodeAddPointer(&other_returns, 21,
5541 search->mb_endpoint_results->data.ptrvalue);
5542
5543 return other_returns;
5544 }
5545
5546
5547 /*
5548 Deallocates memory for BLAST_ExtendWordParamsPtr
5549
5550 */
5551
5552 static BLAST_ExtendWordParamsPtr
5553 BLAST_ExtendWordParamsDestruct (BLAST_ExtendWordParamsPtr ewp_params)
5554
5555 {
5556 ewp_params = MemFree(ewp_params);
5557
5558 return ewp_params;
5559 }
5560
5561
5562 /*
5563 Allocates memory for the BLAST_ExtendWordParamsPtr.
5564
5565 This function also sets many of the parametes such as min_diag_length etc.
5566
5567 Int4 qlen: length of the query.
5568 Boolean multiple_hits: specifies whether multiple hits method is used.
5569 Int4 window_size: the max. distance between two hits that are extended.
5570 */
5571
5572 BLAST_ExtendWordParamsPtr
5573 BLAST_ExtendWordParamsNew (Int4 qlen, Boolean multiple_hits, Int4 window_size)
5574
5575 {
5576 BLAST_ExtendWordParamsPtr ewp_params;
5577 Int4 min_diag_length, bits_to_shift;
5578
5579 ewp_params= MemNew(sizeof(BLAST_ExtendWordParams));
5580
5581 if (ewp_params)
5582 {
5583 min_diag_length = 1;
5584 bits_to_shift = 0;
5585 /* What power of 2 is just longer than the query? */
5586 while (min_diag_length < (qlen+window_size))
5587 {
5588 min_diag_length = min_diag_length << 1;
5589 bits_to_shift++;
5590 }
5591 /* These are used in the word finders to shift and mask
5592 rather than dividing and taking the remainder. */
5593 ewp_params->bits_to_shift = bits_to_shift;
5594 ewp_params->min_diag_length = min_diag_length;
5595 ewp_params->min_diag_mask = min_diag_length-1;
5596 ewp_params->multiple_hits = multiple_hits;
5597 ewp_params->offset = window_size;
5598 ewp_params->window = window_size;
5599 }
5600 return ewp_params;
5601 }
5602
5603 /*
5604 Deallocates memory for the BLAST_ExtendWordPtr.
5605
5606 */
5607 BLAST_ExtendWordPtr LIBCALL
5608 BLAST_ExtendWordDestruct (BLAST_ExtendWordPtr ewp)
5609
5610 {
5611 if (ewp)
5612 {
5613 if (ewp->_buffer)
5614 ewp->_buffer = MemFree(ewp->_buffer);
5615
5616 ewp = MemFree(ewp);
5617 }
5618
5619 return ewp;
5620
5621 }
5622
5623 /*
5624 Allocates memory for the BLAST_ExtendWordPtr.
5625
5626 All of the memory for the arrays is allocated in one chunk
5627 called "_buffer". If multiple_hits is specified them room
5628 for "diag_level", "last_hit", and "version" is allocated and
5629 pointers into the array for these are set. If multiple_hits
5630 is not set, then only room for diag_level and version is allocated;
5631 last_hit is not needed.
5632
5633 Int4 qlen, dblen: length of the query and the LONGEST subject sequence.
5634 Boolean multiple_hits: specifies whether multiple hits method is used.
5635
5636 ** CFJ
5637 ** - previously buffer contained diag_level array, last_hit array, and version array
5638 ** change to contain array of struct {dl,lh,v}.
5639 **
5640 ** - Now that version is no longer used, combining the remaining 2 is probably not a big win.
5641
5642 */
5643 BLAST_ExtendWordPtr
5644 BLAST_ExtendWordNew (BLAST_ExtendWordParamsPtr ewp_params)
5645
5646 {
5647 BLAST_ExtendWordPtr ewp;
5648 int i;
5649
5650 ewp = MemNew(sizeof(BLAST_ExtendWord));
5651
5652 if (ewp)
5653 {
5654 /* Allocate the buffer to be used for Combo array. */
5655 ewp->_buffer = (Int4Ptr) MemNew(ewp_params->min_diag_length*sizeof(CfjModStruct));
5656
5657 if (ewp->_buffer == NULL)
5658 {
5659 ewp = BLAST_ExtendWordDestruct(ewp);
5660 return NULL;
5661 }
5662
5663 ewp->combo_array= (CfjModStruct *) ewp->_buffer;
5664 ewp_params->offset=0;
5665 for(i=0;i<ewp_params->min_diag_length;i++){
5666 ewp->combo_array[i].diag_level=0;
5667 ewp->combo_array[i].last_hit = -ewp_params->window;
5668 }
5669 }
5670
5671 return ewp;
5672 }
5673
5674 /*****************************************************************************
5675 *
5676 * Zeroe's out the memory in the array _buffer, if offset is greater than
5677 * INT4_MAX/2. The first "min_diag_length" spaces in the array are used
5678 * by the array "diag_level", the second "min_diag_length" spaces are used
5679 * by "last_hit". All of these are zeroed out. The last "min_diag_length"
5680 * spaces are used by "version"; these are not zeroed out.
5681 *
5682 * If offset is not greater than INT4_MAX/2, then the memory is not
5683 * zeroed out. Rather "offset" is used as a "zero-point" that is
5684 * always greater than the next possible value when the word finder
5685 * starts working on a new subject sequence.
5686 *
5687 ******************************************************************************/
5688 void LIBCALL
5689 BlastExtendWordExit(BlastSearchBlkPtr search)
5690
5691 {
5692 BLAST_ExtendWordPtr ewp;
5693 BLAST_ExtendWordParamsPtr ewp_params;
5694 Int2 index;
5695 Int4 i, min_diag_length;
5696
5697 ewp_params = search->ewp_params;
5698
5699 for (index=search->first_context; index<=search->last_context; index++)
5700 {
5701
5702 if (ewp_params->offset >= INT4_MAX/2)
5703 {
5704 ewp = search->context[index].ewp;
5705 if (ewp) {
5706 min_diag_length = ewp_params->min_diag_length;
5707 for(i=0;i<min_diag_length;i++)
5708 {
5709 ewp->combo_array[i].diag_level=0;
5710 ewp->combo_array[i].last_hit = -ewp_params->window;
5711 }
5712 }
5713 }
5714 }
5715
5716 if (ewp_params->offset < INT4_MAX/2)
5717 {
5718 ewp_params->offset += search->subject->length + ewp_params->window ;
5719 }
5720 else
5721 {
5722 ewp_params->offset = 0;
5723 }
5724 }
5725
5726
5727 BlastSequenceBlkPtr LIBCALL
5728 BlastSequenceBlkDestruct(BlastSequenceBlkPtr seq_blk)
5729
5730 {
5731
5732 if (seq_blk == NULL)
5733 return NULL;
5734
5735 /* Free from the start of sequence if it's filled in. */
5736 if (seq_blk->sequence_start != NULL)
5737 {
5738 seq_blk->sequence_start = MemFree(seq_blk->sequence_start);
5739 }
5740 else
5741 {
5742 seq_blk->sequence = MemFree(seq_blk->sequence);
5743 }
5744
5745 seq_blk = MemFree(seq_blk);
5746
5747 return seq_blk;
5748 }
5749
5750
5751
5752 static BLASTContextStructPtr
5753 BLASTContextFree(BLASTContextStructPtr context, Int2 number)
5754
5755 {
5756 Int2 index;
5757
5758 if (context == NULL)
5759 return NULL;
5760
5761 for (index=0; index<number; index++)
5762 {
5763 context[index].ewp = BLAST_ExtendWordDestruct(context[index].ewp);
5764 if (context[index].query_allocated == TRUE)
5765 {
5766 context[index].query = BlastSequenceBlkDestruct(context[index].query);
5767 }
5768 }
5769 context = MemFree(context);
5770
5771 return context;
5772 }
5773
5774 void BlastThrInfoFree(BlastThrInfoPtr thr_info)
5775 {
5776 VoidPtr status=NULL;
5777
5778 if (thr_info == NULL)
5779 return;
5780
5781 if (thr_info->index_thr)
5782 {
5783 NlmThreadJoin(thr_info->index_thr, &status);
5784 thr_info->index_thr = NULL;
5785 }
5786
5787 if (thr_info->awake_thr)
5788 {
5789 NlmThreadJoin(thr_info->awake_thr, &status);
5790 thr_info->awake_thr = NULL;
5791 if (thr_info->callback_mutex)
5792 {
5793 NlmMutexDestroy(thr_info->callback_mutex);
5794 thr_info->callback_mutex = NULL;
5795 }
5796 }
5797 BlastGiListDestruct(thr_info->blast_gi_list, TRUE);
5798
5799 NlmMutexDestroy(thr_info->db_mutex);
5800 NlmMutexDestroy(thr_info->results_mutex);
5801 NlmMutexDestroy(thr_info->callback_mutex);
5802
5803 MemFree(thr_info);
5804
5805 return;
5806 }
5807
5808 BlastThrInfoPtr BlastThrInfoNew(void)
5809 {
5810 BlastThrInfoPtr thr_info;
5811
5812 thr_info = MemNew(sizeof(BlastThrInfo));
5813
5814 return thr_info;
5815 }
5816
5817
5818 /*
5819 Allocates space for a copy of the BlastSearchBlk for use in
5820 multi-processing BLAST.
5821 */
5822
5823 BlastSearchBlkPtr LIBCALL
5824 BlastSearchBlkDuplicate (BlastSearchBlkPtr search)
5825
5826 {
5827
5828 BlastSearchBlkPtr new_search;
5829 Int2 index;
5830
5831 if (search == NULL)
5832 return NULL;
5833
5834 new_search = (BlastSearchBlkPtr) MemNew(sizeof(BlastSearchBlk));
5835 if (new_search == NULL)
5836 return NULL;
5837
5838 /* What's allocated here? */
5839 new_search->allocated = 0;
5840 new_search->allocated += BLAST_SEARCH_ALLOC_SUBJECT;
5841 new_search->allocated += BLAST_SEARCH_ALLOC_PBP;
5842 new_search->allocated += BLAST_SEARCH_ALLOC_CONTEXT;
5843 new_search->allocated += BLAST_SEARCH_ALLOC_READDB;
5844 new_search->allocated += BLAST_SEARCH_ALLOC_EWPPARAMS;
5845
5846 /* AM: Support for query multiplexing. */
5847 if( search->mult_queries )
5848 new_search->mult_queries = BlastDuplicateMultQueries( search->mult_queries );
5849
5850 /* Duplicate the rfdp struct, but not the contents. */
5851 new_search->rdfp = readdb_attach(search->rdfp);
5852 if (new_search->rdfp == NULL)
5853 {
5854 new_search = BlastSearchBlkDestruct(new_search);
5855 return NULL;
5856 }
5857
5858 new_search->positionBased = search->positionBased;
5859
5860 /* Changes, need to allocate. */
5861 new_search->pbp = MemDup(search->pbp, sizeof(BLAST_ParameterBlk));
5862 if (search->pbp->mb_params)
5863 new_search->pbp->mb_params =
5864 MemDup(search->pbp->mb_params, sizeof(MegaBlastParameterBlk));
5865 new_search->pbp->filter_string = StringSave(search->pbp->filter_string);
5866 new_search->sbp = search->sbp;
5867 new_search->wfp_first = search->wfp_first;
5868 if (search->prog_number==blast_type_blastn &&
5869 search->pbp->mb_params) {
5870 new_search->wfp_second =
5871 MemDup(search->wfp_second, sizeof(BLAST_WordFinder));
5872 new_search->wfp_second->lookup =
5873 MegaBlastLookupTableDup(search->wfp_second->lookup);
5874 new_search->wfp = new_search->wfp_second;
5875 } else
5876 new_search->wfp_second = search->wfp_second;
5877 new_search->prog_name = StringSave(search->prog_name);
5878 new_search->prog_number = search->prog_number;
5879 new_search->first_context = search->first_context;
5880 new_search->last_context = search->last_context;
5881 new_search->query_slp = search->query_slp;
5882 if (search->prog_number==blast_type_blastn) {
5883 new_search->query_context_offsets =
5884 MemDup(search->query_context_offsets,
5885 (search->last_context-search->first_context+2)*sizeof(Int4));
5886 }
5887 if (search->ewp_params)
5888 new_search->ewp_params = MemDup(search->ewp_params, sizeof(BLAST_ExtendWordParams));
5889 new_search->dblen = search->dblen;
5890 new_search->dblen_eff = search->dblen_eff;
5891 new_search->dblen_eff_real = search->dblen_eff_real;
5892 new_search->dbseq_num = search->dbseq_num;
5893 new_search->length_adjustment = search->length_adjustment;
5894 new_search->searchsp_eff = search->searchsp_eff;
5895
5896 /* Allocate last_context+1 elements, even if there are only last_context-first_context
5897 being used. */
5898 new_search->context = (BLASTContextStructPtr) MemNew((search->last_context+1)*sizeof(BLASTContextStruct));
5899 for (index=new_search->first_context; index<=new_search->last_context; index++)
5900 {
5901 if (new_search->ewp_params)
5902 new_search->context[index].ewp = BLAST_ExtendWordNew(new_search->ewp_params);
5903 new_search->context[index].query = search->context[index].query;
5904 new_search->context[index].query->frame = ContextToFrame(new_search, index);
5905 new_search->context[index].query_allocated = FALSE;
5906 }
5907
5908 new_search->context_factor = search->context_factor;
5909
5910 new_search->subject = (BlastSequenceBlkPtr) MemNew(sizeof(BlastSequenceBlk));
5911 /* 100 is the size limit in the present BLAST for hsp's. */
5912 new_search->hsp_array_size = search->hsp_array_size;
5913 /* The results are held here. */
5914 new_search->result_struct = search->result_struct;
5915 new_search->mb_result_struct = search->mb_result_struct;
5916 new_search->result_size = search->result_size;
5917 new_search->worst_evalue = DBL_MAX;
5918
5919 new_search->translation_table = search->translation_table;
5920 new_search->translation_table_rc = search->translation_table_rc;
5921 new_search->genetic_code = search->genetic_code;
5922 new_search->db_genetic_code = search->db_genetic_code;
5923
5924 if (search->translation_buffer_size > 0)
5925 { /* two extra for the NULLB's on end. */
5926 new_search->translation_buffer = MemNew((2+search->translation_buffer_size)*sizeof(Uint1));
5927 new_search->translation_buffer_size = search->translation_buffer_size;
5928 }
5929
5930 new_search->gap_align = NULL; /* Allocated automatically. */
5931
5932 new_search->whole_query = search->whole_query;
5933 new_search->required_start = search->required_start;
5934 new_search->required_end = search->required_end;
5935
5936 new_search->handle_results = search->handle_results;
5937 if (!search->pbp->mb_params)
5938 new_search->query_id = SeqIdSetDup(search->query_id);
5939 else {
5940 new_search->qid_array = (SeqIdPtr PNTR)
5941 Malloc((search->last_context/2 + 1)*sizeof(SeqIdPtr));
5942
5943 for (index=0; index<=search->last_context/2; index++)
5944 new_search->qid_array[index] = SeqIdSetDup(search->qid_array[index]);
5945 }
5946
5947 /* Duplicating DNAP sequence used in OOF search */
5948 if(search->pbp->is_ooframe)
5949 new_search->query_dnap = BlastMakeCopyQueryDNAP(search->query_dnap);
5950
5951 new_search->thr_info = search->thr_info;
5952 new_search->semid = search->semid;
5953
5954 #ifdef BLAST_COLLECT_STATS
5955 new_search->first_pass_hits = 0;
5956 new_search->second_pass_hits = 0;
5957 new_search->second_pass_trys = 0;
5958 new_search->first_pass_extends = 0;
5959 new_search->second_pass_extends = 0;
5960 new_search->first_pass_good_extends = 0;
5961 new_search->second_pass_good_extends = 0;
5962 new_search->number_of_seqs_better_E = 0;
5963 new_search->prelim_gap_no_contest = 0;
5964 new_search->prelim_gap_passed = 0;
5965 new_search->prelim_gap_attempts = 0;
5966 new_search->real_gap_number_of_hsps = 0;
5967 #endif
5968 new_search->output = search->output;
5969
5970 if (search->abmp) {
5971 new_search = GreedyAlignMemAlloc(new_search);
5972 if (new_search->abmp == NULL) {
5973 new_search = BlastSearchBlkDestruct(new_search);
5974 return NULL;
5975 }
5976 }
5977 if (search->mb_endpoint_results) {
5978 new_search->mb_endpoint_results = ValNodeNew(NULL);
5979 new_search->mb_endpoint_results->data.ptrvalue =
5980 search->mb_endpoint_results->data.ptrvalue;
5981 }
5982 new_search->mask1 = search->mask1;
5983
5984 return new_search;
5985 }
5986 /*
5987 Allocates space for the new BlastSearchBlk and some sturctures
5988 attached to it.
5989 */
5990
5991 BlastSearchBlkPtr LIBCALL
5992 BlastSearchBlkNew (Int2 wordsize, Int4 qlen, CharPtr dbname, Boolean multiple_hits, BLAST_Score threshold_first, BLAST_Score threshold_second, Int4 result_size, CharPtr prog_name, BlastAllWordPtr all_words, Int2 first_context, Int2 last_context, Int4 window_size)
5993
5994 {
5995 return BlastSearchBlkNewExtra(wordsize, qlen, dbname, multiple_hits, threshold_first, threshold_second, result_size, prog_name, all_words, first_context, last_context, NULL, window_size);
5996
5997 }
5998
5999 /*
6000 Allocates space for the new BlastSearchBlk and some sturctures
6001 attached to it.
6002 */
6003
6004 BlastSearchBlkPtr LIBCALL
6005 BlastSearchBlkNewExtra (Int2 wordsize, Int4 qlen, CharPtr dbname, Boolean multiple_hits, BLAST_Score threshold_first, BLAST_Score threshold_second, Int4 result_size, CharPtr prog_name, BlastAllWordPtr all_words, Int2 first_context, Int2 last_context, ReadDBFILEPtr rdfp, Int4 window_size)
6006
6007 {
6008
6009 BlastSearchBlkPtr search;
6010 BLASTContextStructPtr context;
6011 Uint1 is_prot;
6012 Int2 index;
6013 Uint1 alphabet;
6014 Int4 longest_db_seq=INT4_MAX;
6015 ReadDBFILEPtr rdfp_var;
6016 Int4 last_ewp_index;
6017
6018 search = (BlastSearchBlkPtr) MemNew(sizeof(BlastSearchBlk));
6019
6020 if (search != NULL)
6021 {
6022 search->allocated = 0; /* everything's allocated here. */
6023 search->allocated += BLAST_SEARCH_ALLOC_QUERY;
6024 search->allocated += BLAST_SEARCH_ALLOC_SUBJECT;
6025 search->allocated += BLAST_SEARCH_ALLOC_PBP;
6026 search->allocated += BLAST_SEARCH_ALLOC_SBP;
6027 search->allocated += BLAST_SEARCH_ALLOC_EWPPARAMS;
6028 search->allocated += BLAST_SEARCH_ALLOC_CONTEXT;
6029 search->allocated += BLAST_SEARCH_ALLOC_RESULTS;
6030 search->allocated += BLAST_SEARCH_ALLOC_READDB;
6031 search->allocated += BLAST_SEARCH_ALLOC_ALL_WORDS;
6032 search->allocated += BLAST_SEARCH_ALLOC_THRINFO;
6033 search->allocated += BLAST_SEARCH_ALLOC_MASK1;
6034
6035 search->positionBased = FALSE;
6036
6037 if (StringCmp(prog_name, "blastn") == 0)
6038 {
6039 alphabet = BLASTNA_SEQ_CODE;
6040 }
6041 else
6042 {
6043 alphabet = Seq_code_ncbistdaa;
6044 }
6045
6046 if (dbname != NULL)
6047 {
6048
6049 if (rdfp == NULL)
6050 {
6051 if (StringCmp(prog_name, "blastp") == 0 || StringCmp(prog_name, "blastx") == 0)
6052 { /* Protein DB for blastp and blastx. */
6053 is_prot = READDB_DB_IS_PROT;
6054 }
6055 else
6056 {
6057 is_prot = READDB_DB_IS_NUC;
6058 }
6059
6060 if ((search->rdfp=readdb_new(dbname, is_prot)) == NULL)
6061 {
6062 return NULL;
6063 }
6064 }
6065 else
6066 { /* Attaches to the rdfp, rather than reallocating it. */
6067 search->rdfp = readdb_attach(rdfp);
6068 }
6069
6070 rdfp_var = search->rdfp;
6071 longest_db_seq = 0;
6072 while (rdfp_var)
6073 {
6074 longest_db_seq = MAX(longest_db_seq, readdb_get_maxlen(rdfp_var));
6075 rdfp_var = rdfp_var->next;
6076 }
6077 }
6078
6079 search->first_context = first_context;
6080 search->last_context = last_context;
6081
6082 search->pbp =
6083 (BLAST_ParameterBlkPtr) MemNew(sizeof(BLAST_ParameterBlk));
6084
6085 search->sbp = BLAST_ScoreBlkNew(alphabet, last_context+1);
6086
6087 /* Only allocate these if thresholds are above zero, i.e. they will be used. */
6088 if (StringCmp(prog_name, "blastn") != 0)
6089 {
6090 if (threshold_second > 0)
6091 {
6092 search->wfp_first = BLAST_WordFinderNew(search->sbp->alphabet_size, wordsize, 1, FALSE);
6093 search->allocated += BLAST_SEARCH_ALLOC_WFP_FIRST;
6094 /* Only allocate a new WFP if 2nd th differs from 1st. */
6095 search->wfp_second = search->wfp_first;
6096 }
6097 }
6098 else
6099 {
6100 if (multiple_hits)
6101 search->wfp_second = BLAST_WordFinderNew(256, wordsize, READDB_COMPRESSION_RATIO, FALSE);
6102 else
6103 search->wfp_second = BLAST_WordFinderNew(256, wordsize, READDB_COMPRESSION_RATIO, TRUE);
6104 search->allocated += BLAST_SEARCH_ALLOC_WFP_SECOND;
6105 }
6106
6107 search->prog_name = StringSave(prog_name);
6108 search->prog_number = BlastGetProgramNumber(prog_name);
6109 if (qlen > 0)
6110 search->ewp_params = BLAST_ExtendWordParamsNew(qlen, multiple_hits, window_size);
6111 else
6112 search->ewp_params = NULL;
6113 context = search->context = (BLASTContextStructPtr)
6114 MemNew((1+search->last_context)*sizeof(BLASTContextStruct));
6115 if (search->prog_number != blast_type_blastn)
6116 last_ewp_index = search->last_context;
6117 else /* All queries (Mega BLAST) and strands are concatenated
6118 in a single sequence */
6119 last_ewp_index = search->first_context;
6120
6121 for (index=search->first_context; index<=search->last_context; index++)
6122 {
6123 if (search->ewp_params && index <= last_ewp_index)
6124 context[index].ewp = BLAST_ExtendWordNew(search->ewp_params);
6125 context[index].query = (BlastSequenceBlkPtr) MemNew(sizeof(BlastSequenceBlk));
6126 context[index].query->frame = ContextToFrame(search, index);
6127 context[index].query_allocated = TRUE;
6128 }
6129
6130 search->subject = (BlastSequenceBlkPtr) MemNew(sizeof(BlastSequenceBlk));
6131 /* 100 is the size limit in the present BLAST for hsp's. */
6132 search->hsp_array_size = 100;
6133 /* The results are held here. */
6134 search->result_size = result_size;
6135 /*
6136 search->result_struct = BLASTResultsStructNew(result_size, search->pbp->max_pieces, search->pbp->hsp_range_max);
6137 */
6138
6139 search->worst_evalue = DBL_MAX;
6140
6141 search->whole_query = TRUE;
6142 search->required_start = 0;
6143 search->required_end = -1;
6144
6145 search->all_words = all_words;
6146
6147 search->thr_info = BlastThrInfoNew();
6148 #ifdef BLAST_COLLECT_STATS
6149 search->first_pass_hits = 0;
6150 search->second_pass_hits = 0;
6151 search->second_pass_trys = 0;
6152 search->first_pass_extends = 0;
6153 search->second_pass_extends = 0;
6154 search->first_pass_good_extends = 0;
6155 search->second_pass_good_extends = 0;
6156 search->number_of_seqs_better_E = 0;
6157 search->prelim_gap_no_contest = 0;
6158 search->prelim_gap_passed = 0;
6159 search->prelim_gap_attempts = 0;
6160 search->real_gap_number_of_hsps = 0;
6161 #endif
6162 }
6163
6164 return search;
6165 }
6166
6167 /*
6168 Deallocates memory associated with the BlastSearchBlkPtr.
6169 */
6170
6171 BlastSearchBlkPtr LIBCALL
6172 BlastSearchBlkDestruct (BlastSearchBlkPtr search)
6173
6174 {
6175
6176 if (search != NULL) {
6177 if (search->allocated & BLAST_SEARCH_ALLOC_QUERY)
6178 search->original_seq = MemFree(search->original_seq);
6179
6180 if (search->allocated & BLAST_SEARCH_ALLOC_SUBJECT)
6181 search->subject = BlastSequenceBlkDestruct(search->subject);
6182
6183 if (search->allocated & BLAST_SEARCH_ALLOC_SBP)
6184 search->sbp = BLAST_ScoreBlkDestruct(search->sbp);
6185
6186 if (search->allocated & BLAST_SEARCH_ALLOC_WFP_FIRST)
6187 search->wfp_first = BLAST_WordFinderDestruct(search->wfp_first);
6188
6189 if (search->allocated & BLAST_SEARCH_ALLOC_WFP_SECOND) {
6190 search->wfp_second = BLAST_WordFinderDestruct(search->wfp_second);
6191 } else if (search->prog_number==blast_type_blastn &&
6192 search->pbp->mb_params) {
6193 search->wfp_second =
6194 MegaBlastWordFinderDeallocate(search->wfp_second);
6195 }
6196
6197 /* Freeing DNAP sequence used in OOF */
6198
6199 if(search->pbp != NULL && search->pbp->is_ooframe) {
6200 BlastFreeQueryDNAP(search->query_dnap);
6201 search->query_dnap = NULL;
6202 }
6203
6204 if (search->allocated & BLAST_SEARCH_ALLOC_EWPPARAMS) {
6205 search->ewp_params = BLAST_ExtendWordParamsDestruct(search->ewp_params);
6206 }
6207
6208 if (search->allocated & BLAST_SEARCH_ALLOC_CONTEXT) {
6209 search->context = BLASTContextFree(search->context, 1+search->last_context);
6210 }
6211
6212 if (search->allocated & BLAST_SEARCH_ALLOC_RESULTS) {
6213 if (!search->pbp->mb_params)
6214 search->result_struct =
6215 BLASTResultsStructDelete(search->result_struct);
6216 else {
6217 Int2 index;
6218 for (index=0; index<=search->last_context/2; index++)
6219 search->mb_result_struct[index] =
6220 BLASTResultsStructDelete(search->mb_result_struct[index]);
6221 search->mb_result_struct = MemFree(search->mb_result_struct);
6222 }
6223 }
6224
6225 if (search->allocated & BLAST_SEARCH_ALLOC_PBP) {
6226 search->pbp->mb_params = MemFree(search->pbp->mb_params);
6227 MemFree(search->pbp->filter_string);
6228 search->pbp = MemFree(search->pbp);
6229 }
6230
6231 if (search->allocated & BLAST_SEARCH_ALLOC_READDB) {
6232 search->rdfp = readdb_destruct(search->rdfp);
6233 }
6234
6235 if (search->current_hitlist) {
6236 search->current_hitlist = BlastHitListDestruct(search->current_hitlist);
6237 }
6238 search->subject_info = BLASTSubjectInfoDestruct(search->subject_info);
6239
6240
6241 if (search->prog_name) {
6242 search->prog_name = MemFree(search->prog_name);
6243 }
6244
6245 if (search->query_id) {
6246 search->query_id = SeqIdSetFree(search->query_id);
6247 }
6248 if (search->qid_array) {
6249 Int4 index;
6250 for (index=0; index<=search->last_context/2; index++)
6251 SeqIdSetFree(search->qid_array[index]);
6252 search->qid_array = MemFree(search->qid_array);
6253 }
6254 if (search->translation_buffer_size > 0) {
6255 search->translation_buffer = MemFree(search->translation_buffer);
6256 }
6257
6258 if (search->allocated & BLAST_SEARCH_ALLOC_TRANS_INFO) {
6259
6260 if (search->translation_table) {
6261 search->translation_table = MemFree(search->translation_table);
6262 }
6263
6264 if (search->translation_table_rc) {
6265 search->translation_table_rc = MemFree(search->translation_table_rc);
6266 }
6267 }
6268
6269 if (search->allocated & BLAST_SEARCH_ALLOC_ALL_WORDS) {
6270 search->all_words = BlastAllWordDestruct(search->all_words);
6271 }
6272
6273 search->gap_align = GapAlignBlkDelete(search->gap_align);
6274
6275 if (search->allocated & BLAST_SEARCH_ALLOC_QUERY_SLP) {
6276 if (search->query_slp)
6277 search->query_slp = SeqLocFree(search->query_slp);
6278 }
6279
6280
6281 if(search->allocated & BLAST_SEARCH_ALLOC_THRINFO)
6282 BlastThrInfoFree(search->thr_info);
6283
6284 if (search->abmp)
6285 search->abmp = GreedyAlignMemFree(search->abmp);
6286
6287 search->query_context_offsets = MemFree(search->query_context_offsets);
6288
6289 MemFree(search->mb_endpoint_results);
6290
6291 if (search->allocated & BLAST_SEARCH_ALLOC_MASK1)
6292 {
6293 if (search->mask1)
6294 {
6295 SeqLocSetFree(search->mask1->data.ptrvalue);
6296 search->mask1 = ValNodeFree(search->mask1);
6297 }
6298 }
6299
6300 search = MemFree(search);
6301 }
6302
6303 return search;
6304 }
6305
6306
6307 /*
6308 Deallocates all the memory associated with the BlastAllWordPtr.
6309 */
6310
6311 BlastAllWordPtr LIBCALL
6312 BlastAllWordDestruct(BlastAllWordPtr all_words)
6313
6314 {
6315 if (all_words == NULL)
6316 return NULL;
6317
6318 if (all_words->array)
6319 {
6320 all_words->array = MemFree(all_words->array);
6321 }
6322
6323 if (all_words->rows_allocated && all_words->array_storage)
6324 {
6325 all_words->array_storage = MemFree(all_words->array_storage);
6326 }
6327
6328 MemFree(all_words);
6329
6330 return NULL;
6331 }
6332
6333 /*
6334 Allocates the BlastAllWordPtr and sets some flags.
6335 */
6336 BlastAllWordPtr LIBCALL
6337 BlastAllWordNew(Int4 num_of_cols, Int4 wordsize, Boolean rows_allocated, Boolean specific)
6338
6339 {
6340 BlastAllWordPtr all_words;
6341
6342 all_words = MemNew(sizeof(BlastAllWord));
6343 if (all_words)
6344 {
6345 all_words->rows_allocated = rows_allocated;
6346 all_words->specific = specific;
6347 all_words->num_of_cols = num_of_cols;
6348 all_words->wordsize = wordsize;
6349 }
6350
6351 return all_words;
6352 }
6353
6354 BLAST_HitListPtr LIBCALL
6355 BlastHitListDestruct(BLAST_HitListPtr hitlist)
6356 {
6357 BLAST_HSPPtr PNTR hsp_array;
6358 Int4 hspcnt_max, index;
6359
6360 if (hitlist == NULL)
6361 return NULL;
6362
6363 hspcnt_max = hitlist->hspcnt_max;
6364 hsp_array = hitlist->hsp_array;
6365
6366 for (index=0; index<hspcnt_max; index++)
6367 {
6368 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
6369 }
6370
6371 hitlist->hsp_array = MemFree(hsp_array);
6372 hitlist->lh_helper = MemFree(hitlist->lh_helper);
6373
6374 MemFree(hitlist->exact_match_array);
6375
6376 hitlist = MemFree(hitlist);
6377
6378 return hitlist;
6379 }
6380
6381 /****************************************************************
6382
6383 Functions to allocate and destroy the BLAST_HitList.
6384
6385 ***************************************************************/
6386 BLAST_HitListPtr LIBCALL
6387 BlastHitListNew(BlastSearchBlkPtr search)
6388 {
6389 BLAST_HitListPtr hitlist;
6390
6391 hitlist = (BLAST_HitListPtr) MemNew(sizeof(BLAST_HitList));
6392
6393 if (hitlist == NULL)
6394 return hitlist;
6395
6396 hitlist->hspmax = search->hsp_array_size;
6397 hitlist->hsp_array = (BLAST_HSPPtr PNTR) MemNew(hitlist->hspmax*sizeof
6398 (BLAST_HSPPtr));
6399
6400 if (hitlist->hsp_array == NULL)
6401 {
6402 hitlist = BlastHitListDestruct(hitlist);
6403 return NULL;
6404 }
6405
6406 if (search->pbp->mb_params) {
6407 hitlist->exact_match_array = (MegaBlastExactMatchPtr)
6408 MemNew(hitlist->hspmax*sizeof(MegaBlastExactMatch));
6409 hitlist->exact_match_max = hitlist->hspmax;
6410 }
6411
6412 return hitlist;
6413 }
6414
6415
6416 /*
6417 This function translates the context number of a context into
6418 the frame of the sequence.
6419
6420 Arguments:
6421
6422 BlastSearchBlkPtr search: search structure,
6423 Int2 context_number: context number used by BLASTContextStruct array
6424 Boolean is_query: if TRUE, refers to query, otherwise the subject.
6425 */
6426
6427 Int2
6428 ContextToFrame(BlastSearchBlkPtr search, Int2 context_number)
6429
6430 {
6431 Int2 frame=255;
6432 Uint1 prog_number = search->prog_number;
6433
6434 if (prog_number == blast_type_blastn)
6435 {
6436 if (context_number % 2 == 0)
6437 frame = 1;
6438 else
6439 frame = -1;
6440 }
6441 else if (prog_number == blast_type_blastp ||
6442 prog_number == blast_type_tblastn ||
6443 prog_number == blast_type_psitblastn)
6444 { /* Query and subject are protein, no frame. */
6445 frame = 0;
6446 }
6447 else if (prog_number == blast_type_blastx || prog_number == blast_type_tblastx)
6448 {
6449 frame = context_number < 3 ? context_number+1 : -context_number+2;
6450 }
6451
6452 return frame;
6453 }
6454
6455 /*
6456 Allocates and fills in the BLASTSubjectInfo structure.
6457 */
6458
6459 BLASTSubjectInfoPtr LIBCALL
6460 BLASTSubjectInfoNew(SeqIdPtr sip, CharPtr defline, Int4 length)
6461
6462 {
6463 BLASTSubjectInfoPtr subject_info;
6464
6465 subject_info = (BLASTSubjectInfoPtr) MemNew(sizeof(BLASTSubjectInfo));
6466
6467 if (subject_info == NULL)
6468 return NULL;
6469
6470 subject_info->sip = sip;
6471 subject_info->defline = defline;
6472 subject_info->length = length;
6473
6474 return subject_info;
6475 }
6476
6477 /*
6478 Deallocates the BLASTSubjectInfo structure and the
6479 SeqIdPtr, as well as the defline.
6480 */
6481
6482 BLASTSubjectInfoPtr LIBCALL
6483 BLASTSubjectInfoDestruct(BLASTSubjectInfoPtr subject_info)
6484
6485 {
6486
6487 if (subject_info == NULL)
6488 return NULL;
6489
6490 SeqIdFree(subject_info->sip);
6491 MemFree(subject_info->defline);
6492 subject_info = MemFree(subject_info);
6493
6494 return subject_info;
6495 }
6496
6497
6498
6499 /*
6500 Destroys BLASTResultsStructure and associated memory.
6501 */
6502
6503 BLASTResultsStructPtr LIBCALL
6504 BLASTResultsStructDelete(BLASTResultsStructPtr result_struct)
6505
6506 {
6507 Int4 index;
6508 BLASTResultHitlistPtr PNTR results;
6509 BLASTHeapPtr hp, hpt;
6510
6511 if (result_struct == NULL)
6512 return NULL;
6513
6514 results = result_struct->results;
6515 for (index=0; index<result_struct->hitlist_max; index++)
6516 {
6517 if (results[index])
6518 {
6519 results[index] = BLASTResultHitlistFree(results[index]);
6520 }
6521 }
6522
6523
6524 for (hp = result_struct->heap_ptr; hp; )
6525 {
6526 hpt = hp->next;
6527 hp->heap = MemFree(hp->heap);
6528 hp = MemFree(hp);
6529 hp = hpt;
6530 }
6531 result_struct->results = MemFree(result_struct->results);
6532 result_struct = MemFree(result_struct);
6533
6534 return result_struct;
6535 }
6536
6537 /*
6538 returns BLASTResultsStruct.
6539 */
6540
6541 BLASTResultsStructPtr
6542 BLASTResultsStructNew(Int4 results_size, Int4 max_pieces, Int4 range_max)
6543
6544 {
6545 BLASTResultsStructPtr new;
6546 Int4 index;
6547
6548 new = MemNew(sizeof(BLASTResultsStruct));
6549 new->results = (BLASTResultHitlistPtr PNTR) MemNew(results_size*sizeof(BLASTResultHitlistPtr));
6550
6551 for (index=0; index<results_size; index++)
6552 new->results[index] = NULL;
6553
6554 new->hitlist_max = results_size;
6555 new->hitlist_count = 0;
6556 new->max_pieces = max_pieces;
6557 if (range_max > 0) {
6558 new->heap_ptr = (BLASTHeapPtr) MemNew(sizeof(BLASTHeapStruct));
6559 new->heap_ptr->cutvalue = INT4_MAX;
6560 new->heap_ptr->num_in_heap = new->heap_ptr->num_of_ref = 0;
6561 new->heap_ptr->prev = new->heap_ptr->next = NULL;
6562 new->heap_ptr->heap = (BLASTResultHspPtr PNTR) MemNew(sizeof(BLASTResultHspPtr)*range_max);
6563 }
6564 return new;
6565 }
6566
6567
6568 Uint1 AAForCodon (Uint1Ptr codon, CharPtr codes);
6569
6570 /*
6571 GetTranslation to get the translation of the nucl. sequence in the
6572 appropriate frame and with the appropriate GeneticCode.
6573
6574 The function return an allocated CharPtr, the caller must delete this.
6575 The first and last spaces of this CharPtr contain NULLB's.
6576 */
6577
6578 Uint1Ptr LIBCALL
6579 GetTranslation(Uint1Ptr query_seq, Int4 nt_length, Int2 frame, Int4Ptr length, CharPtr genetic_code)
6580 {
6581 Uint1 codon[CODON_LENGTH];
6582 Int4 index, index_prot;
6583 SeqMapTablePtr smtp;
6584 Uint1 residue, new_residue;
6585 Uint1Ptr prot_seq;
6586
6587 smtp = SeqMapTableFind(Seq_code_ncbistdaa, Seq_code_ncbieaa);
6588
6589 /* Allocate two extra spaces for NULLB's at beginning and end of seq. */
6590 prot_seq = (Uint1Ptr) MemNew((2+(nt_length+2)/CODON_LENGTH)*sizeof(Uint1));
6591
6592 /* The first character in the protein is the NULLB sentinel. */
6593 prot_seq[0] = NULLB;
6594 index_prot = 1;
6595 for (index=ABS(frame)-1; index<nt_length-2; index += CODON_LENGTH)
6596 {
6597 codon[0] = query_seq[index];
6598 codon[1] = query_seq[index+1];
6599 codon[2] = query_seq[index+2];
6600 residue = AAForCodon(codon, genetic_code);
6601 new_residue = SeqMapTableConvert(smtp, residue);
6602 if (IS_residue(new_residue))
6603 {
6604 prot_seq[index_prot] = new_residue;
6605 }
6606 index_prot++;
6607 }
6608 prot_seq[index_prot] = NULLB;
6609 *length = index_prot-1;
6610
6611 return prot_seq;
6612 }
6613
6614
6615 /*************************************************************************
6616 *
6617 * MaskTheResidues masks up to max_length residues in buffer.
6618 * The residue to be used for masking (generally 'N' for nucleotides
6619 * and 'X' for proteins) is mask_residue. offset tells how far
6620 * along the sequence the first residue in buffer is. mask_slp
6621 * specifies which parts of the sequence to mask. 'max_length is
6622 * the total length of the sequence.
6623 *
6624 *************************************************************************/
6625
6626 void
6627 BlastMaskTheResidues(Uint1Ptr buffer, Int4 max_length, Uint1 mask_residue, SeqLocPtr mask_slp, Boolean reverse, Int4 offset)
6628
6629 {
6630 SeqLocPtr slp=NULL;
6631 Int4 index, start, stop;
6632
6633 while (mask_slp)
6634 {
6635 slp=NULL;
6636 while((slp = SeqLocFindNext(mask_slp, slp))!=NULL)
6637 {
6638 if (reverse)
6639 {
6640 start = max_length - 1 - SeqLocStop(slp);
6641 stop = max_length - 1 - SeqLocStart(slp);
6642 }
6643 else
6644 {
6645 start = SeqLocStart(slp);
6646 stop = SeqLocStop(slp);
6647 }
6648
6649 start -= offset;
6650 stop -= offset;
6651
6652 for (index=start; index<=stop; index++)
6653 {
6654 buffer[index] = mask_residue;
6655 }
6656 }
6657 mask_slp = mask_slp->next;
6658 }
6659
6660 }
6661
6662 /*
6663 COnverts a protein (translated) SeqLocPtr from the protein
6664 coordinates to the nucl. coordinates.
6665
6666 Only works on a SeqLocPtr of type SeqIntPtr right now.
6667 */
6668
6669 Boolean
6670 BlastConvertProteinSeqLoc(SeqLocPtr slp, Int2 frame, Int4 full_length)
6671
6672 {
6673 SeqIntPtr seq_int;
6674 Int4 from, to;
6675
6676 if (slp == NULL)
6677 return TRUE;
6678
6679 if (slp->choice == SEQLOC_PACKED_INT)
6680 slp = slp->data.ptrvalue;
6681
6682 while (slp)
6683 {
6684 if (slp->choice != SEQLOC_INT)
6685 return FALSE;
6686
6687 seq_int = slp->data.ptrvalue;
6688 from = seq_int->from;
6689 to = seq_int->to;
6690
6691 if (frame < 0)
6692 {
6693 seq_int->to = full_length - CODON_LENGTH*from + frame;
6694 seq_int->from = full_length - CODON_LENGTH*to + frame + 1;
6695 seq_int->strand = Seq_strand_minus;
6696 }
6697 else
6698 {
6699 seq_int->from = CODON_LENGTH*from + frame - 1;
6700 seq_int->to = CODON_LENGTH*to + frame - 1;
6701 seq_int->strand = Seq_strand_plus;
6702 }
6703 slp = slp->next;
6704 }
6705
6706 return TRUE;
6707 }
6708
6709 /*
6710 COnverts a DNA SeqLocPtr from the nucl. coordinates to
6711 the protein (translated) coordinates.
6712 Only works on a SeqLocPtr of type SEQLOC_INT or SEQLOC_PACKED_INT right now.
6713 */
6714
6715 Boolean
6716 BlastConvertDNASeqLoc(SeqLocPtr slp, Int2 frame, Int4 full_length)
6717 {
6718 SeqIntPtr seq_int;
6719 Int4 from, to;
6720
6721 if (slp == NULL)
6722 return TRUE;
6723
6724 if (slp->choice == SEQLOC_PACKED_INT)
6725 slp = slp->data.ptrvalue;
6726
6727 while (slp) {
6728 if (slp->choice != SEQLOC_INT)
6729 return FALSE;
6730
6731 seq_int = slp->data.ptrvalue;
6732 from = seq_int->from;
6733 to = seq_int->to;
6734
6735 if (frame < 0) {
6736 seq_int->from = (full_length + frame - to)/CODON_LENGTH;
6737 seq_int->to = (full_length + frame - from)/CODON_LENGTH;
6738 seq_int->strand = Seq_strand_minus;
6739 } else {
6740 seq_int->from = (from - frame + 1)/CODON_LENGTH;
6741 seq_int->to = (to-frame + 1)/CODON_LENGTH;
6742 seq_int->strand = Seq_strand_plus;
6743 }
6744 slp = slp->next;
6745 }
6746
6747 return TRUE;
6748 }
6749
6750 SeqLocPtr
6751 BioseqSegEx(BioseqPtr bsp_unfilter, CharPtr options)
6752
6753 {
6754 BioseqPtr bsp_filter;
6755 Boolean mask_state;
6756 Char cmd_buf[2*PATH_MAX], temp_file[PATH_MAX];
6757 CharPtr filter_dir;
6758 Int4 index, mask_begin=0;
6759 SeqEntryPtr sep;
6760 SeqLocPtr slp_mask;
6761 SeqPortPtr spp_filter, spp_unfilter;
6762 Uint1 res_filter, res_unfilter;
6763 FILE *fp;
6764
6765
6766 if (bsp_unfilter == NULL)
6767 return NULL;
6768
6769 #ifdef OS_UNIX
6770
6771 TmpNam(temp_file);
6772 fp = FileOpen(temp_file, "w");
6773 if (BioseqToFasta(bsp_unfilter, fp, FALSE) == FALSE)
6774 {
6775 BioseqUnlock(bsp_unfilter);
6776 FileClose(fp);
6777 return NULL;
6778 }
6779 FileClose(fp);
6780
6781 filter_dir = getenv("BLASTFILTER");
6782 if (filter_dir == NULL)
6783 filter_dir = BLASTFILTER_DIR;
6784
6785 if (options != NULL)
6786 sprintf(cmd_buf, "%s%s%s%s %s%s", filter_dir, DIRDELIMSTR, "seg ", temp_file, options, " -x");
6787 else
6788 sprintf(cmd_buf, "%s%s%s%s%s", filter_dir, DIRDELIMSTR, "seg ", temp_file, " -x");
6789
6790 fp = popen(cmd_buf, "r");
6791 if (fp == NULL)
6792 {
6793 ErrPostEx(SEV_WARNING, 0, 0, "Call to seg failed.");
6794 return NULL;
6795 }
6796
6797 sep = FastaToSeqEntry(fp, FALSE);
6798 FileClose(fp);
6799 if (sep == NULL)
6800 {
6801 ErrPostEx(SEV_WARNING, 0, 0, "Call to seg failed.");
6802 return NULL;
6803 }
6804 bsp_filter = sep->data.ptrvalue;
6805
6806 spp_filter = SeqPortNew(bsp_filter, 0, -1, Seq_strand_plus, Seq_code_ncbistdaa);
6807 spp_unfilter = SeqPortNew(bsp_unfilter, 0, -1, Seq_strand_plus, Seq_code_ncbistdaa);
6808
6809 mask_state = FALSE;
6810 index = 0;
6811 slp_mask = NULL;
6812 while ((res_filter=SeqPortGetResidue(spp_filter)) != SEQPORT_EOF)
6813 {
6814 res_unfilter=SeqPortGetResidue(spp_unfilter);
6815 if (res_filter != res_unfilter)
6816 {
6817 if (mask_state == FALSE)
6818 {
6819 mask_begin = index;
6820 mask_state = TRUE;
6821 }
6822 }
6823 else if (mask_state == TRUE)
6824 {
6825 ValNodeLink(&slp_mask, SeqLocIntNew(mask_begin, index-1, Seq_strand_plus, bsp_filter->id));
6826 mask_state = FALSE;
6827 }
6828 index++;
6829 }
6830
6831 /* If the last portion of the sequence was masked. */
6832 if (mask_state == TRUE)
6833 {
6834 ValNodeLink(&slp_mask, SeqLocIntNew(mask_begin, index-1, Seq_strand_plus, bsp_filter->id));
6835 }
6836
6837 sep = SeqEntryFree(sep);
6838 SeqPortFree(spp_filter);
6839 SeqPortFree(spp_unfilter);
6840
6841 pclose(fp);
6842 FileRemove(temp_file);
6843
6844 return slp_mask;
6845 #else
6846 return NULL;
6847 #endif
6848 }
6849
6850 /*
6851 Runs seg and obtains a SeqLocPtr from it.
6852 */
6853 static SeqLocPtr
6854 SeqLocSegEx(SeqLocPtr slp, CharPtr instructions)
6855
6856 {
6857 BioseqPtr bsp_unfilter;
6858 SeqLocPtr slp_mask;
6859 SeqIdPtr sip;
6860
6861
6862 if (slp == NULL)
6863 return NULL;
6864
6865 sip = SeqIdFindBest(SeqLocId(slp), SEQID_GI);
6866 bsp_unfilter = BioseqLockById(sip);
6867 slp_mask = BioseqSegEx(bsp_unfilter, instructions);
6868
6869 BioseqUnlock(bsp_unfilter);
6870
6871 return slp_mask;
6872 }
6873
6874 SeqLocPtr
6875 SeqLocSeg(SeqLocPtr slp)
6876
6877 {
6878 return SeqLocSegEx(slp, NULL);
6879 }
6880
6881 SeqLocPtr
6882 MyBioseqSeg(BioseqPtr bsp_unfilter)
6883
6884 {
6885 return BioseqSegEx(bsp_unfilter, NULL);
6886 }
6887
6888 #define BLASTSEQLOC_BUFFER_SIZE 128
6889
6890 Boolean
6891 parse_blast_options(BLAST_OptionsBlkPtr options, CharPtr string_options,
6892 CharPtr PNTR error_message, CharPtr PNTR database,
6893 Int4Ptr descriptions, Int4Ptr alignments)
6894 {
6895 CharPtr opt_str = "GErqeWdyXZPAIvbYzcFsSpfwtgn", *values;
6896 Int4 index;
6897
6898 if (options == NULL)
6899 return FALSE;
6900
6901 if(!BlastParseInputString(string_options, opt_str, &values, error_message))
6902 {
6903 return FALSE;
6904 }
6905
6906 /* -G gap open cost */
6907
6908 index = BlastGetLetterIndex(opt_str, 'G');
6909 if(values[index] != NULL) {
6910 options->gap_open = atoi(values[index]);
6911 }
6912
6913 /* -E gap extend cost */
6914
6915 index = BlastGetLetterIndex(opt_str, 'E');
6916 if(values[index] != NULL) {
6917 options->gap_extend = atoi(values[index]);
6918 }
6919
6920 /* -q penalty for nucleotide mismatch. */
6921
6922 index = BlastGetLetterIndex(opt_str, 'q');
6923 if(values[index] != NULL) {
6924 options->penalty = atoi(values[index]);
6925 }
6926
6927 /* -r reward for nucleotide match. */
6928
6929 index = BlastGetLetterIndex(opt_str, 'r');
6930 if(values[index] != NULL) {
6931 options->reward = atoi(values[index]);
6932 }
6933
6934 /* -e expect value. */
6935
6936 index = BlastGetLetterIndex(opt_str, 'e');
6937 if(values[index] != NULL) {
6938 options->expect_value = atof(values[index]);
6939 }
6940
6941 /* -W wordsize. */
6942
6943 index = BlastGetLetterIndex(opt_str, 'W');
6944 if(values[index] != NULL) {
6945 options->wordsize = atoi(values[index]);
6946 }
6947
6948 /* -d database. */
6949 if (database) {
6950 index = BlastGetLetterIndex(opt_str, 'd');
6951 if(values[index] != NULL) {
6952 *database = values[index];
6953 values[index] = NULL;
6954 }
6955 }
6956
6957 /* -y Dropoff (X) for blast extensions in bits (default if zero) */
6958
6959 index = BlastGetLetterIndex(opt_str, 'y');
6960 if(values[index] != NULL) {
6961 options->dropoff_2nd_pass = atof(values[index]);
6962 }
6963
6964 /* -X X dropoff value for gapped alignment (in bits) */
6965
6966 index = BlastGetLetterIndex(opt_str, 'X');
6967 if(values[index] != NULL) {
6968 options->gap_x_dropoff = atof(values[index]);
6969 }
6970
6971 /* -Z final X dropoff value for gapped alignment (in bits) */
6972
6973 index = BlastGetLetterIndex(opt_str, 'Z');
6974 if(values[index] != NULL) {
6975 options->gap_x_dropoff_final = atof(values[index]);
6976 }
6977
6978 /* -P multiple hits/two-pass. */
6979
6980 index = BlastGetLetterIndex(opt_str, 'P');
6981 if(values[index] != NULL) {
6982 if (atoi(values[index]) == 0)
6983 {
6984 options->two_pass_method = FALSE;
6985 options->multiple_hits_only = TRUE;
6986 }
6987 else if (atoi(values[index]) == 1)
6988 {
6989 options->two_pass_method = FALSE;
6990 options->multiple_hits_only = FALSE;
6991 }
6992 else
6993 {
6994 options->two_pass_method = TRUE;
6995 options->multiple_hits_only = FALSE;
6996 }
6997 }
6998
6999 /* -A window size. */
7000
7001 index = BlastGetLetterIndex(opt_str, 'A');
7002 if(values[index] != NULL) {
7003 options->window_size = atoi(values[index]);
7004 }
7005
7006 /* -I Hitlist size */
7007 index = BlastGetLetterIndex(opt_str, 'I');
7008 if (values[index] != NULL)
7009 options->hitlist_size = atoi(values[index]);
7010
7011 /* -v Number of descriptions */
7012 if (descriptions) {
7013 *descriptions = -1;
7014 index = BlastGetLetterIndex(opt_str, 'v');
7015 if (values[index] != NULL) {
7016 *descriptions = atoi(values[index]);
7017 options->hitlist_size =
7018 MAX(options->hitlist_size, *descriptions);
7019 }
7020 }
7021
7022 /* -b Number of alignments */
7023 if (alignments) {
7024 *alignments = -1;
7025 index = BlastGetLetterIndex(opt_str, 'b');
7026 if (values[index] != NULL) {
7027 *alignments = atoi(values[index]);
7028 options->hitlist_size =
7029 MAX(options->hitlist_size, *alignments);
7030 }
7031 }
7032
7033 /* -Y Effective search space */
7034 index = BlastGetLetterIndex(opt_str, 'Y');
7035 if (values[index] != NULL)
7036 options->searchsp_eff = atof(values[index]);
7037
7038 /* -z Effective database length */
7039 index = BlastGetLetterIndex(opt_str, 'z');
7040 if (values[index] != NULL) {
7041 const char *dummy=NULL;
7042 options->db_length = StringToInt8(values[index], &dummy);
7043 }
7044
7045 /* -c Constant in pseudocounts for multipass version */
7046 index = BlastGetLetterIndex(opt_str, 'c');
7047 if (values[index] != NULL)
7048 options->pseudoCountConst = atoi(values[index]);
7049
7050 /* -F Filter string */
7051 index = BlastGetLetterIndex(opt_str, 'F');
7052 if (values[index] != NULL)
7053 options->filter_string = values[index];
7054
7055 /* -s Score cut off for megablast */
7056 index = BlastGetLetterIndex(opt_str, 's');
7057 if (values[index] != NULL)
7058 options->cutoff_s2 = atoi(values[index]);
7059
7060 /* -S Strand option */
7061 index = BlastGetLetterIndex(opt_str, 'S');
7062 if (values[index] != NULL)
7063 options->strand_option = (Uint1) atoi(values[index]);
7064
7065 /* -p Percentage of identity cut-off */
7066 index = BlastGetLetterIndex(opt_str, 'p');
7067 if (values[index] != NULL)
7068 options->perc_identity = (FloatLo) atof(values[index]);
7069
7070 /* -f threshold for hits */
7071
7072 index = BlastGetLetterIndex(opt_str, 'f');
7073 if(values[index] != NULL) {
7074 options->threshold_second = atoi(values[index]);
7075 }
7076
7077 /* -w Frame shift penalty (OOF algorithm for blastx) */
7078
7079 index = BlastGetLetterIndex(opt_str, 'w');
7080 if(values[index] != NULL) {
7081 options->shift_pen = atoi(values[index]);
7082 options->is_ooframe = TRUE;
7083 }
7084
7085 /* -t Discontiguous word template length for megablast;
7086 Longest intron length for sum statistics in tblastn */
7087
7088 index = BlastGetLetterIndex(opt_str, 't');
7089 if(values[index] != NULL) {
7090 if (options->is_megablast_search)
7091 options->mb_template_length = atoi(values[index]);
7092 else
7093 options->longest_intron = atoi(values[index]);
7094 }
7095
7096 /* -g Scan every base of the database for megablast */
7097
7098 index = BlastGetLetterIndex(opt_str, 'g');
7099 if(values[index] != NULL) {
7100 options->mb_one_base_step = (TO_UPPER(*values[index]) == 'T');
7101 }
7102
7103 /* -n Use dynamic programming algorithm in megablast for gapped
7104 extensions instead of greedy algorithm */
7105
7106 index = BlastGetLetterIndex(opt_str, 'n');
7107 if(values[index] != NULL) {
7108 options->mb_use_dyn_prog = (TO_UPPER(*values[index]) == 'T');
7109 }
7110
7111 values = MemFree(values);
7112
7113 return TRUE;
7114 }
7115
7116 static Boolean
7117 parse_dust_options(CharPtr ptr, Int4Ptr level, Int4Ptr window, Int4Ptr cutoff, Int4Ptr linker)
7118
7119 {
7120 Char buffer[BLASTSEQLOC_BUFFER_SIZE];
7121 Int4 arg, index, index1, window_pri=-1, linker_pri=-1, level_pri=-1, cutoff_pri=-1;
7122 long tmplong;
7123
7124 arg = 0;
7125 index1 = 0;
7126 for (index=0; index<BLASTSEQLOC_BUFFER_SIZE; index++)
7127 {
7128 if (*ptr == ' ' || *ptr == NULLB)
7129 {
7130 buffer[index1] = NULLB;
7131 index1 = 0;
7132 switch(arg) {
7133 case 0:
7134 sscanf(buffer, "%ld", &tmplong);
7135 level_pri = tmplong;
7136 break;
7137 case 1:
7138 sscanf(buffer, "%ld", &tmplong);
7139 window_pri = tmplong;
7140 break;
7141 case 2:
7142 sscanf(buffer, "%ld", &tmplong);
7143 cutoff_pri = tmplong;
7144 break;
7145 case 3:
7146 sscanf(buffer, "%ld", &tmplong);
7147 linker_pri = tmplong;
7148 break;
7149 default:
7150 break;
7151 }
7152
7153 arg++;
7154 while (*ptr == ' ')
7155 ptr++;
7156
7157 /* end of the buffer. */
7158 if (*ptr == NULLB)
7159 break;
7160 }
7161 else
7162 {
7163 buffer[index1] = *ptr; ptr++;
7164 index1++;
7165 }
7166 }
7167
7168 *level = level_pri;
7169 *window = window_pri;
7170 *cutoff = cutoff_pri;
7171 *linker = linker_pri;
7172
7173 return TRUE;
7174 }
7175
7176
7177 static Boolean
7178 parse_seg_options(CharPtr ptr, Int4Ptr window, FloatHiPtr locut, FloatHiPtr hicut)
7179
7180 {
7181 Char buffer[BLASTSEQLOC_BUFFER_SIZE];
7182 Int4 arg, index, index1;
7183 long tmplong;
7184 FloatHi tmpdouble;
7185
7186 arg = 0;
7187 index1 = 0;
7188 for (index=0; index<BLASTSEQLOC_BUFFER_SIZE; index++)
7189 {
7190 if (*ptr == ' ' || *ptr == NULLB)
7191 {
7192 buffer[index1] = NULLB;
7193 index1 = 0;
7194 switch(arg) {
7195 case 0:
7196 sscanf(buffer, "%ld", &tmplong);
7197 *window = tmplong;
7198 break;
7199 case 1:
7200 sscanf(buffer, "%le", &tmpdouble);
7201 *locut = tmpdouble;
7202 break;
7203 case 2:
7204 sscanf(buffer, "%le", &tmpdouble);
7205 *hicut = tmpdouble;
7206 break;
7207 default:
7208 break;
7209 }
7210
7211 arg++;
7212 while (*ptr == ' ')
7213 ptr++;
7214
7215 /* end of the buffer. */
7216 if (*ptr == NULLB)
7217 break;
7218 }
7219 else
7220 {
7221 buffer[index1] = *ptr; ptr++;
7222 index1++;
7223 }
7224 }
7225
7226 return TRUE;
7227 }
7228
7229 static Boolean
7230 parse_cc_options(CharPtr ptr, Int4Ptr window, FloatHiPtr cutoff, Int4Ptr linker)
7231
7232 {
7233 Char buffer[BLASTSEQLOC_BUFFER_SIZE];
7234 Int4 arg, index, index1;
7235 long tmplong;
7236 FloatHi tmpdouble;
7237
7238 arg = 0;
7239 index1 = 0;
7240 for (index=0; index<BLASTSEQLOC_BUFFER_SIZE; index++)
7241 {
7242 if (*ptr == ' ' || *ptr == NULLB)
7243 {
7244 buffer[index1] = NULLB;
7245 index1 = 0;
7246 switch(arg) {
7247 case 0:
7248 sscanf(buffer, "%ld", &tmplong);
7249 *window = tmplong;
7250 break;
7251 case 1:
7252 sscanf(buffer, "%le", &tmpdouble);
7253 *cutoff = tmpdouble;
7254 break;
7255 case 2:
7256 sscanf(buffer, "%ld", &tmplong);
7257 *linker = tmplong;
7258 break;
7259 default:
7260 break;
7261 }
7262
7263 arg++;
7264 while (*ptr == ' ')
7265 ptr++;
7266
7267 /* end of the buffer. */
7268 if (*ptr == NULLB)
7269 break;
7270 }
7271 else
7272 {
7273 buffer[index1] = *ptr; ptr++;
7274 index1++;
7275 }
7276 }
7277
7278 return TRUE;
7279 }
7280
7281 CharPtr
7282 load_options_to_buffer(CharPtr instructions, CharPtr buffer)
7283 {
7284 Boolean not_started=TRUE;
7285 CharPtr buffer_ptr, ptr;
7286 Int4 index;
7287
7288 ptr = instructions;
7289 buffer_ptr = buffer;
7290 for (index=0; index<BLASTSEQLOC_BUFFER_SIZE && *ptr != NULLB; index++)
7291 {
7292 if (*ptr == ';')
7293 {
7294 ptr++;
7295 break;
7296 }
7297 /* Remove blanks at the beginning. */
7298 if (not_started && *ptr == ' ')
7299 {
7300 ptr++;
7301 }
7302 else
7303 {
7304 not_started = FALSE;
7305 *buffer_ptr = *ptr;
7306 buffer_ptr++; ptr++;
7307 }
7308 }
7309
7310 *buffer_ptr = NULLB;
7311
7312 if (not_started == FALSE)
7313 { /* Remove trailing blanks. */
7314 buffer_ptr--;
7315 while (*buffer_ptr == ' ' && buffer_ptr > buffer)
7316 {
7317 *buffer_ptr = NULLB;
7318 buffer_ptr--;
7319 }
7320 }
7321
7322 return ptr;
7323 }
7324
7325 #define CC_WINDOW 22
7326 #define CC_CUTOFF 40.0
7327 #define CC_LINKER 32
7328
7329 /*
7330 This function parses the 'instructions' string and then calls the appopriate
7331 filtering functions.
7332 */
7333 SeqLocPtr
7334 BlastBioseqFilter(BioseqPtr bsp, CharPtr instructions)
7335
7336 {
7337 return BlastBioseqFilterEx(bsp, instructions, NULL);
7338 }
7339
7340 SeqLocPtr
7341 BlastBioseqFilterEx(BioseqPtr bsp, CharPtr instructions, BoolPtr mask_at_hash)
7342
7343 {
7344 SeqLocPtr slp = NULL;
7345 SeqLocPtr slp_mask;
7346
7347 ValNodeAddPointer(&slp, SEQLOC_WHOLE,
7348 SeqIdDup(SeqIdFindBest(bsp->id, SEQID_GI)));
7349 slp_mask = BlastSeqLocFilterEx(slp, instructions, mask_at_hash);
7350 slp = SeqLocFree(slp);
7351 return slp_mask;
7352 }
7353
7354 SeqLocPtr
7355 BlastSeqLocFilter(SeqLocPtr slp, CharPtr instructions)
7356
7357 {
7358 return BlastSeqLocFilterEx(slp, instructions, NULL);
7359 }
7360
7361 SeqLocPtr
7362 BlastSeqLocFilterEx(SeqLocPtr slp, CharPtr instructions, BoolPtr mask_at_hash)
7363
7364 {
7365 BioseqPtr bsp;
7366 BLAST_OptionsBlkPtr repeat_options, vs_options;
7367 Boolean do_all=FALSE, do_seg=FALSE, do_coil_coil=FALSE, do_dust=FALSE, do_repeats=FALSE, do_vecscreen=FALSE;
7368 Boolean myslp_allocated;
7369 CharPtr buffer=NULL;
7370 CharPtr ptr, repeat_database=NULL, vs_database=NULL, error_msg;
7371 Int2 seqloc_num;
7372 Int4 window_cc, linker_cc, window_dust, level_dust, minwin_dust, linker_dust;
7373 SeqLocPtr cc_slp=NULL, dust_slp=NULL, seg_slp=NULL, seqloc_head=NULL, repeat_slp=NULL, vs_slp=NULL;
7374 PccDatPtr pccp;
7375 Nlm_FloatHiPtr scores;
7376 Nlm_FloatHi cutoff_cc;
7377 SegParamsPtr sparamsp=NULL;
7378 SeqAlignPtr seqalign;
7379 SeqIdPtr sip;
7380 SeqLocPtr myslp, seqloc_var, seqloc_tmp;
7381 ValNodePtr vnp=NULL, vnp_var;
7382
7383 cutoff_cc = CC_CUTOFF;
7384
7385 if (instructions == NULL || StringICmp(instructions, "F") == 0)
7386 return NULL;
7387
7388 /* FALSE is the default right now. */
7389 if (mask_at_hash)
7390 *mask_at_hash = FALSE;
7391
7392 /* parameters for dust. */
7393 /* -1 indicates defaults. */
7394 level_dust = -1;
7395 window_dust = -1;
7396 minwin_dust = -1;
7397 linker_dust = -1;
7398 if (StringICmp(instructions, "T") == 0)
7399 { /* do_all actually means seg for proteins and dust for nt. */
7400 do_all = TRUE;
7401 }
7402 else
7403 {
7404 buffer = MemNew(StringLen(instructions)*sizeof(Char));
7405 ptr = instructions;
7406 /* allow old-style filters when m cannot be followed by the ';' */
7407 if (*ptr == 'm' && ptr[1] == ' ')
7408 {
7409 if (mask_at_hash)
7410 *mask_at_hash = TRUE;
7411 ptr += 2;
7412 }
7413 while (*ptr != NULLB)
7414 {
7415 if (*ptr == 'S')
7416 {
7417 sparamsp = SegParamsNewAa();
7418 sparamsp->overlaps = TRUE; /* merge overlapping segments. */
7419 ptr = load_options_to_buffer(ptr+1, buffer);
7420 if (buffer[0] != NULLB)
7421 {
7422 parse_seg_options(buffer, &sparamsp->window, &sparamsp->locut, &sparamsp->hicut);
7423 }
7424 do_seg = TRUE;
7425 }
7426 else if (*ptr == 'C')
7427 {
7428 ptr = load_options_to_buffer(ptr+1, buffer);
7429 window_cc = CC_WINDOW;
7430 cutoff_cc = CC_CUTOFF;
7431 linker_cc = CC_LINKER;
7432 if (buffer[0] != NULLB)
7433 parse_cc_options(buffer, &window_cc, &cutoff_cc, &linker_cc);
7434 do_coil_coil = TRUE;
7435 }
7436 else if (*ptr == 'D')
7437 {
7438 ptr = load_options_to_buffer(ptr+1, buffer);
7439 if (buffer[0] != NULLB)
7440 parse_dust_options(buffer, &level_dust, &window_dust, &minwin_dust, &linker_dust);
7441 do_dust = TRUE;
7442 }
7443 else if (*ptr == 'R')
7444 {
7445 repeat_options = BLASTOptionNew("blastn", TRUE);
7446 repeat_options->expect_value = 0.1;
7447 repeat_options->penalty = -1;
7448 repeat_options->wordsize = 11;
7449 repeat_options->gap_x_dropoff_final = 90;
7450 repeat_options->dropoff_2nd_pass = 40;
7451 repeat_options->gap_open = 2;
7452 repeat_options->gap_extend = 1;
7453 ptr = load_options_to_buffer(ptr+1, buffer);
7454 if (buffer[0] != NULLB)
7455 parse_blast_options(repeat_options,
7456 buffer, &error_msg, &repeat_database,
7457 NULL, NULL);
7458 if (repeat_database == NULL)
7459 repeat_database = StringSave("humlines.lib humsines.lib retrovir.lib");
7460 do_repeats = TRUE;
7461 }
7462 else if (*ptr == 'V')
7463 {
7464 vs_options = VSBlastOptionNew();
7465 ptr = load_options_to_buffer(ptr+1, buffer);
7466 if (buffer[0] != NULLB)
7467 parse_blast_options(vs_options, buffer,
7468 &error_msg, &vs_database, NULL, NULL);
7469 vs_options = BLASTOptionDelete(vs_options);
7470 if (vs_database == NULL)
7471 vs_database = StringSave("UniVec_Core");
7472 do_vecscreen = TRUE;
7473 }
7474 else if (*ptr == 'L')
7475 { /* do low-complexity filtering; dust for blastn, otherwise seg.*/
7476 do_all = TRUE;
7477 ptr++;
7478 }
7479 else if (*ptr == 'm')
7480 {
7481 if (mask_at_hash)
7482 *mask_at_hash = TRUE;
7483 ptr++;
7484 }
7485 else
7486 { /* Nothing applied. */
7487 ptr++;
7488 }
7489 }
7490 buffer = MemFree(buffer);
7491 }
7492
7493 seqloc_num = 0;
7494 seqloc_head = NULL;
7495 sip = SeqLocId(slp);
7496 bsp = BioseqLockById(SeqIdFindBest(sip, SEQID_GI));
7497 if (ISA_aa(bsp->mol))
7498 {
7499 if (do_all || do_seg)
7500 {
7501 seg_slp = SeqlocSegAa(slp, sparamsp);
7502 SegParamsFree(sparamsp);
7503 sparamsp = NULL;
7504 seqloc_num++;
7505 }
7506 if (do_coil_coil)
7507 {
7508 pccp = PccDatNew ();
7509 pccp->window = window_cc;
7510 ReadPccData (pccp);
7511 /*scores = PredictCCBioseq(bsp, 0, bsp->length-1, pccp);*/
7512 scores = PredictCCSeqLoc(slp, pccp);
7513 cc_slp = FilterCC(scores, cutoff_cc, SeqLocLen(slp), linker_cc, SeqIdDup(sip), FALSE);
7514 MemFree(scores);
7515 PccDatFree (pccp);
7516 seqloc_num++;
7517 }
7518 }
7519 else
7520 {
7521 if (do_all || do_dust)
7522 {
7523 dust_slp = SeqLocDustEx(slp, level_dust, window_dust, linker_dust);
7524 seqloc_num++;
7525 }
7526 if (do_repeats)
7527 {
7528 /* Either the SeqLocPtr is SEQLOC_WHOLE (both strands) or SEQLOC_INT (probably
7529 one strand). In that case we make up a double-stranded one as we wish to look at both strands. */
7530 myslp_allocated = FALSE;
7531 if (slp->choice == SEQLOC_INT)
7532 {
7533 myslp = SeqLocIntNew(SeqLocStart(slp), SeqLocStop(slp), Seq_strand_both, SeqLocId(slp));
7534 myslp_allocated = TRUE;
7535 }
7536 else
7537 {
7538 myslp = slp;
7539 }
7540 start_timer;
7541 repeat_slp = BioseqHitRangeEngineByLoc(myslp, "blastn", repeat_database, repeat_options, NULL, NULL, NULL, NULL, NULL, 0);
7542 stop_timer("after repeat filtering");
7543 repeat_options = BLASTOptionDelete(repeat_options);
7544 repeat_database = MemFree(repeat_database);
7545 if (myslp_allocated)
7546 SeqLocFree(myslp);
7547 seqloc_num++;
7548 }
7549 if (do_vecscreen)
7550 {
7551 /* Either the SeqLocPtr is SEQLOC_WHOLE (both strands) or SEQLOC_INT (probably
7552 one strand). In that case we make up a double-stranded one as we wish to look at both strands. */
7553 myslp_allocated = FALSE;
7554 if (slp->choice == SEQLOC_INT)
7555 {
7556 myslp = SeqLocIntNew(SeqLocStart(slp), SeqLocStop(slp), Seq_strand_both, SeqLocId(slp));
7557 myslp_allocated = TRUE;
7558 }
7559 else
7560 {
7561 myslp = slp;
7562 }
7563 VSScreenSequenceByLoc(myslp, NULL, vs_database, &seqalign, &vnp, NULL, NULL);
7564 vnp_var = vnp;
7565 while (vnp_var)
7566 {
7567 seqloc_tmp = vnp_var->data.ptrvalue;
7568 if (vs_slp == NULL)
7569 {
7570 vs_slp = seqloc_tmp;
7571 }
7572 else
7573 {
7574 seqloc_var = vs_slp;
7575 while (seqloc_var->next)
7576 seqloc_var = seqloc_var->next;
7577 seqloc_var->next = seqloc_tmp;
7578 }
7579 vnp_var->data.ptrvalue = NULL;
7580 vnp_var = vnp_var->next;
7581 }
7582 vnp = ValNodeFree(vnp);
7583 seqalign = SeqAlignSetFree(seqalign);
7584 vs_database = MemFree(vs_database);
7585 if (myslp_allocated)
7586 SeqLocFree(myslp);
7587 seqloc_num++;
7588 }
7589 }
7590
7591 if (seqloc_num == 0)
7592 { /* nothing. */
7593 ;
7594 }
7595 else if (seqloc_num == 1)
7596 {
7597 if (seg_slp)
7598 seqloc_head = seg_slp;
7599 if (cc_slp)
7600 seqloc_head = cc_slp;
7601 if (dust_slp)
7602 seqloc_head = dust_slp;
7603 if (repeat_slp)
7604 seqloc_head = repeat_slp;
7605 if (vs_slp)
7606 seqloc_head = vs_slp;
7607 }
7608 else
7609 {
7610 if (seg_slp)
7611 ValNodeAddPointer(&seqloc_head, SEQLOC_MIX, seg_slp);
7612 if (cc_slp)
7613 ValNodeAddPointer(&seqloc_head, SEQLOC_MIX, cc_slp);
7614 if (dust_slp)
7615 ValNodeAddPointer(&seqloc_head, SEQLOC_MIX, dust_slp);
7616 if (repeat_slp)
7617 ValNodeAddPointer(&seqloc_head, SEQLOC_MIX, repeat_slp);
7618 if (vs_slp)
7619 ValNodeAddPointer(&seqloc_head, SEQLOC_MIX, vs_slp);
7620 }
7621
7622 BioseqUnlock(bsp);
7623 return seqloc_head;
7624 }
7625
7626 /*
7627 Program to run seg on a sequence. Note that this program only
7628 really works in UNIX systems.
7629 */
7630 Boolean LIBCALL
7631 FilterWithSeg (Uint1Ptr sequence, Int4 length, Uint1 alphabet)
7632
7633 {
7634
7635 #ifdef OS_UNIX
7636
7637 BioseqPtr bsp;
7638 Char cmd_buf[2*PATH_MAX], temp_file[PATH_MAX];
7639 CharPtr filter_dir;
7640 FILE PNTR fp;
7641 Int4 byte_store_length;
7642 Nlm_ByteStorePtr byte_store;
7643 SeqEntryPtr sep;
7644
7645 if (sequence == NULL || length == 0)
7646 return FALSE;
7647
7648 byte_store = Nlm_BSNew(length);
7649
7650 byte_store_length = Nlm_BSWrite(byte_store, (VoidPtr) sequence, length);
7651 if (length != byte_store_length)
7652 {
7653 Nlm_BSDelete(byte_store, length);
7654 return FALSE;
7655 }
7656
7657 bsp = BioseqNew();
7658 bsp->seq_data = (SeqDataPtr) byte_store;
7659 bsp->length = length;
7660 bsp->seq_data_type = alphabet;
7661 bsp->mol = Seq_mol_aa;
7662 bsp->repr = Seq_repr_raw;
7663
7664 TmpNam(temp_file);
7665 fp = FileOpen(temp_file, "w");
7666 if (BioseqToFasta(bsp, fp, FALSE) == FALSE)
7667 {
7668 bsp = BioseqFree(bsp);
7669 return FALSE;
7670 }
7671 FileClose(fp);
7672
7673 bsp = BioseqFree(bsp);
7674
7675 filter_dir = getenv("BLASTFILTER");
7676 if (filter_dir != NULL)
7677 sprintf(cmd_buf, "%s%s%s%s%s", filter_dir, DIRDELIMSTR, "seg ", temp_file, " -x");
7678 else
7679 sprintf(cmd_buf, "%s%s%s%s%s", BLASTFILTER_DIR, DIRDELIMSTR, "seg ", temp_file, " -x");
7680
7681 fp = popen(cmd_buf, "r");
7682 if (fp == NULL)
7683 {
7684 ErrPostEx(SEV_WARNING, 0, 0, "Call to seg failed.");
7685 return FALSE;
7686 }
7687
7688 sep = FastaToSeqEntry(fp, FALSE);
7689 if (sep == NULL)
7690 {
7691 ErrPostEx(SEV_WARNING, 0, 0, "Call to seg failed.");
7692 return FALSE;
7693 }
7694
7695 pclose(fp);
7696
7697 bsp = sep->data.ptrvalue;
7698 BioseqRawConvert(bsp, Seq_code_ncbistdaa);
7699
7700 BSSeek((ByteStorePtr) bsp->seq_data, 0, SEEK_SET);
7701 Nlm_BSRead((ByteStorePtr) bsp->seq_data, (VoidPtr) sequence, length);
7702
7703 SeqEntryFree(sep);
7704
7705 FileRemove(temp_file);
7706
7707 return TRUE;
7708 #else
7709 return FALSE;
7710 #endif
7711 }
7712
7713
7714 BLAST_HSPPtr BLAST_HSPFree(BLAST_HSPPtr hsp)
7715 {
7716 if (hsp)
7717 hsp->gap_info = GapXEditBlockDelete(hsp->gap_info);
7718
7719 return (BLAST_HSPPtr) MemFree(hsp);
7720 }
7721
7722 /*
7723 Frees memory used for HSP's on the ResultHitlist.
7724 Should be called as the SeqAlignPtr for a hitlist
7725 is produced to save memory.
7726 */
7727
7728 void
7729 BLASTResultFreeHsp(BLASTResultHitlistPtr result)
7730
7731 {
7732 BLASTResultHspPtr hsp;
7733 Int4 index;
7734
7735 if (result == NULL || result->hsp_array == NULL)
7736 return;
7737
7738 for(index=0; index < result->hspcnt; index++) {
7739 hsp = &result->hsp_array[index];
7740 if (hsp)
7741 hsp->gap_info = GapXEditBlockDelete(hsp->gap_info);
7742 }
7743
7744 if (result->hspcnt != 0)
7745 result->hsp_array = MemFree(result->hsp_array);
7746
7747 result->hspcnt = 0;
7748
7749 return;
7750 }
7751
7752 /*
7753 Free's the hitlist without performing a check
7754 on the integrity of the heap (used for culling).
7755 */
7756 BLASTResultHitlistPtr LIBCALL
7757 BLASTResultHitlistFree(BLASTResultHitlistPtr result)
7758
7759 {
7760 return BLASTResultHitlistFreeEx(NULL, result);
7761
7762 }
7763
7764
7765 BLASTResultHitlistPtr LIBCALL
7766 BLASTResultHitlistFreeEx(BlastSearchBlkPtr search, BLASTResultHitlistPtr result)
7767
7768 {
7769 BLASTHeapPtr hp;
7770 Int4 index;
7771 register Int4 subject_id;
7772
7773 if (result == NULL)
7774 return NULL;
7775
7776
7777 /*
7778 Check the integrity of the heap used for culling. Occassionally
7779 HSP's that have been saved (in the heap before the start of
7780 the HSP) are missed.
7781 Only do this if the BlastSearchBlkPtr was provided.
7782 */
7783 if (search && search->pbp->perform_culling == TRUE && result->num_ref > 0)
7784 {
7785 subject_id = result->subject_id;
7786
7787 /* result->num_ref can change in the loop. */
7788 for (hp = search->result_struct->heap_ptr; hp && result->num_ref>0; hp = hp->next)
7789 {
7790 index=0; /* Note that hp->num_in_heap can change in the loop */
7791 while (index < hp->num_in_heap)
7792 {
7793 if (hp->heap[index]->point_back->subject_id == subject_id)
7794 {
7795 BlastDeleteHeap(hp, index);
7796 }
7797 else
7798 index++;
7799 }
7800 }
7801 }
7802
7803 /* In case it was not freed before. */
7804 BLASTResultFreeHsp(result);
7805
7806 BLASTSubjectInfoDestruct(result->subject_info);
7807
7808 result = MemFree(result);
7809
7810 return result;
7811 }
7812
7813 /*
7814 Creates a new BLASTResultHitlist, with the an hsp-array of length hspcnt. If the
7815 allocation fails, then NULL is returned.
7816 */
7817
7818 BLASTResultHitlistPtr LIBCALL
7819 BLASTResultHitlistNew(Int4 hspcnt)
7820
7821 {
7822
7823 BLASTResultHitlistPtr new;
7824
7825 new = (BLASTResultHitlistPtr) MemNew(sizeof(BLASTResultHitlist));
7826 if (new == NULL)
7827 return NULL;
7828
7829 new->hsp_array = (BLASTResultHspPtr) MemNew(hspcnt*sizeof(BLASTResultHsp));
7830 if (new->hsp_array == NULL)
7831 {
7832 new = BLASTResultHitlistFree(new);
7833 return NULL;
7834 }
7835 new->hspcnt = hspcnt;
7836
7837 return new;
7838 }
7839
7840
7841 static Boolean
7842 CopyHSPToResultHsp(BLAST_KarlinBlkPtr kbp, BLAST_HSPPtr hsp, BLASTResultHspPtr result_hsp)
7843 {
7844 if (result_hsp == NULL || hsp == NULL)
7845 return FALSE;
7846
7847 result_hsp->ordering_method = hsp->ordering_method;
7848 result_hsp->number = hsp->num;
7849 result_hsp->score = hsp->score;
7850 result_hsp->bit_score = ((hsp->score*kbp->Lambda) - kbp->logK)/NCBIMATH_LN2;
7851 result_hsp->e_value = hsp->evalue;
7852 result_hsp->num_ident = hsp->num_ident;
7853 result_hsp->query_offset = hsp->query.offset;
7854 result_hsp->query_length = hsp->query.length;
7855 result_hsp->query_frame = hsp->query.frame;
7856 result_hsp->query_gapped_start = hsp->query.gapped_start;
7857 result_hsp->subject_offset = hsp->subject.offset;
7858 result_hsp->subject_length = hsp->subject.length;
7859 result_hsp->subject_frame = hsp->subject.frame;
7860 result_hsp->subject_gapped_start = hsp->subject.gapped_start;
7861 result_hsp->context = hsp->context;
7862 result_hsp->gap_info = hsp->gap_info;
7863 /* Not set in the other type of HSP? */
7864 result_hsp->hspset_cnt = 0;
7865
7866 return TRUE;
7867 }
7868
7869 Boolean LIBCALL
7870 CopyResultHspToHSP(BLASTResultHspPtr result_hsp, BLAST_HSPPtr hsp)
7871 {
7872 if (result_hsp == NULL || hsp == NULL)
7873 return FALSE;
7874
7875 hsp->ordering_method = result_hsp->ordering_method;
7876 hsp->num = result_hsp->number;
7877 hsp->score = result_hsp->score;
7878 hsp->evalue = result_hsp->e_value;
7879 hsp->num_ident = result_hsp->num_ident;
7880 hsp->query.offset = result_hsp->query_offset;
7881 hsp->query.length = result_hsp->query_length;
7882 hsp->query.end = result_hsp->query_offset + result_hsp->query_length;
7883 hsp->query.frame = result_hsp->query_frame;
7884 hsp->query.gapped_start = result_hsp->query_gapped_start;
7885 hsp->subject.offset = result_hsp->subject_offset;
7886 hsp->subject.length = result_hsp->subject_length;
7887 hsp->subject.end = result_hsp->subject_offset + result_hsp->subject_length;
7888 hsp->subject.frame = result_hsp->subject_frame;
7889 hsp->subject.gapped_start = result_hsp->subject_gapped_start;
7890 hsp->context = result_hsp->context;
7891
7892 return TRUE;
7893 }
7894
7895 /* Same as FillInStdSegInfo, only taking BLAST_HSPPtr argument instead of
7896 BlastResultHspPtr */
7897 StdSegPtr
7898 BLASTHspToStdSeg(BlastSearchBlkPtr search, Int4 subject_length, BLAST_HSPPtr hsp, SeqIdPtr sip, Boolean reverse, SeqIdPtr gi_list)
7899 {
7900 StdSegPtr ssp = NULL;
7901 BLASTResultHspPtr result_hsp =
7902 (BLASTResultHspPtr) Malloc(sizeof(BLASTResultHsp));
7903
7904 CopyHSPToResultHsp(search->sbp->kbp[search->first_context],
7905 hsp, result_hsp);
7906 ssp = FillInStdSegInfo(search, search->subject_id, subject_length, &ssp,
7907 result_hsp, sip, reverse, gi_list);
7908 MemFree(result_hsp);
7909 return ssp;
7910 }
7911
7912 /*
7913 Sort the HSP's by score.
7914 */
7915
7916 int LIBCALLBACK
7917 score_compare_hsps(VoidPtr v1, VoidPtr v2)
7918
7919 {
7920 BLAST_HSPPtr hsp1, hsp2; /* the HSPs to be compared */
7921 int result = 0; /* the result of the comparison */
7922
7923 hsp1 = *((BLAST_HSPPtr PNTR) v1);
7924 hsp2 = *((BLAST_HSPPtr PNTR) v2);
7925
7926 /* Null HSPs are "greater" than any non-null ones, so they go to the end
7927 of a sorted list. */
7928 if (!hsp1 && !hsp2)
7929 return 0;
7930 else if (!hsp1)
7931 return 1;
7932 else if (!hsp2)
7933 return -1;
7934
7935 if (0 == (result = BLAST_CMP(hsp2->score, hsp1->score)) &&
7936 0 == (result = BLAST_CMP(hsp1->subject.offset, hsp2->subject.offset)) &&
7937 0 == (result = BLAST_CMP(hsp2->subject.end, hsp1->subject.end)) &&
7938 0 == (result = BLAST_CMP(hsp1->query .offset, hsp2->query .offset))) {
7939 /* if all other test can't distinguish the HSPs, then the final
7940 test is the result */
7941 result = BLAST_CMP(hsp2->query.end, hsp1->query.end);
7942 }
7943 return result;
7944 }
7945
7946 /*
7947 Function to look for the highest scoring window (of size HSP_MAX_WINDOW)
7948 in an HSP and return the middle of this. Used by the gapped-alignment
7949 functions to start the gapped alignments.
7950 */
7951
7952 Int4 GetStartForGappedAlignment (BlastSearchBlkPtr search, BLAST_HSPPtr hsp, Uint1Ptr query, Uint1Ptr subject, Int4Ptr PNTR matrix)
7953 {
7954 Int4 index1, max_offset, score, max_score, hsp_end;
7955 Uint1Ptr query_var, subject_var;
7956 Boolean positionBased = (search->positionBased && search->sbp->posMatrix);
7957
7958 if (hsp->query.length <= HSP_MAX_WINDOW) {
7959 max_offset = hsp->query.offset + hsp->query.length/2;
7960 return max_offset;
7961 }
7962
7963 hsp_end = hsp->query.offset + HSP_MAX_WINDOW;
7964 query_var = query + hsp->query.offset;
7965 subject_var = subject + hsp->subject.offset;
7966 score=0;
7967 if (!positionBased) {
7968 for (index1=hsp->query.offset; index1<hsp_end; index1++) {
7969 score += matrix[*query_var][*subject_var];
7970 query_var++; subject_var++;
7971 }
7972 } else {
7973 for (index1=hsp->query.offset; index1<hsp_end; index1++) {
7974 score += search->sbp->posMatrix[index1][*subject_var];
7975 query_var++; subject_var++;
7976 }
7977 }
7978 max_score = score;
7979 max_offset = hsp_end - 1;
7980 hsp_end = hsp->query.end -
7981 MAX(0, hsp->query.length - hsp->subject.length);
7982 for (index1=hsp->query.offset + HSP_MAX_WINDOW; index1<hsp_end; index1++) {
7983 if (!positionBased) {
7984 score -= matrix[*(query_var-HSP_MAX_WINDOW)][*(subject_var-HSP_MAX_WINDOW)];
7985 score += matrix[*query_var][*subject_var];
7986 } else {
7987 score -= search->sbp->posMatrix[index1-HSP_MAX_WINDOW][*(subject_var-HSP_MAX_WINDOW)];
7988 score += search->sbp->posMatrix[index1][*subject_var];
7989 }
7990 if (score > max_score) {
7991 max_score = score;
7992 max_offset = index1;
7993 }
7994 query_var++; subject_var++;
7995 }
7996 if (max_score > 0)
7997 max_offset -= HSP_MAX_WINDOW/2;
7998 else
7999 max_offset = hsp->query.offset;
8000
8001 return max_offset;
8002 }
8003
8004 /*
8005 Check whether the starting point for gapped alignment lies in
8006 region that has positive score. This routine is called after a
8007 preliminary gapped alignment has been computed, but before the
8008 traceback is computed. The score of the region containing the
8009 starting point may have changed due to the introduction of
8010 ambiguity characters, further filtering of the sequences or the
8011 application of composition based statistics.
8012
8013 Usually, we check an ungapped alignment of length 11 about the
8014 starting point: 5 characters to the left and 5 to the right.
8015 However, the actual region checked is occassionally shorter because
8016 we don't check characters before the start, or after the end, of
8017 the preliminarily aligned regions in the query or subject.
8018 */
8019 Boolean
8020 CheckStartForGappedAlignment (BlastSearchBlkPtr search, BLAST_HSPPtr hsp,
8021 Uint1Ptr query, Uint1Ptr subject,
8022 Int4Ptr PNTR matrix)
8023 {
8024 Int4 left, right; /* Number of aligned characters to the
8025 left and right of the starting point */
8026 Int4 score; /* Score of the word alignment */
8027 Uint1Ptr subject_var; /* Current character in the subject sequence */
8028 Uint1Ptr subject_right; /* last character to be considered in the subject
8029 sequence */
8030 Boolean positionBased =
8031 (search->positionBased && search->sbp->posMatrix);
8032
8033 /* Compute the number of characters to the left of the start
8034 to include in the word */
8035 left = -HSP_MAX_WINDOW/2;
8036 if (left < hsp->query.offset - hsp->query.gapped_start) {
8037 left = hsp->query.offset - hsp->query.gapped_start;
8038 }
8039 if (left < hsp->subject.offset - hsp->subject.gapped_start) {
8040 left = hsp->subject.offset - hsp->subject.gapped_start;
8041 }
8042
8043 /* Compute the number of characters to right to include in the word,
8044 including the starting point itself. */
8045 right = HSP_MAX_WINDOW/2 + 1;
8046 if (right > hsp->query.end - hsp->query.gapped_start) {
8047 right = hsp->query.end - hsp->query.gapped_start;
8048 }
8049 if (right > hsp->subject.end - hsp->subject.gapped_start) {
8050 right = hsp->subject.end - hsp->subject.gapped_start;
8051 }
8052
8053 /* Calculate the score of the word */
8054 score = 0;
8055 subject_var = subject + hsp->subject.gapped_start + left;
8056 subject_right = subject + hsp->subject.gapped_start + right;
8057 if ( !positionBased ) {
8058 Uint1Ptr query_var; /* Current character in the query */
8059 query_var = query + hsp->query.gapped_start + left;
8060 for ( ; subject_var < subject_right; subject_var++, query_var++) {
8061 score += matrix[*query_var][*subject_var];
8062 }
8063 } else {
8064 Int4 query_index; /* Current position in the query */
8065 query_index = hsp->query.gapped_start + left;
8066 for ( ; subject_var < subject_right; subject_var++, query_index++) {
8067 score += search->sbp->posMatrix[query_index][*subject_var];
8068 }
8069 }
8070 if (score <= 0) {
8071 return FALSE;
8072 } else {
8073 return TRUE;
8074 }
8075 }
8076
8077
8078 /*
8079 Gets the ratio used to change an evalue calculated with the subject
8080 sequence length to one with a db length.
8081 */
8082
8083 Nlm_FloatHi LIBCALL
8084 GetDbSubjRatio(BlastSearchBlkPtr search, Int4 subject_length)
8085 {
8086 Nlm_FloatHi db_subj_ratio;
8087
8088 db_subj_ratio =
8089 ((Nlm_FloatHi) search->context_factor * search->dblen) /
8090 ((Nlm_FloatHi) subject_length);
8091 if (StringCmp(search->prog_name, "tblastn") == 0 ||
8092 StringCmp(search->prog_name, "tblastx") == 0 ||
8093 StringCmp(search->prog_name, "psitblastn") == 0)
8094 {
8095 db_subj_ratio *= 3;
8096 }
8097
8098 return db_subj_ratio;
8099 }
8100
8101 /* The following value should be divisible by 3, to make sure that frames stay
8102 the same when translations are restricted to partial sequence. */
8103 #define SUBJECT_ADJUSTMENT 2100
8104 SeqAlignPtr LIBCALL
8105 BlastGetGapAlgnTbckWithReaddb (BlastSearchBlkPtr search, Int4 hit_number, Boolean ordinal_number)
8106
8107 {
8108 BLASTResultHitlistPtr result_hitlist;
8109 BioseqPtr subject_bsp;
8110 Boolean subject_allocated = FALSE;
8111 Int4 index1, subject_length, rev_subject_length;
8112 Int4 subject_start, subject_end;
8113 Int4 hsp_count;
8114 BLASTResultHspPtr hsp_array;
8115 SeqAlignPtr seqalign;
8116 SeqPortPtr spp;
8117 Uint1Ptr subject, rev_subject;
8118
8119 result_hitlist = search->result_struct->results[hit_number];
8120
8121 if (StringCmp(search->prog_name, "tblastn") == 0 ||
8122 StringCmp(search->prog_name, "psitblastn") == 0)
8123 {
8124 subject_bsp = readdb_get_bioseq(search->rdfp, result_hitlist->subject_id);
8125 spp = SeqPortNew(subject_bsp, 0, -1, Seq_strand_plus, Seq_code_ncbi4na);
8126 /* make one longer to "protect" ALIGN. */
8127 subject = MemNew((1+subject_bsp->length)*sizeof(Uint1));
8128 hsp_array = result_hitlist->hsp_array;
8129 hsp_count = result_hitlist->hspcnt;
8130 for (index1=0; index1<hsp_count; index1++)
8131 {
8132 if (hsp_array[index1].subject_frame > 0)
8133 { /* Get subsequence corresponding to this hsp. */
8134 Int4 offset;
8135
8136 subject_start = 3*hsp_array[index1].subject_offset;
8137 subject_end = subject_start + 3*hsp_array[index1].subject_length;
8138
8139 /* add SUBJECT_ADJUSTMENT bases to either end. */
8140 subject_start = MAX(subject_start - SUBJECT_ADJUSTMENT, 0);
8141 subject_end = MIN(subject_end + SUBJECT_ADJUSTMENT, subject_bsp->length);
8142
8143 SeqPortSeek(spp, subject_start, SEEK_SET);
8144
8145 for (offset=subject_start; offset<subject_end; offset++)
8146 subject[offset] = SeqPortGetResidue(spp);
8147
8148 if (subject_start == 0 && subject_end == subject_bsp->length)
8149 break; /* entire sequence has been fetched. */
8150 }
8151 }
8152 /* Gap character in last space. */
8153 subject[subject_bsp->length] = NULLB;
8154 subject_length = subject_bsp->length;
8155 spp = SeqPortFree(spp);
8156
8157 spp = SeqPortNew(subject_bsp, 0, -1, Seq_strand_minus, Seq_code_ncbi4na);
8158 /* make one longer to "protect" ALIGN. */
8159 rev_subject = MemNew((1+subject_bsp->length)*sizeof(Uint1));
8160 hsp_array = result_hitlist->hsp_array;
8161 hsp_count = result_hitlist->hspcnt;
8162 for (index1=0; index1<hsp_count; index1++)
8163 {
8164 if (hsp_array[index1].subject_frame < 0)
8165 { /* Get subsequence corresponding to this hsp. */
8166 Int4 offset;
8167
8168 subject_start = 3*hsp_array[index1].subject_offset;
8169 subject_end = subject_start + 3*hsp_array[index1].subject_length;
8170
8171 /* add SUBJECT_ADJUSTMENT bases to either end. */
8172 subject_start = MAX(subject_start - SUBJECT_ADJUSTMENT, 0);
8173 subject_end = MIN(subject_end + SUBJECT_ADJUSTMENT, subject_bsp->length);
8174
8175 SeqPortSeek(spp, subject_start, SEEK_SET);
8176
8177 for (offset=subject_start; offset<subject_end; offset++)
8178 rev_subject[offset] = SeqPortGetResidue(spp);
8179
8180 if (subject_start == 0 && subject_end == subject_bsp->length)
8181 break; /* entire sequence has been fetched. */
8182 }
8183 }
8184 /* Gap character in last space. */
8185 rev_subject[subject_bsp->length] = NULLB;
8186 rev_subject_length = subject_bsp->length;
8187 spp = SeqPortFree(spp);
8188 subject_bsp = BioseqFree(subject_bsp);
8189 subject_allocated = TRUE;
8190 }
8191 else
8192 {
8193 subject_length = readdb_get_sequence(search->rdfp, result_hitlist->subject_id, (Uint1Ptr PNTR) &subject);
8194 rev_subject = NULL;
8195 rev_subject_length = 0;
8196 }
8197
8198 seqalign = BlastGetGapAlgnTbck (search, hit_number, FALSE, ordinal_number, subject, subject_length, rev_subject, rev_subject_length);
8199
8200 if (subject_allocated)
8201 {
8202 subject = MemFree(subject);
8203 rev_subject = MemFree(rev_subject);
8204 }
8205
8206 return seqalign;
8207 }
8208
8209 int LIBCALLBACK
8210 query_offset_compare_hsp(VoidPtr v1, VoidPtr v2)
8211
8212 {
8213 BLAST_HSPPtr h1, h2;
8214 BLAST_HSPPtr PNTR hp1, PNTR hp2;
8215
8216 hp1 = (BLAST_HSPPtr PNTR) v1;
8217 hp2 = (BLAST_HSPPtr PNTR) v2;
8218 h1 = *hp1;
8219 h2 = *hp2;
8220
8221 if (h1 == NULL) {
8222 return (h2 == NULL) ? 0 : 1;
8223 } else if (h2 == NULL) {
8224 return -1;
8225 }
8226
8227 if (h1->query.offset < h2->query.offset)
8228 return -1;
8229 if (h1->query.offset > h2->query.offset)
8230 return 1;
8231
8232 if (h1->subject.offset < h2->subject.offset)
8233 return -1;
8234 if (h1->subject.offset > h2->subject.offset)
8235 return 1;
8236
8237 return 0;
8238 }
8239
8240 int LIBCALLBACK
8241 query_end_compare_hsp(VoidPtr v1, VoidPtr v2)
8242
8243 {
8244 BLAST_HSPPtr h1, h2;
8245 BLAST_HSPPtr PNTR hp1, PNTR hp2;
8246
8247 hp1 = (BLAST_HSPPtr PNTR) v1;
8248 hp2 = (BLAST_HSPPtr PNTR) v2;
8249 h1 = *hp1;
8250 h2 = *hp2;
8251
8252 if (h1 == NULL) {
8253 return (h2 == NULL) ? 0 : 1;
8254 } else if (h2 == NULL) {
8255 return -1;
8256 }
8257
8258 if (h1->query.end < h2->query.end)
8259 return -1;
8260 if (h1->query.end > h2->query.end)
8261 return 1;
8262
8263 if (h1->subject.end < h2->subject.end)
8264 return -1;
8265 if (h1->subject.end > h2->subject.end)
8266 return 1;
8267
8268 return 0;
8269 }
8270 /*
8271 Check the gapped alignments for an overlap of two different alignments.
8272 A sufficient overlap is when two alignments have the same start values
8273 of have the same final values.
8274
8275 The number of valid alignments remaining is returned.
8276 */
8277
8278 static Int4
8279 CheckGappedAlignmentsForOverlap(BlastSearchBlkPtr search, BLAST_HSPPtr *hsp_array, Int4 hsp_count, Int2 frame)
8280
8281 {
8282 Int4 index1, index, increment;
8283
8284 if (search == NULL || hsp_array == NULL || hsp_count == 0)
8285 return 0;
8286
8287 HeapSort(hsp_array, hsp_count, sizeof(BLAST_HSPPtr), query_offset_compare_hsp);
8288 index=0;
8289 increment=1;
8290 while (index < hsp_count-increment)
8291 { /* Check if both HSP's start on or end on the same digonal. */
8292 if (hsp_array[index+increment] == NULL)
8293 {
8294 increment++;
8295 continue;
8296 }
8297
8298 if (frame != 0 && hsp_array[index+increment]->subject.frame != frame)
8299 break;
8300
8301 if (hsp_array[index] && hsp_array[index]->query.offset == hsp_array[index+increment]->query.offset &&
8302 hsp_array[index]->subject.offset == hsp_array[index+increment]->subject.offset &&
8303 SIGN(hsp_array[index]->query.frame) == SIGN(hsp_array[index+increment]->query.frame))
8304 {
8305 if (hsp_array[index]->score > hsp_array[index+increment]->score)
8306 {
8307 hsp_array[index+increment] =
8308 BLAST_HSPFree(hsp_array[index+increment]);
8309 increment++;
8310 }
8311 else
8312 {
8313 hsp_array[index] =
8314 BLAST_HSPFree(hsp_array[index]);
8315 index++;
8316 increment = 1;
8317 }
8318 }
8319 else
8320 {
8321 index++;
8322 increment = 1;
8323 }
8324 }
8325
8326 HeapSort(hsp_array, hsp_count, sizeof(BLAST_HSPPtr), query_end_compare_hsp);
8327 index=0;
8328 increment=1;
8329 while (index < hsp_count-increment)
8330 { /* Check if both HSP's start on or end on the same digonal. */
8331 if (hsp_array[index+increment] == NULL)
8332 {
8333 increment++;
8334 continue;
8335 }
8336
8337 if (frame != 0 && hsp_array[index+increment]->subject.frame != frame)
8338 break;
8339
8340 if (hsp_array[index] &&
8341 hsp_array[index]->query.end == hsp_array[index+increment]->query.end &&
8342 hsp_array[index]->subject.end == hsp_array[index+increment]->subject.end &&
8343 SIGN(hsp_array[index]->query.frame) == SIGN(hsp_array[index+increment]->query.frame))
8344 {
8345 if (hsp_array[index]->score > hsp_array[index+increment]->score)
8346 {
8347 hsp_array[index+increment] =
8348 BLAST_HSPFree(hsp_array[index+increment]);
8349 increment++;
8350 }
8351 else
8352 {
8353 hsp_array[index] =
8354 BLAST_HSPFree(hsp_array[index]);
8355 index++;
8356 increment = 1;
8357 }
8358 }
8359 else
8360 {
8361 index++;
8362 increment = 1;
8363 }
8364 }
8365
8366 HeapSort(hsp_array,hsp_count,sizeof(BLAST_HSPPtr), score_compare_hsps);
8367
8368 index1 = 0;
8369 for (index=0; index<hsp_count; index++)
8370 {
8371 if (hsp_array[index] != NULL)
8372 index1++;
8373 }
8374
8375
8376 return index1;
8377
8378 }
8379
8380 /*
8381 Sort the HSP's by frame.
8382 */
8383
8384 int LIBCALLBACK
8385 frame_compare_hsp_m3(VoidPtr v1, VoidPtr v2)
8386
8387 {
8388 BLAST_HSPPtr h1, h2;
8389 BLAST_HSPPtr PNTR hp1, PNTR hp2;
8390
8391 hp1 = (BLAST_HSPPtr PNTR) v1;
8392 hp2 = (BLAST_HSPPtr PNTR) v2;
8393 h1 = *hp1;
8394 h2 = *hp2;
8395
8396 if (h1->subject.frame == -3 && h2->subject.frame != -3)
8397 return -1;
8398 if (h2->subject.frame == -3 && h1->subject.frame != -3)
8399 return 1;
8400
8401 return 0;
8402 }
8403 int LIBCALLBACK
8404 frame_compare_hsp_m2(VoidPtr v1, VoidPtr v2)
8405
8406 {
8407 BLAST_HSPPtr h1, h2;
8408 BLAST_HSPPtr PNTR hp1, PNTR hp2;
8409
8410 hp1 = (BLAST_HSPPtr PNTR) v1;
8411 hp2 = (BLAST_HSPPtr PNTR) v2;
8412 h1 = *hp1;
8413 h2 = *hp2;
8414
8415 if (h1->subject.frame == -2 && h2->subject.frame != -2)
8416 return -1;
8417 if (h2->subject.frame == -2 && h1->subject.frame != -2)
8418 return 1;
8419
8420 return 0;
8421 }
8422
8423 int LIBCALLBACK
8424 frame_compare_hsp_m1(VoidPtr v1, VoidPtr v2)
8425
8426 {
8427 BLAST_HSPPtr h1, h2;
8428 BLAST_HSPPtr PNTR hp1, PNTR hp2;
8429
8430 hp1 = (BLAST_HSPPtr PNTR) v1;
8431 hp2 = (BLAST_HSPPtr PNTR) v2;
8432 h1 = *hp1;
8433 h2 = *hp2;
8434
8435 if (h1->subject.frame == -1 && h2->subject.frame != -1)
8436 return -1;
8437 if (h2->subject.frame == -1 && h1->subject.frame != -1)
8438 return 1;
8439
8440 return 0;
8441 }
8442 int LIBCALLBACK
8443 frame_compare_hsp_p1(VoidPtr v1, VoidPtr v2)
8444
8445 {
8446 BLAST_HSPPtr h1, h2;
8447 BLAST_HSPPtr PNTR hp1, PNTR hp2;
8448
8449 hp1 = (BLAST_HSPPtr PNTR) v1;
8450 hp2 = (BLAST_HSPPtr PNTR) v2;
8451 h1 = *hp1;
8452 h2 = *hp2;
8453
8454 if (h1->subject.frame == 1 && h2->subject.frame != 1)
8455 return -1;
8456 if (h2->subject.frame == 1 && h1->subject.frame != 1)
8457 return 1;
8458
8459 return 0;
8460 }
8461 int LIBCALLBACK
8462 frame_compare_hsp_p2(VoidPtr v1, VoidPtr v2)
8463
8464 {
8465 BLAST_HSPPtr h1, h2;
8466 BLAST_HSPPtr PNTR hp1, PNTR hp2;
8467
8468 hp1 = (BLAST_HSPPtr PNTR) v1;
8469 hp2 = (BLAST_HSPPtr PNTR) v2;
8470 h1 = *hp1;
8471 h2 = *hp2;
8472
8473 if (h1->subject.frame == 2 && h2->subject.frame != 2)
8474 return -1;
8475 if (h2->subject.frame == 2 && h1->subject.frame != 2)
8476 return 1;
8477
8478 return 0;
8479 }
8480 int LIBCALLBACK
8481 frame_compare_hsp_p3(VoidPtr v1, VoidPtr v2)
8482
8483 {
8484 BLAST_HSPPtr h1, h2;
8485 BLAST_HSPPtr PNTR hp1, PNTR hp2;
8486
8487 hp1 = (BLAST_HSPPtr PNTR) v1;
8488 hp2 = (BLAST_HSPPtr PNTR) v2;
8489 h1 = *hp1;
8490 h2 = *hp2;
8491
8492 if (h1->subject.frame == 3 && h2->subject.frame != 3)
8493 return -1;
8494 if (h2->subject.frame == 3 && h1->subject.frame != 3)
8495 return 1;
8496
8497 return 0;
8498 }
8499 /*
8500 Engine to get the gapped scores from an array of HSP's.
8501 */
8502 static BLAST_HSPPtr PNTR
8503 BlastGappedScoreInternal(BlastSearchBlkPtr search, Uint1Ptr subject, Int4 subject_length, GapAlignBlkPtr gap_align, BLAST_HSPPtr *hsp_array, Int4Ptr hspcnt, Int4Ptr hspcnt_max, Int4 hspmax, Int2 frame)
8504
8505 {
8506 BLAST_HSPPtr hsp, hsp1=NULL;
8507 BLAST_HSPPtr PNTR hsp_array_new;
8508 BLAST_HSP_helperPtr helper;
8509 Boolean hsp_start_is_contained, hsp_end_is_contained;
8510 Int4 hsp_cnt=0, index, index1;
8511 Int4 max_offset = 0, next_offset;
8512 Int4 query_num; /* AM: Added to support query concatenation */
8513
8514 /* helper contains most frequently used information to speed up access. */
8515 helper = Malloc((*hspcnt)*sizeof(BLAST_HSP_helper));
8516 for (index=0; index<(*hspcnt); index++)
8517 {
8518 hsp_start_is_contained = FALSE;
8519 hsp_end_is_contained = FALSE;
8520 hsp = hsp_array[index];
8521 /* This prefetches this value for the test below. */
8522 next_offset = hsp->query.offset;
8523
8524 if (frame != 0 && hsp->subject.frame != frame)
8525 break;
8526
8527 for (index1=0; index1<index; index1++)
8528 {
8529 hsp_start_is_contained = FALSE;
8530 hsp_end_is_contained = FALSE;
8531
8532 hsp1 = hsp_array[index1];
8533 if (hsp1 == NULL)
8534 continue;
8535
8536 /* Check with the helper array whether further
8537 tests are warranted. Having only two ints
8538 in the helper array speeds up access. */
8539 if (helper[index1].qoffset <= next_offset &&
8540 helper[index1].qend >= next_offset)
8541 {
8542 if (CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.offset, hsp1->subject.offset, hsp1->subject.end, hsp->subject.offset) == TRUE)
8543
8544 { /* Check that it's on diff. strands. */
8545 if (SIGN(hsp1->query.frame) == SIGN(hsp->query.frame) &&
8546 SIGN(hsp1->subject.frame) == SIGN(hsp->subject.frame))
8547 hsp_start_is_contained = TRUE;
8548 }
8549 if (hsp_start_is_contained && CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end, hsp->query.end, hsp1->subject.offset, hsp1->subject.end, hsp->subject.end) == TRUE)
8550
8551 { /* Check that it's on diff. strands. */
8552 if (SIGN(hsp1->query.frame) == SIGN(hsp->query.frame) &&
8553 SIGN(hsp1->subject.frame) == SIGN(hsp->subject.frame))
8554 hsp_end_is_contained = TRUE;
8555 if (hsp_start_is_contained && hsp_end_is_contained && hsp->score <= hsp1->score)
8556 {
8557 break;
8558 }
8559 }
8560 }
8561 }
8562
8563 if (hsp_start_is_contained == FALSE ||
8564 hsp_end_is_contained == FALSE ||
8565 (hsp1 == NULL) || (hsp->score > hsp1->score))
8566 {
8567 gap_align->include_query = 0;
8568
8569 if(!search->pbp->is_ooframe) {
8570 max_offset = GetStartForGappedAlignment(search, hsp, search->context[hsp->context].query->sequence, subject, search->sbp->matrix);
8571 }
8572
8573 #ifdef BLAST_COLLECT_STATS
8574 search->real_gap_number_of_hsps++;
8575 #endif
8576 Nlm_MemSet((VoidPtr) &(hsp_array[index]->hsp_link), 0, sizeof(BLAST_HSP_LINK));
8577 hsp_array[index]->linked_set = FALSE;
8578 hsp_array[index]->start_of_chain = FALSE;
8579 hsp_array[index]->num = 0;
8580 hsp_array[index]->xsum = 0.0;
8581
8582 if(search->pbp->is_ooframe) {
8583 gap_align->is_ooframe = TRUE;
8584 gap_align->query = subject;
8585 if(hsp->query.frame > 0) {
8586 gap_align->subject = search->query_dnap[0]->sequence;
8587 gap_align->subject_length = search->query_dnap[0]->length;
8588 } else {
8589 gap_align->subject = search->query_dnap[1]->sequence;
8590 gap_align->subject_length = search->query_dnap[1]->length;
8591 }
8592
8593 gap_align->query_length = subject_length;
8594
8595 gap_align->q_start = hsp->subject.offset;
8596 gap_align->s_start = hsp->query.offset;
8597
8598 hsp->query.gapped_start = gap_align->s_start;
8599 hsp->subject.gapped_start = gap_align->q_start;
8600
8601 } else {
8602 gap_align->query = search->context[hsp->context].query->sequence;
8603 gap_align->query_length = search->context[hsp->context].query->length;
8604 gap_align->q_start = max_offset;
8605 gap_align->s_start =
8606 (hsp->subject.offset - hsp->query.offset) + max_offset;
8607 hsp->query.gapped_start = gap_align->q_start;
8608 hsp->subject.gapped_start = gap_align->s_start;
8609
8610 gap_align->subject = subject;
8611 gap_align->subject_length = subject_length;
8612 }
8613
8614 /* For out-of frame gapping - query is protein
8615 and subject is DNA translated into 3 frames */
8616
8617 PerformGappedAlignment(gap_align);
8618
8619 if(search->pbp->is_ooframe) {
8620 hsp->query.offset = gap_align->subject_start;
8621 hsp->subject.offset = gap_align->query_start;
8622 /* The end is one further for BLAST than for the gapped align. */
8623 hsp->query.end = gap_align->subject_stop + 1;
8624 hsp->subject.end = gap_align->query_stop + 1;
8625 } else {
8626 hsp->query.offset = gap_align->query_start;
8627 hsp->query.end = gap_align->query_stop + 1;
8628 hsp->subject.offset = gap_align->subject_start;
8629 hsp->subject.end = gap_align->subject_stop + 1;
8630 /* The end is one further for BLAST than for the gapped align. */
8631 }
8632
8633 hsp->query.length = hsp->query.end - hsp->query.offset;
8634 hsp->subject.length = hsp->subject.end - hsp->subject.offset;
8635 hsp->score = gap_align->score;
8636 if( hsp->score >= search->pbp->cutoff_s1 ) {
8637 /* AM: Changed to support query concatenation */
8638 if( !search->mult_queries )
8639 hsp->evalue =
8640 BlastKarlinStoE_simple(hsp->score,
8641 search->sbp->
8642 kbp_gap[search->first_context],
8643 search->searchsp_eff);
8644 else {
8645 query_num = GetQueryNum( search->mult_queries,
8646 hsp->query.offset,
8647 hsp->query.end,
8648 hsp->query.frame );
8649 hsp->evalue =
8650 BlastKarlinStoE_simple(hsp->score,
8651 search->sbp->
8652 kbp_gap[search->first_context],
8653 search->mult_queries->
8654 SearchSpEff[query_num]);
8655 }
8656
8657 hsp_cnt++;
8658 /* Fill in the helper structure. */
8659 helper[index].qoffset = hsp->query.offset;
8660 helper[index].qend = hsp->query.end;
8661 } else {
8662 /* Score of the gapped extension is below the required
8663 cutoff, delete this hsp */
8664 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
8665 }
8666 }
8667 else
8668 { /* Contained within another HSP, delete. */
8669 hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
8670 }
8671 }
8672 helper = MemFree(helper);
8673
8674 hsp_cnt = CheckGappedAlignmentsForOverlap(search, hsp_array, *hspcnt, frame);
8675
8676 if (hsp_cnt < (*hspcnt))
8677 {
8678 /* Save HSP's again, discarding those that have been NULLed out. */
8679 hsp_array_new = MemNew(hspmax*sizeof(BLAST_HSPPtr));
8680 index1 = 0;
8681 for (index=0; index<(*hspcnt_max); index++)
8682 {
8683 if (hsp_array[index] != NULL)
8684 {
8685 hsp_array_new[index1] = hsp_array[index];
8686 index1++;