|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/tools/blastool.c |
source navigation diff markup identifier search freetext search file search |
1 static char const rcsid[] = "$Id: blastool.c,v 6.292 2007/03/15 20:39:17 madden Exp $";
2
3 /* ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================*/
26
27 /*****************************************************************************
28
29 File name: blastool.c
30
31 Author: Tom Madden
32
33 Contents: Utilities for BLAST
34
35 ******************************************************************************/
36 /*
37 * $Revision: 6.292 $
38 * $Log: blastool.c,v $
39 * Revision 6.292 2007/03/15 20:39:17 madden
40 * Fix incorrect setting of filter in BLAST_WizardOptionsMaskInit
41 *
42 * Revision 6.291 2006/09/21 13:42:37 madden
43 * BlastProcessGiLists returns a boolean to specify that an attempt was made to process a list of GIs. If no matches were found this can be reported back to the user
44 *
45 * Revision 6.290 2006/04/26 12:42:36 madden
46 * BlastSetUserErrorString and BlastDeleteUserErrorString moved from blastool.c to blfmtutl.c
47 *
48 * Revision 6.289 2006/03/21 22:35:27 camacho
49 * Add support for setting database length in BLAST_WizardOptions{Blk,Mask}
50 *
51 * Revision 6.288 2006/03/02 21:01:54 camacho
52 * Use lowercase masking in BLAST_Wizard
53 *
54 * Revision 6.287 2005/12/29 19:56:06 madden
55 * Moved functions to print tabular output to blfmtutl
56 *
57 * Revision 6.286 2005/11/18 14:19:25 madden
58 * Check pointer before dereferencing
59 *
60 * Revision 6.285 2005/10/06 12:52:23 madden
61 * Changes to support correct gapped stats for blastn
62 *
63 * Revision 6.284 2005/09/29 17:39:58 coulouri
64 * from mike gertz:
65 * Refactored BLASTPostSearchLogic so that RedoAlignmentCore may be
66 * called when concatenated queries are being used.
67 *
68 * Revision 6.283 2005/08/31 20:32:58 coulouri
69 * From Mike Gertz:
70 * Removed manipulation of options->hitlist_size,
71 * options->expect_value and search->pbp->cutoff_e; these should no
72 * longer be manipulated by this routine.
73 *
74 * Revision 6.282 2005/07/25 19:01:00 coulouri
75 * correction to previous commit
76 *
77 * Revision 6.281 2005/07/13 16:18:49 coulouri
78 * correct logic error when dealing with oid 0. fixes rt#15077917.
79 *
80 * Revision 6.280 2005/02/15 21:10:47 dondosha
81 * Set X-dropoff for the traceback in MegaBlastPrintAlignInfo to final X-dropoff parameter
82 *
83 * Revision 6.279 2004/12/09 16:21:21 camacho
84 * Do not do the traceback for non-Smith-Waterman blastpgp runs [by Mike Gertz]
85 *
86 * Revision 6.278 2004/11/19 13:22:05 madden
87 * Remove no_check_score completely (from Mike Gertz)
88 *
89 * Revision 6.277 2004/11/17 13:03:56 madden
90 * Always set no_score_check to TRUE
91 *
92 * Revision 6.276 2004/11/01 20:43:15 camacho
93 * + BlastErrorToString
94 *
95 * Revision 6.275 2004/08/31 17:07:14 dondosha
96 * For PRINT_SEQUENCES option in tabular output, always show query on forward strand; reverse complement subject is necessary
97 *
98 * Revision 6.274 2004/08/26 19:01:35 dondosha
99 * Retrieve Bioseq before attempting to calculate number of identities in alignment segments
100 *
101 * Revision 6.273 2004/08/16 19:37:26 dondosha
102 * Enabled uneven gap HSP linking for blastx
103 *
104 * Revision 6.272 2004/07/15 21:00:17 dondosha
105 * Print Accession.Version in megablast tabular output subject ids; print number of sequences in # Query comment if multiple queries
106 *
107 * Revision 6.271 2004/06/30 13:42:27 kans
108 * include <blfmtutl.h> to clear up Mac compiler missing prototype errors
109 *
110 * Revision 6.270 2004/06/30 12:29:39 madden
111 * Moved some functions to blfmtutl.c
112 *
113 * Revision 6.269 2004/06/24 21:16:34 dondosha
114 * Boolean argument in ScoreAndEvalueToBuffers changed to Uint1, so pass 0 instead of FALSE
115 *
116 * Revision 6.268 2004/05/21 13:53:04 dondosha
117 * Use BLAST_HSPFree to free BLAST_HSP structures, hence no need to call GapXEditBlockDelete in multiple places
118 *
119 * Revision 6.267 2004/05/14 15:38:11 dondosha
120 * Use newly public function ScoreAndEvalueToBuffers from txalign.h instead of a static function
121 *
122 * Revision 6.266 2004/05/14 14:41:03 bealer
123 * - Er. I mean .002, as per blastpgp.
124 *
125 * Revision 6.265 2004/05/14 14:39:45 bealer
126 * - Adjust ethresh to .001 for PSI blast.
127 *
128 * Revision 6.264 2004/04/30 15:25:20 dondosha
129 * Added argument in call to BXMLGetHspFromSeqAlign
130 *
131 * Revision 6.263 2004/04/28 20:32:55 bealer
132 * - Fix BLAST_Wizard defaults for PSI-Blast: comp-based-stats=TRUE, E=0.005.
133 *
134 * Revision 6.262 2004/03/22 22:04:43 dondosha
135 * Do not reassign kbp_gap pointers for megablast traceback, as they are already set due to recent changes
136 *
137 * Revision 6.261 2004/03/22 15:24:56 dondosha
138 * Do not set gapped-search-specific options for tblastx
139 *
140 * Revision 6.260 2004/02/26 15:52:30 papadopo
141 * Mike Gertz' modifications to unify handling of gapped Karlin blocks between protein and nucleotide searches
142 *
143 * Revision 6.259 2004/02/24 14:07:42 camacho
144 * BlastAdjustDbNumbers is DEPRECATED
145 *
146 * Revision 6.258 2003/12/30 15:14:40 camacho
147 * Fix to MergeDbGiListsWithOIDLists:
148 * When searching gi lists for matches in rdfp linked list, isolate each
149 * element of the linked list first.
150 *
151 * Revision 6.257 2003/12/23 23:28:27 dondosha
152 * Use number of identities from the seqalign to calculate percent identity in tabular output
153 *
154 * Revision 6.256 2003/12/12 16:01:23 madden
155 * Change to signature of BlastCutoffs, remove BlastCutoffs_simple
156 *
157 * Revision 6.255 2003/11/10 16:59:42 dondosha
158 * Print the seqid in AcknowledgeBlastQuery for any non-local id
159 *
160 * Revision 6.254 2003/10/29 17:46:59 dondosha
161 * Allow 2-stage greedy extension in megablast
162 *
163 * Revision 6.253 2003/09/12 16:02:06 dondosha
164 * If gap open is non-zero for megablast, require gap extension option to be non-zero too
165 *
166 * Revision 6.252 2003/08/21 16:06:37 dondosha
167 * Correction to previous change
168 *
169 * Revision 6.251 2003/08/20 22:13:18 dondosha
170 * Added BlastPrintTabularResults with an extra boolean parameter for OOF alignments
171 *
172 * Revision 6.250 2003/07/15 20:33:23 madden
173 * set is_rps_blast if service is rpsblast
174 *
175 * Revision 6.249 2003/07/15 14:33:43 dondosha
176 * Added a #define for fprintf substitute, needed for gzip compression of Web BLAST results
177 *
178 * Revision 6.248 2003/06/12 16:47:40 madden
179 * Fix so all query info on one line for tab. report
180 *
181 * Revision 6.247 2003/06/11 20:19:16 madden
182 * Fixes to PrintTabularOutputHeader
183 *
184 * Revision 6.246 2003/05/30 17:25:36 coulouri
185 * add rcsid
186 *
187 * Revision 6.245 2003/05/22 21:39:23 dondosha
188 * Correction to previous change
189 *
190 * Revision 6.244 2003/05/22 20:45:27 dondosha
191 * Fix in BlastCreateVirtualOIDList when rdfps with oidlist exist before rdfps with gi lists
192 *
193 * Revision 6.243 2003/05/13 16:02:53 coulouri
194 * make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
195 *
196 * Revision 6.242 2003/05/06 15:19:19 dondosha
197 * Removed extra memory freeing statement for megablast -D3 option
198 *
199 * Revision 6.241 2003/05/05 16:48:40 camacho
200 * Removed warning about gi list restrictions
201 *
202 * Revision 6.240 2003/04/25 18:56:14 camacho
203 * Updated MergeDbGiListsWithOIDLists to use gilists as opposed to gifiles
204 *
205 * Revision 6.239 2003/04/24 17:10:15 dondosha
206 * Fixed FastaCheckDna: check only one sequence at a time
207 *
208 * Revision 6.238 2003/04/23 23:34:07 boemker
209 * Bug fixes: BLAST_Wizard ignored mask->use_best_align,
210 * mask->use_real_db_size, mask->window_size.
211 *
212 * Revision 6.237 2003/04/23 23:22:37 boemker
213 * Bug fix: BLAST_Wizard ignored strand_option.
214 *
215 * Revision 6.236 2003/04/23 23:11:16 boemker
216 * Bug fix: BLAST_Wizard ignored pseudoCountConst.
217 *
218 * Revision 6.235 2003/04/23 22:55:27 boemker
219 * Bug fixes: BLAST_Wizard ignored first_db_seq, final_db_seq.
220 *
221 * Revision 6.234 2003/04/23 22:48:25 boemker
222 * Bug fix: BLAST_Wizard ignored is_ooframe.
223 *
224 * Revision 6.233 2003/04/22 21:52:13 dondosha
225 * Added function OOFBlastHSPGetNumIdentical
226 *
227 * Revision 6.232 2003/04/17 20:51:33 camacho
228 * Rolled back previous changes
229 *
230 * Revision 6.230 2003/04/14 19:53:18 camacho
231 * Use ISAMUninitSearch to avoid running into memory problems
232 *
233 * Revision 6.229 2003/04/04 20:02:38 dondosha
234 * ABR bug fix in BlastPrintTabulatedResultsEx
235 *
236 * Revision 6.228 2003/03/26 20:29:50 boemker
237 * By default, set required_end = 0 to match behavior of blastcgicmd.cpp
238 *
239 * Revision 6.227 2003/03/26 15:47:52 boemker
240 * sizeof x -> sizeof(x)
241 *
242 * Revision 6.226 2003/03/26 15:46:18 boemker
243 * In BLAST_OptionsBlkInit, use memset to clear structure.
244 *
245 * Revision 6.225 2003/03/26 13:36:06 boemker
246 * Corrected misspelling (opts => options) in BLAST_WizardOptionsBlkInit.
247 *
248 * Revision 6.224 2003/03/26 02:45:25 boemker
249 * Rewrote BLAST_WizardOptionsBlkInit to reflect changes in
250 * BLAST_WizardOptionsBlk.
251 *
252 * Revision 6.223 2003/03/25 22:25:57 boemker
253 * Replaced cutoff_s2, which isn't used, with cutoff_s, which is. Added
254 * query_lcase_mask. Made alignments and descriptions (arguments of function
255 * BLAST_Wizard) optional. Rewrote to avoid use of snprintf, which isn't
256 * portable.
257 *
258 * Revision 6.222 2003/03/25 19:58:18 boemker
259 * Moved code to initialize search options from blastcgicmd.cpp to here, as
260 * BLAST_Wizard et al.
261 *
262 * Revision 6.221 2003/03/24 19:42:14 madden
263 * Changes to support query concatenation for blastn and tblastn
264 *
265 * Revision 6.220 2003/02/28 18:34:41 dondosha
266 * Added a possibility to print sequences in the -D3 output
267 *
268 * Revision 6.219 2003/02/18 19:50:09 madden
269 * Correctly report number of sequences better than expect value for gapped search
270 *
271 * Revision 6.218 2003/01/31 17:57:27 camacho
272 * Fix to MergeDbGiFilesWithOIDLists
273 *
274 * Revision 6.217 2003/01/15 19:06:30 dondosha
275 * Correction in copying defline in BlastPrintTabulatedResultsEx
276 *
277 * Revision 6.216 2003/01/14 20:28:53 madden
278 * New function BLASTAddBlastDBTitleToSeqAnnotEx
279 *
280 * Revision 6.215 2002/12/04 23:32:49 camacho
281 * Do not set use_this_gi with nucleotide dbs (redundant)
282 *
283 * Revision 6.214 2002/11/26 23:02:34 madden
284 * Make ErrorMessage optional in ParseBlastOptions
285 *
286 * Revision 6.213 2002/11/21 21:28:27 camacho
287 * Mask the entire volume if no gis in rdfp->gilist belong to rdfp
288 *
289 * Revision 6.212 2002/11/08 20:52:01 dondosha
290 * After the traceback, reduce hitlist size back to the original
291 *
292 * Revision 6.211 2002/11/04 22:53:32 dondosha
293 * 1. Memory leak fixed in BlastClusterHitsFromSeqAlign
294 * 2. Removed redundant score sets from the seqaligns produced for translated searches
295 * 3. Added BlastHSPGetNumIdentical function
296 *
297 * Revision 6.210 2002/09/23 16:49:00 dondosha
298 * Include subject gis in megablast tabular output with full seqids option
299 *
300 * Revision 6.209 2002/09/19 18:12:46 dondosha
301 * Do not move around Karlin blocks if there are no results for some query in megablast
302 *
303 * Revision 6.208 2002/09/13 19:12:53 camacho
304 * Use correct query length in FormatBlastParameters
305 *
306 * Revision 6.207 2002/09/11 16:54:29 dondosha
307 * Removed erroneous assignment of cutoff_s for megablast in BLASTOptionNewEx that was restored in revision 6.204
308 *
309 * Revision 6.206 2002/08/22 12:36:36 madden
310 * Removed unused variables
311 *
312 * Revision 6.205 2002/08/19 18:24:20 camacho
313 * Removed unused variables
314 *
315 * Revision 6.204 2002/08/09 19:39:20 camacho
316 * Added constants for some blast search parameters
317 *
318 * Revision 6.203 2002/08/07 02:06:07 camacho
319 * Fixed problem when merging lists of gis
320 *
321 * Revision 6.202 2002/08/06 15:43:20 dondosha
322 * Set window size to 0 for megablast in BLASTOptionNewEx
323 *
324 * Revision 6.201 2002/08/06 15:41:49 camacho
325 * Changed return type of MergeDbGiFilesWithOIDLists to void
326 *
327 * Revision 6.200 2002/08/01 20:45:34 dondosha
328 * Changed prototype of the BLASTPostSearchLogic function to make it
329 * more convenient
330 *
331 * Revision 6.199 2002/07/24 21:11:38 kans
332 * reverted ncbi URL
333 *
334 * Revision 6.198 2002/07/24 15:38:07 dondosha
335 * Corrections in BlastPostSearchLogic for processing multiple-query intermediate results from megablast
336 *
337 * Revision 6.197 2002/07/23 16:50:04 kans
338 * changed www.ncbi.nlm.nih.gov to www.ncbi.nih.gov
339 *
340 * Revision 6.196 2002/07/15 17:34:18 camacho
341 * Fixed little/big endian problem in IntersectDoubleInt4ListWithOIDLists
342 *
343 * Revision 6.195 2002/07/09 16:09:43 camacho
344 * Changed interface to BlastCreateVirtualOIDList
345 *
346 * Revision 6.194 2002/07/02 17:08:01 dondosha
347 * Reverse previous change - not needed
348 *
349 * Revision 6.193 2002/07/02 01:36:40 dondosha
350 * For megablast use larger window in CheckStartForGappedAlignment
351 *
352 * Revision 6.192 2002/06/26 00:56:30 camacho
353 *
354 * 1. Fixed bug when searching a mixture of real and mask databases.
355 * 2. Clean up of code that calculates the number of sequences and database
356 * length.
357 *
358 * Revision 6.191 2002/06/25 19:39:38 camacho
359 * Made BlastCreateVirtualOIDList public for use by neighboring software
360 *
361 * Revision 6.190 2002/06/23 06:57:45 camacho
362 * Minor fix to previous commit
363 *
364 * Revision 6.189 2002/06/21 21:48:19 camacho
365 * Reorganized BlastProcessGiLists
366 *
367 * Revision 6.188 2002/06/19 22:50:33 dondosha
368 * Added all queries information for tabular output with multiple queries
369 *
370 * Revision 6.187 2002/06/13 20:48:45 dondosha
371 * Do not assign cutoff_s for megablast in BLASTOptionNewEx - it is not needed anyway
372 *
373 * Revision 6.186 2002/06/06 22:03:06 camacho
374 *
375 * 1. Fixed bug in BlastDbGiListToOidList
376 * 2. Removed statement in BlastProcessGiLists that sets num_seqs to 0
377 *
378 * Revision 6.185 2002/05/28 21:41:03 dondosha
379 * Correction for megablast -D3 output with non-affine extension when database has a gi list
380 *
381 * Revision 6.184 2002/05/10 22:38:49 dondosha
382 * Always do preliminary and then traceback gapped extension if dynamic programming extension is used
383 *
384 * Revision 6.183 2002/05/09 15:35:51 dondosha
385 * Added BLASTOptionNewEx function with an extra argument for megablast
386 *
387 * Revision 6.182 2002/04/18 19:05:08 camacho
388 * Modified BlastProcessGiLists to deal with multi-volume databases with empty oidlists
389 *
390 * Revision 6.181 2002/04/18 16:18:21 dondosha
391 * Added BlastPrintTabulatedResultsEx with extra argument to keep track of progress
392 *
393 * Revision 6.180 2002/04/15 20:51:38 jianye
394 * Added getFastaStyleTitle(BioseqPtr bsp)
395 *
396 * Revision 6.179 2002/04/10 17:42:02 madden
397 * Fix for blastn matrix check (last checkin)
398 *
399 * Revision 6.178 2002/04/10 15:50:15 dondosha
400 * Added validity checks for some megablast options
401 *
402 * Revision 6.177 2002/04/09 18:44:44 madden
403 * Set matrix for blastn correctly
404 *
405 * Revision 6.176 2002/03/22 17:05:09 dondosha
406 * For megablast tabular formatting of seqalign, count number of alignments for each query separately
407 *
408 * Revision 6.175 2002/02/27 20:13:56 dondosha
409 * Added checks for NULL pointer in BlastInsertList2Heap
410 *
411 * Revision 6.174 2002/02/14 20:37:38 dondosha
412 * Bug fix in callback for blastn tabular output when only reverse strand is searched
413 *
414 * Revision 6.173 2002/01/07 23:16:00 dondosha
415 * Fixed several memory leaks and allocation/freeing bugs in multithreaded megablast
416 *
417 * Revision 6.172 2001/12/28 20:41:09 dondosha
418 * 1. Mega BLAST related parameters moved to a separate structure
419 * 2. Disallow word sizes < 8 for Mega BLAST
420 *
421 * Revision 6.171 2001/11/26 20:40:15 camacho
422 * Used the correct maximum oid in BlastProcessGiLists
423 *
424 * Revision 6.170 2001/11/14 23:41:32 dondosha
425 * Minor change in MegaBlastPrintAlignInfo
426 *
427 * Revision 6.169 2001/11/13 14:13:02 egorov
428 * The change fixes a bug when database specified by a list of GIs is
429 * smaller than list of GIs returned from Entrez what causes segmentation fault.
430 *
431 * Revision 6.168 2001/11/05 17:30:24 egorov
432 * Initialize 'cutoff' variable. This eliminates UMR problem
433 * and some of blastsrv3.REAL crashes.
434 *
435 * Revision 6.167 2001/10/30 17:27:00 dondosha
436 * Check if title exists before printing it in PrintTabularOutputHeader
437 *
438 * Revision 6.166 2001/10/24 15:02:48 dondosha
439 * Correction in BlastPrintAlignInfo for 2 sequences case
440 *
441 * Revision 6.165 2001/10/15 19:53:20 vakatov
442 * Get rid of the extra LIBCALL at BLASTPostSearchLogic()
443 *
444 * Revision 6.164 2001/10/12 15:26:29 dondosha
445 * 1. Improvements to BlastClusterHitsFromSeqAlign
446 * 2. Added BLASTPostSearchLogic function
447 *
448 * Revision 6.163 2001/10/02 16:03:58 dondosha
449 * Fix for ungapped search tabular output
450 *
451 * Revision 6.162 2001/09/06 20:24:33 dondosha
452 * Removed threshold_first
453 *
454 * Revision 6.161 2001/09/06 17:11:16 dondosha
455 * Truncate query defline in tabular output header if it is too long
456 *
457 * Revision 6.160 2001/09/05 20:31:04 dondosha
458 * Fixed tiny memory leak in PrintTabularOutputHeader function
459 *
460 * Revision 6.159 2001/08/15 21:37:09 dondosha
461 * Added functions for clustering of hits from a seqalign
462 *
463 * Revision 6.158 2001/07/31 16:42:39 dondosha
464 * Added function FastaCheckDna
465 *
466 * Revision 6.157 2001/07/20 18:38:27 dondosha
467 * Minor bug fix in MegaBlastPrintAlignInfo
468 *
469 * Revision 6.156 2001/07/09 14:17:24 madden
470 * Fix PC-lint complaints from R. Williams
471 *
472 * Revision 6.155 2001/07/09 13:12:04 madden
473 * Removed unused variables
474 *
475 * Revision 6.154 2001/07/06 13:59:02 madden
476 * Fixed compiler and lint warnings
477 *
478 * Revision 6.153 2001/06/29 20:45:21 dondosha
479 * If percent identity close but less than 100, print 99.99 in tabular output
480 *
481 * Revision 6.152 2001/06/28 21:22:20 dondosha
482 * Do not print query information in tabular output header if it is not available
483 *
484 * Revision 6.151 2001/06/28 13:42:09 madden
485 * Fixes to prevent overflow on number of hits reporting
486 *
487 * Revision 6.150 2001/06/25 18:04:24 madden
488 * Fix to BlastComputeProbs to ignore score > max or < min
489 *
490 * Revision 6.149 2001/06/18 21:45:16 dondosha
491 * Adjusted PrintTabularOutputHeader for web BLAST
492 *
493 * Revision 6.148 2001/06/15 21:18:39 dondosha
494 * Added function PrintTabularOutputHeader
495 *
496 * Revision 6.147 2001/06/15 16:38:46 dondosha
497 * Correction to previous changes
498 *
499 * Revision 6.146 2001/06/15 15:48:38 dondosha
500 * Fixed uninitialized variable bug
501 *
502 * Revision 6.145 2001/06/14 22:09:15 dondosha
503 * Rearranged code for gi lists and oid masks processing to get rid of duplication
504 *
505 * Revision 6.144 2001/06/14 16:33:35 dondosha
506 * Fixed bug in the new function BlastDbGiListToOidList
507 *
508 * Revision 6.143 2001/06/13 21:45:09 dondosha
509 * Search of multiple databases with gi files implemented
510 *
511 * Revision 6.142 2001/06/07 19:30:03 dondosha
512 * Pass believe query argument to BlastPrintTabulatedResults
513 *
514 * Revision 6.141 2001/06/06 21:22:43 dondosha
515 * Added (query) Bioseq and SeqLoc arguments to function BlastPrintTabulatedResults
516 *
517 * Revision 6.140 2001/06/05 22:16:15 dondosha
518 * Check if KarlinBlk exists in FormatBlastParameters
519 *
520 * Revision 6.139 2001/05/29 22:00:46 dondosha
521 * Tabulated output bug fixes
522 *
523 * Revision 6.138 2001/05/24 16:25:22 dondosha
524 * Correction for query and subject ids printed with post-search tabulated format
525 *
526 * Revision 6.137 2001/05/23 22:38:48 dondosha
527 * Added option -m 9 to print post-search tabulated output
528 *
529 * Revision 6.136 2001/05/15 19:19:26 shavirin
530 * Fixed minor bug in the function convertSeqAlignListToValNodeList().
531 *
532 * Revision 6.135 2001/05/11 22:05:20 dondosha
533 * Added BlastPrintTabulatedResults function for post-search formatting
534 *
535 * Revision 6.134 2001/05/10 21:58:58 dondosha
536 * Always print full ids in non-megablast tabulated output
537 *
538 * Revision 6.133 2001/05/04 22:14:41 dondosha
539 * Do not call BioseqLockById from MegaBlastPrintAlignInfo in case of local ids in the database
540 *
541 * Revision 6.132 2001/05/04 14:14:28 madden
542 * Fixes for multiple patterns in phi-blast
543 *
544 * Revision 6.131 2001/04/26 16:43:10 madden
545 * Correction to convertValNodeListToSeqAlignList from AS
546 *
547 * Revision 6.130 2001/04/25 13:04:03 madden
548 * Fix from AS to convertValNodeListToSeqAlignList for multiple occurrences of a pattern
549 *
550 * Revision 6.129 2001/04/10 17:01:15 dondosha
551 * Fixed purify errors, including problem with multithreaded tabulated output
552 *
553 * Revision 6.128 2001/04/06 18:15:08 madden
554 * Move UNIX-specific stuff (HeyIAmInMemory) to bqueue.[ch]
555 *
556 * Revision 6.127 2001/04/04 20:32:40 dondosha
557 * Correction for tabulated output from bl2seq for translated searches
558 *
559 * Revision 6.126 2001/04/03 22:35:24 kans
560 * include simutil.h to get prototype for Change_Loc_Strand for Mac compiler
561 *
562 * Revision 6.125 2001/04/03 21:59:49 dondosha
563 * Implemented tabulated output for non-megablast bl2seq
564 *
565 * Revision 6.124 2001/03/27 15:46:16 madden
566 * Fix problem in BlastSetUserErrorString
567 *
568 * Revision 6.123 2001/03/22 21:37:41 madden
569 * Raise hsp_num_max to 400
570 *
571 * Revision 6.122 2001/03/19 22:39:24 dondosha
572 * Allow location on the first query sequence for megablast
573 *
574 * Revision 6.121 2001/03/19 18:54:16 madden
575 * Added BlastSeqLocFillDoubleIntEx, changed BlastSeqLocFillDoubleIntRev
576 *
577 * Revision 6.120 2001/03/13 14:00:24 madden
578 * Make ungapped case consistent with stand-alone binary
579 *
580 * Revision 6.119 2001/03/07 14:09:35 madden
581 * Default window size for blastn is 30
582 *
583 * Revision 6.118 2001/02/28 19:55:30 dondosha
584 * Corrected handling of local query IDs for megablast tabulated output
585 *
586 * Revision 6.117 2001/02/21 15:05:31 dondosha
587 * Round scaled matrix values to nearest integer instead of integer cast
588 *
589 * Revision 6.116 2001/02/16 22:16:22 dondosha
590 * BlastScaleMatrix: if new lambda is -1, stop changing factor
591 *
592 * Revision 6.115 2001/02/16 18:45:40 dondosha
593 * Fixed minor purify errors
594 *
595 * Revision 6.114 2001/02/08 20:41:15 dondosha
596 * Implemented tabulated output for all translated programs
597 *
598 * Revision 6.113 2001/02/08 16:33:03 kans
599 * include mblast.h for Mac compiler
600 *
601 * Revision 6.112 2001/02/07 21:16:05 dondosha
602 * Added callbacks to produce tabulated output
603 *
604 * Revision 6.111 2001/01/25 14:34:46 madden
605 * Fix for ungapped blastn
606 *
607 * Revision 6.110 2001/01/23 21:01:01 dondosha
608 * Change double quotes to single quotes in previous change
609 *
610 * Revision 6.109 2001/01/23 20:25:42 dondosha
611 * 1. Renamed BlastParceInputString to BlastParseInputString
612 * 2. Recognize a double quoted string as an option value in
613 * BlastParseInputString
614 *
615 * Revision 6.108 2001/01/17 17:44:34 madden
616 * Set default blastn window size to zero
617 *
618 * Revision 6.107 2001/01/16 14:03:54 madden
619 * Enable gapped check for blastn immediately after finding hits
620 *
621 * Revision 6.106 2001/01/12 16:57:29 shavirin
622 * Checked for NULL score frequencies to avoid coredump in case of empty
623 * sequence.
624 *
625 * Revision 6.105 2000/12/19 18:39:50 madden
626 * Add function BlastSetUserErrorString and BlastDeleteUserErrorString
627 *
628 * Revision 6.104 2000/12/15 19:27:17 dondosha
629 * Corrected the reference link and display for Mega BLAST
630 *
631 * Revision 6.103 2000/12/04 18:51:24 madden
632 * Fix memory leaks
633 *
634 * Revision 6.102 2000/11/29 16:32:59 dondosha
635 * Fixed minor memory leak in MegaBlastPrintReference
636 *
637 * Revision 6.101 2000/11/21 18:46:18 madden
638 * Remove settings for blosum62_20[ab]
639 *
640 * Revision 6.100 2000/11/09 16:12:36 madden
641 * Set do_sum_stats TRUE for ungapped blastp
642 *
643 * Revision 6.99 2000/11/07 16:30:26 madden
644 * Introduce intermediate score (before linking of HSPs) for blastx and tblastn
645 *
646 * Revision 6.98 2000/11/01 16:25:56 madden
647 * Changes from Futamura for psitblastn
648 *
649 * Revision 6.97 2000/10/26 18:52:56 dondosha
650 * Added function MegaBlastPrintReference
651 *
652 * Revision 6.96 2000/10/13 18:49:22 lewisg
653 * move structures out of unix ifdef
654 *
655 * Revision 6.95 2000/10/12 21:39:39 shavirin
656 * Added support for OOF in the function BLASTFilterOverlapRegions().
657 *
658 * Revision 6.94 2000/10/12 17:55:06 shavirin
659 * Fixed NT error for invalid structure identifier.
660 *
661 * Revision 6.93 2000/10/11 21:49:58 shavirin
662 * Added set of functions around BLASTFilterOverlapRegions() to sort and
663 * filter output SeqAlign by overlapping reagions.
664 *
665 * Revision 6.92 2000/10/05 19:41:43 shavirin
666 * Rewritten function BlastErrorPrintExtra().
667 *
668 * Revision 6.91 2000/09/15 17:11:53 dondosha
669 * Set gap parameters to zeros for megablast in BLASTOptionSetGapParams
670 *
671 * Revision 6.90 2000/08/29 19:35:49 madden
672 * Add gilist_not_owned to blast_gi_list
673 *
674 * Revision 6.89 2000/08/28 17:09:53 madden
675 * query_lcase_mask no longer freed in BLASTOptionDelete
676 *
677 * Revision 6.88 2000/08/23 18:48:44 madden
678 * Use BlastKarlinBlkGappedCalcEx in place of BlastKarlinBlkGappedCalc
679 *
680 * Revision 6.87 2000/08/18 19:34:38 madden
681 * Do not print lcl| in AcknowledgeBlastQuery
682 *
683 * Revision 6.86 2000/08/04 15:02:25 shavirin
684 * Changed default threshhold values for matrixes BLOSUM62_20,
685 * BLOSUM62_20A, BLOSUM62_20B.
686 *
687 * Revision 6.85 2000/07/31 16:41:00 shavirin
688 * Reduced POSIT_SCALE_FACTOR from 1000 to 200 to avoid overflow
689 * with BLOSUM80; moved declaration os POSIT_SCALE_FACTOR to posit.h
690 *
691 * Revision 6.84 2000/07/21 15:23:24 madden
692 * Set default threshold for BLOSUM62_20
693 *
694 * Revision 6.83 2000/07/12 23:07:30 kans
695 * reverse_seq moved from pseed3.c to blastool.c, placed in blast.h header, called by gapxdrop.c
696 *
697 * Revision 6.82 2000/06/20 15:50:46 shavirin
698 * Added new functions: BLASTAddBlastDBTitleToSeqAnnot and
699 * BLASTGetDatabaseTitleFromSeqAnnot().
700 *
701 * Revision 6.81 2000/04/18 16:29:27 madden
702 * Free gifile name
703 *
704 * Revision 6.80 2000/04/10 19:58:06 dondosha
705 * Added BlastSeqLocFillDoubleIntRev to fill mask locations for concatenated reverse strand in blastn
706 *
707 * Revision 6.79 2000/03/31 19:10:57 dondosha
708 * Changed some names related to MegaBlast
709 *
710 * Revision 6.78 2000/03/30 21:45:36 madden
711 * Make BlastDeleteHeap non-static
712 *
713 * Revision 6.77 2000/03/14 13:44:01 madden
714 * Set default of sort_gi_list to TRUE
715 *
716 * Revision 6.76 2000/03/01 20:45:15 shavirin
717 * Clear some memory in BLASTOptionsDelete() function
718 *
719 * Revision 6.75 2000/02/23 20:45:45 dondosha
720 * Modified heap operations for blastn to handle concatenated contexts
721 *
722 * Revision 6.74 2000/01/21 22:24:09 madden
723 * Use Nlm_Int8tostr in place of Ltostr
724 *
725 * Revision 6.73 2000/01/13 18:10:43 madden
726 * Fix problem with incorrect stat values for blastn and missing hits
727 *
728 * Revision 6.72 2000/01/07 16:01:24 madden
729 * Use readdb_get_totals_ex to get db number to report
730 *
731 * Revision 6.71 1999/12/31 14:23:19 egorov
732 * Add support for using mixture of real and maks database with gi-list files:
733 * 1. Change logic of creating rdfp list.
734 * 2. BlastGetDbChunk gets real databases first, then masks.
735 * 3. Propoper calculation of database sizes using alias files.
736 * 4. Change to CommonIndex to support using of mask databases.
737 * 5. Use correct gis in formated output (BlastGetAllowedGis()).
738 * 6. Other small changes
739 *
740 * Revision 6.70 1999/12/21 20:03:23 egorov
741 * readdb_gi2seq() has new parameter. Use NULL here.
742 *
743 * Revision 6.69 1999/12/21 16:58:00 madden
744 * Fix command-line parser for options in case there is no space between option and value
745 *
746 * Revision 6.68 1999/12/17 20:47:04 egorov
747 * Fix 'gcc -Wall' warnings
748 *
749 * Revision 6.67 1999/12/16 19:16:49 egorov
750 * Return a value from not-void function
751 *
752 * Revision 6.66 1999/12/14 15:35:13 madden
753 * Added BlastPrintFilterWarning
754 *
755 * Revision 6.65 1999/11/30 19:00:51 madden
756 * Added Nlm_SwapUint4 calls for the ordinal ID list
757 *
758 * Revision 6.64 1999/11/26 22:26:49 madden
759 * Change gap_x_dropoff value for blastn
760 *
761 * Revision 6.63 1999/11/09 14:16:53 madden
762 * made sum_stats default again, rolling back rev 6.61
763 *
764 * Revision 6.62 1999/11/02 15:22:18 madden
765 * Add BlastParseInputString and BlastGetLetterIndex
766 *
767 * Revision 6.61 1999/10/27 21:01:32 madden
768 * made do_sum_stats not default
769 *
770 * Revision 6.60 1999/09/29 17:14:50 shavirin
771 * Fixed memory leak in BLASTOptionsDelete()
772 *
773 * Revision 6.59 1999/09/22 20:59:20 egorov
774 * Add blast db mask stuff
775 *
776 * Revision 6.58 1999/09/16 16:54:43 madden
777 * Allow longer wordsizes
778 *
779 * Revision 6.57 1999/09/14 19:56:39 shavirin
780 * Fixed bug in PHI-Blast when number of hits to DB == 0
781 *
782 * Revision 6.56 1999/09/09 18:00:25 madden
783 * formatting problem for effective db length
784 *
785 * Revision 6.55 1999/08/27 20:22:51 shavirin
786 * Added default value for decline_align in the function BLASTOptionNew().
787 *
788 * Revision 6.53 1999/08/17 14:10:05 madden
789 * Validation for smith_waterman and tweak_parameters options
790 *
791 * Revision 6.52 1999/05/25 13:37:15 madden
792 * Call readdb_get_sequence_length only for seqs in db
793 *
794 * Revision 6.51 1999/04/27 17:22:27 madden
795 * Set hsp_num_max for ungapped BLAST
796 *
797 * Revision 6.50 1999/04/14 14:53:48 madden
798 * Correction for databases over 2 Gig
799 *
800 * Revision 6.49 1999/04/01 21:42:47 madden
801 * Fix memory leaks when gi list is used
802 *
803 * Revision 6.48 1999/03/22 15:19:03 beloslyu
804 * corrections to compile on Red Hat Linux v5.2
805 *
806 * Revision 6.47 1999/03/18 16:43:57 shavirin
807 * Added function Boolean HeyIAmInMemory(Int4 program)
808 *
809 * Revision 6.46 1999/03/17 13:21:07 madden
810 * Fix comment in comment problem
811 *
812 * Revision 6.45 1999/03/12 15:03:45 egorov
813 * Add proper Int4-long type casting
814 *
815 * Revision 6.44 1999/02/19 14:18:25 madden
816 * Added back check for negative nucl. penalty
817 *
818 * Revision 6.43 1999/02/19 14:16:20 madden
819 * list manipulation bug for seed
820 *
821 * Revision 6.41 1999/01/26 18:26:54 madden
822 * make updateLambdaK public
823 *
824 * Revision 6.40 1999/01/08 22:08:42 madden
825 * BlastScaleMatrix returns factor as FloatHi
826 *
827 * Revision 6.39 1998/12/31 18:17:06 madden
828 * Added strand option
829 *
830 * Revision 6.38 1998/12/29 17:45:07 madden
831 * Add do_sum_stats flag
832 *
833 * Revision 6.37 1998/12/03 15:19:32 madden
834 * Changes to speed up BlastFreeHeap and InsertToHeap
835 *
836 * Revision 6.36 1998/11/04 01:36:06 egorov
837 * Add support for entrez-query and org-name to blast3
838 *
839 * Revision 6.35 1998/10/13 22:06:24 madden
840 * Fixed AdjustDbNumbers
841 *
842 * Revision 6.34 1998/09/28 12:29:24 madden
843 * Check for problem in rescaling code
844 *
845 * Revision 6.33 1998/09/22 18:46:43 egorov
846 * Add BlastErrorPrintExtra()
847 *
848 * Revision 6.32 1998/09/17 19:53:02 madden
849 * Added fillCandLambda
850 *
851 * Revision 6.31 1998/09/16 18:59:35 madden
852 * Print subset information if entire database not searched
853 *
854 * Revision 6.30 1998/09/14 15:48:36 madden
855 * Fixed PHI-BLAST reference
856 *
857 * Revision 6.29 1998/09/14 15:11:15 egorov
858 * Add support for Int8 length databases; remove unused variables
859 *
860 * Revision 6.28 1998/09/10 22:36:09 madden
861 * Added convertSeqAlignListToValNodeList and convertValNodeListToSeqAlignList
862 *
863 * Revision 6.27 1998/09/09 21:18:09 madden
864 * Added PrintKAParametersExtra
865 *
866 * Revision 6.26 1998/09/04 14:45:42 madden
867 * Moved code from blast.c blastool.c
868 *
869 * Revision 6.25 1998/08/28 21:21:29 madden
870 * Changed PhiBlast ref
871 *
872 * Revision 6.24 1998/08/25 14:16:22 madden
873 * Added BlastGetPhiReference and BlastPrintPhiReference
874 *
875 * Revision 6.23 1998/07/28 15:23:47 madden
876 * Changed Number of sequences better printout again
877 *
878 * Revision 6.22 1998/07/27 21:54:02 madden
879 * Fixed printing of non-integral E-values
880 *
881 * Revision 6.21 1998/07/21 20:58:06 madden
882 * Changes to allow masking at hash only
883 *
884 * Revision 6.20 1998/07/17 15:40:01 madden
885 * Changes for Effective search space.
886 *
887 * Revision 6.19 1998/07/02 22:15:31 madden
888 * Fixed bug in BlastAdjustDbNumbers
889 *
890 * Revision 6.18 1998/06/17 18:10:24 madden
891 * Validate for isPatternSearch
892 *
893 * Revision 6.17 1998/06/12 15:52:49 madden
894 * Fixed warnings
895 *
896 * Revision 6.16 1998/06/05 15:50:35 madden
897 * Return 1 from BLASTOptionValidateEx if wordsize incorrect.
898 *
899 * Revision 6.15 1998/06/03 17:40:48 madden
900 * Added blastn check for wordsize in BLASTOptionValidateEx
901 *
902 * Revision 6.14 1998/05/28 19:59:42 madden
903 * Changed hsp_range_max
904 *
905 * Revision 6.13 1998/05/21 19:44:45 egorov
906 * Make word "Reference" be HTML link in case HTML output requested
907 *
908 * Revision 6.12 1998/05/03 17:21:04 madden
909 * Added error message for expect_value <= 0, fix typo setting open and expect values
910 *
911 * Revision 6.11 1998/05/01 18:33:56 egorov
912 * Add new parametes to BLASTOptionSetGapParam()
913 *
914 * Revision 6.10 1998/04/30 14:28:46 madden
915 * Raise thresholds for blastx, tblast[nx]
916 *
917 * Revision 6.9 1998/04/29 14:28:07 madden
918 * Fix reference formatting problem
919 *
920 * Revision 6.8 1998/04/27 16:47:34 madden
921 * Added window and threshold to BLASTOptionSetGapParams
922 *
923 * Revision 6.7 1998/04/24 19:28:29 madden
924 * Added BlastScaleMatrix (and other rescaling code moved from posit.c)
925 *
926 * Revision 6.6 1998/04/13 20:29:57 madden
927 * Add one to length of array for NULLB at end
928 *
929 * Revision 6.5 1998/03/24 15:38:23 madden
930 * Use BlastDoubleInt4Ptr to keep track of gis and ordinal_ids
931 *
932 * Revision 6.4 1998/03/18 14:14:18 madden
933 * Support random access by gi list
934 *
935 * Revision 6.3 1998/02/28 17:24:30 madden
936 * Default window_size zero for blastn
937 *
938 * Revision 6.2 1998/02/27 16:52:05 madden
939 * Added BlastGetSequenceFromBioseq
940 *
941 * Revision 6.1 1998/02/27 14:30:30 madden
942 * Tools (or utilities) for the BLAST programs
943 *
944 */
945
946 #include <ncbi.h>
947 #include <blastpri.h>
948 #include <objcode.h>
949 #include <objseq.h>
950 #include <sequtil.h>
951 #include <readdb.h>
952 #include <ncbithr.h>
953 #include <txalign.h>
954 #include <posit.h>
955 #include <seed.h>
956 #include <xmlblast.h>
957 #include <mblast.h>
958 #include <simutil.h>
959 #include <vecscrn.h>
960 #include <blfmtutl.h>
961
962 int (*blastool_fprintf)(FILE*, const char *, ...) = fprintf;
963 #define fprintf blastool_fprintf
964
965 typedef struct _FilterAlign {
966 SeqAlignPtr sap;
967 Char subject_sip[64];
968 HspPtr hsp;
969 } FilterAlign, PNTR FilterAlignPtr;
970
971
972 typedef struct _SappSet {
973 Int4 count;
974 FilterAlignPtr PNTR fapp;
975 } SappSet, PNTR SappSetPtr;
976
977
978 #define BUFFER_LENGTH 255
979
980 /*
981 Formats the BLAST parameters for the BLAST report.
982 One CharPtr is returned, newlines are indicated by tildes ('~').
983 */
984
985 CharPtr
986 FormatBlastParameters(BlastSearchBlkPtr search)
987
988 {
989 BLAST_ParameterBlkPtr pbp;
990 BLAST_Score cutoff = 0;
991 Char buffer[128];
992 CharPtr ret_buffer;
993 Int2 ret_buffer_length;
994 Int4 num_entries;
995 Int8 total_length;
996 Nlm_FloatHi evalue;
997
998 pbp = search->pbp;
999
1000 ret_buffer = NULL;
1001 ret_buffer_length = 0;
1002
1003
1004 sprintf(buffer, "Matrix: %s", search->sbp->name);
1005 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1006
1007 if (pbp->gapped_calculation)
1008 {
1009 sprintf(buffer, "Gap Penalties: Existence: %ld, Extension: %ld", (long) search->pbp->gap_open, (long) search->pbp->gap_extend);
1010 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1011 }
1012
1013 if (pbp->two_pass_method == FALSE)
1014 {
1015 sprintf(buffer, "Number of Hits to DB: %s", Nlm_Int8tostr((Int8) search->second_pass_hits, 1));
1016 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1017
1018 readdb_get_totals_ex(search->rdfp, &total_length, &num_entries, TRUE);
1019
1020 sprintf(buffer, "Number of Sequences: %ld", (long) num_entries);
1021 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1022 sprintf(buffer, "Number of extensions: %ld", (long) search->second_pass_extends);
1023 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1024 sprintf(buffer, "Number of successful extensions: %ld", (long) search->second_pass_good_extends);
1025 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1026 }
1027 else
1028 {
1029 sprintf(buffer, "Number of Hits to DB: 1st pass: %s, 2nd pass: %s",
1030 Nlm_Int8tostr((Int8) search->first_pass_hits, 1), Nlm_Int8tostr((Int8) search->second_pass_hits, 1));
1031 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1032 readdb_get_totals_ex(search->rdfp, &total_length, &num_entries, TRUE);
1033 sprintf(buffer, "Number of Sequences: 1st pass: %ld, 2nd pass: %ld",
1034 (long) num_entries, (long) search->second_pass_trys);
1035 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1036 sprintf(buffer, "Number of extensions: 1st pass: %ld, 2nd pass: %ld",
1037 (long) search->first_pass_extends, (long) search->second_pass_extends);
1038 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1039 sprintf(buffer, "Number of successful extensions: 1st pass: %ld, 2nd pass: %ld",
1040 (long) search->first_pass_good_extends, (long) search->second_pass_good_extends);
1041 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1042 }
1043
1044 if (pbp->cutoff_e > 0.1)
1045 {
1046 sprintf(buffer, "Number of sequences better than %4.1f: %ld",
1047 pbp->cutoff_e, (long) search->number_of_seqs_better_E);
1048 }
1049 else
1050 {
1051 sprintf(buffer, "Number of sequences better than %3.1e: %ld",
1052 pbp->cutoff_e, (long) search->number_of_seqs_better_E);
1053 }
1054 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1055
1056 if (pbp->gapped_calculation)
1057 {
1058 sprintf(buffer, "Number of HSP's better than %4.1f without gapping: %ld",
1059 pbp->cutoff_e, (long) search->prelim_gap_no_contest);
1060 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1061 sprintf(buffer, "Number of HSP's successfully gapped in prelim test: %ld",
1062 (long) search->prelim_gap_passed);
1063 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1064 sprintf(buffer, "Number of HSP's that attempted gapping in prelim test: %ld", (long) search->prelim_gap_attempts);
1065 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1066 sprintf(buffer, "Number of HSP's gapped (non-prelim): %ld", (long) search->real_gap_number_of_hsps);
1067 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1068 }
1069 /* The following makes sense only when there is only one query */
1070 if (search->last_context <= 1) {
1071 Int8 qlen;
1072 if (search->pbp->is_rps_blast) {
1073 qlen = (Int8) search->rps_qlen;
1074 } else {
1075 qlen = (Int8)
1076 search->context[search->first_context].query->length;
1077 }
1078 sprintf(buffer, "length of query: %s", Nlm_Int8tostr(qlen, 1));
1079 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1080 }
1081 sprintf(buffer, "length of database: %s", Nlm_Int8tostr ((Int8) search->dblen, 1));
1082 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1083
1084 sprintf(buffer, "effective HSP length: %ld", (long) search->length_adjustment);
1085 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1086
1087 if (search->last_context <= 1) {
1088 Int8 eff_qlen;
1089 if (search->pbp->is_rps_blast) {
1090 eff_qlen = (Int8) search->rps_qlen - search->length_adjustment;
1091 } else {
1092 eff_qlen = (Int8)
1093 search->context[search->first_context].query->effective_length;
1094 }
1095
1096 sprintf(buffer, "effective length of query: %s",
1097 Nlm_Int8tostr(eff_qlen, 1));
1098 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1099 }
1100 sprintf(buffer, "effective length of database: %s", Nlm_Int8tostr ((Int8) search->dblen_eff, 1));
1101 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1102 if (search->last_context <= 1) {
1103 Int8 eff_qlen;
1104 if (search->pbp->is_rps_blast) {
1105 eff_qlen = (Int8) search->rps_qlen - search->length_adjustment;
1106 } else {
1107 eff_qlen = (Int8)
1108 search->context[search->first_context].query->effective_length;
1109 }
1110 sprintf(buffer, "effective search space: %8.0f", ((Nlm_FloatHi)
1111 search->dblen_eff)*((Nlm_FloatHi) eff_qlen));
1112 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1113 }
1114 sprintf(buffer, "effective search space used: %8.0f", (Nlm_FloatHi) search->searchsp_eff);
1115 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1116
1117 if (StringCmp(search->prog_name, "blastx") == 0
1118 || StringCmp(search->prog_name, "tblastn") == 0
1119 || StringCmp(search->prog_name, "tblastx") == 0
1120 || StringCmp(search->prog_name, "psitblastn") == 0 )
1121 {
1122 sprintf(buffer, "frameshift window, decay const: %ld, %4.1f",
1123 (long) pbp->gap_size, pbp->gap_decay_rate);
1124 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1125 }
1126
1127 if (!search->pbp->mb_params) {
1128 sprintf(buffer, "T: %ld", (long) search->pbp->threshold_second);
1129 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1130 sprintf(buffer, "A: %ld", (long) search->pbp->window_size);
1131 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1132 sprintf(buffer, "X1: %ld (%4.1f bits)", (long) (-search->pbp->dropoff_2nd_pass), ((-search->pbp->dropoff_2nd_pass)*(search->sbp->kbp[search->first_context]->Lambda/NCBIMATH_LN2)));
1133 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1134 if (StringCmp(search->prog_name, "blastn") == 0 || search->pbp->gapped_calculation == FALSE) {
1135 sprintf(buffer, "X2: %ld (%4.1f bits)", (long) search->pbp->gap_x_dropoff, ((search->pbp->gap_x_dropoff)*(search->sbp->kbp[search->first_context]->Lambda/NCBIMATH_LN2)));
1136 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1137 } else {
1138 sprintf(buffer, "X2: %ld (%4.1f bits)", (long) search->pbp->gap_x_dropoff, ((search->pbp->gap_x_dropoff)*(search->sbp->kbp_gap[search->first_context]->Lambda/NCBIMATH_LN2)));
1139 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1140 sprintf(buffer, "X3: %ld (%4.1f bits)", (long) search->pbp->gap_x_dropoff_final, ((search->pbp->gap_x_dropoff_final)*(search->sbp->kbp_gap[search->first_context]->Lambda/NCBIMATH_LN2)));
1141 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1142 }
1143 sprintf(buffer, "S1: %ld (%4.1f bits)", (long) search->pbp->gap_trigger, ((((search->pbp->gap_trigger)*(search->sbp->kbp[search->first_context]->Lambda))-(search->sbp->kbp[search->first_context]->logK))/NCBIMATH_LN2));
1144 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1145 cutoff = 0;
1146 }
1147 if (search->last_context <= 1) {
1148 evalue = pbp->cutoff_e;
1149 if (search->pbp->gapped_calculation == FALSE) {
1150 /* AM: Changed to support query concatenation. */
1151 if( !search->mult_queries ) {
1152 Nlm_FloatHi search_sp =
1153 search->context[search->first_context]
1154 .query->effective_length *
1155 (Nlm_FloatHi) search->dblen_eff;
1156 BlastCutoffs(&cutoff, &evalue,
1157 search->sbp->kbp[search->first_context],
1158 search_sp, FALSE, 0.0 );
1159 } else {
1160 Nlm_FloatHi search_sp =
1161 search->mult_queries->MinLenEff *
1162 (Nlm_FloatHi) search->dblen_eff;
1163 BlastCutoffs(&cutoff, &evalue,
1164 search->sbp->kbp[search->first_context],
1165 search_sp, FALSE, 0.0 );
1166 }
1167
1168 sprintf(buffer, "S2: %ld (%4.1f bits)", (long) cutoff, (((cutoff)*(search->sbp->kbp[search->first_context]->Lambda))-(search->sbp->kbp[search->first_context]->logK))/NCBIMATH_LN2);
1169 } else {
1170 if( !search->mult_queries ) {
1171 Nlm_FloatHi search_sp =
1172 search->context[search->first_context]
1173 .query->effective_length *
1174 (Nlm_FloatHi) search->dblen_eff;
1175 BlastCutoffs(&cutoff, &evalue,
1176 search->sbp->kbp_gap[search->first_context],
1177 search_sp, FALSE, 0.0 );
1178 } else {
1179 Nlm_FloatHi search_sp =
1180 search->mult_queries->MinLenEff *
1181 (Nlm_FloatHi)search->dblen_eff;
1182 BlastCutoffs( &cutoff, &evalue,
1183 search->sbp->kbp_gap[search->first_context],
1184 search_sp, FALSE, 0.0 );
1185 }
1186 sprintf(buffer, "S2: %ld (%4.1f bits)", (long) cutoff, (((cutoff)*(search->sbp->kbp_gap[search->first_context]->Lambda))-(search->sbp->kbp_gap[search->first_context]->logK))/NCBIMATH_LN2);
1187 }
1188 add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
1189 }
1190 return ret_buffer;
1191 }
1192
1193 /*
1194 Deallocates *BlastErrorMsgPtr produced by BlastConstructErrorMessage.
1195 */
1196
1197 BlastErrorMsgPtr BlastDestroyErrorMessage(BlastErrorMsgPtr error_msg)
1198
1199 {
1200 if (error_msg == NULL)
1201 return NULL;
1202
1203 MemFree(error_msg->msg);
1204 MemFree(error_msg);
1205
1206 return NULL;
1207 }
1208
1209 /*
1210 Prepares error message and appends ValNodePtr, containing BlastErrorMsgPtr, to
1211 end of chain. The beginning of the ValNodePtr chain is returned.
1212 */
1213
1214 ValNodePtr BlastConstructErrorMessage(CharPtr function, CharPtr message, Uint1 level, ValNodePtr PNTR vnpp)
1215
1216 {
1217 CharPtr buffer;
1218 CharPtr ptr;
1219 BlastErrorMsgPtr error_msg;
1220 int length = 10;
1221
1222 if (vnpp == NULL)
1223 return NULL;
1224
1225 if (function != NULL)
1226 length += StringLen(function);
1227
1228 if (message != NULL)
1229 length += StringLen(message);
1230
1231 ptr = buffer = (Char*) MemNew(length*sizeof(char));
1232 if (function != NULL)
1233 {
1234 sprintf(buffer, "%s: ", function);
1235 ptr = buffer + StringLen(buffer);
1236 }
1237
1238 if (message != NULL)
1239 {
1240 sprintf(ptr, "%s", message);
1241 }
1242
1243 error_msg = (BlastErrorMsgPtr) MemNew(sizeof(BlastErrorMsg));
1244 error_msg->msg = buffer;
1245 error_msg->level = level;
1246
1247 ValNodeAddPointer(vnpp, 0, error_msg);
1248
1249 return *vnpp;
1250 }
1251
1252 /*
1253 Destroys a chain of ValNodes and the BlastErrorMsgPtr data.
1254 */
1255
1256 ValNodePtr BlastErrorChainDestroy(ValNodePtr vnp)
1257
1258 {
1259 ValNodePtr start = vnp;
1260
1261 while (vnp)
1262 {
1263 BlastDestroyErrorMessage(vnp->data.ptrvalue);
1264 vnp->data.ptrvalue = NULL;
1265 vnp = vnp->next;
1266 }
1267
1268 ValNodeFree(start);
1269
1270 return NULL;
1271 }
1272
1273 /*
1274 Prints the error messages.
1275 */
1276
1277 void LIBCALL BlastErrorPrint(ValNodePtr error_return)
1278 {
1279 BlastErrorMsgPtr error_msg;
1280
1281 if (error_return == NULL)
1282 return;
1283
1284 while (error_return)
1285 {
1286 error_msg = error_return->data.ptrvalue;
1287 switch (error_msg->level)
1288 {
1289 case 0:
1290 ErrPostEx(SEV_INFO, 0, 0, "%s", error_msg->msg);
1291 break;
1292 case 1:
1293 ErrPostEx(SEV_WARNING, 0, 0, "%s", error_msg->msg);
1294 break;
1295 case 2:
1296 ErrPostEx(SEV_ERROR, 0, 0, "%s", error_msg->msg);
1297 break;
1298 case 3:
1299 ErrPostEx(SEV_FATAL, 1, 0, "%s", error_msg->msg);
1300 break;
1301 default:
1302 ErrPostEx(SEV_WARNING, 0, 0, "Unknown BLAST error level");
1303 break;
1304 }
1305 error_return = error_return->next;
1306 }
1307 return;
1308
1309 }
1310
1311 CharPtr LIBCALL BlastErrorToString(ValNodePtr error_return)
1312 {
1313 BlastErrorMsgPtr error_msg;
1314 ValNodePtr tmp = NULL;
1315 Uint4 length = 0;
1316 CharPtr retval = NULL;
1317
1318 if ( !error_return ) {
1319 return NULL;
1320 }
1321
1322 /* First, determine the length of the error messages */
1323 for (tmp = error_return; tmp; tmp = tmp->next) {
1324 error_msg = (BlastErrorMsgPtr) tmp->data.ptrvalue;
1325 length += StringLen(error_msg->msg);
1326 switch (error_msg->level) {
1327 case 0: length += 4; break; /* INFO */
1328 case 1: length += 7; break; /* WARNING */
1329 case 2: length += 5; break; /* ERROR */
1330 case 3: length += 5; break; /* FATAL */
1331 default: break;
1332 }
1333 length += 2; /* for space and ':' */
1334 length += 2; /* for newline/NULL byte */
1335 }
1336
1337 retval = (CharPtr) Malloc(sizeof(Char)*length);
1338 if ( !retval ) {
1339 return NULL;
1340 }
1341 *retval = NULLB;
1342
1343 for (tmp = error_return; tmp; tmp = tmp->next) {
1344 Char buffer[BUFFER_LENGTH] = { NULLB };
1345 error_msg = (BlastErrorMsgPtr) tmp->data.ptrvalue;
1346
1347 switch (error_msg->level)
1348 {
1349 case 0:
1350 sprintf(buffer, "%s: %s\n", "INFO", error_msg->msg);
1351 break;
1352 case 1:
1353 sprintf(buffer, "%s: %s\n", "WARNING", error_msg->msg);
1354 break;
1355 case 2:
1356 sprintf(buffer, "%s: %s\n", "ERROR", error_msg->msg);
1357 break;
1358 case 3:
1359 sprintf(buffer, "%s: %s\n", "FATAL", error_msg->msg);
1360 break;
1361 default:
1362 ErrPostEx(SEV_WARNING, 0, 0, "Unknown BLAST error level");
1363 break;
1364 }
1365 StringNCat(retval, buffer, length);
1366 }
1367 return retval;
1368 }
1369
1370 void LIBCALL BlastErrorPrintExtra(ValNodePtr error_return, Boolean errpostex, FILE* fp)
1371 {
1372 BlastErrorMsgPtr error_msg;
1373 ErrSev err_sev;
1374 CharPtr msg;
1375
1376 CharPtr errsevmsg[] = { "UNKNOWN","INFO","WARNING","ERROR",
1377 "FATAL"};
1378
1379 if (error_return == NULL)
1380 return;
1381
1382 while (error_return) {
1383 error_msg = error_return->data.ptrvalue;
1384 msg = error_msg->msg;
1385
1386 if(error_msg->level > 5)
1387 error_msg->level = 0;
1388
1389 if(error_msg->level == 4)
1390 err_sev = SEV_FATAL;
1391 else
1392 err_sev = error_msg->level;
1393
1394 if (errpostex)
1395 ErrPostEx(err_sev, 0, 0, "%s", msg);
1396
1397 if (fp)
1398 fprintf(fp, "%s: %s\n", errsevmsg[error_msg->level], msg);
1399
1400 error_return = error_return->next;
1401 }
1402 return;
1403 }
1404
1405 CharPtr scan_to_break (CharPtr ptr)
1406 {
1407
1408 while (*ptr != NULLB)
1409 {
1410 if (*ptr == ';')
1411 {
1412 ptr++;
1413 break;
1414 }
1415 ptr++;
1416 }
1417
1418 return ptr;
1419 }
1420
1421 Boolean LIBCALL
1422 BlastPrintFilterWarning (CharPtr filter_string, Int4 line_length, FILE *outfp, Boolean html)
1423
1424 {
1425 CharPtr ptr;
1426
1427 ptr = filter_string;
1428
1429 if (filter_string == NULL || outfp == NULL)
1430 return FALSE;
1431
1432 while (*ptr != NULLB)
1433 {
1434 if (*ptr == 'S')
1435 {
1436 ptr = scan_to_break(ptr);
1437 }
1438 else if (*ptr == 'C')
1439 {
1440 ptr = scan_to_break(ptr);
1441 }
1442 else if (*ptr == 'D')
1443 {
1444 ptr = scan_to_break(ptr);
1445 }
1446 else if (*ptr == 'R')
1447 {
1448 ptr = scan_to_break(ptr);
1449 if (html)
1450 fprintf(outfp, "<B>NOTE:</B>");
1451 else
1452 fprintf(outfp, "NOTE:");
1453 fprintf(outfp, " This query has been filtered for human repeats.\n");
1454 fprintf(outfp, "This filtering is effective for 70-90%% of all repeats.\n\n");
1455 }
1456 else if (*ptr == 'L')
1457 { /* do low-complexity filtering; dust for blastn, otherwise seg.*/
1458 ptr = scan_to_break(ptr);
1459 }
1460 else
1461 {
1462 ptr++;
1463 }
1464 }
1465
1466 return TRUE;
1467
1468 }
1469
1470 #define BLAST_SMALLEST_EVALUE 1.0e-300
1471 /*
1472 Initialize the options structure.
1473
1474 The fields should be set to default values, that depend on the program.
1475 */
1476 BLAST_OptionsBlkPtr LIBCALL
1477 BLASTOptionNew(CharPtr progname, Boolean gapped)
1478
1479 {
1480 return BLASTOptionNewEx(progname, gapped, FALSE);
1481 }
1482
1483 BLAST_OptionsBlkPtr LIBCALL
1484 BLASTOptionNewEx(CharPtr progname, Boolean gapped, Boolean is_megablast)
1485
1486 {
1487 BLAST_OptionsBlkPtr options;
1488
1489 options = (BLAST_OptionsBlkPtr) MemNew(sizeof(BLAST_OptionsBlk));
1490
1491 options->perform_culling = FALSE; /* Results should not be culled at all right now. */
1492
1493 options->program_name = StringSave(progname);
1494 options->required_start = 0;
1495 options->required_end = -1; /* -1 indicates the end of the query. */
1496 options->db_length = 0; /* zero means that real size will be used. */
1497 options->searchsp_eff = 0; /* zero means that real size will be used. */
1498
1499 options->hsp_range_max = 100;
1500 options->entrez_query = NULL;
1501 options->gifile = NULL;
1502 options->gilist = NULL;
1503 options->sort_gi_list = TRUE;
1504 options->gilist_already_calculated = FALSE;
1505
1506 if (gapped)
1507 {
1508 options->gapped_calculation = TRUE;
1509 options->do_sum_stats = FALSE;
1510 }
1511 else
1512 {
1513 options->gapped_calculation = FALSE;
1514 options->do_sum_stats = TRUE;
1515 options->hsp_num_max = 400;
1516 }
1517
1518 options->discontinuous = FALSE; /* discontinuous is default. */
1519 if (StringICmp(progname, "blastn") == 0)
1520 {
1521 options->gap_decay_rate = 0.5;
1522 options->gap_prob = 0.5;
1523 options->gap_size = 40;
1524 options->threshold_second = WORD_THRESHOLD_BLASTN;
1525 options->expect_value = 10;
1526 options->hitlist_size = 500;
1527 options->two_pass_method = FALSE;
1528 options->multiple_hits_only = FALSE;
1529 options->number_of_bits = 0.0;
1530 /* 1st pass not done for blastn. */
1531 options->dropoff_2nd_pass = UNGAPPED_X_DROPOFF_NUCL;
1532 options->matrix = NULL;
1533 options->old_stats = FALSE;
1534 options->penalty = PENALTY;
1535 options->reward = REWARD;
1536 options->e2 = 0.05;
1537 /* Used in the post-process gapping of the blastn result. */
1538 if (!is_megablast) {
1539 options->wordsize = WORDSIZE_NUCL;
1540 options->gap_open = GAP_OPEN_NUCL;
1541 options->gap_extend = GAP_EXTN_NUCL;
1542 options->block_width = 20;
1543 options->window_size = WINDOW_SIZE_NUCL;
1544 options->cutoff_s = 0;
1545 options->cutoff_s2 = 0;
1546 options->gap_x_dropoff = GAP_X_DROPOFF_NUCL;
1547 } else {
1548 options->is_megablast_search = TRUE;
1549 options->wordsize = WORDSIZE_MEGABLAST;
1550 options->gap_open = GAP_OPEN_MEGABLAST;
1551 options->gap_extend = GAP_EXTN_MEGABLAST;
1552 options->cutoff_s2 = 28;
1553 options->window_size = WINDOW_SIZE_MEGABLAST;
1554 options->gap_x_dropoff = GAP_X_DROPOFF_MEGABLAST;
1555 options->dropoff_2nd_pass = UNGAPPED_X_DROPOFF_MEGABLAST;
1556 }
1557 options->decline_align = INT2_MAX;
1558 options->gap_x_dropoff_final = GAP_X_DROPOFF_FINAL_NUCL;
1559 options->gap_trigger = 25.0;
1560 options->strand_option = BLAST_BOTH_STRAND;
1561 if (gapped)
1562 options->do_sum_stats = FALSE;
1563 else
1564 options->do_sum_stats = TRUE;
1565 }
1566 else
1567 {
1568 options->gap_size = 40;
1569 options->window_size = WINDOW_SIZE_PROT;
1570 options->expect_value = 10;
1571 options->hitlist_size = 500;
1572 options->two_pass_method = FALSE;
1573 options->multiple_hits_only = TRUE;
1574 options->number_of_bits = 0.0;
1575 options->dropoff_1st_pass = 7;
1576 options->dropoff_2nd_pass = UNGAPPED_X_DROPOFF_PROT;
1577 options->matrix = StringSave("BLOSUM62");
1578 options->old_stats = FALSE;
1579 options->wordsize = WORDSIZE_PROT;
1580 options->penalty = 0;
1581 options->reward = 0;
1582 options->gap_decay_rate = 0.5;
1583 options->gap_prob = 0.5;
1584
1585 options->gap_open = GAP_OPEN_PROT;
1586 options->gap_extend = GAP_EXTN_PROT;
1587 options->decline_align = INT2_MAX;
1588 options->gap_x_dropoff = GAP_X_DROPOFF_PROT;
1589 options->gap_x_dropoff_final = GAP_X_DROPOFF_FINAL_PROT;
1590 options->gap_trigger = 22.0;
1591
1592 if (StringICmp(progname, "blastp") == 0)
1593 {
1594 options->e2 = BLAST_SMALLEST_EVALUE;
1595 options->threshold_second = WORD_THRESHOLD_BLASTP;
1596 }
1597 else if (StringICmp(progname, "blastx") == 0)
1598 {
1599 options->e2 = 1.0;
1600 options->genetic_code = 1;
1601 options->threshold_second = WORD_THRESHOLD_BLASTX;
1602 options->do_sum_stats = TRUE;
1603 options->strand_option = BLAST_BOTH_STRAND;
1604 }
1605
1606 else if ( StringICmp(progname, "tblastn") == 0
1607 || StringICmp(progname, "psitblastn") == 0 )
1608 {
1609 options->e2 = 1.0;
1610 options->db_genetic_code = 1;
1611 options->threshold_second = WORD_THRESHOLD_TBLASTN;
1612 options->do_sum_stats = TRUE;
1613 }
1614 else if (StringICmp(progname, "tblastx") == 0)
1615 {
1616 options->e2 = BLAST_SMALLEST_EVALUE;
1617 options->genetic_code = 1;
1618 options->db_genetic_code = 1;
1619 options->threshold_second = WORD_THRESHOLD_TBLASTX;
1620 options->gap_open = 0;
1621 options->gap_extend = 0;
1622 options->gap_x_dropoff = 0;
1623 options->gap_x_dropoff_final = 0;
1624 options->gapped_calculation = gapped = FALSE;
1625 options->do_sum_stats = TRUE;
1626 options->strand_option = BLAST_BOTH_STRAND;
1627 options->hsp_num_max = 100;
1628 }
1629 if (gapped)
1630 {
1631 options->two_pass_method = FALSE;
1632 options->multiple_hits_only = TRUE;
1633 options->dropoff_2nd_pass = options->dropoff_1st_pass;
1634 options->gap_decay_rate = 0.1;
1635 options->gap_prob = 1.0;
1636 }
1637 }
1638
1639 return options;
1640 }
1641
1642 /*
1643 Delete the Options structure.
1644 */
1645 BLAST_OptionsBlkPtr LIBCALL
1646 BLASTOptionDelete(BLAST_OptionsBlkPtr options)
1647 {
1648 if (options == NULL)
1649 return NULL;
1650
1651 if (options->matrix != NULL)
1652 MemFree(options->matrix);
1653
1654 if (options->program_name != NULL)
1655 MemFree(options->program_name);
1656
1657 MemFree(options->filter_string);
1658
1659 MemFree(options->gifile);
1660
1661 options = MemFree(options);
1662 return options;
1663 }
1664
1665
1666 /*
1667 Validate the Options structure. If an invalid option is found,
1668 call BLASTOptionDelete and issue an error message.
1669 */
1670 BLAST_OptionsBlkPtr LIBCALL
1671 BLASTOptionValidate(BLAST_OptionsBlkPtr options, CharPtr progname)
1672
1673 {
1674 Int2 status;
1675 ValNodePtr error_return=NULL;
1676
1677 status = BLASTOptionValidateEx(options, progname, &error_return);
1678
1679 if (status != 0)
1680 options = BLASTOptionDelete(options);
1681
1682 BlastErrorPrint(error_return);
1683 BlastErrorChainDestroy(error_return);
1684
1685 return options;
1686 }
1687
1688 /*
1689 Validate the Options structure. If an invalid option is found,
1690 call BLASTOptionDelete and issue an error message.
1691 */
1692 Int2 LIBCALL
1693 BLASTOptionValidateEx (BLAST_OptionsBlkPtr options, CharPtr progname, ValNodePtr PNTR error_return)
1694
1695 {
1696 Int2 status=0;
1697
1698 if (options->hitlist_size < 1)
1699 {
1700 BlastConstructErrorMessage("BLASTOptionValidateEx", "No hits are being saved", 1, error_return);
1701 return 1;
1702 }
1703
1704 if (options->expect_value <= 0.0 && options->cutoff_s == 0)
1705 {
1706 BlastConstructErrorMessage("BLASTOptionValidateEx", "expect value must be greater than zero", 1, error_return);
1707 return 1;
1708 }
1709
1710 if (options->wordsize <= 0)
1711 {
1712 BlastConstructErrorMessage("BLASTOptionValidateEx", "wordsize must be non-zero", 1, error_return);
1713 return 1;
1714 }
1715
1716 if (StringICmp(progname, "blastn") == 0)
1717 {
1718 if (options->penalty >= 0)
1719 {
1720 BlastConstructErrorMessage("BLASTOptionValidateEx", "BLASTN penalty must be negative", 1, error_return);
1721 return 1;
1722 }
1723
1724 if (options->is_megablast_search && options->wordsize < 8) {
1725 BlastConstructErrorMessage("BLASTOptionValidateEx",
1726 "Wordsize must be 8 or greater", 1, error_return);
1727 return 1;
1728 } else if (options->wordsize < 7) {
1729 BlastConstructErrorMessage("BLASTOptionValidateEx", "Wordsize must be 7 or greater", 1, error_return);
1730 return 1;
1731 }
1732
1733 if (options->is_megablast_search && options->gap_open > 0
1734 && options->gap_extend == 0)
1735 {
1736 BlastConstructErrorMessage("BLASTOptionValidateEx",
1737 "Gap extension penalty must be non-zero if gap open"
1738 " penalty specified", 1, error_return);
1739 return 1;
1740 }
1741
1742 if (options->threshold_second != 0)
1743 {
1744 BlastConstructErrorMessage("BLASTOptionValidateEx", "non-zero threshold not permitted with blastn", 1, error_return);
1745 return 1;
1746 }
1747
1748 if (options->two_pass_method == TRUE || options->number_of_bits != 0.0)
1749 {
1750 BlastConstructErrorMessage("BLASTOptionValidateEx", "two-passes not available for blastn", 1, error_return);
1751 return 1;
1752 }
1753
1754 if ((options->strand_option | BLAST_BOTH_STRAND) == 0)
1755 {
1756 BlastConstructErrorMessage("BLASTOptionValidateEx", "invalid strand specified", 1, error_return);
1757 return 1;
1758 }
1759
1760 if (options->gapped_calculation == TRUE)
1761 {
1762 BLAST_KarlinBlk* kbp = BlastKarlinBlkCreate();
1763 BLAST_KarlinBlk* kbp_ungap = BlastKarlinBlkCreate();
1764 Boolean round_down;
1765 /* We call this function to find out if these are supported parameters. */
1766 status = BlastKarlinBlkNuclGappedCalc(kbp, options->gap_open, options->gap_extend, options->reward,
1767 options->penalty, kbp_ungap, &round_down, error_return);
1768 kbp = BlastKarlinBlkDestruct(kbp);
1769 kbp_ungap = BlastKarlinBlkDestruct(kbp_ungap);
1770 if (!status)
1771 return status;
1772 }
1773
1774 /*
1775 if (options->multiple_hits_only == TRUE)
1776 {
1777 BlastConstructErrorMessage("BLASTOptionValidateEx", "multiple hits not available for blastn", 1, error_return);
1778 return 1;
1779 }
1780 */
1781 }
1782 else
1783 {
1784 if (StringICmp(progname, "blastx") == 0 || StringICmp(progname, "tblastx") == 0)
1785 {
1786 if ((options->strand_option | BLAST_BOTH_STRAND) == 0)
1787 {
1788 BlastConstructErrorMessage("BLASTOptionValidateEx", "invalid strand specified", 1, error_return);
1789 return 1;
1790 }
1791 }
1792 if (options->wordsize < 2 || options->wordsize > 3)
1793 {
1794 BlastConstructErrorMessage("BLASTOptionValidateEx", "Valid wordsize range is 2 to 3", 1, error_return);
1795 return 1;
1796 }
1797 if (options->threshold_second == 0)
1798 {
1799 BlastConstructErrorMessage("BLASTOptionValidateEx", "non-zero threshold required", 1, error_return );
1800 return 1;
1801 }
1802 if (options->penalty != 0 || options->reward != 0)
1803 {
1804 BlastConstructErrorMessage("BLASTOptionValidateEx", "penalty or reward can only be non-zero for blastn", 1, error_return);
1805 return 1;
1806 }
1807
1808 if (StringICmp(progname, "tblastx") == 0)
1809 {
1810 if (options->gapped_calculation == TRUE)
1811 {
1812 BlastConstructErrorMessage("BLASTOptionValidateEx", "gapped calculations not available with tblastx", 1, error_return);
1813 return 1;
1814 }
1815 }
1816
1817 if (options->gapped_calculation == TRUE)
1818 {
1819 status = BlastKarlinBlkGappedCalcEx(NULL, options->gap_open, options->gap_extend, options->decline_align, options->matrix, error_return);
1820 }
1821 }
1822
1823 if (options->isPatternSearch && (!options->gapped_calculation))
1824 {
1825 BlastConstructErrorMessage("BLASTOptionValidateEx", "PHI-BLAST cannot use ungapped alignments", 1, error_return);
1826 return 1;
1827 }
1828
1829 if (options->tweak_parameters && (!options->gapped_calculation))
1830 {
1831 BlastConstructErrorMessage("BLASTOptionValidateEx", "parameter adjustment not supported with ungapped alignments", 1, error_return);
1832 return 1;
1833 }
1834
1835 if (options->smith_waterman && (!options->gapped_calculation))
1836 {
1837 BlastConstructErrorMessage("BLASTOptionValidateEx", "locally optimal alignments not supported with ungapped alignments", 1, error_return);
1838 return 1;
1839 }
1840 if ((StringICmp(progname, "psitblastn") != 0) && (options->CheckpointFileName !=NULL))
1841 {
1842 BlastConstructErrorMessage("BLASTOptionValidateEx",
1843 "-R option is available only for psitblastn", 1, error_return);
1844 return 1;
1845 }
1846
1847 /* Check the validity of the megablast specific options */
1848 if (options->is_megablast_search) {
1849 if (options->mb_template_length > 0) {
1850 if (options->mb_template_length != 16 &&
1851 options->mb_template_length != 18 &&
1852 options->mb_template_length != 21) {
1853 BlastConstructErrorMessage("BLASTOptionValidateEx",
1854 "Only discontiguous template lengths 16, 18 and 21 are supported",
1855 1, error_return);
1856 return 1;
1857 }
1858
1859 if (options->wordsize < 11 || options->wordsize > 12) {
1860 BlastConstructErrorMessage("BLASTOptionValidateEx",
1861 "Only discontiguous word sizes 11 and 12 are supported",
1862 1, error_return);
1863 return 1;
1864 }
1865
1866 if (options->mb_disc_type > MB_TWO_TEMPLATES) {
1867 BlastConstructErrorMessage("BLASTOptionValidateEx",
1868 "Allowed discontiguous word types are: 0 - coding, 1 - maximal, 2 - two templates",
1869 1, error_return);
1870 return 1;
1871 }
1872 }
1873 }
1874
1875 /* --KM check -B option: query concatenation only with certain
1876 BLAST flavors */
1877 if (options->NumQueries>0 &&
1878 !( (StringICmp("blastn", progname) == 0) ||
1879 (StringICmp("tblastn", progname) == 0) ) ) {
1880
1881 BlastConstructErrorMessage("BLASTOptionValidateEx",
1882 "Query concatenation can only be used with blastn and tblastn",
1883 1, error_return);
1884 return 1;
1885 }
1886
1887
1888
1889
1890 return status;
1891 }
1892
1893 /*
1894 Changes the matrix value to the one given and sets the
1895 default parameters for that Matrix.
1896 */
1897
1898 Int2 LIBCALL
1899 BLASTOptionSetGapParams (BLAST_OptionsBlkPtr options,
1900 CharPtr matrix_name, Int4 open, Int4 extended)
1901
1902 {
1903 Boolean found_matrix=FALSE, threshold_set=FALSE;
1904 Int4 threshold, window_size;
1905
1906 if (options == NULL || matrix_name == NULL)
1907 return -1;
1908
1909 /* Save these in case we need to restore (for blastn). */
1910 threshold = options->threshold_second;
1911 window_size = options->window_size;
1912
1913 /* We check for protein matrices also for blastn in case a nucl.-nucl. matrix really
1914 has been specified. */
1915 if (StringICmp(matrix_name, "BLOSUM62") == 0) {
1916 options->gap_open = 11;
1917 options->gap_extend = 1;
1918 options->window_size = 40;
1919 options->threshold_second = 11;
1920 found_matrix = TRUE;
1921 threshold_set = TRUE;
1922 } else if (StringICmp(matrix_name, "BLOSUM45") == 0) {
1923 options->gap_open = 14;
1924 options->gap_extend = 2;
1925 options->window_size = 60;
1926 options->threshold_second = 14;
1927 found_matrix = TRUE;
1928 threshold_set = TRUE;
1929 } else if (StringICmp(matrix_name, "BLOSUM50") == 0) {
1930 options->gap_open = 13;
1931 options->gap_extend = 2;
1932 found_matrix = TRUE;
1933 } else if (StringICmp(matrix_name, "PAM250") == 0) {
1934 options->gap_open = 14;
1935 options->gap_extend = 2;
1936 found_matrix = TRUE;
1937 } else if (StringICmp(matrix_name, "BLOSUM62_20") == 0) {
1938 options->gap_open = 100;
1939 options->gap_extend = 10;
1940 options->threshold_second = 100;
1941 found_matrix = TRUE;
1942 } else if (StringICmp(matrix_name, "BLOSUM90") == 0) {
1943 options->gap_open = 10;
1944 options->gap_extend = 1;
1945 found_matrix = TRUE;
1946 } else if (StringICmp(matrix_name, "BLOSUM80") == 0) {
1947 options->gap_open = 10;
1948 options->gap_extend = 1;
1949 options->window_size = 25;
1950 options->threshold_second = 12;
1951 found_matrix = TRUE;
1952 threshold_set = TRUE;
1953 } else if (StringICmp(matrix_name, "PAM30") == 0) {
1954 options->gap_open = 9;
1955 options->gap_extend = 1;
1956 options->window_size = 15;
1957 options->threshold_second = 16;
1958 found_matrix = TRUE;
1959 threshold_set = TRUE;
1960 } else if (StringICmp(matrix_name, "PAM70") == 0) {
1961 options->gap_open = 10;
1962 options->gap_extend = 1;
1963 options->window_size = 20;
1964 options->threshold_second = 14;
1965 found_matrix = TRUE;
1966 threshold_set = TRUE;
1967 }
1968
1969 if (open)
1970 options->gap_open = open;
1971 if (extended)
1972 options->gap_extend = extended;
1973
1974
1975 /* blastn and megablast are different. */
1976 if (StringICmp("blastn", options->program_name) == 0) {
1977 options->threshold_second = threshold;
1978 options->window_size = window_size;
1979 if (!options->is_megablast_search) {
1980 options->gap_open = 5;
1981 options->gap_extend = 2;
1982 if (found_matrix == FALSE) {
1983 /* probably not a protein-protein matrix. */
1984 options->matrix = StringSave(matrix_name);
1985 }
1986 } else {
1987 options->gap_open = GAP_OPEN_MEGABLAST;
1988 options->gap_extend = GAP_EXTN_MEGABLAST;
1989 }
1990 return 0;
1991 }
1992
1993 if (matrix_name)
1994 {
1995 if (options->matrix)
1996 MemFree(options->matrix);
1997 options->matrix = StringSave(matrix_name);
1998 }
1999
2000 if (!found_matrix)
2001 return -1;
2002
2003 if (threshold_set) {
2004 if (StringICmp(options->program_name, "blastx") == 0) {
2005 options->threshold_second++;
2006
2007 } else if (StringICmp(options->program_name, "tblastn") == 0
2008 || StringICmp(options->program_name, "tblastx") == 0
2009 || StringICmp(options->program_name, "psitblastn") == 0) {
2010 options->threshold_second += 2;
2011 }
2012 }
2013 return 0;
2014 }
2015
2016 /*
2017 This function obtains the sequence from a BioseqPtr in ASCII alphabet.
2018 The return value is a Uint1Ptr containing the sequence, the Int4Ptr
2019 length inidcates the length of the seqeunce.
2020 */
2021
2022 Uint1Ptr
2023 BlastGetSequenceFromBioseq (BioseqPtr bsp, Int4Ptr length)
2024
2025 {
2026 Int4 index;
2027 SeqPortPtr spp;
2028 Uint1 residue;
2029 Uint1Ptr sequence;
2030
2031 *length = 0;
2032 if (bsp == NULL)
2033 return NULL;
2034
2035 sequence = MemNew((1+bsp->length)*sizeof(Uint1));
2036 if (sequence == NULL)
2037 return NULL;
2038
2039 if (ISA_na(bsp->mol))
2040 spp = SeqPortNew(bsp, 0, -1, Seq_strand_plus, Seq_code_iupacna);
2041 else
2042 spp = SeqPortNew(bsp, 0, -1, Seq_strand_unknown, Seq_code_ncbieaa);
2043
2044 index=0;
2045 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
2046 {
2047 if (residue == SEQPORT_VIRT)
2048 continue;
2049 sequence[index] = residue;
2050 index++;
2051 }
2052 sequence[index] = NULLB;
2053 spp = SeqPortFree(spp);
2054
2055 *length = index;
2056 return sequence;
2057 }
2058
2059 /*
2060 Adjusts the length and number of sequences in a database according
2061 to the gi_list or seqIdPtr list given.
2062
2063 06/24/2002: This function returns the number of sequences and number of
2064 bases/residues based on the information on the rdfp parameter (all others
2065 are ignored). This function should be called after BlastProcessGiLists and
2066 before calculating the effective search space.
2067
2068 02/12/2004: This function is *deprecated*
2069 */
2070
2071 Boolean
2072 BlastAdjustDbNumbers (ReadDBFILEPtr rdfp, Int8Ptr db_length, Int4Ptr db_number, SeqIdPtr seqid_list, BlastDoubleInt4Ptr gi_list, OIDListPtr oidlist, BlastDoubleInt4Ptr PNTR gi_list_pointers, Int4 gi_list_total)
2073
2074 {
2075 return readdb_get_totals_ex2(rdfp, db_length, db_number, FALSE, TRUE);
2076 }
2077
2078 /*
2079 Deletes only the BlastGiListPtr, not
2080 the associated arrays.
2081 */
2082 BlastGiListPtr
2083 BlastGiListDestruct(BlastGiListPtr blast_gi_list, Boolean contents)
2084 {
2085 if (blast_gi_list == NULL)
2086 return NULL;
2087
2088 if (contents)
2089 { /* On the main thread. Deallocate contents. */
2090 if (blast_gi_list->gilist_not_owned == FALSE)
2091 MemFree(blast_gi_list->gi_list);
2092 MemFree(blast_gi_list->gi_list_pointer);
2093 }
2094
2095 return MemFree(blast_gi_list);
2096 }
2097
2098 /*
2099 Allocates BlastGiListPtr. The caller still owns
2100 the gi_list and must delete it.
2101 */
2102
2103 BlastGiListPtr BlastGiListNew(BlastDoubleInt4Ptr gi_list, Int4 tot)
2104 {
2105 BlastGiListPtr blast_gi_list = NULL;
2106 register Int4 i;
2107
2108 if (gi_list == NULL || tot == 0)
2109 return NULL;
2110
2111 if ((blast_gi_list = MemNew(sizeof(BlastGiList))) == NULL) {
2112 ErrPostEx(SEV_ERROR,0,0,"BlastGiListNew: Out of memory\n");
2113 return NULL;
2114 }
2115
2116 blast_gi_list->gi_list = MemNew(sizeof(BlastDoubleInt4)*tot);
2117 blast_gi_list->gi_list_pointer = MemNew(sizeof(BlastDoubleInt4Ptr)*tot);
2118 if ((blast_gi_list->gi_list == NULL) ||
2119 (blast_gi_list->gi_list_pointer == NULL)) {
2120 ErrPostEx(SEV_ERROR,0,0,"BlastGiListNew: Out of memory\n");
2121 return NULL;
2122 }
2123
2124 for (i = 0; i < tot; i++) {
2125 MemCpy(&(blast_gi_list->gi_list[i]),&gi_list[i],
2126 sizeof(BlastDoubleInt4));
2127 blast_gi_list->gi_list_pointer[i] = &(blast_gi_list->gi_list[i]);
2128 }
2129 blast_gi_list->total = tot;
2130 blast_gi_list->current = 0;
2131 blast_gi_list->gilist_not_owned = FALSE; /* yes, we own this */
2132
2133 return blast_gi_list;
2134 }
2135
2136 /* This is a simple callback comparison function for sorting */
2137 static int LIBCALLBACK
2138 blast_double_int_oid_compare(VoidPtr v1, VoidPtr v2)
2139 {
2140 BlastDoubleInt4Ptr h1, h2;
2141 BlastDoubleInt4Ptr *hp1, *hp2;
2142
2143 hp1 = (BlastDoubleInt4Ptr PNTR) v1;
2144 hp2 = (BlastDoubleInt4Ptr PNTR) v2;
2145 h1 = *hp1;
2146 h2 = *hp2;
2147
2148 if (h1->ordinal_id < h2->ordinal_id)
2149 return -1;
2150 if (h1->ordinal_id > h2->ordinal_id)
2151 return 1;
2152 if (h1->start == h2->start) { /* break the tie with gis */
2153 if (h1->gi < h2->gi)
2154 return -1;
2155 if (h1->gi > h2->gi)
2156 return 1;
2157 }
2158
2159 return 0;
2160 }
2161 static int LIBCALLBACK
2162 blast_double_int_gi_compare(VoidPtr v1, VoidPtr v2)
2163 {
2164 BlastDoubleInt4Ptr h1, h2;
2165 BlastDoubleInt4Ptr *hp1, *hp2;
2166
2167 hp1 = (BlastDoubleInt4Ptr PNTR) v1;
2168 hp2 = (BlastDoubleInt4Ptr PNTR) v2;
2169 h1 = *hp1;
2170 h2 = *hp2;
2171
2172 if (h1->gi < h2->gi)
2173 return -1;
2174 if (h1->gi > h2->gi)
2175 return 1;
2176
2177 return 0;
2178 }
2179 static int LIBCALLBACK
2180 blast_double_int_start_compare(VoidPtr v1, VoidPtr v2)
2181 {
2182 BlastDoubleInt4Ptr h1, h2;
2183 BlastDoubleInt4Ptr *hp1, *hp2;
2184
2185 hp1 = (BlastDoubleInt4Ptr PNTR) v1;
2186 hp2 = (BlastDoubleInt4Ptr PNTR) v2;
2187 h1 = *hp1;
2188 h2 = *hp2;
2189
2190 if (h1->start < h2->start)
2191 return -1;
2192 if (h1->start > h2->start)
2193 return 1;
2194 if (h1->start == h2->start) { /* break the tie with oids */
2195 if (h1->ordinal_id < h2->ordinal_id)
2196 return -1;
2197 if (h1->ordinal_id > h2->ordinal_id)
2198 return 1;
2199 }
2200
2201 return 0;
2202 }
2203
2204 /* Returns the intersection of list1 and list2. Caller is reponsible to
2205 * deallocate the return value */
2206 static BlastGiListPtr IntersectBlastGiLists(
2207 BlastDoubleInt4Ptr list1, Int4 total1,
2208 BlastDoubleInt4Ptr list2, Int4 total2)
2209 {
2210 BlastGiListPtr retval = NULL;
2211 BlastDoubleInt4Ptr new_list = NULL;
2212 BlastDoubleInt4Ptr *list_ptrs1 = NULL, *list_ptrs2 = NULL;
2213 Int4 idx1, idx2, new_idx = 0, short_list_sz = 0, gi;
2214
2215 if ((!list1 && !list2) || (total1 <= 0 && total2 <= 0)) {
2216 ErrPostEx(SEV_WARNING, 0, 0, "IntersectBlastGiLists: No lists to "
2217 "combine?");
2218 return NULL;
2219 }
2220 if (!list1) total1 = 0;
2221 if (!list2) total2 = 0;
2222
2223 if (total1 > total2)
2224 short_list_sz = total2;
2225 else
2226 short_list_sz = total1;
2227
2228 new_list = (BlastDoubleInt4Ptr)
2229 MemNew(short_list_sz*sizeof(BlastDoubleInt4));
2230 if (!new_list) {
2231 ErrPostEx(SEV_WARNING, 0, 0, "IntersectBlastGiLists: Failed to "
2232 "allocate memory for gi list");
2233 return NULL;
2234 }
2235
2236 /* sort the two lists by gi to facilitate intersection */
2237 list_ptrs1 = (BlastDoubleInt4Ptr *)
2238 Malloc(sizeof(BlastDoubleInt4Ptr)*total1);
2239 list_ptrs2 = (BlastDoubleInt4Ptr *)
2240 Malloc(sizeof(BlastDoubleInt4Ptr)*total2);
2241
2242 for (idx1 = 0; idx1 < total1; idx1++)
2243 list_ptrs1[idx1] = &(list1[idx1]);
2244 for (idx2 = 0; idx2 < total2; idx2++)
2245 list_ptrs2[idx2] = &(list2[idx2]);
2246 HeapSort(list_ptrs1, total1, sizeof(BlastDoubleInt4Ptr *),
2247 blast_double_int_gi_compare);
2248 HeapSort(list_ptrs2, total2, sizeof(BlastDoubleInt4Ptr *),
2249 blast_double_int_gi_compare);
2250
2251 /* Intersect! */
2252 for (idx1 = 0, idx2 = 0; idx1 < total1; idx1++) {
2253 gi = list_ptrs1[idx1]->gi;
2254
2255 for (; idx2 < total2 && list_ptrs2[idx2]->gi < gi; idx2++);
2256
2257 if (idx2 < total2 && list_ptrs2[idx2]->gi == gi) {
2258 MemCpy((VoidPtr)&new_list[new_idx], list_ptrs1[idx1],
2259 sizeof(BlastDoubleInt4));
2260 new_idx++;
2261 }
2262 }
2263
2264 /* Save the intersection and return it */
2265 if (new_idx > 0)
2266 retval = BlastGiListNew(new_list, new_idx);
2267
2268 if (list_ptrs1)
2269 MemFree(list_ptrs1);
2270 if (list_ptrs2)
2271 MemFree(list_ptrs2);
2272 MemFree(new_list);
2273
2274 return retval;
2275 }
2276
2277 /* Return the union of list1 and list2. Size of the return value will be
2278 * total1 + total2. This function does not eliminate repeated entries.
2279 * Caller is responsible for deallocating the return value.*/
2280 static
2281 BlastGiListPtr CombineDoubleInt4Lists(
2282 BlastDoubleInt4Ptr list1, Int4 total1,
2283 BlastDoubleInt4Ptr list2, Int4 total2)
2284 {
2285 BlastGiListPtr retval = NULL;
2286 BlastDoubleInt4Ptr new_list;
2287
2288 if ((!list1 && !list2) || (total1 <= 0 && total2 <= 0))
2289 return NULL;
2290
2291 if (!list1) total1 = 0;
2292 if (!list2) total2 = 0;
2293
2294 new_list = (BlastDoubleInt4Ptr)
2295 Malloc((total1+total2)*sizeof(BlastDoubleInt4));
2296 if (!new_list) {
2297 ErrPostEx(SEV_WARNING, 0, 0, "CombineDoubleInt4Lists: Failed to "
2298 "allocate memory for gi list");
2299 return NULL;
2300 }
2301 MemCpy(new_list, list1, total1*sizeof(BlastDoubleInt4));
2302 MemCpy(&(new_list[total1]), list2, total2*sizeof(BlastDoubleInt4));
2303
2304 retval = BlastGiListNew(new_list, total1+total2);
2305 MemFree(new_list);
2306
2307 return retval;
2308 }
2309
2310 /* Intersects (or creates) oidlists and attaches them to individual rdfp's in
2311 * the rdfp_chain. If any rdfp->oidlist is populated before calling this
2312 * function, it will be freed and replaced by the intersection of the gilist
2313 * parameter and itself. */
2314 static
2315 Boolean IntersectDoubleInt4ListWithOIDLists(
2316 BlastDoubleInt4Ptr gilist, Int4 total,
2317 ReadDBFILEPtr rdfp_chain)
2318 {
2319 OIDListPtr new_oidlist = NULL, lcl_oidlist = NULL;
2320 ReadDBFILEPtr rdfp = NULL;
2321 BlastDoubleInt4Ptr *gilist_ptrs = NULL;
2322 Int4 gilist_idx = 0; /* index into gilist_ptrs */
2323 Int4 i, new_oidlistsz = 0, new_oidlist_count = 0;
2324 Int4 maxoid, lcl_mask_index, lcl_oid;
2325 Uint4 lcl_oid_bit = 0, lcl_mask = 0;
2326
2327 gilist_ptrs = (BlastDoubleInt4Ptr *)
2328 Malloc(total*sizeof(BlastDoubleInt4Ptr));
2329 for (i=0; i < total; i++)
2330 gilist_ptrs[i] = &(gilist[i]);
2331
2332 /* Sort by start field */
2333 HeapSort(gilist_ptrs, total,
2334 sizeof(BlastDoubleInt4Ptr PNTR), blast_double_int_start_compare);
2335
2336 /* merge gis from gilist and oids as you walk through rdfp_chain */
2337 for (rdfp = rdfp_chain; rdfp; rdfp = rdfp->next) {
2338
2339 lcl_oidlist = rdfp->oidlist;
2340
2341 while (gilist_idx < total &&
2342 gilist_ptrs[gilist_idx]->start == rdfp->start) {
2343
2344 if (!new_oidlist) {
2345 /* determine the maxoid for this rdfp */
2346 for (i = gilist_idx;
2347 i < total && gilist_ptrs[i]->start == rdfp->start;
2348 i++);
2349
2350 if (i == total && rdfp->start != gilist_ptrs[i-1]->start)
2351 break;
2352 maxoid = gilist_ptrs[i-1]->ordinal_id - rdfp->start;
2353
2354 new_oidlist = (OIDListPtr) MemNew(sizeof(OIDList));
2355 new_oidlist->total = maxoid + 1;
2356 new_oidlistsz = maxoid/MASK_WORD_SIZE + 2;
2357 new_oidlist->list = (Uint4Ptr) MemNew(
2358 (new_oidlistsz)*sizeof(Uint4));
2359 new_oidlist->memory = new_oidlist->list;
2360 }
2361
2362 lcl_oid = gilist_ptrs[gilist_idx]->ordinal_id - rdfp->start;
2363
2364 if (lcl_oid >= 0) {
2365
2366 lcl_mask_index = lcl_oid/MASK_WORD_SIZE;
2367 lcl_oid_bit = 0x1 << (MASK_WORD_SIZE - 1 -
2368 lcl_oid % MASK_WORD_SIZE);
2369
2370 /* In a database with volumes and oidlists we might have empty
2371 * masks (ie: month est subset in a multi-volume est real
2372 * database), so reset the mask index to avoid accessing
2373 * outside the bounds of the lcl_oidlist->list */
2374 if (lcl_oidlist) {
2375 if (lcl_oidlist->total == 0)
2376 lcl_mask_index = 0;
2377 /* also remember the mask */
2378 lcl_mask = SwapUint4(lcl_oidlist->list[lcl_mask_index]);
2379 }
2380
2381 /* if there is an oidlist already, make sure this gi is in both
2382 * the gilist and the oidlist */
2383 if (!lcl_oidlist ||
2384 (lcl_oidlist && lcl_oidlist->list &&
2385 (lcl_oid < new_oidlist->total) && (lcl_mask & lcl_oid_bit)))
2386 {
2387 new_oidlist->list[lcl_mask_index] |= lcl_oid_bit;
2388 new_oidlist_count++;
2389 }
2390 }
2391 gilist_idx++;
2392 }
2393 /* Save the newly created (or combined) oidlist */
2394 if (new_oidlist_count != 0) {
2395 for (i = 0; i < new_oidlistsz; i++)
2396 new_oidlist->list[i] = SwapUint4(new_oidlist->list[i]);
2397 rdfp->oidlist = OIDListFree(rdfp->oidlist);
2398 rdfp->oidlist = new_oidlist;
2399 new_oidlist = NULL;
2400 new_oidlist_count = 0;
2401 } else {
2402 new_oidlist = OIDListFree(new_oidlist);
2403 }
2404 }
2405
2406 MemFree(gilist_ptrs);
2407
2408 return TRUE;
2409 }
2410
2411 /* The purpose of this function is to merge (or convert) any rdfp->gifile(s)
2412 in the rdfp_chain to rdfp->oidlist(s). These oidlists would be local to
2413 each rdfp (ie: if one gi list is given for a multivolume database,
2414 multiple oidlists could be created corresponding to each of the rdfp in
2415 the rdfp_chain).
2416 If there are any ordinal id lists present in the rdfp_chain, these will be
2417 intersected with the oidlists created from the gilists.
2418 The gis that were listed in each rdfp->gilist that belong to this database
2419 are returned in the bglp. Caller is responsible for deallocating this
2420 value.
2421 */
2422 static
2423 void MergeDbGiListsWithOIDLists(ReadDBFILEPtr rdfp_chain, ValNodePtr *err_ret,
2424 BlastGiListPtr *bglpp)
2425 {
2426 ReadDBFILEPtr rdfp = NULL; /* holds an individual element of the
2427 rdfp_chain linked list */
2428 ReadDBFILEPtr rdfp_tmp = NULL; /* holds the rest of the rdfp linked list
2429 when searching for gis in a rdfp element
2430 */
2431 BlastDoubleInt4Ptr list=NULL, global_list = NULL;
2432 BlastGiListPtr tmp_list = NULL;
2433 Int4 i, index, ngis = 0, total_num_gis = 0, start;
2434 Char buf[256];
2435
2436 /* If this argument is not passed in, any gis from rdfp->gifile will
2437 * be lost for formatting purposes */
2438 if (!bglpp)
2439 return;
2440
2441 /**
2442 * Gather all gis from all rdfp->gilist(s).
2443 */
2444 for (rdfp = rdfp_chain; rdfp; rdfp = rdfp->next) {
2445 if (!rdfp->gilist)
2446 continue;
2447
2448 ngis = rdfp->gilist->count;
2449 list = (BlastDoubleInt4Ptr) Malloc(sizeof(BlastDoubleInt4)*ngis);
2450 if (!list) {
2451 ErrPostEx(SEV_FATAL, 1, 0, "Out of memory");
2452 return;
2453 }
2454
2455 /* Isolate the current rdfp element */
2456 rdfp_tmp = rdfp->next;
2457 rdfp->next = NULL;
2458
2459 /* See which gis in rdfp->gilist belong to this rdfp */
2460 for (index=0, i=0; i < ngis; i++) {
2461 list[index].ordinal_id = readdb_gi2seq(rdfp_chain,
2462 rdfp->gilist->i[i], &start);
2463 if (list[index].ordinal_id >= 0) {
2464 list[index].gi = rdfp->gilist->i[i];
2465 list[index].start = start;
2466 index++;
2467 }
2468 }
2469
2470 /* Restore the rdfp_chain linked list */
2471 rdfp->next = rdfp_tmp;
2472
2473 /* After this point we don't really need the rdfp->gilist */
2474 rdfp->gilist = Int4ListFree(rdfp->gilist);
2475
2476 /* No gis in rdfp->gilist belong to this rdfp, so restrict this entire
2477 * rdfp (mask all of its sequences) */
2478 if ( (ngis = index) == 0) {
2479 Int4 maxoid = rdfp->stop - rdfp->start;
2480
2481 rdfp->oidlist = (OIDListPtr) MemNew(sizeof(OIDList));
2482 rdfp->oidlist->total = maxoid + 1;
2483 rdfp->oidlist->list = (Uint4Ptr)
2484 MemNew(sizeof(Uint4)*(maxoid/MASK_WORD_SIZE + 2));
2485 rdfp->oidlist->memory = rdfp->oidlist->list;
2486 sprintf(buf,"%s database restricted with gis not present in the "
2487 "database. Restricting the entire database",
2488 FileNameFind(rdfp->filename));
2489 ErrPostEx(SEV_INFO, 0, 0, buf);
2490 /*BlastConstructErrorMessage("MergeDbGiListsWithOIDLists", buf,
2491 SEV_WARNING, err_ret);*/
2492 list = MemFree(list);
2493 continue;
2494 }
2495
2496 tmp_list = CombineDoubleInt4Lists(global_list, total_num_gis,
2497 list, ngis);
2498 if (tmp_list) {
2499 global_list = MemFree(global_list);
2500 global_list = tmp_list->gi_list;
2501 tmp_list->gilist_not_owned = TRUE;
2502 BlastGiListDestruct(tmp_list, TRUE);
2503 total_num_gis += ngis;
2504 }
2505 list = MemFree(list);
2506
2507 }
2508
2509 /**
2510 * If there are any lists, create (or intersect) oidlists for the
2511 * appropriate rdfp's
2512 */
2513 if (global_list && total_num_gis > 0) {
2514
2515 if (IntersectDoubleInt4ListWithOIDLists(global_list, total_num_gis,
2516 rdfp_chain) == FALSE) {
2517 ErrPostEx(SEV_WARNING, 0, 0, "MergeDbGiListsWithOIDLists: Could "
2518 "not intersect gi lists with oidlists");
2519 MemFree(global_list);
2520 return;
2521 }
2522
2523 *bglpp = BlastGiListNew(global_list, total_num_gis);
2524 MemFree(global_list);
2525 }
2526 return;
2527 }
2528
2529 /* Purpose: Create the virtual oidlist to limit this blast search.
2530 Parameters:
2531 bglp contains the list of gis to limit the search with. It is freed by this
2532 function.
2533 rdfp_chain must be the head of the linked list of rdfps attached to the
2534 BlastSearchBlk structure.
2535 oidlist_forall_rdfp determines whether the gis in bglp mask only
2536 certain rdfps or mask the entire rdfp_chain.
2537 Finally, in the options parameter, the the gilist_already_calculated
2538 field is a flag to determine whether the ordinal ids for the bglp should
2539 be calculated. This is not the case for the neighboring software
2540 (ie: nabrd.cpp), so we do not calculate them again. This assumes that *
2541 only* real databases are used and that the virtual oidlist is freed for
2542 every subsequent invocation of the BLASTSetUpSearch function.
2543 The options->sort_gi_list flag is also used by the neighboring software
2544 where no sorting of the gi list (bglp parameter) is needed.
2545
2546 The bglp parameter is used to construct the virtual oidlist. If there are
2547 any local oidlists in the rdfp_chain, these are intersected with the bglp.
2548 If the bglp is NULL, any oidlists in the rdfp will be consolidated into
2549 the virtual oidlist but these will not be part of the return value. If
2550 there are no oidlists and bglp is NULL, the search is not limited at all.
2551 */
2552 BlastGiListPtr
2553 BlastCreateVirtualOIDList(BlastGiListPtr bglp, ReadDBFILEPtr rdfp_chain,
2554 Boolean oidlist_forall_rdfp, BLAST_OptionsBlkPtr options)
2555 {
2556 BlastDoubleInt4Ptr gilist, real_gilist = NULL, *real_gilist_ptrs = NULL;
2557 ReadDBFILEPtr rdfp, mask_rdfp = NULL;
2558 OIDListPtr virtual_oidlist = NULL, lcl_oidlist = NULL;
2559 Int4 start, maxoid = 0, virtual_oidlistsz, real_ngis = 0, real_idx = 0;
2560 Int4 virtual_oid, virtual_mask_index, virtual_oid_bit;
2561 Int4 lcl_mask_index, lcl_bit, lcl_oid;
2562 Uint4 lcl_mask = 0;
2563 register Int4 i;
2564
2565 {
2566 Boolean there_are_oidlists = FALSE;
2567 for(rdfp=rdfp_chain;rdfp;rdfp=rdfp->next) {
2568 if (rdfp->oidlist) {
2569 there_are_oidlists = TRUE;
2570 break;
2571 }
2572 }
2573 if ( (bglp == NULL) && (there_are_oidlists == FALSE) )
2574 return NULL;
2575 }
2576
2577 /* initialize the start and oid fields of gilist, as well as maxoid */
2578 if (bglp) {
2579 gilist = bglp->gi_list;
2580 for (i = 0; i < bglp->total; i++) {
2581 if (!options->gilist_already_calculated && /* nabrd does this */
2582 gilist[i].ordinal_id <= 0) { /* assumes gilist was
2583 memset'd to zero */
2584 gilist[i].ordinal_id = readdb_gi2seq(rdfp_chain, gilist[i].gi,
2585 &start);
2586 gilist[i].start = start;
2587 }
2588 maxoid = MAX(maxoid, gilist[i].ordinal_id);
2589 if (gilist[i].ordinal_id != -1)
2590 real_ngis++; /* keep track of how many gis are actually found */
2591 }
2592 } else {
2593 for (rdfp = rdfp_chain; rdfp; rdfp = rdfp->next) {
2594 if (rdfp->oidlist)
2595 maxoid = MAX(maxoid, rdfp->oidlist->total+rdfp->start-1);
2596 }
2597 }
2598
2599 /* Allocate the virtual oidlist */
2600 if (!(virtual_oidlist = (OIDListPtr) MemNew(sizeof(OIDList)))) {
2601 ErrPostEx(SEV_WARNING, 0, 0,"BlastCreateVirtualOIDList: Could not "
2602 "allocate memory for global oidlist");
2603 return NULL;
2604 }
2605 virtual_oidlist->total = maxoid + 1;
2606 virtual_oidlistsz = maxoid/MASK_WORD_SIZE+2;
2607 virtual_oidlist->list = (Uint4Ptr) MemNew(virtual_oidlistsz*sizeof(Uint4));
2608 virtual_oidlist->memory = virtual_oidlist->list;
2609 if (!virtual_oidlist->list) {
2610 ErrPostEx(SEV_WARNING, 0, 0,"BlastCreateVirtualOIDList: Could not "
2611 "allocate memory for global oidlist");
2612 return NULL;
2613 }
2614
2615 /* real_gilist will store the gilist that corresponds to the intersection
2616 * of the bglp and the oidlists in the rdfp_chain */
2617 if (real_ngis > 0) {
2618 real_gilist = (BlastDoubleInt4Ptr)
2619 MemNew(sizeof(BlastDoubleInt4)*real_ngis);
2620 real_gilist_ptrs = (BlastDoubleInt4Ptr *)
2621 MemNew(sizeof(BlastDoubleInt4 *)*real_ngis);
2622 if (!real_gilist || !real_gilist_ptrs) {
2623 ErrPostEx(SEV_WARNING, 0, 0, "BlastCreateVirtualOIDList: Out of "
2624 "memory");
2625 MemFree(virtual_oidlist);
2626 return NULL;
2627 }
2628 }
2629
2630 if (bglp) {
2631 Boolean first_gi_found = FALSE;
2632 Int4 j, virtual_oid1;
2633
2634
2635 /* Iterate through the gilist, initializing the virtual oidlist */
2636 for (i = 0; i < bglp->total; i++) {
2637 if ((virtual_oid = gilist[i].ordinal_id) < 0)
2638 continue;
2639
2640 for (rdfp = rdfp_chain; rdfp; rdfp = rdfp->next) {
2641 if (!first_gi_found && rdfp->start < gilist[i].start) {
2642 if ((lcl_oidlist = rdfp->oidlist) && !oidlist_forall_rdfp) {
2643
2644 for (j = 0; j < lcl_oidlist->total; j++) {
2645 lcl_mask_index = j/MASK_WORD_SIZE;
2646 lcl_bit = 0x1 <<
2647 (MASK_WORD_SIZE - 1 - j % MASK_WORD_SIZE);
2648 virtual_oid1 = j + rdfp->start;
2649 virtual_mask_index = virtual_oid1/MASK_WORD_SIZE;
2650 virtual_oid_bit = 0x1 <<
2651 (MASK_WORD_SIZE - 1 - virtual_oid1 % MASK_WORD_SIZE);
2652
2653 lcl_mask =
2654 SwapUint4(lcl_oidlist->list[lcl_mask_index]);
2655 if (lcl_mask & lcl_bit) {
2656 virtual_oidlist->list[virtual_mask_index] |=
2657 virtual_oid_bit;
2658 }
2659 }
2660 }
2661 } else if (rdfp->start == gilist[i].start) {
2662 first_gi_found = TRUE;
2663 break;
2664 }
2665 }
2666 if (!rdfp) continue;
2667
2668 virtual_mask_index = virtual_oid/MASK_WORD_SIZE;
2669 virtual_oid_bit = 0x1 << (MASK_WORD_SIZE - 1 -
2670 virtual_oid % MASK_WORD_SIZE);
2671 if ((lcl_oidlist = rdfp->oidlist)) {
2672 lcl_oid = gilist[i].ordinal_id - gilist[i].start;
2673 lcl_mask_index = lcl_oid/MASK_WORD_SIZE;
2674 lcl_bit = 0x1 << (MASK_WORD_SIZE - 1 -
2675 lcl_oid % MASK_WORD_SIZE);
2676 if (lcl_oidlist->total == 0)
2677 lcl_mask_index = 0;
2678 }
2679
2680 if (!lcl_oidlist ||
2681 (lcl_oidlist && lcl_oidlist->list &&
2682 lcl_oid < lcl_oidlist->total &&
2683 (SwapUint4(lcl_oidlist->list[lcl_mask_index])&lcl_bit)))
2684 {
2685 virtual_oidlist->list[virtual_mask_index] |= virtual_oid_bit;
2686 MemCpy((VoidPtr) &real_gilist[real_idx], (VoidPtr) &gilist[i],
2687 sizeof(BlastDoubleInt4));
2688 real_idx++;
2689 }
2690 }
2691 } else {
2692 /* Create virtual oidlist by combining existing oidlists */
2693
2694 for (rdfp = rdfp_chain; rdfp; rdfp = rdfp->next) {
2695
2696 if (!(lcl_oidlist = rdfp->oidlist) || oidlist_forall_rdfp)
2697 continue;
2698
2699 for (i = 0; i < lcl_oidlist->total; i++) {
2700 lcl_mask_index = i/MASK_WORD_SIZE;
2701 lcl_bit = 0x1 << (MASK_WORD_SIZE - 1 - i % MASK_WORD_SIZE);
2702 virtual_oid = i + rdfp->start;
2703 virtual_mask_index = virtual_oid/MASK_WORD_SIZE;
2704 virtual_oid_bit = 0x1 << (MASK_WORD_SIZE - 1 -
2705 virtual_oid % MASK_WORD_SIZE);
2706
2707 lcl_mask = SwapUint4(lcl_oidlist->list[lcl_mask_index]);
2708 if (lcl_mask & lcl_bit) {
2709 virtual_oidlist->list[virtual_mask_index] |=
2710 virtual_oid_bit;
2711 }
2712 }
2713 }
2714 }
2715 for (i = 0; i < virtual_oidlistsz; i++)
2716 virtual_oidlist->list[i] = SwapUint4(virtual_oidlist->list[i]);
2717
2718 /* Determine the first rdfp with an oidlist, and free the local oidlists */
2719 for (rdfp = rdfp_chain; rdfp; rdfp = rdfp->next) {
2720 if (rdfp->oidlist && !mask_rdfp)
2721 mask_rdfp = rdfp;
2722 rdfp->oidlist = OIDListFree(rdfp->oidlist);
2723 }
2724
2725 /* attach mask to appropriate place */
2726 if (oidlist_forall_rdfp && real_ngis > 0) {
2727 rdfp_chain->oidlist = virtual_oidlist;
2728 } else {
2729 if (mask_rdfp)
2730 mask_rdfp->oidlist = virtual_oidlist;
2731 else {
2732 /* Should never happen */
2733 ErrPostEx(SEV_ERROR, 0, 0, "BlastCreateVirtualOIDList: Missing "
2734 "oidlists to attach virtual_oidlist");
2735 OIDListFree(virtual_oidlist);
2736 return NULL;
2737 }
2738 }
2739
2740 /* Discard bglp and use real_gilist to store the gilist that corresponds
2741 * to the virtual oidlist */
2742 if (bglp) {
2743 bglp = BlastGiListDestruct(bglp, TRUE);
2744 for (i = 0; i < real_idx; i++)
2745 real_gilist_ptrs[i] = &(real_gilist[i]);
2746 if (options->sort_gi_list == TRUE) {
2747 /* sort by oid to allow quick searches in BlastGetAllowedGis */
2748 HeapSort(real_gilist_ptrs, real_idx,
2749 sizeof(BlastDoubleInt4Ptr PNTR), blast_double_int_oid_compare);
2750 }
2751 gilist = (BlastDoubleInt4Ptr)MemNew(real_idx*sizeof(BlastDoubleInt4));
2752 for (i = 0; i < real_idx; i++)
2753 MemCpy((VoidPtr)&gilist[i], real_gilist_ptrs[i],
2754 sizeof(BlastDoubleInt4));
2755 bglp = BlastGiListNew(gilist, real_idx);
2756 MemFree(real_gilist);
2757 MemFree(real_gilist_ptrs);
2758 MemFree(gilist);
2759 }
2760
2761 return bglp;
2762 }
2763
2764 /* These are the criteria that can restrict a blast search:
2765 a) OIDListPtr rdfp->oidlist;
2766 b) Int4ListPtr rdfp->gilist;
2767 c) ValNodePtr options->gilist;
2768 d) CharPtr options->gifile;
2769 e) BlastDoubleInt4Ptr gi_list;
2770
2771 The policy to restrict blast searches follows:
2772 All rdfp->gilist(s) are merged (union). The resulting gilist is then
2773 intersected with any rdfp->oidlist(s) in (a)
2774 (see MergeDbGiListsWithOIDLists). Gis from rdfp->gilist(s) will be returned
2775 so that they can be added to the search->thr_info->blast_gi_list structure.
2776
2777 If non-NULL, (c), (d), and (e) are intersected and this result is
2778 intersected with the gilist obtained previously.
2779
2780 A single 'virtual' oidlist is created and attached to the first rdfp in the
2781 search->rdfp chain that has been restricted (see BlastCreateVirtualOIDList).
2782
2783 The resulting gi list from (b), (c), (d), and (e) is attached to the
2784 BlastGiListPtr search->thr_info->blast_gi_list field. The definite source
2785 for the actual search resides in the virtual oidlist (if any).
2786 */
2787 Boolean
2788 BlastProcessGiLists(BlastSearchBlkPtr search, BLAST_OptionsBlkPtr options,
2789 BlastDoubleInt4Ptr gi_list, Int4 gi_list_size)
2790 {
2791 BlastGiListPtr bglp = NULL, bglp_tmp = NULL;
2792 BlastDoubleInt4Ptr tmp_list = NULL;
2793 ValNodePtr vnp;
2794 Int4 ngis = 0;
2795 /* determine if final oidlist should cover all rdfps or it should start
2796 * with the first rdfp that had a rdfp->oidlist or a rdfp->gilist */
2797 Boolean oidlist_forall_rdfp = FALSE;
2798 Boolean looking_for_gis = FALSE;
2799
2800 /* if any of the following parameters is non-NULL, we need to create an
2801 * oidlist that covers all rdfp(s) in search->rdfp. */
2802 if (gi_list || options->gifile || options->gilist)
2803 oidlist_forall_rdfp = TRUE;
2804
2805 /* Create individual oidlists for those databases which have gi lists */
2806 MergeDbGiListsWithOIDLists(search->rdfp, &search->error_return, &bglp);
2807
2808 if (gi_list)
2809 {
2810 bglp = CombineDoubleInt4Lists(gi_list, gi_list_size, NULL, 0);
2811 looking_for_gis = TRUE;
2812 }
2813
2814 if (options->gifile) {
2815
2816 if ((tmp_list = GetGisFromFile(options->gifile, &ngis))) {
2817 if (bglp) {
2818 bglp_tmp = IntersectBlastGiLists(tmp_list, ngis,
2819 bglp->gi_list, bglp->total);
2820 } else {
2821 bglp_tmp = CombineDoubleInt4Lists(tmp_list, ngis, NULL, 0);
2822 }
2823
2824 BlastGiListDestruct(bglp, TRUE);
2825 bglp = bglp_tmp;
2826 bglp_tmp = NULL;
2827 tmp_list = (BlastDoubleInt4Ptr) MemFree(tmp_list);
2828 ngis = 0;
2829 }
2830 looking_for_gis = TRUE;
2831 }
2832
2833 /* this is very inefficient, needs to be changed in ASN.1 spec? */
2834 if (options->gilist) {
2835 Int4 alloc_chunk = 1024, alloc = 1024;
2836 looking_for_gis = TRUE;
2837
2838 tmp_list = (BlastDoubleInt4Ptr) MemNew(sizeof(BlastDoubleInt4)*alloc);
2839 if (!tmp_list) {
2840 ErrPostEx(SEV_WARNING, 0, 0, "BlastProcessGiLists: Out of memory");
2841 BlastGiListDestruct(bglp, TRUE);
2842 return looking_for_gis;
2843 }
2844
2845 for (vnp = options->gilist; vnp; vnp = vnp->next) {
2846 if (ngis >= alloc) {
2847 alloc += alloc_chunk;
2848 tmp_list = (BlastDoubleInt4Ptr)
2849 Realloc(tmp_list, sizeof(BlastDoubleInt4)*alloc);
2850 MemSet(&tmp_list[ngis], 0, alloc_chunk*sizeof(BlastDoubleInt4));
2851 }
2852 tmp_list[ngis++].gi = vnp->data.intvalue;
2853 }
2854
2855 if (bglp) {
2856 bglp_tmp = IntersectBlastGiLists(tmp_list, ngis,
2857 bglp->gi_list, bglp->total);
2858 } else {
2859 /* (options->gifile intersection gi_list) resulted in an empty
2860 * set, then we don't recreate bglp. */
2861 if (!options->gifile && !gi_list)
2862 bglp_tmp = CombineDoubleInt4Lists(tmp_list, ngis, NULL, 0);
2863 }
2864
2865 BlastGiListDestruct(bglp, TRUE);
2866 bglp = bglp_tmp;
2867 bglp_tmp = NULL;
2868 tmp_list = (BlastDoubleInt4Ptr) MemFree(tmp_list);
2869 }
2870
2871 /* Save the final gi list for formatting purposes */
2872 search->thr_info->blast_gi_list =
2873 BlastCreateVirtualOIDList(bglp, search->rdfp, oidlist_forall_rdfp,
2874 options);
2875
2876 return looking_for_gis;
2877 }
2878
2879
2880 #define POSIT_PERCENT 0.05
2881 #define POSIT_NUM_ITERATIONS 10
2882
2883 #define Xchar 21 /*character for low-complexity columns*/
2884
2885
2886 /* This should be more than large enough for any alphabet. */
2887 #define POSIT_MAX_ALPHABET_SIZE 30
2888
2889 /*Compute probabilities for each score in posMatrix,
2890 also sets minScore and maxScore*/
2891 static BLAST_ScoreFreqPtr
2892 BlastComputeProbs(BlastMatrixRescalePtr matrix_rescale, Boolean position_dependent)
2893 {
2894 Int2 alphabet_total;
2895 Int4 c; /*index on characters*/
2896 Int4 p; /*index on positions*/
2897 Int4 s; /*index on scores */
2898 Int4 dim1, dim2;
2899 BLAST_Score score_min, score_max;
2900 BLAST_ScoreFreqPtr sfp;
2901 Int4 score; /*one score in the matrix*/
2902 Nlm_FloatHi increment; /*Increment in probability due to one score*/
2903 Int4 effectiveLength;
2904 Int4Ptr *matrix;
2905 Uint1 std_alphabet[POSIT_MAX_ALPHABET_SIZE];
2906
2907 alphabet_total = BlastGetStdAlphabet(Seq_code_ncbistdaa, std_alphabet, POSIT_MAX_ALPHABET_SIZE);
2908 if (alphabet_total <= 0)
2909 return NULL;
2910
2911 matrix = matrix_rescale->matrix;
2912 score_min = 0;
2913 score_max = 0;
2914 if(position_dependent)
2915 {
2916 dim1 = matrix_rescale->query_length;
2917 dim2 = alphabet_total;
2918 }
2919 else
2920 {
2921 dim1 = alphabet_total;
2922 dim2 = alphabet_total;
2923 }
2924
2925 effectiveLength = 0;
2926 for (p = 0; p < matrix_rescale->query_length; p++)
2927 if (Xchar != matrix_rescale->query[p])
2928 effectiveLength++;
2929 for (p = 0; p < dim1; p++)
2930 if (Xchar != matrix_rescale->query[p])
2931 for (c = 0; c < dim2; c++) {
2932 if (matrix[p][std_alphabet[c]] <= BLAST_SCORE_MIN || matrix[p][std_alphabet[c]] >= BLAST_SCORE_MAX)
2933 continue;
2934 if (matrix[p][std_alphabet[c]] < (score_min))
2935 (score_min) = matrix[p][std_alphabet[c]];
2936 if (matrix[p][std_alphabet[c]] > (score_max))
2937 (score_max) = matrix[p][std_alphabet[c]];
2938 }
2939
2940 if((sfp = BlastScoreFreqNew(score_min, score_max)) == NULL) {
2941 ErrPostEx(SEV_ERROR, 0, 0,
2942 "BlastComputeProbs(): Unable to create Score Frequencies");
2943 return NULL;
2944 }
2945
2946 sfp->obs_min = sfp->score_min;
2947 sfp->obs_max = sfp->score_max;
2948 for (p = 0; p < dim1; p++)
2949 if (Xchar != matrix_rescale->query[p])
2950 for (c = 0; c < dim2; c++) {
2951 /*Increment the weight for the score in position [p][std_alphabet[c]] */
2952 score = matrix[p][std_alphabet[c]];
2953 if (score <= BLAST_SCORE_MIN || score >= BLAST_SCORE_MAX)
2954 continue;
2955 increment =
2956 (matrix_rescale->standardProb[std_alphabet[c]]/ effectiveLength);
2957 sfp->sprob[score]+= increment;
2958 }
2959
2960 sfp->score_avg = 0.0;
2961 for(s = sfp->score_min; s <= sfp->score_max; s++)
2962 sfp->score_avg += s*sfp->sprob[s];
2963 return(sfp);
2964 }
2965
2966 void LIBCALL
2967 updateLambdaK(BlastMatrixRescalePtr matrix_rescale, Boolean position_dependent)
2968 {
2969 BLAST_ScoreFreqPtr sfp;
2970
2971 if((sfp = BlastComputeProbs(matrix_rescale, position_dependent)) == NULL)
2972 return;
2973
2974 BlastKarlinBlkCalc(matrix_rescale->kbp_psi[0], sfp);
2975 matrix_rescale->kbp_gap_psi[0]->K = (matrix_rescale->kbp_psi[0]->K)*(matrix_rescale->kbp_gap_std[0]->K)/matrix_rescale->K_ideal;
2976 matrix_rescale->kbp_gap_psi[0]->logK = log(matrix_rescale->kbp_gap_psi[0]->K);
2977 sfp = BlastScoreFreqDestruct(sfp);
2978 }
2979
2980 Nlm_FloatHi
2981 BlastScaleMatrix(BlastMatrixRescalePtr matrix_rescale, Boolean position_dependent)
2982 {
2983 Int4 dim1, dim2;
2984 Int4 a,c; /*loop indices*/
2985 Boolean too_high=TRUE, first_time;
2986 Nlm_FloatHi factor, factor_low=1.0, factor_high=1.0;
2987 Nlm_FloatHi lambda, new_lambda; /*Karlin-Altschul parameter*/
2988 Int4 index; /*loop index for binary search*/
2989 Int4Ptr *private_matrix;
2990 Int4Ptr *matrix;
2991
2992 private_matrix = matrix_rescale->private_matrix;
2993 matrix = matrix_rescale->matrix;
2994
2995
2996 /* Bracket the values. */
2997 if (position_dependent)
2998 {
2999 dim1 = matrix_rescale->query_length;
3000 dim2 = matrix_rescale->alphabet_size;
3001 }
3002 else
3003 {
3004 dim1 = matrix_rescale->alphabet_size;
3005 dim2 = matrix_rescale->alphabet_size;
3006 }
3007 lambda = matrix_rescale->lambda_ideal;
3008
3009 first_time = TRUE;
3010 factor = 1.0;
3011 while (1)
3012 {
3013 for(c = 0; c < dim1; c++)
3014 {
3015 for(a = 0; a < dim2; a++)
3016 {
3017 if (private_matrix[c][a] == BLAST_SCORE_MIN)
3018 {
3019 matrix[c][a] = BLAST_SCORE_MIN;
3020 }
3021 else
3022 {
3023 matrix[c][a] = Nlm_Nint(((FloatHi)(factor*private_matrix[c][a]))/POSIT_SCALE_FACTOR);
3024 }
3025 }
3026 }
3027
3028 updateLambdaK(matrix_rescale, position_dependent);
3029
3030 if(matrix_rescale->kbp_psi[0] != NULL)
3031 new_lambda = matrix_rescale->kbp_psi[0]->Lambda;
3032 else
3033 return 0.0;
3034
3035 if (new_lambda > lambda)
3036 {
3037 if (first_time)
3038 {
3039 factor_high = 1.0 + POSIT_PERCENT;
3040 factor = factor_high;
3041 factor_low = 1.0;
3042 too_high = TRUE;
3043 first_time = FALSE;
3044 }
3045 else
3046 {
3047 if (too_high == FALSE)
3048 break;
3049 factor_high += (factor_high-1.0);
3050 factor = factor_high;
3051 }
3052 }
3053 else if (new_lambda > 0)
3054 {
3055 if (first_time)
3056 {
3057 factor_high = 1.0;
3058 factor_low = 1.0 - POSIT_PERCENT;
3059 factor = factor_low;
3060 too_high = FALSE;
3061 first_time = FALSE;
3062 }
3063 else
3064 {
3065 if (too_high == TRUE)
3066 break;
3067 factor_low += (factor_low-1.0);
3068 factor = factor_low;
3069 }
3070 } else {
3071 ErrPostEx(SEV_FATAL, 1, 0, "Matrix has positive average score");
3072 return 0.0;
3073 }
3074
3075 }
3076
3077 /* binary search for ten times. */
3078 for (index=0; index<POSIT_NUM_ITERATIONS; index++)
3079 {
3080 factor = 0.5*(factor_high+factor_low);
3081 for(c = 0; c < dim1; c++)
3082 {
3083 for(a = 0; a < dim2; a++)
3084 {
3085 if (private_matrix[c][a] == BLAST_SCORE_MIN)
3086 {
3087 matrix[c][a] = BLAST_SCORE_MIN;
3088 }
3089 else
3090 {
3091 matrix[c][a] = Nlm_Nint(((FloatHi)(factor*private_matrix[c][a]))/POSIT_SCALE_FACTOR);
3092 }
3093 }
3094 }
3095 updateLambdaK(matrix_rescale, position_dependent);
3096
3097 if(matrix_rescale->kbp_psi[0] != NULL)
3098 new_lambda = matrix_rescale->kbp_psi[0]->Lambda;
3099 else
3100 return 0.0;
3101
3102 if (new_lambda > lambda)
3103 {
3104 factor_low = factor;
3105 }
3106 else
3107 {
3108 factor_high = factor;
3109 }
3110 }
3111
3112 for(a = 0; a < matrix_rescale->alphabet_size; a++)
3113 matrix[dim1][a] = BLAST_SCORE_MIN;
3114
3115 return factor;
3116 }
3117
3118
3119 /*
3120 Deallocates only memory for BlastMatrixRescalePtr.
3121 */
3122 BlastMatrixRescalePtr
3123 BlastMatrixRescaleDestruct (BlastMatrixRescalePtr matrix_rescale)
3124
3125 {
3126 if (matrix_rescale == NULL)
3127 return NULL;
3128
3129 return MemFree(matrix_rescale);
3130 }
3131
3132 /*
3133 Allocates and fills the BlastMatrixRescalePtr.
3134 */
3135
3136 BlastMatrixRescalePtr
3137 BlastMatrixRescaleNew(Int4 alphabet_size, Int4 query_length, Uint1Ptr query, Nlm_FloatHiPtr standardProb, Int4Ptr *matrix, Int4Ptr *private_matrix, BLAST_KarlinBlkPtr *kbp_std, BLAST_KarlinBlkPtr *kbp_psi, BLAST_KarlinBlkPtr *kbp_gap_std, BLAST_KarlinBlkPtr *kbp_gap_psi, Nlm_FloatHi lambda_ideal, Nlm_FloatHi K_ideal)
3138
3139 {
3140 BlastMatrixRescalePtr matrix_rescale;
3141
3142 matrix_rescale = (BlastMatrixRescalePtr) MemNew(sizeof(BlastMatrixRescale));
3143
3144 matrix_rescale->alphabet_size = alphabet_size;
3145 matrix_rescale->query_length = query_length;
3146 matrix_rescale->query = query;
3147 matrix_rescale->standardProb = standardProb;
3148 matrix_rescale->matrix = matrix;
3149 matrix_rescale->private_matrix = private_matrix;
3150 matrix_rescale->kbp_std = kbp_std;
3151 matrix_rescale->kbp_psi = kbp_psi;
3152 matrix_rescale->kbp_gap_std = kbp_gap_std;
3153 matrix_rescale->kbp_gap_psi = kbp_gap_psi;
3154 matrix_rescale->lambda_ideal = lambda_ideal;
3155 matrix_rescale->K_ideal = K_ideal;
3156
3157 return matrix_rescale;
3158 }
3159
3160 /*
3161 BlastSeqLocCount counts the number of SeqLoc's
3162 */
3163 static Int4
3164 BlastSeqLocCount (SeqLocPtr mask_slp)
3165
3166 {
3167 Int4 index=0;
3168 SeqLocPtr slp;
3169
3170 while (mask_slp)
3171 {
3172 slp=NULL;
3173 while((slp = SeqLocFindNext(mask_slp, slp))!=NULL)
3174 {
3175 index++;
3176 }
3177 mask_slp = mask_slp->next;
3178 }
3179
3180 return index;
3181 }
3182
3183 static Boolean
3184 BlastIntervalSort (BlastDoubleInt4Ptr *link_ptr, Int4 link_value, Int4 total, int (LIBCALLBACK *callback )PROTO ((Nlm_VoidPtr, Int4 start, Int4 stop)), VoidPtr ptr)
3185
3186 {
3187 Boolean do_callback, start=TRUE;
3188 Int4 index, start_pos, stop_pos, largest_stop_pos;
3189
3190 index=0;
3191 while (index < total)
3192 {
3193 if (start == TRUE)
3194 {
3195 start_pos = link_ptr[index]->gi;
3196 start = FALSE;
3197 largest_stop_pos = 0;
3198 }
3199 else
3200 {
3201 /* Keep track of largest stop position. */
3202 largest_stop_pos = MAX(largest_stop_pos, link_ptr[index]->ordinal_id);
3203 do_callback = FALSE;
3204 if (index == total-1) /* Last one. */
3205 {
3206 stop_pos = link_ptr[index]->ordinal_id;
3207 start = TRUE;
3208 do_callback = TRUE;
3209 }
3210 else if (largest_stop_pos+link_value < link_ptr[index+1]->gi)
3211 { /* Check overlap with next one. */
3212 stop_pos = link_ptr[index]->ordinal_id;
3213 start = TRUE;
3214 do_callback = TRUE;
3215 }
3216
3217 if (do_callback)
3218 {
3219 callback(ptr, start_pos, MAX(largest_stop_pos, stop_pos));
3220 }
3221 index++;
3222 }
3223 }
3224
3225 return TRUE;
3226 }
3227
3228 static int LIBCALLBACK
3229 list_ptr_compare(VoidPtr v1, VoidPtr v2)
3230
3231 {
3232 BlastDoubleInt4Ptr h1, h2;
3233 BlastDoubleInt4Ptr *hp1, *hp2;
3234
3235 hp1 = (BlastDoubleInt4Ptr PNTR) v1;
3236 hp2 = (BlastDoubleInt4Ptr PNTR) v2;
3237 h1 = *hp1;
3238 h2 = *hp2;
3239
3240 if (h1->gi < h2->gi)
3241 return -1;
3242 if (h1->gi > h2->gi)
3243 return 1;
3244
3245 return 0;
3246 }
3247
3248 int LIBCALLBACK
3249 slp_callback(VoidPtr ptr, Int4 start, Int4 stop)
3250
3251 {
3252 ValNodePtr *vnpp;
3253
3254
3255 vnpp = (ValNodePtr PNTR) ptr;
3256
3257 ValNodeAddInt(vnpp, 0, start);
3258 ValNodeAddInt(vnpp, 1, stop);
3259
3260 return 1;
3261 }
3262
3263
3264 ValNodePtr
3265 BlastSeqLocFillDoubleInt (SeqLocPtr mask_slp, Int4 max_length, Boolean reverse)
3266
3267 {
3268 return BlastSeqLocFillDoubleIntEx(mask_slp, 0, max_length, reverse, 0);
3269
3270 }
3271
3272 /*
3273 SeqLocPtr mask_slp: location to be masked
3274 Int4 full_query_length: length of the query, not just portion being blasted.
3275 Int4 max_length: length of query being blasted.
3276 Boolean reverse: do minus strand.
3277 Int4 offset: offset in full query to portion being blasted.
3278
3279 the coordinates reported here are in terms of the portion of the query
3280 being blasted for use by the function that makes up words.
3281 */
3282
3283 ValNodePtr
3284 BlastSeqLocFillDoubleIntEx (SeqLocPtr mask_slp, Int4 full_query_length, Int4 max_length, Boolean reverse, Int4 offset)
3285
3286 {
3287 Int4 count, index, start, stop;
3288 BlastDoubleInt4Ptr list_pri, *list_ptr_pri;
3289 SeqLocPtr slp;
3290 ValNodePtr vnp;
3291
3292 vnp = NULL;
3293 if (mask_slp == NULL)
3294 {
3295 ValNodeAddInt(&vnp, 1, -1);
3296 ValNodeAddInt(&vnp, 0, max_length);
3297 return vnp;
3298 }
3299
3300 /* use length of segment if full_query_length no set. */
3301 if (full_query_length == 0)
3302 full_query_length = max_length;
3303
3304 count = BlastSeqLocCount (mask_slp);
3305 list_pri = (BlastDoubleInt4Ptr) MemNew(count*sizeof(BlastDoubleInt4));
3306 list_ptr_pri = (BlastDoubleInt4Ptr PNTR) MemNew(count*sizeof(BlastDoubleInt4Ptr));
3307
3308 index=0;
3309 while (mask_slp)
3310 {
3311 slp=NULL;
3312 while((slp = SeqLocFindNext(mask_slp, slp))!=NULL)
3313 {
3314 if (reverse)
3315 {
3316 start = full_query_length - 1 - SeqLocStop(slp);
3317 stop = full_query_length - 1 - SeqLocStart(slp);
3318 }
3319 else
3320 {
3321 start = SeqLocStart(slp);
3322 stop = SeqLocStop(slp);
3323 }
3324
3325 list_pri[index].gi = start - offset;
3326 list_pri[index].ordinal_id = stop -offset;
3327 list_ptr_pri[index] = &(list_pri[index]);
3328 index++;
3329 }
3330 mask_slp = mask_slp->next;
3331 }
3332
3333 HeapSort(list_ptr_pri, count, sizeof(BlastHitRangePtr PNTR), list_ptr_compare);
3334
3335 /* Allows the proper start. */
3336 ValNodeAddInt(&vnp, 1, -1);
3337 BlastIntervalSort(list_ptr_pri, 0, count, slp_callback, (VoidPtr) &vnp);
3338
3339 list_pri = MemFree(list_pri);
3340 list_ptr_pri = MemFree(list_ptr_pri);
3341
3342 return vnp;
3343 }
3344
3345 ValNodePtr
3346 BlastSeqLocFillDoubleIntRev (ValNodePtr location, SeqLocPtr mask_slp, Int4 max_length, Int4 full_query_length, Int4 offset)
3347
3348 {
3349 Int4 count, index, start, stop;
3350 BlastDoubleInt4Ptr list_pri, *list_ptr_pri;
3351 SeqLocPtr slp;
3352 ValNodePtr vnp;
3353
3354 vnp = location;
3355 if (mask_slp == NULL) {
3356 if (!vnp) {
3357 ValNodeAddInt(&vnp, 1, -1);
3358 ValNodeAddInt(&vnp, 0, max_length);
3359 }
3360 return vnp;
3361 }
3362
3363 /* use length of segment if full_query_length no set. */
3364 if (full_query_length == 0)
3365 full_query_length = max_length;
3366
3367
3368 count = BlastSeqLocCount (mask_slp);
3369 list_pri = (BlastDoubleInt4Ptr) MemNew(count*sizeof(BlastDoubleInt4));
3370 list_ptr_pri = (BlastDoubleInt4Ptr PNTR) MemNew(count*sizeof(BlastDoubleInt4Ptr));
3371
3372 index=0;
3373 while (mask_slp) {
3374 slp=NULL;
3375 while((slp = SeqLocFindNext(mask_slp, slp))!=NULL) {
3376 start = full_query_length + max_length - SeqLocStop(slp);
3377 stop = full_query_length + max_length - SeqLocStart(slp);
3378
3379 list_pri[index].gi = start - offset;
3380 list_pri[index].ordinal_id = stop - offset;
3381 list_ptr_pri[index] = &(list_pri[index]);
3382 index++;
3383 }
3384 mask_slp = mask_slp->next;
3385 }
3386
3387 HeapSort(list_ptr_pri, count, sizeof(BlastHitRangePtr PNTR), list_ptr_compare);
3388 BlastIntervalSort(list_ptr_pri, 0, count, slp_callback, (VoidPtr) &vnp);
3389
3390 list_pri = MemFree(list_pri);
3391 list_ptr_pri = MemFree(list_ptr_pri);
3392
3393 return vnp;
3394 }
3395
3396
3397
3398 static Boolean
3399 small(BLASTResultHspPtr a, BLASTResultHspPtr b)
3400 {
3401 if (a->e_value > b->e_value) return TRUE;
3402 if (a->e_value < b->e_value) return FALSE;
3403 if (a->score > b->score) return FALSE;
3404 if (a->score < b->score) return TRUE;
3405 if (a->point_back->subject_id > b->point_back->subject_id) return FALSE;
3406 return TRUE;
3407 }
3408
3409
3410 /*
3411 this is some sort of HeapSort (or it makes the heap
3412 as it is used in HeapSort).
3413 */
3414 static void
3415 BlastHeapify(BLASTHeapPtr which_heap, Int4 position)
3416 {
3417 Int4 heap_size, index, lim, left_son, small_son;
3418 BLASTResultHspPtr tmp, PNTR heap;
3419
3420 heap_size = which_heap->num_in_heap;
3421 heap = which_heap->heap;
3422 index = position; lim = heap_size/2;
3423
3424 while (index < lim) {
3425 left_son = 2*index + 1;
3426 if (left_son == heap_size-1)
3427 small_son = left_son;
3428 else {
3429 if (small(heap[left_son],heap[left_son+1])) small_son = left_son;
3430 else small_son = left_son+1;
3431 }
3432 /* If heap[small_son] is less significant than heap[index], then
3433 switch them. Otherwise exit the loop.
3434 */
3435 if (small(heap[small_son], heap[index])) {
3436 tmp = heap[index];
3437 heap[index] = heap[small_son];
3438 heap[small_son] = tmp;
3439 index = small_son;
3440 } else
3441 break;
3442 }
3443 }
3444
3445 /*
3446 Insert a new element into the BLASTHeapPtr list, which
3447 appears to be ordered (here) from least to most significant.
3448 the order does not seem to be strictly from least to
3449 most significant, or is it?
3450 */
3451 static void
3452 BlastInsertHeap(BLASTHeapPtr which_heap, BLASTResultHspPtr hp)
3453 {
3454 Int4 heap_size, index, father;
3455 BLASTResultHspPtr PNTR heap;
3456
3457 hp->point_back->num_ref+= 1;
3458 /* Increase heap size by one for new entry. */
3459 heap_size = (which_heap->num_in_heap+=1);
3460 heap = which_heap->heap;
3461 index = heap_size-1;
3462
3463 while (index > 0) {
3464 father = (index-1)/2;
3465 /* If heap[father] is LESS significant than hp, then exit loop. */
3466 if (small(heap[father], hp)) {
3467 break;
3468 }
3469 /* The 1st time this is called heap[index] is NULL as it's one past
3470 the last filled in element. The idea here is obviously to
3471 move the more significant elements to the end, but don't
3472 they get out of order then?
3473 */
3474 heap[index] = heap[father];
3475 index = father;
3476 }
3477 /* Fill in new element. */
3478 heap[index] = hp;
3479 }
3480
3481 Int4
3482 BlastDeleteHeap(BLASTHeapPtr which_heap, Int4 position)
3483 {
3484 Int4 last, return_value;
3485 BLASTResultHspPtr PNTR heap;
3486
3487 last = (which_heap->num_in_heap-=1);
3488 heap = which_heap->heap;
3489 return_value = (heap[position]->point_back->num_ref -= 1);
3490 if (position != last) {
3491 heap[position] = heap[last];
3492 BlastHeapify(which_heap, position);
3493 }
3494 return return_value;
3495 }
3496
3497 static void
3498 BlastInsertWholeHeap(BlastSearchBlkPtr search, BLASTHeapPtr which_heap, Int4 cutvalue)
3499 {
3500 BLASTHeapPtr hp;
3501 Int4 i;
3502
3503 hp = (BLASTHeapPtr) MemNew(sizeof(BLASTHeapStruct));
3504 hp->heap = (BLASTResultHspPtr PNTR) MemNew(sizeof(BLASTResultHspPtr)*search->pbp->hsp_range_max);
3505 hp->next = which_heap;
3506 hp->num_of_ref = 0;
3507 hp->cutvalue = cutvalue;
3508 if (which_heap->prev) {
3509 hp->prev = which_heap->prev;
3510 which_heap->prev->next = hp;
3511 } else {
3512 hp->prev = NULL;
3513 search->result_struct->heap_ptr = hp;
3514 }
3515 which_heap->prev = hp;
3516 hp->num_in_heap = which_heap->num_in_heap;
3517 for (i = 0; i < which_heap->num_in_heap; i++) {
3518 hp->heap[i] = which_heap->heap[i];
3519 which_heap->heap[i]->point_back->num_ref +=1;
3520 }
3521 }
3522
3523 static void
3524 BlastDeleteWholeHeap(BlastSearchBlkPtr search, BLASTHeapPtr which_heap)
3525 {
3526 Int4 i;
3527 if ((which_heap->num_of_ref -= 1) >0) return;
3528 if (which_heap->next) which_heap->next->prev = which_heap->prev;
3529 if (which_heap->prev) which_heap->prev->next = which_heap->next;
3530 else search->result_struct->heap_ptr = which_heap->next;
3531 for (i = 0; i < which_heap->num_in_heap; i++) {
3532 which_heap->heap[i]->point_back->num_ref -= 1;
3533 }
3534 which_heap->heap = MemFree(which_heap->heap);
3535 which_heap = MemFree(which_heap);
3536 }
3537
3538 static Int2
3539 BlastPossibleDeleteWholeHeap(BlastSearchBlkPtr search, BLASTHeapPtr PNTR hhp, BLASTResultHspPtr heap0)
3540 /* if the deleted hit result a remove of the right end point,
3541 return 1 to make the program rerun
3542 */
3543 {
3544 BLASTHeapPtr hp = *hhp;
3545
3546 if (heap0 == NULL || hp == NULL)
3547 return 0;
3548
3549 if (heap0->back_left && hp->prev && heap0->back_left == hp->prev) {
3550 heap0->back_left = NULL;
3551 if ((hp->prev->num_of_ref -= 1) == 0) {
3552 BlastDeleteWholeHeap(search, hp->prev);
3553 }
3554 }
3555 if (heap0->back_right == hp) {
3556 heap0->back_right = NULL;
3557 if ((hp->num_of_ref -= 1)==0) {
3558 *hhp = hp->next;
3559 BlastDeleteWholeHeap(search, hp);
3560 *hhp = (*hhp)->prev;
3561 return 1;
3562 }
3563 }
3564 return 0;
3565 }
3566
3567
3568
3569 Int2
3570 BlastInsertList2Heap(BlastSearchBlkPtr search, BLASTResultHitlistPtr result_hitlist)
3571 {
3572 BLASTResultHspPtr PNTR heap, hsp;
3573 BLASTResultHspPtr hsp_array;
3574 Int4 index, hsp_range_max;
3575 Int4 begin, end, hspcnt, query_length;
3576 Int2 context;
3577 Boolean hsp_deleted, new_inserted;
3578 BLASTHeapPtr hp;
3579
3580 if (search->pbp->perform_culling == FALSE ||
3581 search->pbp->mb_params) /* Culling is turned off. */
3582 return 3;
3583
3584 hsp_deleted = new_inserted = FALSE;
3585 hsp_range_max = search->pbp->hsp_range_max;
3586 hspcnt = result_hitlist->hspcnt;
3587 hsp_array = result_hitlist->hsp_array;
3588
3589 if (search->prog_number==blast_type_blastn)
3590 query_length = search->query_context_offsets[1] - 1;
3591
3592 for (index = 0; index < hspcnt; index++) {
3593 hsp = &hsp_array[index];
3594 if (search->prog_number==blast_type_blastn) {
3595 context = hsp->query_offset / query_length;
3596 if (context == 0) {
3597 begin = hsp->query_offset;
3598 end = (hsp->query_offset+hsp->query_length-1);
3599 } else {
3600 end = 2*query_length - hsp->query_offset;
3601 begin = end - hsp->query_length + 1;
3602 }
3603 } else {
3604 begin = hsp->query_offset;
3605 end = (hsp->query_offset+hsp->query_length-1);
3606 }
3607 for (hp = search->result_struct->heap_ptr; hp; hp = hp->next)
3608 if (hp->cutvalue >= begin) break;
3609 if (hp && (hp->num_in_heap < hsp_range_max || small(hp->heap[0], hsp))) {
3610 if (!hp->prev || hp->prev->cutvalue != begin-1) {
3611 BlastInsertWholeHeap(search, hp, begin-1);
3612 }
3613 hp->prev->num_of_ref +=1;
3614 hsp->back_left = hp->prev;
3615 } else hsp->back_left = NULL;
3616 for (; hp; hp = hp->next) {
3617 if (end <= hp->cutvalue) break;
3618 heap = hp->heap;
3619 if (hp->num_in_heap >= hsp_range_max) {
3620 if (small(heap[0], hsp)) {
3621 if (BlastPossibleDeleteWholeHeap(search, &hp, heap[0])) continue;
3622 if ((heap[0]->point_back->num_ref-=1)==0)
3623 hsp_deleted = TRUE;
3624 heap = hp->heap;
3625 heap[0] = hsp;
3626 hsp->point_back->num_ref +=1;
3627 BlastHeapify(hp, 0);
3628 new_inserted = TRUE;
3629 }
3630 } else {
3631 BlastInsertHeap(hp, hsp);
3632 new_inserted = TRUE;
3633 }
3634 }
3635 hp = search->result_struct->heap_ptr;
3636 if (hp && (hp->num_in_heap < hsp_range_max || small(hp->heap[0], hsp))) {
3637 if (end != hp->cutvalue) {
3638 BlastInsertWholeHeap(search, hp, end);
3639 hp = hp->prev;
3640 }
3641 hp->num_of_ref += 1;
3642 heap = hp->heap;
3643 if (hp->num_in_heap >= hsp_range_max) {
3644 BlastPossibleDeleteWholeHeap(search, &hp, heap[0]);
3645 if ((heap[0]->point_back->num_ref-=1)==0)
3646 hsp_deleted = TRUE;
3647 heap = hp->heap;
3648 heap[0] = hsp;
3649 hsp->point_back->num_ref+=1;
3650 BlastHeapify(hp, 0);
3651 new_inserted = TRUE;
3652 } else {
3653 BlastInsertHeap(hp, hsp);
3654 new_inserted = TRUE;
3655 }
3656 hsp->back_right = hp;
3657 } else hsp->back_right = NULL;
3658 }
3659 if (hsp_deleted) return 1;
3660 if (new_inserted) return 2;
3661 return 0;
3662 }
3663
3664 void
3665 BlastFreeHeap(BlastSearchBlkPtr search, BLASTResultHitlistPtr result_hitlist)
3666 {
3667 BLASTResultHspPtr PNTR heap, hsp;
3668 BLASTResultHspPtr hsp_array;
3669 Int4 index;
3670 Int4 begin, end, i, hspcnt, query_length;
3671 Int2 context;
3672 BLASTHeapPtr hp;
3673
3674 if (search->pbp->perform_culling == FALSE ||
3675 search->pbp->mb_params) /* Culling is turned off. */
3676 return;
3677
3678 hspcnt = result_hitlist->hspcnt;
3679 hsp_array = result_hitlist->hsp_array;
3680
3681 if (search->prog_number==blast_type_blastn)
3682 query_length = search->query_context_offsets[1] - 1;
3683
3684 for (index = 0; index < hspcnt; index++) {
3685 hsp = &hsp_array[index];
3686 if (search->prog_number==blast_type_blastn) {
3687 context = hsp->query_offset / query_length;
3688 if (context == 0) {
3689 begin = hsp->query_offset;
3690 end = (hsp->query_offset+hsp->query_length-1);
3691 } else {
3692 end = 2*query_length - hsp->query_offset;
3693 begin = end - hsp->query_length + 1;
3694 }
3695 } else {
3696 begin = hsp->query_offset;
3697 end = (hsp->query_offset+hsp->query_length-1);
3698 }
3699 if (hsp->back_left) {
3700 hp = hsp->back_left->next;
3701 if (BlastPossibleDeleteWholeHeap(search, &hp, hsp)) continue;
3702 } else {
3703 for (hp = search->result_struct->heap_ptr; hp; hp = hp->next) {
3704 if (hp->cutvalue >= begin) break;
3705 }
3706 }
3707 for (; hp; hp = hp->next) {
3708 if (hp->cutvalue > end ) break;
3709 heap = hp->heap;
3710 for (i = 0; i < hp->num_in_heap; i++) {
3711 if (heap[i] == hsp) {
3712 BlastDeleteHeap(hp, i);
3713 break;
3714 }
3715 }
3716 }
3717 if (hp)
3718 hp = hp->prev;
3719 BlastPossibleDeleteWholeHeap(search, &hp, hsp);
3720 }
3721 }
3722
3723 /*converts a 1-level list of SeqAligns to a 2-level
3724 list store in a ValNodePtr; the list lastSeqAligns
3725 specifies where to break up seqAlignList */
3726 ValNodePtr convertSeqAlignListToValNodeList(SeqAlignPtr seqAlignList, SeqAlignPtr * lastSeqAligns, Int4 numLastSeqAligns)
3727 {
3728 ValNodePtr returnValNodePtr, thisValNodePtr, nextValNodePtr;
3729 SeqAlignPtr thisSeqAlign;
3730 Int4 lastAlignIndex;
3731 Int4 lastNonEmptyAlignIndex;
3732
3733 if(seqAlignList == NULL)
3734 return NULL;
3735
3736 lastAlignIndex = 0;
3737 lastNonEmptyAlignIndex = 0;
3738 returnValNodePtr = (ValNodePtr) MemNew (sizeof(ValNode));
3739 if (NULL == lastSeqAligns[lastAlignIndex])
3740 returnValNodePtr->data.ptrvalue = NULL;
3741 else
3742 returnValNodePtr->data.ptrvalue = seqAlignList;
3743 returnValNodePtr->next = NULL;
3744 thisValNodePtr = returnValNodePtr;
3745 thisSeqAlign = seqAlignList;
3746
3747 /*Find first sub-list that is non-NULL and set it
3748 to point to seqAlignList*/
3749 while ((lastAlignIndex < (numLastSeqAligns -1) &&
3750 (NULL == lastSeqAligns[lastAlignIndex]))) {
3751 nextValNodePtr = (ValNodePtr) MemNew (sizeof(ValNode));
3752 if (NULL == lastSeqAligns[lastAlignIndex+1])
3753 nextValNodePtr->data.ptrvalue = NULL;
3754 else {
3755 nextValNodePtr->data.ptrvalue = seqAlignList;
3756 lastNonEmptyAlignIndex = lastAlignIndex +1;
3757 }
3758 thisValNodePtr->next = nextValNodePtr;
3759 nextValNodePtr->next = NULL;
3760 thisValNodePtr = nextValNodePtr;
3761 lastAlignIndex++;
3762 }
3763
3764 while ((NULL != thisSeqAlign) || (lastAlignIndex < (numLastSeqAligns -1))) {
3765 if ((lastAlignIndex < (numLastSeqAligns -1)) &&
3766 ((NULL == lastSeqAligns[lastAlignIndex]) ||
3767 (thisSeqAlign == lastSeqAligns[lastAlignIndex]))) {
3768 nextValNodePtr = (ValNodePtr) MemNew (sizeof(ValNode));
3769
3770 if (NULL != lastSeqAligns[lastAlignIndex+1]) {
3771 nextValNodePtr->data.ptrvalue = thisSeqAlign->next;
3772 thisSeqAlign = thisSeqAlign->next;
3773 lastSeqAligns[lastNonEmptyAlignIndex]->next = NULL;
3774 lastNonEmptyAlignIndex = lastAlignIndex +1;
3775 } else {
3776 nextValNodePtr->data.ptrvalue = NULL;
3777 }
3778 thisValNodePtr->next = nextValNodePtr;
3779 nextValNodePtr->next = NULL;
3780 thisValNodePtr = nextValNodePtr;
3781 lastAlignIndex++;
3782 } else {
3783 if (NULL != thisSeqAlign)
3784 thisSeqAlign = thisSeqAlign->next;
3785 }
3786 }
3787 return (returnValNodePtr);
3788 }
3789
3790 /*converts a 2-level list of SeqAligns stored as a ValNodePtr */
3791
3792 SeqAlignPtr
3793 convertValNodeListToSeqAlignList(ValNodePtr seqAlignDoubleList,
3794 SeqAlignPtr ** lastSeqAligns,
3795 Int4 * numLastSeqAligns)
3796 {
3797 ValNodePtr thisValNodePtr, secondValNodePtr;
3798 SeqAlignPtr returnSeqAlign, thisSeqAlign;
3799 Int4 numValNodePtrs, indexValNodePtrs;
3800
3801
3802 thisValNodePtr = seqAlignDoubleList;
3803 numValNodePtrs = 0;
3804
3805 while (NULL != thisValNodePtr) {
3806 numValNodePtrs++;
3807 thisValNodePtr = thisValNodePtr->next;
3808 }
3809
3810 if (seqAlignDoubleList == NULL ||
3811 seqAlignDoubleList->data.ptrvalue == NULL) {
3812 *numLastSeqAligns = 0;
3813 *lastSeqAligns = NULL;
3814 return NULL;
3815 } else {
3816 indexValNodePtrs = 0;
3817 *numLastSeqAligns = numValNodePtrs;
3818 *lastSeqAligns = (SeqAlignPtr *) MemNew (numValNodePtrs * sizeof(SeqAlignPtr));
3819 returnSeqAlign = seqAlignDoubleList->data.ptrvalue;
3820 thisValNodePtr = seqAlignDoubleList;
3821 do {
3822 thisSeqAlign = thisValNodePtr->data.ptrvalue;
3823 if (thisSeqAlign != NULL) {
3824 while (thisSeqAlign->next != NULL)
3825 thisSeqAlign = thisSeqAlign->next;
3826 }
3827 (*lastSeqAligns)[indexValNodePtrs] = thisSeqAlign;
3828 indexValNodePtrs++;
3829 /*find next non-empty list to link in*/
3830 if (thisSeqAlign != NULL) {
3831 secondValNodePtr = thisValNodePtr->next;
3832 while (NULL != secondValNodePtr) {
3833 if (NULL != secondValNodePtr->data.ptrvalue)
3834 {
3835 thisSeqAlign->next = secondValNodePtr->data.ptrvalue;
3836 break;
3837 }
3838 else
3839 secondValNodePtr = secondValNodePtr->next;
3840 }
3841 /*if all remaning ValNodePtrs have NULL seqAligns, finish off
3842 list of seqAligns with a NULL*/
3843 if (NULL == secondValNodePtr)
3844 thisSeqAlign->next = NULL;
3845 }
3846 thisValNodePtr = thisValNodePtr->next;
3847 } while (NULL != thisValNodePtr);
3848 return returnSeqAlign;
3849 }
3850 }
3851
3852 void LIBCALL
3853 fillCandLambda(seedSearchItems * seedSearch, Char *matrixName, BLAST_OptionsBlkPtr options)
3854 {
3855 if (0 == StringCmp("BLOSUM62", matrixName)) {
3856 seedSearch->paramC = 0.50;
3857 if ((11 == options->gap_open) && (1 == options->gap_extend)) {
3858 seedSearch->paramLambda = 0.270;
3859 seedSearch->paramK = 0.047;
3860 return;
3861 }
3862 if ((9 == options->gap_open) && (2 == options->gap_extend)) {
3863 seedSearch->paramLambda = 0.285;
3864 seedSearch->paramK = 0.075;
3865 return;
3866 }
3867 if ((8 == options->gap_open) && (2 == options->gap_extend)) {
3868 seedSearch->paramLambda = 0.265;
3869 seedSearch->paramK = 0.046;
3870 return;
3871 }
3872 if ((7 == options->gap_open) && (2 == options->gap_extend)) {
3873 seedSearch->paramLambda = 0.243;
3874 seedSearch->paramK = 0.032;
3875 return;
3876 }
3877 if ((12 == options->gap_open) && (1 == options->gap_extend)) {
3878 seedSearch->paramLambda = 0.281;
3879 seedSearch->paramK = 0.057;
3880 return;
3881 }
3882 if ((10 == options->gap_open) && (1 == options->gap_extend)) {
3883 seedSearch->paramLambda = 0.250;
3884 seedSearch->paramK = 0.033;
3885 return;
3886 }
3887 ErrPostEx(SEV_FATAL, 1, 0, "The combination %d for gap opening cost and %d for gap extension is not supported in PHI-BLAST with matrix %s\n", options->gap_open, options->gap_extend, matrixName);
3888 }
3889 else {
3890 if (0 == StringCmp("PAM30", matrixName)) {
3891 seedSearch->paramC = 0.30;
3892 if ((9 == options->gap_open) && (1 == options->gap_extend)) {
3893 seedSearch->paramLambda = 0.295;
3894 seedSearch->paramK = 0.13;
3895 return;
3896 }
3897 if ((7 == options->gap_open) && (2 == options->gap_extend)) {
3898 seedSearch->paramLambda = 0.306;
3899 seedSearch->paramK = 0.15;
3900 return;
3901 }
3902 if ((6 == options->gap_open) && (2 == options->gap_extend)) {
3903 seedSearch->paramLambda = 0.292;
3904 seedSearch->paramK = 0.13;
3905 return;
3906 }
3907 if ((5 == options->gap_open) && (2 == options->gap_extend)) {
3908 seedSearch->paramLambda = 0.263;
3909 seedSearch->paramK = 0.077;
3910 return;
3911 }
3912 if ((10 == options->gap_open) && (1 == options->gap_extend)) {
3913 seedSearch->paramLambda = 0.309;
3914 seedSearch->paramK = 0.15;
3915 return;
3916 }
3917 if ((8 == options->gap_open) && (1 == options->gap_extend)) {
3918 seedSearch->paramLambda = 0.270;
3919 seedSearch->paramK = 0.070;
3920 return;
3921 }
3922 ErrPostEx(SEV_FATAL, 1, 0, "The combination %d for gap opening cost and %d for gap extension is not supported in PHI-BLAST with matrix %s\n", options->gap_open, options->gap_extend, matrixName);
3923 }
3924 else {
3925 if (0 == StringCmp("PAM70", matrixName)) {
3926 seedSearch->paramC = 0.35;
3927 if ((10 == options->gap_open) && (1 == options->gap_extend)) {
3928 seedSearch->paramLambda = 0.291;
3929 seedSearch->paramK = 0.089;
3930 return;
3931 }
3932 if ((8 == options->gap_open) && (2 == options->gap_extend)) {
3933 seedSearch->paramLambda = 0.303;
3934 seedSearch->paramK = 0.13;
3935 return;
3936 }
3937 if ((7 == options->gap_open) && (2 == options->gap_extend)) {
3938 seedSearch->paramLambda = 0.287;
3939 seedSearch->paramK = 0.095;
3940 return;
3941 }
3942 if ((6 == options->gap_open) && (2 == options->gap_extend)) {
3943 seedSearch->paramLambda = 0.269;
3944 seedSearch->paramK = 0.079;
3945 return;
3946 }
3947 if ((11 == options->gap_open) && (1 == options->gap_extend)) {
3948 seedSearch->paramLambda = 0.307;
3949 seedSearch->paramK = 0.13;
3950 return;
3951 }
3952 if ((9 == options->gap_open) && (1 == options->gap_extend)) {
3953 seedSearch->paramLambda = 0.269;
3954 seedSearch->paramK = 0.058;
3955 return;
3956 }
3957 ErrPostEx(SEV_FATAL, 1, 0, "The combination %d for gap opening cost and %d for gap extension is not supported in PHI-BLAST with matrix %s\n", options->gap_open, options->gap_extend, matrixName);
3958 }
3959 else {
3960 if (0 == StringCmp("BLOSUM80", matrixName)) {
3961 seedSearch->paramC = 0.40;
3962 if ((10 == options->gap_open) && (1 == options->gap_extend)) {
3963 seedSearch->paramLambda = 0.300;
3964 seedSearch->paramK = 0.072;
3965 return;
3966 }
3967 if ((8 == options->gap_open) && (2 == options->gap_extend)) {
3968 seedSearch->paramLambda = 0.308;
3969 seedSearch->paramK = 0.089;
3970 return;
3971 }
3972 if ((7 == options->gap_open) && (2 == options->gap_extend)) {
3973 seedSearch->paramLambda = 0.295;
3974 seedSearch->paramK = 0.077;
3975 return;
3976 }
3977 if ((6 == options->gap_open) && (2 == options->gap_extend)) {
3978 seedSearch->paramLambda = 0.271;
3979 seedSearch->paramK = 0.051;
3980 return;
3981 }
3982 if ((11 == options->gap_open) && (1 == options->gap_extend)) {
3983 seedSearch->paramLambda = 0.314;
3984 seedSearch->paramK = 0.096;
3985 return;
3986 }
3987 if ((9 == options->gap_open) && (1 == options->gap_extend)) {
3988 seedSearch->paramLambda = 0.277;
3989 seedSearch->paramK = 0.046;
3990 return;
3991 }
3992 ErrPostEx(SEV_FATAL, 1, 0, "The combination %d for gap opening cost and %d for gap extension is not supported in PHI-BLAST with matrix %s\n", options->gap_open, options->gap_extend, matrixName);
3993 }
3994 else {
3995 if (0 == StringCmp("BLOSUM45", matrixName)) {
3996 seedSearch->paramC = 0.60;
3997 if ((14 == options->gap_open) && (2 == options->gap_extend)) {
3998 seedSearch->paramLambda = 0.199;
3999 seedSearch->paramK = 0.040;
4000 return;
4001 }
4002 if ((13 == options->gap_open) && (3 == options->gap_extend)) {
4003 seedSearch->paramLambda = 0.209;
4004 seedSearch->paramK = 0.057;
4005 return;
4006 }
4007 if ((12 == options->gap_open) && (3 == options->gap_extend)) {
4008 seedSearch->paramLambda = 0.203;
4009 seedSearch->paramK = 0.049;
4010 return;
4011 }
4012 if ((11 == options->gap_open) && (3 == options->gap_extend)) {
4013 seedSearch->paramLambda = 0.193;
4014 seedSearch->paramK = 0.037;
4015 return;
4016 }
4017 if ((10 == options->gap_open) && (3 == options->gap_extend)) {
4018 seedSearch->paramLambda = 0.182;
4019 seedSearch->paramK = 0.029;
4020 return;
4021 }
4022 if ((15 == options->gap_open) && (2 == options->gap_extend)) {
4023 seedSearch->paramLambda = 0.206;
4024 seedSearch->paramK = 0.049;
4025 return;
4026 }
4027 if ((13 == options->gap_open) && (2 == options->gap_extend)) {
4028 seedSearch->paramLambda = 0.190;
4029 seedSearch->paramK = 0.032;
4030 return;
4031 }
4032 if ((12 == options->gap_open) && (2 == options->gap_extend)) {
4033 seedSearch->paramLambda = 0.177;
4034 seedSearch->paramK = 0.023;
4035 return;
4036 }
4037 if ((19 == options->gap_open) && (1 == options->gap_extend)) {
4038 seedSearch->paramLambda = 0.209;
4039 seedSearch->paramK = 0.049;
4040 return;
4041 }
4042 if ((18 == options->gap_open) && (1 == options->gap_extend)) {
4043 seedSearch->paramLambda = 0.202;
4044 seedSearch->paramK = 0.041;
4045 return;
4046 }
4047 if ((17 == options->gap_open) && (1 == options->gap_extend)) {
4048 seedSearch->paramLambda = 0.195;
4049 seedSearch->paramK = 0.034;
4050 return;
4051 }
4052 if ((16 == options->gap_open) && (1 == options->gap_extend)) {
4053 seedSearch->paramLambda = 0.183;
4054 seedSearch->paramK = 0.024;
4055 return;
4056 }
4057 ErrPostEx(SEV_FATAL, 1, 0, "The combination %d for gap opening cost and %d for gap extension is not supported in PHI-BLAST with matrix %s\n", options->gap_open, options->gap_extend, matrixName);
4058
4059 }
4060 else {
4061 ErrPostEx(SEV_FATAL, 1, 0, "Matrix %s not allowed in PHI-BLAST\n", matrixName);
4062 }
4063 }
4064 }
4065 }
4066 }
4067 }
4068
4069 /*
4070 Used by BlastParseInputString to get the 'index' of the
4071 option and to check it's validity.
4072 */
4073 Int4 BlastGetLetterIndex(CharPtr letters, Char ch)
4074 {
4075 Int4 index;
4076
4077 for(index = 0; letters[index] != NULLB; index++) {
4078 if (letters[index] == ch) {
4079 return index;
4080 }
4081 }
4082 return -1;
4083 }
4084
4085 /*
4086 Parses a string of input options.
4087 For use in the Web page and filtering options.
4088 Not for use in command-line programs - use GetArgs.
4089 */
4090
4091 Boolean BlastParseInputString(CharPtr string,
4092 CharPtr letters,
4093 CharPtr PNTR *values_in,
4094 CharPtr PNTR ErrorMessage)
4095 {
4096 CharPtr chptr;
4097 Int4 i, index = 0, max_par_num;
4098 Char option[1024];
4099 CharPtr PNTR values;
4100 Char message[1024];
4101
4102 if(string == NULL || letters == NULL ||
4103 *letters == '\0' || values_in == NULL) {
4104 return FALSE;
4105 }
4106
4107 max_par_num = StringLen(letters);
4108
4109 values = (CharPtr PNTR)MemNew(max_par_num * sizeof(CharPtr));
4110 *values_in = values;
4111
4112 chptr = string;
4113
4114 while(1) {
4115 while(IS_WHITESP(*chptr)) /* Rolling spaces */
4116 chptr++;
4117
4118 if(*chptr == NULLB) /* Check for NULLB */
4119 break;
4120
4121 if (*chptr != '-') { /* Check for the option sign */
4122 sprintf(message, "Invalid input string started from \"%s\"",
4123 chptr);
4124 if (ErrorMessage)
4125 *ErrorMessage = StringSave(message);
4126 return FALSE;
4127 } else {
4128 chptr++;
4129 }
4130
4131 /* checking index in options table */
4132
4133 if((index = BlastGetLetterIndex(letters, *chptr)) < 0) {
4134 sprintf(message, "Character \'%c\' is not a valid option",
4135 *chptr);
4136 if (ErrorMessage)
4137 *ErrorMessage = StringSave(message);
4138 return FALSE;
4139 }
4140
4141 if(*chptr == NULLB) /* Check for NULLB */
4142 break;
4143
4144 chptr++;
4145
4146 while(IS_WHITESP(*chptr)) /* Rolling spaces */
4147 chptr++;
4148
4149 if(*chptr == NULLB) /* Check for NULLB */
4150 break;
4151
4152 /* If option is in double quotes, get the entire string between the
4153 quotes */
4154 if (*chptr == '\'') {
4155 for (i=0; *++chptr != NULLB && *chptr != '\''; i++)
4156 option[i] = *chptr;
4157 if (*chptr == '\'')
4158 chptr++;
4159 } else {
4160 for(i=0; !IS_WHITESP(*chptr) && *chptr != NULLB; i++, chptr++)
4161 option[i] = *chptr;
4162 }
4163 option[i] = NULLB;
4164
4165 MemFree(values[index]);
4166 values[index] = StringSave(option);
4167 }
4168
4169 return TRUE;
4170 }
4171
4172 CharPtr BLASTGetDatabaseTitleFromSeqAnnot(SeqAnnotPtr seqannot)
4173 {
4174 ValNodePtr vnp;
4175
4176 UserObjectPtr uop;
4177 UserFieldPtr ufp;
4178 ObjectIdPtr oip;
4179
4180 for(vnp = seqannot->desc; vnp != NULL; vnp = vnp->next) {
4181
4182 if (vnp->choice == Annot_descr_user) {
4183 uop = (UserObjectPtr) vnp->data.ptrvalue;
4184 oip = uop->type;
4185 if(!StringICmp(oip->str, "BLAST database title")) {
4186 ufp = uop->data;
4187 oip = ufp->label;
4188 return oip->str;
4189 }
4190 }
4191 }
4192 return NULL;
4193 }
4194
4195
4196 void BLASTAddBlastDBTitleToSeqAnnotEx(SeqAnnotPtr seqannot, CharPtr title, Boolean is_na)
4197 {
4198 UserObjectPtr uop;
4199 UserFieldPtr ufp;
4200 ObjectIdPtr oip;
4201
4202 uop = UserObjectNew();
4203 oip = ObjectIdNew();
4204 uop->type = oip;
4205 oip->str = StringSave("BLAST database title");
4206
4207 ufp = UserFieldNew();
4208 oip = ObjectIdNew();
4209 ufp->label = oip;
4210 uop->data = ufp;
4211
4212 oip->str = StringSave(title);
4213 ufp->choice = 4;
4214 ufp->data.boolvalue = is_na; /* Nucleotide if TRUE. */
4215
4216 ValNodeAddPointer(&(seqannot->desc), Annot_descr_user, (Pointer)uop);
4217 }
4218
4219 /* BLASTAddBlastDBTitleToSeqAnnotEx is preferred if the type of database is known. */
4220
4221 void BLASTAddBlastDBTitleToSeqAnnot(SeqAnnotPtr seqannot, CharPtr title)
4222 {
4223 BLASTAddBlastDBTitleToSeqAnnotEx(seqannot, title, TRUE);
4224 return;
4225 }
4226
4227 /* pos is assumed to be the address of a chracter in the array seq
4228 if so, copy from pos backwards to the start of seq
4229 into target.
4230 return the number of characters copied*/
4231 Int4 reverse_seq(Uint1 *seq, Uint1 *pos, Uint1 *target)
4232 {
4233 Uint1 *c; /*index over addresses of characters in seq*/
4234 Int4 numCopied; /*number of characters copied*/
4235
4236 for (c = pos, numCopied = 0; c >= seq; c--, numCopied++)
4237 *target++ = *c;
4238 *target = '\0';
4239 return numCopied;
4240 }
4241
4242 static void FilterAlignFree(FilterAlignPtr fap)
4243 {
4244 HspFree(fap->hsp);
4245
4246 if(fap->sap != NULL)
4247 SeqAlignSetFree(fap->sap);
4248
4249 MemFree(fap);
4250
4251 return;
4252 }
4253
4254 static void SappSetFree(SappSetPtr ssp)
4255 {
4256 Int4 i;
4257 FilterAlignPtr fap;
4258
4259 for(i = 0; i < ssp->count; i++) {
4260 if ((fap = ssp->fapp[i]) != NULL) {
4261 FilterAlignFree(fap);
4262 }
4263 }
4264
4265 MemFree(ssp->fapp);
4266 MemFree(ssp);
4267
4268 return;
4269 }
4270 static int LIBCALLBACK SappArrayScoreCmp(VoidPtr v1, VoidPtr v2)
4271 {
4272 FilterAlignPtr f1, f2;
4273
4274 f1 = (*(FilterAlignPtr PNTR) v1);
4275 f2 = (*(FilterAlignPtr PNTR) v2);
4276
4277 if (f1->hsp->score < f2->hsp->score)
4278 return 1;
4279 else if (f1->hsp->score > f2->hsp->score)
4280 return -1;
4281
4282 if (f1->hsp->evalue > f2->hsp->evalue)
4283 return 1;
4284 else if (f1->hsp->evalue < f2->hsp->evalue)
4285 return -1;
4286
4287 else return 0;
4288 }
4289
4290 static Boolean BLASTSortSapArray(SappSetPtr ssp)
4291 {
4292
4293 HeapSort((VoidPtr)ssp->fapp, ssp->count, sizeof(FilterAlignPtr),
4294 SappArrayScoreCmp);
4295
4296 return TRUE;
4297 }
4298
4299 static Boolean BLASTTestHSPPosition(HspPtr hsp1, HspPtr hsp2, Int4 pct)
4300 {
4301 Boolean from_is_inside, to_is_inside;
4302 Int4 length;
4303 Nlm_FloatHi pct_overlap = 0.0;
4304
4305 if(hsp1 == NULL || hsp2 == NULL)
4306 return TRUE;
4307
4308 from_is_inside = FALSE;
4309 to_is_inside = FALSE;
4310
4311 if((SIGN(hsp1->query_frame) != SIGN(hsp2->query_frame)) ||
4312 (SIGN(hsp1->hit_frame) != SIGN(hsp2->hit_frame)))
4313 return TRUE;
4314
4315
4316 if(SIGN(hsp1->query_frame) == SIGN(hsp1->hit_frame)) {
4317
4318 if(CONTAINED_IN_HSP(hsp1->query_from, hsp1->query_to, hsp2->query_from,hsp1->hit_from, hsp1->hit_to, hsp2->hit_from)) {
4319 from_is_inside = TRUE;
4320 }
4321
4322 if(CONTAINED_IN_HSP(hsp1->query_from, hsp1->query_to, hsp2->query_from, hsp1->hit_from, hsp1->hit_to, hsp2->hit_from)) {
4323 to_is_inside = TRUE;
4324 }
4325 } else {
4326 if(CONTAINED_IN_HSP(hsp1->query_to, hsp1->query_from, hsp2->query_from, hsp1->hit_from, hsp1->hit_to, hsp2->hit_from)) {
4327 from_is_inside = TRUE;
4328 }
4329
4330 if(CONTAINED_IN_HSP(hsp1->query_to, hsp1->query_from, hsp2->query_from, hsp1->hit_from, hsp1->hit_to, hsp2->hit_from)) {
4331 to_is_inside = TRUE;
4332 }
4333
4334 }
4335
4336 if(from_is_inside && to_is_inside)
4337 return FALSE;
4338
4339 length = abs(hsp2->query_to - hsp2->query_from);
4340
4341 if(from_is_inside) {
4342 pct_overlap = (length - (MIN(abs(hsp2->query_to - hsp1->query_from), abs(hsp2->query_to - hsp1->query_to))))*100.0/length;
4343
4344 } else if (to_is_inside) {
4345
4346 pct_overlap = (length - (MIN(abs(hsp2->query_from - hsp1->query_from), abs(hsp2->query_from - hsp1->query_to))))*100.0/length;
4347
4348 }
4349
4350 if(pct_overlap > pct)
4351 return FALSE;
4352
4353 return TRUE;
4354 }
4355 static SeqAlignPtr BLASTFilterSapArray(SappSetPtr ssp, Int4 pct,
4356 SeqAlignPtr PNTR next_tail)
4357 {
4358 Int4 i, j;
4359 HspPtr hsp1, hsp2;
4360 SeqAlignPtr sap_head;
4361
4362 if(ssp == NULL || ssp->count == 0)
4363 return NULL;
4364
4365
4366 for(i = 0; i < ssp->count; i++) {
4367
4368 if(ssp->fapp[i] == NULL)
4369 continue;
4370
4371 hsp1 = ssp->fapp[i]->hsp;
4372
4373 for(j = 0; j < i; j++) {
4374
4375 if(ssp->fapp[j] == NULL)
4376 continue;
4377
4378 hsp2 = ssp->fapp[j]->hsp;
4379
4380 if(!BLASTTestHSPPosition(hsp1, hsp2, pct)) {
4381 /* FALSE mean hsp2 to be removed */
4382 if(ssp->fapp[j] != NULL) {
4383 FilterAlignFree(ssp->fapp[j]);
4384 ssp->fapp[j] = NULL;
4385 }
4386 break;
4387 }
4388 }
4389 }
4390
4391 sap_head = NULL;
4392 *next_tail = NULL;
4393
4394 for(i = 0; i < ssp->count; i++) {
4395 if(ssp->fapp[i] != NULL) {
4396 if(sap_head == NULL) {
4397 sap_head = ssp->fapp[i]->sap;
4398 *next_tail = sap_head;
4399 } else {
4400 (*next_tail)->next = ssp->fapp[i]->sap;
4401 *next_tail = (*next_tail)->next;
4402 }
4403 ssp->fapp[i]->sap = NULL;
4404 }
4405 }
4406
4407 return sap_head;
4408 }
4409
4410 static SappSetPtr BLASTCreateSappArray(SeqAlignPtr sap_in,
4411 Boolean subject_is_aa,
4412 Boolean is_ooframe)
4413 {
4414 Int4 count;
4415 SeqAlignPtr sap;
4416 SappSetPtr ssp;
4417
4418 if(sap_in == NULL)
4419 return NULL;
4420
4421 for(count = 0, sap = sap_in; sap != NULL; sap = sap->next)
4422 count++;
4423
4424 ssp = (SappSetPtr) MemNew(sizeof(SappSet));
4425
4426 ssp->count = count;
4427
4428 ssp->fapp = (FilterAlignPtr PNTR) MemNew(count*sizeof(FilterAlignPtr));
4429
4430 for(count = 0, sap = sap_in; sap != NULL; count++) {
4431 ssp->fapp[count] = MemNew(sizeof(FilterAlign));
4432 ssp->fapp[count]->sap = sap;
4433 SeqIdWrite(TxGetSubjectIdFromSeqAlign(sap),
4434 ssp->fapp[count]->subject_sip,
4435 PRINTID_FASTA_LONG,
4436 sizeof(ssp->fapp[count]->subject_sip));
4437 ssp->fapp[count]->hsp = BXMLGetHspFromSeqAlign(sap, subject_is_aa, 0,
4438 is_ooframe, NULL);
4439
4440 sap = sap->next;
4441 ssp->fapp[count]->sap->next = NULL; /* Single SAP */
4442 }
4443
4444 return ssp;
4445 }
4446
4447 static SappSetPtr BLASTGetNextSapArray(SappSetPtr ssp_in, Int4 pct)
4448 {
4449 CharPtr current_sip;
4450 Int4 count, i;
4451 SappSetPtr ssp;
4452
4453 /* First - calculating number of SeqAligns for the next sap array */
4454 current_sip = NULL;
4455 count = 0;
4456 for(i = 0; i < ssp_in->count; i++) {
4457
4458 if (ssp_in->fapp[i] != NULL) {
4459 if(current_sip == NULL) {
4460 current_sip = ssp_in->fapp[i]->subject_sip;
4461 count++;
4462 continue;
4463 }
4464
4465 if(!StringCmp(current_sip, ssp_in->fapp[i]->subject_sip))
4466 count++;
4467 }
4468 }
4469
4470 if(count == 0)
4471 return NULL;
4472
4473 ssp = (SappSetPtr) MemNew(sizeof(SappSet));
4474 ssp->count = count;
4475 ssp->fapp = (FilterAlignPtr PNTR) MemNew(sizeof(FilterAlignPtr) * ssp->count);
4476
4477 for(i = 0, count = 0; i < ssp_in->count && count < ssp->count; i++) {
4478 if (ssp_in->fapp[i] != NULL) {
4479 if(!StringCmp(current_sip, ssp_in->fapp[i]->subject_sip)) {
4480 ssp->fapp[count] = ssp_in->fapp[i];
4481 ssp_in->fapp[i] = NULL;
4482 count++;
4483 }
4484 }
4485 }
4486 return ssp;
4487 }
4488
4489 /* -----------------------------------------------------------------
4490 This function will filter given SeqAlignPtr for overlaping
4491 regions for the same query/subject pair. Another input parameter is
4492 percentage of overlapping required for the alignment to be removed
4493 from SeqAlignPtr
4494 ---------------------------------------------------------------- */
4495
4496 SeqAlignPtr BLASTFilterOverlapRegions(SeqAlignPtr sap, Int4 pct,
4497 Boolean subject_is_aa,
4498 Boolean is_ooframe,
4499 Boolean sort_array)
4500 {
4501 SeqAlignPtr seqalign, head, tail=NULL, next_tail;
4502 SappSetPtr ssp, ssp_tmp;
4503
4504 if(sap == NULL)
4505 return NULL;
4506
4507 head = NULL;
4508
4509 ssp = BLASTCreateSappArray(sap, subject_is_aa, is_ooframe);
4510
4511 /* Generally speaking SeqAlignPtr coming from Blast engine should be
4512 correctly sorted - if not sorting below may be used */
4513
4514 if(sort_array)
4515 BLASTSortSapArray(ssp);
4516
4517 while(TRUE) {
4518
4519 if((ssp_tmp = BLASTGetNextSapArray(ssp, pct)) == NULL)
4520 break;
4521
4522 BLASTSortSapArray(ssp_tmp);
4523
4524 seqalign = BLASTFilterSapArray(ssp_tmp, pct, &next_tail);
4525
4526 SappSetFree(ssp_tmp);
4527
4528 if(head == NULL) {
4529 head = seqalign;
4530 } else {
4531 tail->next = seqalign;
4532 }
4533
4534 tail = next_tail;
4535 }
4536
4537 SappSetFree(ssp);
4538
4539 return head;
4540 }
4541
4542 Int4
4543 BlastGetNumIdentical(Uint1Ptr query, Uint1Ptr subject, Int4 q_start,
4544 Int4 s_start, Int4 length, Boolean reverse)
4545 {
4546 Int4 i, ident = 0;
4547 Uint1Ptr q, s;
4548
4549 q = &query[q_start];
4550 if (!reverse)
4551 s = &subject[s_start];
4552 else
4553 s = &subject[s_start + length - 1];
4554
4555 for (i=0; i<length; i++) {
4556 if (*q == *s)
4557 ident++;
4558 q++;
4559 if (!reverse)
4560 s++;
4561 else
4562 s--;
4563 }
4564
4565 return ident;
4566 }
4567
4568
4569 int LIBCALLBACK BlastPrintAlignInfo(VoidPtr srch)
4570 {
4571 BlastSearchBlkPtr search = (BlastSearchBlkPtr) srch;
4572 BLAST_HSPPtr hsp, PNTR hsp_array;
4573 Int4 i;
4574 CharPtr title, ptr;
4575 Char query_buffer[BUFFER_LENGTH+1], subject_buffer[BUFFER_LENGTH+1];
4576 SeqIdPtr sip, subject_id, query_id;
4577 Int4 index, score;
4578 Int4 num_mismatches, num_gap_opens, align_length, num_ident;
4579 Uint1Ptr query_seq, subject_start=NULL, subject_seq, rev_subject=NULL;
4580 FloatHi perc_ident, bit_score, evalue;
4581 Char bit_score_buff[10];
4582 CharPtr eval_buff = NULL;
4583 Int4 length=0, query_length, subject_length=0, rev_subject_length=0;
4584 Int4 q_start, q_end, s_start, s_end, q_shift=0, s_shift=0;
4585 CharPtr subject_descr = NULL;
4586 Int4 cutoff_s;
4587 Boolean is_na, is_translated, db_is_na;
4588 Int4 hspcnt, number, numseg;
4589 SeqAlignPtr seqalign = NULL, sap;
4590 FILE *fp = (FILE *)search->output;
4591 BioseqPtr bsp = NULL, query_bsp;
4592 SeqPortPtr spp;
4593 SeqLocPtr subject_slp = NULL;
4594 Uint1 residue;
4595 AlignSumPtr asp = NULL;
4596
4597 if (search->current_hitlist == NULL || search->current_hitlist->hspcnt <= 0) {
4598 search->subject_info = BLASTSubjectInfoDestruct(search->subject_info);
4599 return 0;
4600 }
4601
4602 hspcnt = search->current_hitlist->hspcnt;
4603 is_na = (search->prog_number == blast_type_blastn);
4604 is_translated = (search->prog_number != blast_type_blastn &&
4605 search->prog_number != blast_type_blastp);
4606
4607 subject_slp = search->query_slp->next;
4608 /* Only for the two sequences case, get offset shift if subject
4609 is a subsequence */
4610 if (!search->rdfp && subject_slp)
4611 s_shift = SeqLocStart(subject_slp);
4612 /* Check if query is a subsequence */
4613 q_shift = SeqLocStart(search->query_slp);
4614
4615 if (is_na)
4616 cutoff_s = search->pbp->cutoff_s;
4617 else
4618 cutoff_s = 0;
4619
4620 db_is_na = (search->prog_number != blast_type_blastp &&
4621 search->prog_number != blast_type_tblastn);
4622
4623 if (search->rdfp) {
4624 ReadDBBioseqFetchEnable("blastall", search->rdfp->filename,
4625 db_is_na, TRUE);
4626 readdb_get_descriptor(search->rdfp, search->subject_id, &sip,
4627 &subject_descr);
4628 } else
4629 sip = SeqIdSetDup(search->subject_info->sip);
4630
4631 if (search->prog_number == blast_type_tblastn) {
4632 if (subject_slp) {
4633 spp = SeqPortNewByLoc(subject_slp, Seq_code_ncbi4na);
4634 subject_length = SeqLocLen(subject_slp);
4635 } else {
4636 if (search->rdfp)
4637 bsp = readdb_get_bioseq(search->rdfp, search->subject_id);
4638 else
4639 bsp = BioseqLockById(sip);
4640 spp = SeqPortNew(bsp, 0, -1, Seq_strand_plus, Seq_code_ncbi4na);
4641 subject_length = bsp->length;
4642 }
4643 /* make one longer to "protect" ALIGN. */
4644 subject_start = subject_seq =
4645 MemNew((1+subject_length)*sizeof(Uint1));
4646 for (index=0; index<subject_length; index++) {
4647 subject_seq[index] = SeqPortGetResidue(spp);
4648 }
4649 /* Gap character in last space. */
4650 subject_seq[subject_length] = NULLB;
4651 spp = SeqPortFree(spp);
4652
4653 if (subject_slp && !bsp) {
4654 Change_Loc_Strand(subject_slp, Seq_strand_minus);
4655 spp = SeqPortNewByLoc(subject_slp, Seq_code_ncbi4na);
4656 Change_Loc_Strand(subject_slp, Seq_strand_both);
4657 } else if (bsp)
4658 spp = SeqPortNew(bsp, 0, -1, Seq_strand_minus, Seq_code_ncbi4na);
4659
4660 /* make one longer to "protect" ALIGN. */
4661 rev_subject = MemNew((1+subject_length)*sizeof(Uint1));
4662 for (index=0; index<subject_length; index++)
4663 rev_subject[index] = SeqPortGetResidue(spp);
4664 /* Gap character in last space. */
4665 rev_subject[subject_length] = NULLB;
4666 rev_subject_length = subject_length;
4667 spp = SeqPortFree(spp);
4668 if (search->rdfp)
4669 bsp = BioseqFree(bsp);
4670 else if (bsp)
4671 BioseqUnlock(bsp);
4672 } else if (search->rdfp) {
4673 if (is_na) {
4674 subject_length =
4675 readdb_get_sequence_ex(search->rdfp, search->subject_id,
4676 &subject_start, &length, TRUE);
4677 subject_seq = subject_start + 1;
4678 } else {
4679 subject_length =
4680 readdb_get_sequence(search->rdfp, search->subject_id,
4681 &subject_seq);
4682 }
4683 } else if (search->prog_number == blast_type_blastn) {
4684 bsp = BioseqLockById(sip);
4685 if (subject_slp) {
4686 spp = SeqPortNewByLoc(subject_slp, Seq_code_ncbi4na);
4687 subject_length = SeqLocLen(subject_slp);
4688 } else {
4689 spp = SeqPortNew(bsp, 0, -1, Seq_strand_plus, Seq_code_ncbi4na);
4690 subject_length = bsp->length;
4691 }
4692 if (bsp->repr == Seq_repr_delta)
4693 SeqPortSet_do_virtual(spp, TRUE);
4694 /* make one longer to "protect" ALIGN. */
4695 subject_start = MemNew((2+subject_length)*sizeof(Uint1));
4696 subject_start[0] = ncbi4na_to_blastna[0];
4697 subject_seq = subject_start+1;
4698 index=0;
4699 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
4700 if (IS_residue(residue))
4701 subject_seq[index++] = ncbi4na_to_blastna[residue];
4702 }
4703 /* Gap character in last space. */
4704 subject_seq[index] = ncbi4na_to_blastna[0];
4705 spp = SeqPortFree(spp);
4706 BioseqUnlock(bsp);
4707 } else {
4708 if (subject_slp) {
4709 subject_length = SeqLocLen(subject_slp);
4710 spp = SeqPortNewByLoc(subject_slp, Seq_code_ncbistdaa);
4711 } else {
4712 bsp = BioseqLockById(sip);
4713 subject_length = bsp->length;
4714 spp = SeqPortNew(bsp, 0, -1, Seq_strand_unknown,
4715 Seq_code_ncbistdaa);
4716 BioseqUnlock(bsp);
4717 }
4718 subject_start = (Uint1Ptr) MemNew(((subject_length)+2)*sizeof(Uint1));
4719 /* The first residue is the sentinel. */
4720 subject_start[0] = NULLB;
4721 subject_seq = subject_start+1;
4722 index = 0;
4723 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
4724 if (IS_residue(residue))
4725 subject_seq[index++] = residue;
4726 }
4727 subject_seq[index] = NULLB;
4728 spp = SeqPortFree(spp);
4729 }
4730
4731 if (sip->choice != SEQID_GENERAL ||
4732 StringCmp(((DbtagPtr)sip->data.ptrvalue)->db, "BL_ORD_ID")) {
4733 ptr = (CharPtr) Malloc(BUFFER_LENGTH+1);
4734 SeqIdWrite(sip, ptr, PRINTID_FASTA_LONG, BUFFER_LENGTH);
4735 if (StringNCmp(ptr, "lcl|", 4))
4736 StringCpy(subject_buffer, ptr);
4737 else
4738 StringCpy(subject_buffer, ptr+4);
4739 ptr = MemFree(ptr);
4740 } else {
4741 ptr = StringTokMT(subject_descr, " \t", &subject_descr);
4742 subject_descr = ptr;
4743 StringCpy(subject_buffer, ptr);
4744 }
4745
4746 query_id = search->query_id;
4747 if (is_na)
4748 query_length = search->query_context_offsets[search->first_context+1] -
4749 search->query_context_offsets[search->first_context] - 1;
4750 else
4751 query_length = search->context[0].query->length;
4752
4753 query_bsp = BioseqLockById(query_id);
4754 if (query_id->choice == SEQID_LOCAL) {
4755 title = BioseqGetTitle(query_bsp);
4756 if (title) {
4757 ptr = StringTokMT(title, " ", &title);
4758 StringCpy(query_buffer, ptr);
4759 } else {
4760 ptr = (CharPtr) Malloc(BUFFER_LENGTH+1);
4761 SeqIdWrite(query_bsp->id, ptr, PRINTID_FASTA_LONG,
4762 BUFFER_LENGTH);
4763 if (StringNCmp(ptr, "lcl|", 4))
4764 StringCpy(query_buffer, ptr);
4765 else
4766 StringCpy(query_buffer, ptr+4);
4767 ptr = MemFree(ptr);
4768
4769 }
4770 } else {
4771 SeqIdWrite(query_bsp->id, query_buffer, PRINTID_FASTA_LONG, BUFFER_LENGTH);
4772 }
4773 BioseqUnlock(query_bsp);
4774
4775 hsp_array = search->current_hitlist->hsp_array;
4776
4777 subject_id = GetTheSeqAlignID(sip);
4778 sip = SeqIdSetFree(sip);
4779 if (search->prog_number != blast_type_tblastx &&
4780 search->pbp->gapped_calculation)
4781 RealBlastGetGappedAlignmentTraceback(search, subject_seq, subject_length, rev_subject, rev_subject_length, subject_id, hsp_array, hspcnt, &seqalign, NULL, cutoff_s, FALSE, search->subject_id, TRUE);
4782 else {
4783 SeqIdPtr new_sip, gi_list=NULL;
4784 StdSegPtr ssp;
4785
4786 gi_list = BlastGetAllowedGis(search, search->subject_id, &new_sip);
4787 sip = SeqIdDup(query_id);
4788 if (new_sip)
4789 sip->next = new_sip;
4790 else
4791 sip->next = subject_id;
4792 for (index=0; index<hspcnt; index++) {
4793 hsp = hsp_array[index];
4794 ssp = BLASTHspToStdSeg(search, subject_length, hsp, sip,
4795 FALSE, gi_list);
4796 ssp->ids = SeqIdSetDup(sip);
4797 if (!seqalign)
4798 sap = seqalign = SeqAlignNew();
4799 else {
4800 sap->next = SeqAlignNew();
4801 sap = sap->next;
4802 }
4803 sap->type = 2;
4804 sap->segtype = 3;
4805 sap->segs = ssp;
4806 }
4807 SeqIdFree(sip);
4808 }
4809
4810 subject_id = SeqIdSetFree(subject_id);
4811
4812 if (is_translated || !search->pbp->gapped_calculation) {
4813 asp = MemNew(sizeof(AlignSum));
4814 asp->matrix = NULL;
4815 asp->matrix = load_default_matrix();
4816 asp->is_aa = !is_na;
4817 if (is_translated)
4818 AdjustOffSetsInSeqAlign(seqalign, search->query_slp, subject_slp);
4819 }
4820
4821 /* Evalue buffer is dynamically allocated to avoid compiler warnings
4822 in calls to ScoreAndEvalueToBuffers. */
4823 eval_buff = Malloc(10);
4824
4825 /* Now print the tab-delimited fields, using seqalign */
4826 for (sap = seqalign; sap; sap = sap->next) {
4827 perc_ident = 0;
4828 align_length = 0;
4829 num_gap_opens = 0;
4830 num_mismatches = 0;
4831
4832 GetScoreAndEvalue(sap, &score, &bit_score, &evalue, &number);
4833
4834 /* Do not allow knocking off digit in evalue buffer, so parsers are
4835 not confused. */
4836 ScoreAndEvalueToBuffers(bit_score, evalue,
4837 bit_score_buff, &eval_buff, 0);
4838
4839 query_seq = search->context[search->first_context].query->sequence;
4840
4841 if (!is_translated && search->pbp->gapped_calculation) {
4842 DenseSegPtr dsp = (DenseSegPtr) sap->segs;
4843 Boolean reverse = (dsp->strands[0] != dsp->strands[1]);
4844 numseg = dsp->numseg;
4845
4846 for (i=0; i<numseg; i++) {
4847 align_length += dsp->lens[i];
4848 if (dsp->starts[2*i] != -1 && dsp->starts[2*i+1] != -1) {
4849 if (reverse) {
4850 q_start =
4851 query_length - dsp->starts[2*i] - dsp->lens[i] + 1;
4852 } else {
4853 q_start = dsp->starts[2*i];
4854 }
4855 num_ident = BlastGetNumIdentical(query_seq, subject_seq,
4856 q_start, dsp->starts[2*i+1], dsp->lens[i], FALSE);
4857 perc_ident += num_ident;
4858 num_mismatches += dsp->lens[i] - num_ident;
4859 } else
4860 num_gap_opens++;
4861 }
4862 perc_ident = perc_ident / align_length * 100;
4863
4864 /* Adjust offsets for (concatenated) minus strand */
4865 if (is_na && dsp->starts[0] >= query_length) {
4866 q_start = 2*query_length - dsp->starts[2*numseg-2] -
4867 dsp->lens[numseg-1] + 2;
4868 q_end = 2*query_length - dsp->starts[0] + 1;
4869 s_start = dsp->starts[2*numseg-1] + dsp->lens[numseg-1];
4870 s_end = dsp->starts[1]+1;
4871 } else if (reverse) {
4872 q_start = dsp->starts[0];
4873 q_end = dsp->starts[2*numseg-2] + dsp->lens[numseg-1] - 1;
4874 s_end = dsp->starts[2*numseg-1] + dsp->lens[numseg-1];
4875 s_start = dsp->starts[1]+1;
4876 } else {
4877 q_start = dsp->starts[0] + 1;
4878 q_end = dsp->starts[2*numseg-2] + dsp->lens[numseg-1];
4879 s_start = dsp->starts[1] + 1;
4880 s_end = dsp->starts[2*numseg-1] + dsp->lens[numseg-1];
4881 }
4882 } else {
4883 StdSegPtr ssp = (StdSegPtr) sap->segs;
4884
4885 find_score_in_align(sap, 1, asp);
4886
4887 if (asp->m_frame < 0)
4888 q_start = SeqLocStop(ssp->loc) + 1;
4889 else
4890 q_start = SeqLocStart(ssp->loc) + 1;
4891
4892 if (asp->t_frame < 0)
4893 s_start = SeqLocStop(ssp->loc->next) + 1;
4894 else
4895 s_start = SeqLocStart(ssp->loc->next) + 1;
4896
4897 for (index=1; ssp->next; index++)
4898 ssp = ssp->next;
4899
4900 if (asp->m_frame < 0)
4901 q_end = SeqLocStart(ssp->loc) + 1;
4902 else
4903 q_end = SeqLocStop(ssp->loc) + 1;
4904
4905 if (asp->t_frame < 0)
4906 s_end = SeqLocStart(ssp->loc->next) + 1;
4907 else
4908 s_end = SeqLocStop(ssp->loc->next) + 1;
4909
4910 align_length = asp->totlen;
4911 num_gap_opens = index / 2;
4912 num_mismatches = asp->totlen - asp->gaps - asp->identical;
4913 perc_ident = ((FloatHi) 100*asp->identical)/ (asp->totlen);
4914 }
4915
4916 if (!is_translated) {
4917 /* Adjust coordinates if query and/or subject is a subsequence */
4918 q_start += q_shift;
4919 q_end += q_shift;
4920 s_start += s_shift;
4921 s_end += s_shift;
4922 }
4923
4924 if (perc_ident >= 99.995 && perc_ident < 100.00)
4925 perc_ident = 99.99;
4926
4927 fprintf(fp,
4928 "%s\t%s\t%.2f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\n",
4929 query_buffer, subject_buffer, perc_ident, align_length,
4930 num_mismatches, num_gap_opens, q_start,
4931 q_end, s_start, s_end, eval_buff, bit_score_buff);
4932 }
4933
4934 eval_buff = MemFree(eval_buff);
4935
4936 if (is_translated) {
4937 free_default_matrix(asp->matrix);
4938 MemFree(asp);
4939 }
4940
4941 if (search->rdfp)
4942 ReadDBBioseqFetchDisable();
4943
4944 MemFree(subject_start);
4945 MemFree(rev_subject);
4946 SeqAlignSetFree(seqalign);
4947 MemFree(subject_descr);
4948 fflush(fp);
4949 return 0;
4950 }
4951
4952 static void
4953 FillSequenceBuffers(Uint1* query_seq, Uint1* subject_seq,
4954 Int4* starts, Int4* lengths, Int4 numseg,
4955 char* query_buffer, char* subject_buffer,
4956 Boolean reverse)
4957 {
4958 Int4 index, index1;
4959 const char* blastna_to_iupacna = "ACGTRYMKWSBDHVN-";
4960 const char* blastna_to_iupacna_rev = "TGCAYRKMSWVHDBN-";
4961 Uint1* query_ptr;
4962 Uint1* subject_ptr;
4963 Int4 offset;
4964 char* buffer;
4965
4966 offset = 0;
4967 if (!reverse) {
4968 for (index = 0; index < numseg; ++index) {
4969 buffer = &query_buffer[offset];
4970 if (starts[2*index] != -1) {
4971 query_ptr = &query_seq[starts[2*index]];
4972 for (index1 = 0; index1 < lengths[index]; ++index1) {
4973 *buffer = blastna_to_iupacna[*query_ptr];
4974 buffer++;
4975 query_ptr++;
4976 }
4977 } else {
4978 memset(buffer, '-', lengths[index]);
4979 }
4980 buffer = &subject_buffer[offset];
4981 if (starts[2*index+1] != -1) {
4982 subject_ptr = &subject_seq[starts[2*index+1]];
4983 for (index1 = 0; index1 < lengths[index]; ++index1) {
4984 *buffer = blastna_to_iupacna[*subject_ptr];
4985 buffer++;
4986 subject_ptr++;
4987 }
4988 } else {
4989 memset(buffer, '-', lengths[index]);
4990 }
4991 offset += lengths[index];
4992 }
4993 } else {
4994 for (index = numseg-1; index >=0; --index) {
4995 buffer = &query_buffer[offset];
4996 if (starts[2*index] != -1) {
4997 query_ptr = &query_seq[starts[2*index]+lengths[index]-1];
4998 for (index1 = 0; index1 < lengths[index]; ++index1) {
4999 *buffer = blastna_to_iupacna_rev[*query_ptr];
5000 buffer++;
5001 query_ptr--;
5002 }
5003 } else {
5004 memset(buffer, '-', lengths[index]);
5005 }
5006 buffer = &subject_buffer[offset];
5007 if (starts[2*index+1] != -1) {
5008 subject_ptr = &subject_seq[starts[2*index+1]+lengths[index]-1];
5009 for (index1 = 0; index1 < lengths[index]; ++index1) {
5010 *buffer = blastna_to_iupacna_rev[*subject_ptr];
5011 buffer++;
5012 subject_ptr--;
5013 }
5014 } else {
5015 memset(buffer, '-', lengths[index]);
5016 }
5017 offset += lengths[index];
5018 }
5019 }
5020 }
5021
5022 int LIBCALLBACK
5023 MegaBlastPrintAlignInfo(VoidPtr ptr)
5024 {
5025 BlastSearchBlkPtr search = (BlastSearchBlkPtr) ptr;
5026 BLAST_HSPPtr hsp;
5027 Int4 i, subject_gi;
5028 Int2 context;
5029 CharPtr query_buffer, title;
5030 SeqIdPtr sip, subject_id, query_id;
5031 Int4 hsp_index;
5032 Int4 num_mismatches, num_gap_opens, align_length, num_ident;
5033 BLAST_KarlinBlkPtr kbp;
5034 Uint1Ptr query_seq, subject_seq = NULL;
5035 FloatHi perc_ident, bit_score;
5036 Char bit_score_buff[10];
5037 CharPtr eval_buff = NULL;
5038 GapXEditScriptPtr esp;
5039 Int4 q_start, q_end, s_start, s_end, query_length, numseg;
5040 Int4 q_off, q_shift = 0, s_off, s_shift = 0;
5041 Int4Ptr length, start;
5042 Uint1Ptr strands;
5043 CharPtr subject_descr=NULL, subject_buffer, buffer;
5044 Boolean numeric_sip_type = FALSE;
5045 FILE *fp = (FILE *) search->output;
5046 CharPtr query_seq_buffer, subject_seq_buffer;
5047 Boolean print_sequences;
5048
5049 if (search->current_hitlist == NULL || search->current_hitlist->hspcnt <= 0) {
5050 search->subject_info = BLASTSubjectInfoDestruct(search->subject_info);
5051 return 0;
5052 }
5053
5054 print_sequences = (getenv("PRINT_SEQUENCES") != NULL);
5055
5056 subject_seq = search->subject->sequence_start + 1;
5057
5058 if (search->rdfp) {
5059 readdb_get_descriptor(search->rdfp, search->subject_id, &sip,
5060 &subject_descr);
5061 } else
5062 sip = SeqIdSetDup(search->subject_info->sip);
5063
5064 if (!search->pbp->mb_params->full_seqids)
5065 subject_id = SeqIdFindBestAccession(sip);
5066 else
5067 subject_id = sip;
5068
5069 if (subject_id->choice == SEQID_GI) {
5070 SeqIdPtr new_subject_seqid = NULL;
5071 ValNodePtr gi_list =
5072 BlastGetAllowedGis(search, search->subject_id, &new_subject_seqid);
5073
5074 if (gi_list) {
5075 /* change subject's gi with this 'use_this_gi' gi */
5076 subject_id->data.intvalue = gi_list->data.intvalue;
5077 }
5078 }
5079
5080 if (subject_id->choice != SEQID_GENERAL ||
5081 StringCmp(((DbtagPtr)subject_id->data.ptrvalue)->db, "BL_ORD_ID")) {
5082 if (search->pbp->mb_params->full_seqids) {
5083 subject_buffer = (CharPtr) Malloc(BUFFER_LENGTH + 1);
5084 SeqIdWrite(subject_id, subject_buffer, PRINTID_FASTA_LONG, BUFFER_LENGTH);
5085 } else {
5086 numeric_sip_type =
5087 GetAccessionVersionFromSeqId(subject_id, &subject_gi,
5088 &subject_buffer, TRUE);
5089 }
5090 } else {
5091 subject_buffer = StringTokMT(subject_descr, " \t", &subject_descr);
5092 subject_descr = subject_buffer;
5093 }
5094
5095 buffer = (CharPtr) Malloc(LARGE_BUFFER_LENGTH);
5096
5097 /* Only for the two sequences case, get offset shift if subject
5098 is a subsequence */
5099 if (!search->rdfp && search->query_slp->next)
5100 s_shift = SeqLocStart(search->query_slp->next);
5101 /* Get offset shift if query is a subsequence */
5102 q_shift = SeqLocStart(search->query_slp);
5103
5104 /* Evalue buffer is dynamically allocated to avoid compiler warnings
5105 in calls to ScoreAndEvalueToBuffers. */
5106 eval_buff = Malloc(10);
5107
5108 for (hsp_index=0; hsp_index<search->current_hitlist->hspcnt; hsp_index++) {
5109 hsp = search->current_hitlist->hsp_array[hsp_index];
5110 if (hsp==NULL || (search->pbp->cutoff_e > 0 &&
5111 hsp->evalue > search->pbp->cutoff_e)) {
5112 continue;
5113 }
5114 context = hsp->context;
5115 query_id = search->qid_array[context/2];
5116
5117
5118 if (query_id == NULL) /* Bad hsp, something wrong */
5119 continue;
5120 hsp->context = context & 1;
5121
5122 query_length = search->query_context_offsets[context+1] -
5123 search->query_context_offsets[context] - 1;
5124
5125 query_seq = search->context[context].query->sequence;
5126 kbp = search->sbp->kbp[context];
5127
5128 /* If traceback hasn't been done yet, do it now */
5129 if (!hsp->gap_info) {
5130 /* This must have already been allocated */
5131 GapAlignBlkPtr gap_align = search->gap_align;
5132 FloatHi searchsp_eff;
5133 Int4 max_offset, max_start = MAX_DBSEQ_LEN / 2, start_shift;
5134
5135 /* Set the X-dropoff to the final X dropoff parameter. */
5136 gap_align->x_parameter = search->pbp->gap_x_dropoff_final;
5137 gap_align->query = query_seq;
5138 gap_align->query_length = query_length;
5139 gap_align->subject = subject_seq;
5140 gap_align->subject_length = search->subject->length;
5141 if ((hsp->query.gapped_start == 0 && hsp->subject.gapped_start == 0) ||
5142 CheckStartForGappedAlignment(search, hsp, gap_align->query, gap_align->subject, search->sbp->matrix) == FALSE) {
5143 max_offset = GetStartForGappedAlignment(search, hsp, gap_align->query, gap_align->subject, search->sbp->matrix);
5144 gap_align->q_start = max_offset;
5145 gap_align->s_start = (hsp->subject.offset - hsp->query.offset) + max_offset;
5146 hsp->query.gapped_start = gap_align->q_start;
5147 hsp->subject.gapped_start = gap_align->s_start;
5148 } else {
5149 gap_align->q_start = hsp->query.gapped_start;
5150 gap_align->s_start = hsp->subject.gapped_start;
5151 }
5152
5153 if (gap_align->s_start > max_start) {
5154 start_shift = (gap_align->s_start / max_start) * max_start;
5155 gap_align->subject = gap_align->subject + start_shift;
5156
5157 gap_align->s_start %= max_start;
5158 } else
5159 start_shift = 0;
5160
5161 gap_align->subject_length =
5162 MIN(gap_align->subject_length - start_shift,
5163 gap_align->s_start + hsp->subject.length + max_start);
5164 PerformGappedAlignmentWithTraceback(gap_align);
5165 hsp->query.offset = gap_align->query_start;
5166 hsp->subject.offset = gap_align->subject_start + start_shift;
5167 /* The end is one further for BLAST than for the gapped align. */
5168 hsp->query.end = gap_align->query_stop + 1;
5169 hsp->subject.end = gap_align->subject_stop + 1 + start_shift;
5170 if (gap_align->edit_block && start_shift > 0) {
5171 gap_align->edit_block->start2 += start_shift;
5172 gap_align->edit_block->length2 += start_shift;
5173 }
5174 hsp->query.length = hsp->query.end - hsp->query.offset;
5175 hsp->subject.length = hsp->subject.end - hsp->subject.offset;
5176 hsp->score = gap_align->score;
5177 hsp->gap_info = gap_align->edit_block;
5178 searchsp_eff = (FloatHi) search->dblen_eff *
5179 (FloatHi) search->context[hsp->context].query->effective_length;
5180
5181 hsp->evalue =
5182 BlastKarlinStoE_simple(hsp->score, kbp, searchsp_eff);
5183 if (hsp->evalue > search->pbp->cutoff_e) {
5184 continue;
5185 }
5186 }
5187
5188 if (query_id->choice == SEQID_LOCAL &&
5189 search->pbp->mb_params->full_seqids) {
5190 Int4 local_index;
5191 if (GetAccessionVersionFromSeqId(query_id, &local_index,
5192 &query_buffer, TRUE)) {
5193 /* If local id is numeric, it is a fake id, so try to retrieve the
5194 real one */
5195 BioseqPtr query_bsp = BioseqLockById(query_id);
5196 title = StringSave(BioseqGetTitle(query_bsp));
5197 if (title)
5198 query_buffer = StringTokMT(title, " ", &title);
5199 else /* Failed to find any string for this id */
5200 query_buffer = StringSave("QUERY");
5201 BioseqUnlock(query_bsp);
5202 }
5203 } else {
5204 query_buffer = (CharPtr) Malloc(BUFFER_LENGTH + 1);
5205 if (!search->pbp->mb_params->full_seqids) {
5206 SeqIdWrite(SeqIdFindBestAccession(query_id), query_buffer,
5207 PRINTID_TEXTID_ACC_VER, BUFFER_LENGTH);
5208 } else {
5209 SeqIdWrite(query_id, query_buffer, PRINTID_FASTA_LONG,
5210 BUFFER_LENGTH);
5211 }
5212 }
5213
5214 bit_score = (hsp->score*kbp->Lambda - kbp->logK) / NCBIMATH_LN2;
5215
5216 esp = hsp->gap_info->esp;
5217
5218 for (numseg=0; esp; esp = esp->next, numseg++);
5219
5220 q_off = hsp->query.offset;
5221 s_off = hsp->subject.offset;
5222
5223 GXECollectDataForSeqalign(hsp->gap_info, hsp->gap_info->esp, numseg,
5224 &start, &length, &strands,
5225 &q_off, &s_off);
5226
5227 if (start[0] < 0) {
5228 length[0] += start[0];
5229 start[1] -= start[0];
5230 start[0] = 0;
5231 }
5232 if (start[2*(numseg-1)] + length[numseg-1] > query_length)
5233 length[numseg-1] = query_length - start[2*(numseg-1)];
5234
5235 perc_ident = 0;
5236 align_length = 0;
5237 num_gap_opens = 0;
5238 num_mismatches = 0;
5239 for (i=0; i<numseg; i++) {
5240 if (start[2*i] != -1 && start[2*i+1] != -1) {
5241 num_ident = MegaBlastGetNumIdentical(query_seq, subject_seq,
5242 start[2*i], start[2*i+1],
5243 length[i], FALSE);
5244 perc_ident += num_ident;
5245 num_mismatches += length[i] - num_ident;
5246 } else {
5247 num_gap_opens++;
5248 }
5249 align_length += length[i];
5250 }
5251 perc_ident = perc_ident / align_length * 100;
5252 /* Avoid printing 100.00 when the hit is not an exact match */
5253 if (perc_ident >= 99.995 && perc_ident < 100.00)
5254 perc_ident = 99.99;
5255
5256 if (perc_ident < search->pbp->mb_params->perc_identity) {
5257 MemFree(start);
5258 MemFree(length);
5259 MemFree(strands);
5260 MemFree(query_buffer);
5261 continue;
5262 }
5263
5264 if (print_sequences) {
5265 /* Fill the query and subject sequence buffers */
5266 query_seq_buffer = MemNew((align_length+1));
5267 subject_seq_buffer = MemNew((align_length+1));
5268 FillSequenceBuffers(query_seq, subject_seq, start, length, numseg,
5269 query_seq_buffer, subject_seq_buffer,
5270 (context%2 == 1));
5271 }
5272
5273 if (hsp->context) {
5274 hsp->query.end = query_length - hsp->query.offset;
5275 hsp->query.offset =
5276 hsp->query.end - hsp->query.length;
5277 s_end = hsp->subject.offset + 1;
5278 s_start = hsp->subject.end;
5279 } else {
5280 hsp->query.end = hsp->query.offset + hsp->query.length;
5281 s_start = hsp->subject.offset + 1;
5282 s_end = hsp->subject.offset + hsp->subject.length;
5283 }
5284
5285 q_start = hsp->query.offset + 1;
5286 q_end = hsp->query.end;
5287
5288 /* Adjust offsets if query is a subsequence, only for first query */
5289 if (context < 2) {
5290 q_start += q_shift;
5291 q_end += q_shift;
5292 }
5293
5294 s_start += s_shift;
5295 s_end += s_shift;
5296
5297 /* Do not allow knocking off digit in evalue buffer, so parsers are
5298 not confused. */
5299 ScoreAndEvalueToBuffers(bit_score, hsp->evalue,
5300 bit_score_buff, &eval_buff, 0);
5301
5302 if (print_sequences) {
5303 if (numeric_sip_type) {
5304 fprintf(fp,
5305 "%s\t%ld\t%.2f\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%s\t%s\t%s\t%s\n",
5306 query_buffer, (long) subject_gi, perc_ident,
5307 (long) align_length, (long) num_mismatches,
5308 (long) num_gap_opens, (long) q_start, (long) q_end,
5309 (long) s_start, (long) s_end, eval_buff, bit_score_buff,
5310 query_seq_buffer, subject_seq_buffer);
5311 } else {
5312 fprintf(fp,
5313 "%s\t%s\t%.2f\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%s\t%s\t%s\t%s\n",
5314 query_buffer, subject_buffer, perc_ident,
5315 (long) align_length, (long) num_mismatches,
5316 (long) num_gap_opens, (long) q_start, (long) q_end,
5317 (long) s_start, (long) s_end, eval_buff, bit_score_buff,
5318 query_seq_buffer, subject_seq_buffer);
5319 }
5320 MemFree(query_seq_buffer);
5321 MemFree(subject_seq_buffer);
5322 } else {
5323 if (numeric_sip_type) {
5324 fprintf(fp,
5325 "%s\t%ld\t%.2f\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%s\t%s\n",
5326 query_buffer, (long) subject_gi, perc_ident,
5327 (long) align_length, (long) num_mismatches,
5328 (long) num_gap_opens, (long) q_start, (long) q_end,
5329 (long) s_start, (long) s_end, eval_buff, bit_score_buff);
5330 } else {
5331 fprintf(fp,
5332 "%s\t%s\t%.2f\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%s\t%s\n",
5333 query_buffer, subject_buffer, perc_ident,
5334 (long) align_length, (long) num_mismatches,
5335 (long) num_gap_opens, (long) q_start, (long) q_end,
5336 (long) s_start, (long) s_end, eval_buff, bit_score_buff);
5337 }
5338 }
5339
5340 MemFree(start);
5341 MemFree(length);
5342 MemFree(strands);
5343 MemFree(query_buffer);
5344 } /* End loop on hsp's */
5345 if (!numeric_sip_type && subject_buffer != subject_descr)
5346 MemFree(subject_buffer);
5347 MemFree(subject_descr);
5348 MemFree(buffer);
5349 MemFree(eval_buff);
5350
5351 sip = SeqIdSetFree(sip);
5352 fflush(fp);
5353 return 0;
5354 }
5355
5356
5357 Boolean FastaCheckDna(CharPtr seq)
5358 {
5359 Uint2 len = 100;
5360 CharPtr ptr = NULL;
5361
5362 if (*seq == '>') {
5363 for (++seq; *seq != NULLB && *seq != '\n'; seq++) ;
5364 }
5365
5366 if ((ptr = StrChr(seq, '>')) != NULL)
5367 len = MIN(len, ptr - seq);
5368 else
5369 len = MIN(len, StringLen(seq));
5370
5371 return (CheckDnaResidue(seq, len, NULL));
5372 }
5373
5374 BLASTHSPSegmentPtr BLASTHSPSegmentFromSeqAlign (SeqAlignPtr sap)
5375 {
5376 BLASTHSPSegmentPtr hsp = (BLASTHSPSegmentPtr)
5377 MemNew(sizeof(BLASTHSPSegment));
5378 Int4 numseg;
5379 DenseDiagPtr ddp;
5380 DenseSegPtr dsp;
5381 StdSegPtr ssp;
5382
5383 switch(sap->segtype) {
5384 case SAS_DENDIAG:
5385 ddp = (DenseDiagPtr) sap->segs;
5386 if (ddp->strands == NULL || ddp->strands[0] != Seq_strand_minus) {
5387 hsp->q_start = ddp->starts[0] + 1;
5388 hsp->q_end = ddp->starts[0] + ddp->len;
5389 } else {
5390 hsp->q_start = ddp->starts[0] + ddp->len;
5391 hsp->q_end = ddp->starts[0] + 1;
5392 }
5393 if (ddp->strands == NULL || ddp->strands[1] != Seq_strand_minus) {
5394 hsp->s_start = ddp->starts[1] + 1;
5395 hsp->s_end = ddp->starts[1] + ddp->len;
5396 } else {
5397 hsp->s_start = ddp->starts[1] + ddp->len;
5398 hsp->s_end = ddp->starts[1] + 1;
5399 }
5400 break;
5401 case SAS_DENSEG:
5402 dsp = (DenseSegPtr) sap->segs;
5403 numseg = dsp->numseg;
5404 if (dsp->strands == NULL || dsp->strands[0] != Seq_strand_minus) {
5405 hsp->q_start = dsp->starts[0] + 1;
5406 hsp->q_end = dsp->starts[2*numseg-2] + dsp->lens[numseg-1];
5407 } else {
5408 hsp->q_start = dsp->starts[2*numseg-2] + 1;
5409 hsp->q_end = dsp->starts[0] + dsp->lens[0];
5410 }
5411
5412 if (dsp->strands == NULL || dsp->strands[1] != Seq_strand_minus) {
5413 hsp->s_start = dsp->starts[1] + 1;
5414 hsp->s_end = dsp->starts[2*numseg-1] + dsp->lens[numseg-1];
5415 } else {
5416 hsp->s_start = dsp->starts[2*numseg-1] + 1;
5417 hsp->s_end = dsp->starts[1] + dsp->lens[0];
5418 }
5419 break;
5420 case SAS_STD:
5421 ssp = (StdSegPtr) sap->segs;
5422
5423 if (SeqLocStrand(ssp->loc) != Seq_strand_minus)
5424 hsp->q_start = SeqLocStart(ssp->loc);
5425 else
5426 hsp->q_end = SeqLocStop(ssp->loc);
5427 if (SeqLocStrand(ssp->loc->next) != Seq_strand_minus)
5428 hsp->s_start = SeqLocStart(ssp->loc->next);
5429 else
5430 hsp->s_end = SeqLocStop(ssp->loc->next);
5431
5432 while (ssp->next)
5433 ssp = ssp->next;
5434
5435 if (SeqLocStrand(ssp->loc) != Seq_strand_minus)
5436 hsp->q_end = SeqLocStop(ssp->loc);
5437 else
5438 hsp->q_start = SeqLocStart(ssp->loc);
5439 if (SeqLocStrand(ssp->loc->next) != Seq_strand_minus)
5440 hsp->s_end = SeqLocStop(ssp->loc->next);
5441 else
5442 hsp->s_start = SeqLocStart(ssp->loc->next);
5443 break;
5444 default:
5445 break;
5446 }
5447 return hsp;
5448 }
5449
5450 #define CLUSTER_GOOD_HIT_LTHRESH 50
5451
5452 SeqAlignPtr BlastClusterHitsFromSeqAlign(SeqAlignPtr seqalign,
5453 CharPtr prog_name,
5454 CharPtr database,
5455 BLAST_OptionsBlkPtr options,
5456 FloatHi length_thresh,
5457 FloatHi score_thresh,
5458 FloatHi overlap_thresh,
5459 Boolean two_sided)
5460 {
5461 SeqAlignPtr sap, prev_sap, head, next_sap;
5462 SeqAlignPtr PNTR sapp;
5463 SeqIdPtr sip, next_sip;
5464 Int4 index, hspcnt, index1, q_overlap;
5465 ValNodePtr mask;
5466 BLASTHSPSegmentPtr PNTR hit_array;
5467 BLASTHSPSegmentPtr hsp, hsp1;
5468 BioseqPtr PNTR bspp;
5469 BioseqPtr bsp1, bsp2;
5470 BlastSearchBlkPtr search1;
5471 BLASTResultHitlistPtr hitlist;
5472 BLASTResultHspPtr best_hsp;
5473 BLAST_KarlinBlkPtr kbp;
5474 FloatHi bit_score, length_coverage, score_coverage;
5475 Boolean db_is_na;
5476 Int4 root1, root2;
5477 Int4Ptr root;
5478
5479 for (sap = seqalign, hspcnt=0; sap; sap = sap->next, hspcnt++);
5480
5481 sapp = (SeqAlignPtr PNTR) Malloc(hspcnt*sizeof(SeqAlignPtr));
5482 bspp = (BioseqPtr PNTR) Malloc(hspcnt*sizeof(BioseqPtr));
5483 hit_array = (BLASTHSPSegmentPtr PNTR)
5484 Malloc(hspcnt*sizeof(BLASTHSPSegmentPtr));
5485
5486 db_is_na = TRUE;
5487 if (!StrCmp(prog_name, "blastp") || !StrCmp(prog_name, "blastx"))
5488 db_is_na = FALSE;
5489
5490 ReadDBBioseqFetchEnable ("blastall", database, db_is_na, TRUE);
5491
5492 sip = next_sip = NULL;
5493 prev_sap = NULL;
5494 for (sap = seqalign, hspcnt=0; sap; sap = sap->next) {
5495 next_sip = TxGetSubjectIdFromSeqAlign(sap);
5496 if (SeqIdComp(next_sip, sip) != SIC_YES) {
5497 sapp[hspcnt] = sap;
5498 if (prev_sap)
5499 prev_sap->next = NULL;
5500 bspp[hspcnt]= BioseqLockById(next_sip);
5501 hit_array[hspcnt] = BLASTHSPSegmentFromSeqAlign(sap);
5502 hspcnt++;
5503 sip = next_sip;
5504 }
5505 prev_sap = sap;
5506 }
5507
5508 root = (Int4Ptr) Malloc(hspcnt*sizeof(Int4));
5509 for (index=0; index<hspcnt; index++)
5510 root[index] = index;
5511
5512 index = 0;
5513 while (index<hspcnt) {
5514 hsp = hit_array[index];
5515
5516 bsp1 = bspp[index];
5517
5518 search1 =
5519 BlastQuerySequenceSetUp(bsp1, prog_name, options);
5520 root2 = index;
5521 while (root[root2] != root2)
5522 root2 = root[root2];
5523
5524 for (index1=index+1; index1<hspcnt; index1++) {
5525 root1 = index1;
5526 while (root[root1] != root1)
5527 root1 = root[root1];
5528 /* If these hits are already in one cluster, no need to compare them */
5529 if (root1 == root2)
5530 continue;
5531 hsp1 = hit_array[index1];
5532 bsp2 = bspp[index1];
5533
5534 /* For blastn, check if these hits are good enough to be involved in
5535 clustering, to avoid pairwise comparison of too many sequences */
5536 if (!StrCmp(prog_name, "blastn") &&
5537 MIN(hsp->q_end-hsp->q_start, hsp1->q_end-hsp1->q_start) <
5538 CLUSTER_GOOD_HIT_LTHRESH)
5539 continue;
5540 /* Check if the two lengths are comparable, for the 2-sided case */
5541 if (two_sided && ((FloatHi)ABS(bsp1->length - bsp2->length)) /
5542 MIN(bsp1->length, bsp2->length) > length_thresh)
5543 continue;
5544 /* Check if the next hit passes a simple query overlap test to be a
5545 candidate to belong to this cluster */
5546 q_overlap =
5547 MIN(hsp->q_end, hsp1->q_end) -
5548 MAX(hsp->q_start, hsp1->q_start);
5549 if (((FloatHi)q_overlap) /
5550 (MAX(hsp->q_end-hsp->q_start, hsp1->q_end-hsp1->q_start) + 1) <
5551 overlap_thresh)
5552 continue;
5553
5554 /* Do the two sequences search to determine whether this
5555 candidate in fact belongs to this cluster */
5556 search1 = BlastSequencesOnTheFlyEx(search1, bsp2);
5557
5558 if (search1) {
5559 hitlist = search1->result_struct->results[0];
5560 if (hitlist && hitlist->hspcnt > 0) {
5561 best_hsp = &hitlist->hsp_array[0];
5562 if (search1->pbp->gapped_calculation)
5563 kbp = search1->sbp->kbp_gap[search1->first_context];
5564 else
5565 kbp = search1->sbp->kbp[search1->first_context];
5566 bit_score = ((hitlist->high_score *
5567 kbp->Lambda) - kbp->logK)/NCBIMATH_LN2;
5568
5569 if (two_sided)
5570 length_coverage = MIN(((FloatHi)best_hsp->query_length) /
5571 bsp1->length,
5572 ((FloatHi)best_hsp->subject_length) /
5573 bsp2->length);
5574 else
5575 length_coverage = MAX(((FloatHi)best_hsp->query_length) /
5576 bsp1->length,
5577 ((FloatHi)best_hsp->subject_length) /
5578 bsp2->length);
5579 score_coverage = bit_score /
5580 MAX(best_hsp->query_length, best_hsp->subject_length);
5581 if (score_coverage > score_thresh &&
5582 length_coverage > length_thresh) {
5583 /* remove the respective hit */
5584 root1 = index1;
5585 while (root[root1] != root1)
5586 root1 = root[root1];
5587 root[root1] = root[root2];
5588 }
5589 }
5590 }
5591 }
5592 if (search1) {
5593 mask = search1->mask;
5594 while (mask) {
5595 SeqLocSetFree(mask->data.ptrvalue);
5596 mask = mask->next;
5597 }
5598 ValNodeFree(search1->mask);
5599 search1 = BlastSearchBlkDestruct(search1);
5600 }
5601 for (++index; index<hspcnt && hit_array[index]==NULL; index++);
5602 }
5603
5604 /* Remove the unneeded seqaligns */
5605 head = NULL;
5606
5607 for (index=0; index<hspcnt; index++) {
5608 if (root[index] == index) {
5609 if (!head) {
5610 head = (SeqAlignPtr) Malloc(sizeof(SeqAlign));
5611 MemCpy(head, sapp[index], sizeof(SeqAlign));
5612 for (sap = head; sap->next; sap = sap->next) {
5613 next_sap = sap->next;
5614 sap->next = (SeqAlignPtr) Malloc(sizeof(SeqAlign));
5615 MemCpy(sap->next, next_sap, sizeof(SeqAlign));
5616 }
5617 sapp[index] = head;
5618 } else {
5619 sap->next = sapp[index];
5620 for (index1=0; sap->next; sap = sap->next, index1=1) {
5621 next_sap = sap->next;
5622 sap->next = (SeqAlignPtr) Malloc(sizeof(SeqAlign));
5623 MemCpy(sap->next, next_sap, sizeof(SeqAlign));
5624 if (index1==0)
5625 sapp[index] = sap->next;
5626 }
5627 }
5628 } else {
5629 /* Save the ids of sequences in the cluster in the 'master' field of
5630 SeqAlign structure (kludge) */
5631 if (!sapp[root[index]]->master) {
5632 sapp[root[index]]->master =
5633 SeqIdDup(TxGetSubjectIdFromSeqAlign(sapp[index]));
5634 } else {
5635 sip = sapp[root[index]]->master;
5636 while (sip->next)
5637 sip = sip->next;
5638 sip->next = SeqIdDup(TxGetSubjectIdFromSeqAlign(sapp[index]));
5639 }
5640 }
5641 }
5642
5643 for (index=0; index<hspcnt; index++)
5644 BioseqUnlock(bspp[index]);
5645 MemFree(bspp);
5646
5647 ReadDBBioseqFetchDisable();
5648
5649 return head;
5650 }
5651
5652 /* This function does everything necessary to produce a SeqAlign from the
5653 intermediate BLAST results structures;
5654 In case of multiple queries, the SeqAlign array must be allocated outside.
5655 */
5656 void
5657 BLASTPostSearchLogic(BlastSearchBlkPtr search, BLAST_OptionsBlkPtr options,
5658 SeqAlignPtr PNTR seqalignp, Boolean single_chain)
5659 {
5660 BLASTResultHspPtr hsp;
5661 GapXEditBlockPtr edit_block;
5662 SeqIdPtr gi_list=NULL,subject_id;
5663 SeqAlignPtr head,seqalign,tail;
5664 BLASTResultsStructPtr result_struct;
5665 BLASTResultHitlistPtr result_hitlist;
5666 Uint1Ptr sequence;
5667 Int4 length,hitlist_count,sequence_length,index,index1,total_num_hsp=0,hspcnt;
5668 Char buffer[512];
5669 Int2 num_queries, query_index;
5670
5671 /* AM: Support for query multiplexing. */
5672 MQ_DivideResultsInfoPtr subjects = NULL;
5673 Int4 index2;
5674 Int4 nhlists = 0;
5675
5676 head = NULL;
5677 tail = NULL;
5678 if (single_chain)
5679 *seqalignp = NULL;
5680
5681 if (search->prog_number==blast_type_blastn) {
5682 /* Unconcatenate the strands by adjusting the query offsets in
5683 all hsps */
5684 search->context[search->first_context].query->length =
5685 search->query_context_offsets[search->first_context+1] - 1;
5686 }
5687
5688 if (StringCmp(search->prog_name, "blastn") == 0 &&
5689 search->pbp->gapped_calculation)
5690 {
5691 num_queries = search->last_context / 2 + 1;
5692
5693 for (query_index=0; query_index<num_queries; query_index++) {
5694 if (options) {
5695 search->pbp->gap_open = options->gap_open;
5696 search->pbp->gap_extend = options->gap_extend;
5697 }
5698 /*
5699 search->pbp->gap_x_dropoff = (BLAST_Score) (options->gap_x_dropoff*NCBIMATH_LN2 / search->sbp->kbp_gap[search->first_context]->Lambda);
5700 search->pbp->gap_x_dropoff_final = (BLAST_Score) (options->gap_x_dropoff_final*NCBIMATH_LN2 / search->sbp->kbp_gap[search->first_context]->Lambda);
5701 */
5702 if (!search->pbp->mb_params) {
5703 result_struct = search->result_struct;
5704 } else {
5705 result_struct = search->mb_result_struct[query_index];
5706 }
5707
5708 if (!result_struct)
5709 continue;
5710 hitlist_count = result_struct->hitlist_count;
5711
5712 sequence=NULL;
5713 sequence_length=0;
5714
5715 for (index=0; index<hitlist_count; index++)
5716 {
5717 if (search->pbp->mb_params && !search->pbp->mb_params->no_traceback
5718 && !search->pbp->mb_params->use_dyn_prog) {
5719 /* Traceback has already been computed */
5720 seqalign =
5721 MegaBlastGapInfoToSeqAlign(search, index, query_index);
5722 } else {
5723 length = readdb_get_sequence_ex(search->rdfp,
5724 result_struct->results[index]->subject_id,
5725 &sequence, &sequence_length, TRUE);
5726
5727 if (!search->pbp->mb_params) {
5728 /* Traditional Blastn */
5729 seqalign = SumBlastGetGappedAlignmentTraceback(
5730 search, index, FALSE, FALSE,
5731 sequence+1, length);
5732 } else {
5733 /* Mega BLAST with non-greedy extension */
5734 SumBlastGetGappedAlignmentEx(search, index, FALSE, FALSE,
5735 sequence+1, length, TRUE,
5736 &seqalign, NULL, query_index);
5737 }
5738 }
5739 result_struct->results[index]->seqalign = seqalign;
5740 total_num_hsp += result_struct->results[index]->hspcnt;
5741 BLASTResultFreeHsp(result_struct->results[index]);
5742 if (search->pbp->total_hsp_limit > 0 && total_num_hsp > search->pbp->total_hsp_limit)
5743 {
5744 sprintf(buffer, "Only alignments to %ld best database sequences returned", (long) index+1);
5745 BlastConstructErrorMessage("EngineCore", buffer, 1, &(search->error_return));
5746 break;
5747 }
5748 }
5749 sequence = MemFree(sequence);
5750 sequence_length = 0;
5751
5752 HeapSort(result_struct->results, hitlist_count, sizeof(BLASTResultHitlistPtr), evalue_compare_hits);
5753
5754 /*
5755 The next loop organizes the SeqAligns (and the alignments
5756 in the BLAST report) in the same order as the deflines.
5757 */
5758 if (!single_chain) {
5759 head = tail = NULL;
5760 }
5761
5762 /* AM: Support for query multiplexing. */
5763 if( search->mult_queries )
5764 {
5765 subjects = (MQ_DivideResultsInfoPtr)MemNew( sizeof( MQ_DivideResultsInfo )*hitlist_count );
5766 nhlists = hitlist_count;
5767 }
5768 else nhlists = MIN(hitlist_count, options->hitlist_size);
5769
5770 for (index=0, index2 = 0; index<nhlists; index++) {
5771 seqalign = result_struct->results[index]->seqalign;
5772 if (seqalign) {
5773
5774 /* AM: Support for query multiplexing. */
5775 if( search->mult_queries )
5776 {
5777 subjects[index2].evalue
5778 = result_struct->results[index]->best_evalue;
5779 subjects[index2++].subject_id
5780 = result_struct->results[index]->subject_id;
5781 }
5782
5783 if (head == NULL) {
5784 head = seqalign;
5785 } else {
5786 for (; tail->next; tail = tail->next);
5787 tail->next = seqalign;
5788 }
5789 tail = seqalign;
5790 }
5791 }
5792
5793 /* AM: Support for query multiplexing. */
5794 if( search->mult_queries )
5795 {
5796 search->mult_queries->sap_array_data->sap_array
5797 = DivideSeqAligns( options, head,
5798 search->mult_queries, subjects );
5799 subjects = MemFree( subjects );
5800 }
5801
5802 /* Free the extra seqaligns */
5803 for ( ; index < hitlist_count; index++)
5804 result_struct->results[index]->seqalign =
5805 SeqAlignSetFree(result_struct->results[index]->seqalign);
5806
5807 if (!single_chain)
5808 seqalignp[query_index] = head;
5809 }
5810 if (!single_chain)
5811 head = NULL;
5812 }
5813 else if (search->pbp->gapped_calculation)
5814 /* Non-blastn programs */
5815 {
5816 result_struct = search->result_struct;
5817 hitlist_count = result_struct->hitlist_count;
5818
5819 if (!options || !(options->smith_waterman ||
5820 options->tweak_parameters)) {
5821
5822 for (index=0; index<hitlist_count; index++) {
5823 seqalign = BlastGetGapAlgnTbckWithReaddb(search, index, FALSE);
5824 result_struct->results[index]->seqalign = seqalign;
5825 /*BLASTResultFreeHsp(result_struct->results[index]);*/
5826 total_num_hsp += result_struct->results[index]->hspcnt;
5827 if (search->pbp->total_hsp_limit > 0 && total_num_hsp > search->pbp->total_hsp_limit)
5828 {
5829 sprintf(buffer, "Only alignments to %ld best database sequences returned", (long) index+1);
5830 BlastConstructErrorMessage("EngineCore", buffer, 1, &(search->error_return));
5831 break;
5832 }
5833 }
5834 HeapSort(result_struct->results, hitlist_count,
5835 sizeof(BLASTResultHitlistPtr), evalue_compare_hits);
5836
5837 for (index=0; index<hitlist_count; index++) {
5838 BLASTResultFreeHsp(result_struct->results[index]);
5839 }
5840 index = 0;
5841 /* AM: Support for query multiplexing. */
5842 if (search->mult_queries) {
5843 subjects = (MQ_DivideResultsInfoPtr)
5844 MemNew(sizeof(MQ_DivideResultsInfo) * hitlist_count);
5845 nhlists = hitlist_count;
5846 } else {
5847 nhlists = MIN(hitlist_count, options->hitlist_size);
5848 }
5849 /* Only save alignments up to the original hitlist size */
5850 for (index2 = 0; index < nhlists; index++) {
5851 seqalign = result_struct->results[index]->seqalign;
5852 /* AM: Support for query multiplexing. */
5853 if (search->mult_queries) {
5854 subjects[index2].evalue
5855 = result_struct->results[index]->best_evalue;
5856 subjects[index2++].subject_id
5857 = result_struct->results[index]->subject_id;
5858 }
5859 if (seqalign) {
5860 if (head == NULL) {
5861 head = seqalign;
5862 } else {
5863 for ( ; tail->next; tail = tail->next) ;
5864 tail->next = seqalign;
5865 }
5866 tail = seqalign;
5867 }
5868 }
5869 /* AM: Support for query multiplexing. */
5870 if (search->mult_queries) {
5871 search->mult_queries->sap_array_data->sap_array
5872 = DivideSeqAligns(options, head,
5873 search->mult_queries, subjects);
5874 subjects = MemFree(subjects);
5875 }
5876 } else {
5877 SeqAlignPtr * sap =
5878 RedoAlignmentCore(search, options, hitlist_count,
5879 options->tweak_parameters,
5880 options->smith_waterman);
5881 if (search->mult_queries) {
5882 int query_index;
5883 search->mult_queries->sap_array_data->sap_array = sap;
5884 /* Set head to the first non-null list of seqAligns; or
5885 * NULL if none exists -- a nonnil value of head
5886 * indicates that results were found, even if head
5887 * doesn't contain all results. */
5888 head = NULL;
5889 for (query_index = 0;
5890 query_index < search->mult_queries->NumQueries;
5891 query_index++) {
5892 if (sap[query_index] != NULL) {
5893 head = sap[query_index];
5894 break;
5895 }
5896 }
5897 } else {
5898 if (sap)
5899 {
5900 head = sap[0];
5901 MemFree(sap);
5902 }
5903 }
5904 for (index=0; index<hitlist_count; index++) {
5905 BLASTResultFreeHsp(result_struct->results[index]);
5906 }
5907 /* Set index = 0 to free any seqaligns remaining in the
5908 * results_struct; RedoAlignmentCore doesn't add seqaligns
5909 * to results_struct, so if there are any they are the
5910 * result of an early phase of blast. */
5911 index = 0;
5912 }
5913 /* Free the unused seqaligns (all in case of PSI BLAST,
5914 extra ones otherwise */
5915 for ( ; index<hitlist_count; index++) {
5916 result_struct->results[index]->seqalign =
5917 SeqAlignSetFree(result_struct->results[index]->seqalign);
5918 }
5919 search->number_of_seqs_better_E =
5920 MIN(hitlist_count, options->hitlist_size);
5921 } else {
5922 /* Ungapped psi-blast. */
5923 if (StringCmp("blastp", search->prog_name) == 0 && search->positionBased == TRUE)
5924 {
5925 result_struct = search->result_struct;
5926 hitlist_count = result_struct->hitlist_count;
5927 for (index=0; index<hitlist_count; index++)
5928 {
5929 result_hitlist = result_struct->results[index];
5930 gi_list = BlastGetAllowedGis(search, result_hitlist->subject_id, NULL);
5931 hspcnt = result_hitlist->hspcnt;
5932 subject_id = BlastGetSubjectId(search, index, TRUE, NULL);
5933 for (index1=0; index1<hspcnt; index1++)
5934 {
5935 hsp = &(result_hitlist->hsp_array[index1]);
5936 edit_block = SimpleIntervalToGapXEditBlock(hsp->query_offset, hsp->subject_offset, hsp->query_length);
5937 seqalign = GapXEditBlockToSeqAlign(edit_block, SeqIdDup(subject_id), SeqIdDup(search->query_id));
5938 seqalign->score = GetScoreSetFromBlastResultHsp(hsp, gi_list);
5939
5940 if (head == NULL) {
5941 head = seqalign;
5942 } else {
5943 for ( ; tail->next; tail = tail->next);
5944 tail->next = seqalign;
5945 }
5946 tail = seqalign;
5947 total_num_hsp += result_struct->results[index]->hspcnt;
5948 if (search->pbp->total_hsp_limit > 0 && total_num_hsp > search->pbp->total_hsp_limit)
5949 {
5950 sprintf(buffer, "Only alignments to %ld best database sequences returned", (long) index+1);
5951 BlastConstructErrorMessage("EngineCore", buffer, 1, &(search->error_return));
5952 break;
5953 }
5954 }
5955 }
5956 } /* end blastp */
5957 else
5958 {
5959 Boolean discontinuous = ((options != NULL) && options->discontinuous);
5960 if (StringCmp("blastn", search->prog_name) == 0 || StringCmp("blastp", search->prog_name) == 0)
5961 head = GetSeqAlignForResultHitList(search, TRUE, FALSE, discontinuous, FALSE, FALSE);
5962 else
5963 head = GetSeqAlignForResultHitList(search, FALSE, FALSE, discontinuous, FALSE, FALSE);
5964 }
5965 }
5966
5967 gi_list = SeqIdSetFree(gi_list);
5968
5969 if (head)
5970 *seqalignp = head;
5971 }
5972
5973 /*return query fasta style title(id+title). New memory was allocated for this title*/
5974 CharPtr getFastaStyleTitle(BioseqPtr bsp){
5975
5976 CharPtr titleBuf=NULL;
5977
5978 if(bsp){
5979
5980 if (bsp->id)
5981 {
5982 Char idBuf[BUFFER_LENGTH+1];
5983 CharPtr seqTitle, temp=NULL;
5984
5985 SeqIdWrite(bsp->id, idBuf, PRINTID_FASTA_LONG, BUFFER_LENGTH);
5986 temp=idBuf;
5987 if (StringNCmp(idBuf, "lcl|", 4) == 0)
5988 temp=StrStr(idBuf, "gi|");
5989
5990 titleBuf=MemNew(sizeof(Char)*(BUFFER_LENGTH+1));
5991 StringCpy(titleBuf,temp);
5992 seqTitle= BioseqGetTitle(bsp);
5993 StringNCat(titleBuf, seqTitle, BUFFER_LENGTH-StringLen(temp));
5994
5995 }
5996 }
5997 return titleBuf;
5998 }
5999
6000 Int2
6001 BlastHSPGetNumIdentical(BlastSearchBlkPtr search, BLAST_HSPPtr hsp,
6002 BLASTResultHspPtr result_hsp,
6003 Int4Ptr num_ident_ptr, Int4Ptr align_length_ptr)
6004 {
6005 Int4 i, num_ident, align_length, q_off, s_off;
6006 Int2 context;
6007 Uint1Ptr query_seq, subject_seq = NULL, q, s;
6008 GapXEditBlockPtr gap_info;
6009 GapXEditScriptPtr esp;
6010
6011 gap_info = (hsp ? hsp->gap_info : result_hsp->gap_info);
6012
6013 if (!gap_info && search->pbp->gapped_calculation)
6014 return -1;
6015
6016 context = (hsp ? hsp->context : result_hsp->context);
6017 q_off = (hsp ? hsp->query.offset : result_hsp->query_offset);
6018 s_off = (hsp ? hsp->subject.offset : result_hsp->subject_offset);
6019
6020 if (search->pbp->is_ooframe)
6021 q_off /= CODON_LENGTH;
6022
6023 query_seq = search->context[context].query->sequence;
6024 if (search->subject->sequence_start)
6025 subject_seq = search->subject->sequence_start + 1;
6026 else
6027 subject_seq = search->subject->sequence;
6028
6029 if (!subject_seq)
6030 return -1;
6031
6032 q = &query_seq[q_off];
6033 s = &subject_seq[s_off];
6034
6035 num_ident = 0;
6036 align_length = 0;
6037
6038
6039 if (!gap_info) {
6040 /* Ungapped case */
6041 align_length = (hsp ? hsp->query.length : result_hsp->query_length);
6042 for (i=0; i<align_length; i++) {
6043 if (*q++ == *s++)
6044 num_ident++;
6045 }
6046 } else {
6047 for (esp = gap_info->esp; esp; esp = esp->next) {
6048 align_length += esp->num;
6049 switch (esp->op_type) {
6050 case GAPALIGN_SUB:
6051 for (i=0; i<esp->num; i++) {
6052 if (*q++ == *s++)
6053 num_ident++;
6054 }
6055 break;
6056 case GAPALIGN_DEL:
6057 s += esp->num;
6058 break;
6059 case GAPALIGN_INS:
6060 q += esp->num;
6061 break;
6062 default:
6063 s += esp->num;
6064 q += esp->num;
6065 break;
6066 }
6067 }
6068 }
6069
6070 *align_length_ptr = align_length;
6071 *num_ident_ptr = num_ident;
6072 return 0;
6073 }
6074
6075 Int2
6076 OOFBlastHSPGetNumIdentical(Uint1Ptr query_seq, Uint1Ptr subject_seq,
6077 BLAST_HSPPtr hsp, BLASTResultHspPtr result_hsp,
6078 Int4Ptr num_ident_ptr, Int4Ptr align_length_ptr)
6079 {
6080 Int4 i, num_ident, align_length, q_off, s_off;
6081 Int2 context;
6082 Uint1Ptr q, s;
6083 GapXEditBlockPtr gap_info;
6084 GapXEditScriptPtr esp;
6085
6086 gap_info = (hsp ? hsp->gap_info : result_hsp->gap_info);
6087
6088 if (!gap_info)
6089 return -1;
6090
6091 context = (hsp ? hsp->context : result_hsp->context);
6092 s_off = (hsp ? hsp->query.offset : result_hsp->query_offset);
6093 q_off = (hsp ? hsp->subject.offset : result_hsp->subject_offset);
6094
6095 if (!subject_seq || !query_seq)
6096 return -1;
6097
6098 q = &query_seq[q_off];
6099 s = &subject_seq[s_off];
6100
6101 num_ident = 0;
6102 align_length = 0;
6103
6104
6105 for (esp = gap_info->esp; esp; esp = esp->next) {
6106 switch (esp->op_type) {
6107 case 3: /* Substitution */
6108 align_length += esp->num;
6109 for (i=0; i<esp->num; i++) {
6110 if (*q == *s)
6111 num_ident++;
6112 ++q;
6113 s += CODON_LENGTH;
6114 }
6115 break;
6116 case 6: /* Insertion */
6117 align_length += esp->num;
6118 s += esp->num * CODON_LENGTH;
6119 break;
6120 case 0: /* Deletion */
6121 align_length += esp->num;
6122 q += esp->num;
6123 break;
6124 case 1: /* Gap of two nucleotides. */
6125 s -= 2;
6126 break;
6127 case 2: /* Gap of one nucleotide. */
6128 s -= 1;
6129 break;
6130 case 4: /* Insertion of one nucleotide. */
6131 s += 1;
6132 break;
6133 case 5: /* Insertion of two nucleotides. */
6134 s += 2;
6135 break;
6136 default:
6137 s += esp->num * CODON_LENGTH;
6138 q += esp->num;
6139 break;
6140 }
6141 }
6142
6143 *align_length_ptr = align_length;
6144 *num_ident_ptr = num_ident;
6145 return 0;
6146 }
6147
6148
6149 /* --------------------------------------------------------------------
6150 *
6151 * BLAST_Wizard & related functions
6152 *
6153 * --------------------------------------------------------------------
6154 */
6155
6156 void
6157 BLAST_WizardOptionsBlkInit(BLAST_WizardOptionsBlkPtr options)
6158 {
6159 memset(options, 0, sizeof(BLAST_WizardOptionsBlk));
6160 }
6161
6162 void
6163 BLAST_WizardOptionsBlkDone(BLAST_WizardOptionsBlkPtr options)
6164 {
6165 if(options->entrez_query) {
6166 MemFree(options->entrez_query);
6167 options->entrez_query = 0;
6168 }
6169 if(options->filter_string) {
6170 MemFree(options->filter_string);
6171 options->filter_string = 0;
6172 }
6173 ValNodeFree(options->gilist);
6174 options->gilist = 0;
6175 if(options->matrix) {
6176 MemFree(options->matrix);
6177 options->matrix = 0;
6178 }
6179 if(options->phi_pattern) {
6180 MemFree(options->phi_pattern);
6181 options->phi_pattern = 0;
6182 }
6183 if(options->query_lcase_mask) {
6184 ValNodeFreeData(options->query_lcase_mask);
6185 options->query_lcase_mask = 0;
6186 }
6187 if(options->string_options) {
6188 MemFree(options->string_options);
6189 options->string_options = 0;
6190 }
6191 }
6192
6193 void
6194 BLAST_WizardOptionsMaskInit(BLAST_WizardOptionsMaskPtr mask)
6195 {
6196 mask->block_width = FALSE;
6197 mask->cutoff_s = FALSE;
6198 mask->db_genetic_code = FALSE;
6199 mask->ethresh = FALSE;
6200 mask->expect_value = FALSE;
6201 mask->first_db_seq = FALSE;
6202 mask->final_db_seq = FALSE;
6203 mask->gap_extend = FALSE;
6204 mask->gap_open = FALSE;
6205 mask->gapped_calculation = FALSE;
6206 mask->genetic_code = FALSE;
6207 mask->hitlist_size = FALSE;
6208 mask->hsp_range_max = FALSE;
6209 mask->is_ooframe = FALSE;
6210 mask->mb_disc_type = FALSE;
6211 mask->mb_template_length = FALSE;
6212 mask->no_traceback = FALSE;
6213 mask->penalty = FALSE;
6214 mask->perc_identity = FALSE;
6215 mask->perform_culling = FALSE;
6216 mask->pseudoCountConst = FALSE;
6217 mask->required_end = FALSE;
6218 mask->required_start = FALSE;
6219 mask->reward = FALSE;
6220 mask->db_length = FALSE;
6221 mask->searchsp_eff = FALSE;
6222 mask->smith_waterman = FALSE;
6223 mask->strand_option = FALSE;
6224 mask->threshold_first = FALSE;
6225 mask->threshold_second = FALSE;
6226 mask->tweak_parameters = FALSE;
6227 mask->use_best_align = FALSE;
6228 mask->use_real_db_size = FALSE;
6229 mask->window_size = FALSE;
6230 mask->wordsize = FALSE;
6231
6232 mask->two_hits = FALSE;
6233 }
6234
6235 BLAST_OptionsBlkPtr
6236 BLAST_Wizard(
6237 const char* program,
6238 const char* service,
6239 BLAST_WizardOptionsBlkPtr options,
6240 BLAST_WizardOptionsMaskPtr mask,
6241 int* alignments,
6242 int* descriptions,
6243 char** error)
6244 {
6245 int dummy_alignments = -1;
6246 int dummy_descriptions = -1;
6247 BLAST_OptionsBlkPtr out = 0;
6248 Boolean ungapped_alignment;
6249 *error = 0;
6250 if(!alignments)
6251 alignments = &dummy_alignments;
6252 if(!descriptions)
6253 descriptions = &dummy_descriptions;
6254 if(mask->expect_value && mask->cutoff_s) {
6255 *error = StringSave("can't specify both expect_value and cutoff_s");
6256 goto end;
6257 }
6258 ungapped_alignment = mask->gapped_calculation ?
6259 !options->gapped_calculation :
6260 FALSE;
6261 if(!StringCmp(service, "vecscreen")) {
6262 out = VSBlastOptionNew();
6263 out->gapped_calculation = !ungapped_alignment;
6264 }
6265 else {
6266 Boolean is_megablast_search = !StringICmp(service, "megablast");
6267 out = BLASTOptionNewEx(
6268 (char*) program, !ungapped_alignment, is_megablast_search);
6269
6270 /* set some defaults for backward compat. with blastcgicmd.cpp */
6271 if(!StringCmp(service, "psi"))
6272 out->ethresh = 0.002;
6273 else if (!StringCmp(service, "rpsblast"))
6274 out->is_rps_blast = TRUE;
6275
6276 /* ignore some fields for backward compat. with blastcgicmd.cpp */
6277 if(is_megablast_search)
6278 mask->block_width = FALSE;
6279 else {
6280 mask->mb_disc_type = FALSE;
6281 mask->mb_template_length = FALSE;
6282 mask->no_traceback = FALSE;
6283 mask->two_hits = FALSE;
6284 }
6285 }
6286 if(out->is_megablast_search) {
6287 out->no_traceback = mask->no_traceback ?
6288 options->no_traceback :
6289 FALSE;
6290 }
6291 if(mask->is_ooframe)
6292 out->is_ooframe = options->is_ooframe;
6293 if(mask->first_db_seq)
6294 out->first_db_seq = options->first_db_seq;
6295 if(mask->final_db_seq)
6296 out->final_db_seq = options->final_db_seq;
6297 if(mask->pseudoCountConst)
6298 out->pseudoCountConst = options->pseudoCountConst;
6299 if(mask->strand_option)
6300 out->strand_option = options->strand_option;
6301 if(mask->use_best_align)
6302 out->use_best_align = options->use_best_align;
6303 if(mask->use_real_db_size)
6304 out->use_real_db_size = options->use_real_db_size;
6305 if(mask->window_size)
6306 out->window_size = options->window_size;
6307 if(mask->expect_value)
6308 out->expect_value = options->expect_value;
6309 if(mask->cutoff_s)
6310 out->cutoff_s = options->cutoff_s;
6311 if(mask->threshold_first)
6312 out->threshold_first = options->threshold_first;
6313 if(mask->threshold_second)
6314 out->threshold_second = options->threshold_second;
6315 if(mask->perc_identity)
6316 out->perc_identity = options->perc_identity;
6317 if(out->is_megablast_search) {
6318 if (out->perc_identity >= 95.0) {
6319 out->reward = 1;
6320 out->penalty = -3;
6321 }
6322 else if(out->perc_identity >= 85.0
6323 || out->perc_identity == 0.0) {
6324 out->reward = 1;
6325 out->penalty = -2;
6326 }
6327 else if (out->perc_identity >= 80.0) {
6328 out->reward = 2;
6329 out->penalty = -3;
6330 }
6331 else if (out->perc_identity >= 75.0) {
6332 out->reward = 4;
6333 out->penalty = -5;
6334 }
6335 else if (out->perc_identity >= 60.0) {
6336 out->reward = 2;
6337 out->penalty = -2;
6338 }
6339 }
6340 if(mask->penalty)
6341 out->penalty = options->penalty;
6342 if(mask->reward)
6343 out->reward = options->reward;
6344 if(mask->wordsize)
6345 out->wordsize = options->wordsize;
6346 if(mask->db_length)
6347 out->db_length = options->db_length;
6348 if(mask->searchsp_eff)
6349 out->searchsp_eff = options->searchsp_eff;
6350 if(mask->hsp_range_max)
6351 out->hsp_range_max = options->hsp_range_max;
6352 if(!out->is_megablast_search) {
6353 if(mask->block_width)
6354 out->block_width = options->block_width;
6355 }
6356 else {
6357 if(mask->mb_template_length)
6358 out->mb_template_length = options->mb_template_length;
6359 if(mask->two_hits && options->two_hits)
6360 out->window_size = 40;
6361 if(mask->mb_disc_type)
6362 out->mb_disc_type = options->mb_disc_type;
6363 if(out->mb_template_length > 0
6364 && out->wordsize <= 12
6365 && out->wordsize >= 11
6366 && out->perc_identity < 90.0) {
6367 if(out->perc_identity >= 85.0) {
6368 out->gap_open = 4;
6369 out->gap_extend = 1;
6370 }
6371 else if(out->perc_identity >= 80.0) {
6372 out->gap_open = 5;
6373 out->gap_extend = 2;
6374 }
6375 else if(out->perc_identity >= 75) {
6376 out->gap_open = 7;
6377 out->gap_extend = 2;
6378 }
6379 else if(out->perc_identity >= 60) {
6380 out->gap_open = 3;
6381 out->gap_extend = 1;
6382 }
6383 }
6384 }
6385 if(!out->perform_culling)
6386 out->perform_culling = mask->perform_culling ?
6387 options->perform_culling :
6388 FALSE;
6389 if(mask->gap_open)
6390 out->gap_open = options->gap_open;
6391 if(mask->gap_extend)
6392 out->gap_extend = options->gap_extend;
6393 if(mask->genetic_code)
6394 out->genetic_code = options->genetic_code;
6395 if(mask->db_genetic_code)
6396 out->db_genetic_code = options->db_genetic_code;
6397 if(!out->tweak_parameters)
6398 out->tweak_parameters = mask->tweak_parameters ?
6399 options->tweak_parameters :
6400 FALSE;
6401 if(!strcmp(service, "psi")) {
6402 out->ethresh = mask->ethresh ?
6403 options->ethresh :
6404 0.002;
6405 out->tweak_parameters = mask->tweak_parameters ?
6406 options->tweak_parameters :
6407 TRUE;
6408 }
6409 if(options->entrez_query) {
6410 out->entrez_query = options->entrez_query; /* take ownership */
6411 options->entrez_query = 0;
6412 out->gifile = 0;
6413 }
6414 if(options->matrix) {
6415 out->matrix = options->matrix; /* take ownership */
6416 options->matrix = 0;
6417 }
6418 if(options->phi_pattern) {
6419 out->phi_pattern = options->phi_pattern; /* take ownership */
6420 options->phi_pattern = 0;
6421 }
6422 if(out->is_megablast_search) {
6423 if(out->wordsize < 8)
6424 out->wordsize = 8;
6425 out->cutoff_s2 = out->wordsize * out->reward;
6426 }
6427 out->hitlist_size = -1;
6428 if(mask->hitlist_size) {
6429 out->hitlist_size = options->hitlist_size;
6430 *alignments = MIN(out->hitlist_size, *alignments);
6431 *descriptions = MIN(out->hitlist_size, *descriptions);
6432 }
6433 else
6434 out->hitlist_size = -1;
6435 if(options->string_options) {
6436 int other_adv_descriptions = -1;
6437 int other_adv_alignments = -1;
6438 parse_blast_options(
6439 out,
6440 options->string_options,
6441 error,
6442 NULL,
6443 &other_adv_descriptions,
6444 &other_adv_alignments);
6445 if(*error)
6446 goto end;
6447 if (other_adv_alignments != -1) {
6448 *alignments = other_adv_alignments;
6449 out->hitlist_size =
6450 MAX(out->hitlist_size, other_adv_alignments);
6451 }
6452 if (other_adv_descriptions != -1) {
6453 *descriptions = other_adv_descriptions;
6454 out->hitlist_size =
6455 MAX(out->hitlist_size, other_adv_descriptions);
6456 }
6457 if(!StringCmp(out->filter_string, "T")) {
6458 MemFree(out->filter_string);
6459 if(StringICmp(program, "blastn"))
6460 out->filter_string = StringSave("S");
6461 else
6462 out->filter_string = StringSave("D");
6463 }
6464 }
6465 /* append ; for backward compat. with blastcgicmd.cpp */
6466 if(out->filter_string) {
6467 char buf[10240];
6468 if(strlen(out->filter_string)+2 > sizeof buf) {
6469 *error = StringSave("internal error: buffer too small");
6470 goto end;
6471 }
6472 sprintf(buf, "%s;", out->filter_string);
6473 MemFree(out->filter_string);
6474 out->filter_string = StringSave(buf);
6475 }
6476 if(options->filter_string) {
6477 char buf[10240];
6478 char* s = out->filter_string ? out->filter_string : "";
6479 if(strlen(s)+strlen(options->filter_string)+1 > sizeof buf) {
6480 *error = StringSave("internal error: buffer too small");
6481 goto end;
6482 }
6483 sprintf(buf, "%s%s", s, options->filter_string);
6484 if(out->filter_string)
6485 MemFree(out->filter_string);
6486 out->filter_string = StringSave(buf);
6487 }
6488 if(out->hitlist_size != -1) {
6489 if(*descriptions == -1)
6490 *descriptions = MIN(100, out->hitlist_size);
6491 if(*alignments == -1)
6492 *alignments = MIN(100, out->hitlist_size);
6493 } else {
6494 if(*descriptions == -1)
6495 *descriptions = 100;
6496 if(*alignments == -1)
6497 *alignments = 100;
6498 out->hitlist_size = MAX(*descriptions, *alignments);
6499 }
6500 if(!strcmp(service, "vecscreen"))
6501 out->hitlist_size = 1000;
6502 if(mask->required_start)
6503 out->required_start = options->required_start;
6504 if(mask->required_end)
6505 out->required_end = options->required_end;
6506 else
6507 out->required_end = 0; /* backward compat. with blastcgicmd.cpp */
6508 if(options->gilist) {
6509 out->gilist = options->gilist; /* take ownership */
6510 options->gilist = 0;
6511 }
6512 if (options->query_lcase_mask) {
6513 out->query_lcase_mask = options->query_lcase_mask; /* take ownership */
6514 options->query_lcase_mask = 0;
6515 }
6516 end:
6517 if(*error) {
6518 BLASTOptionDelete(out);
6519 out = 0;
6520 }
6521 return out;
6522 }
6523 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |