|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/demo/blastclust.c |
source navigation diff markup identifier search freetext search file search |
1 static char const rcsid[] = "$Id: blastclust.c,v 6.49 2007/12/21 14:00:47 madden Exp $";
2
3 /* $RCSfile: blastclust.c,v $ $Revision: 6.49 $ $Date: 2007/12/21 14:00:47 $
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * Author: Ilya Dondoshansky
29 *
30 * File Description:
31 * Program to perform a single-linkage clustering of a set of protein or DNA
32 * sequences. See file README.bcl for description
33 *
34 * ---------------------------------------------------------------------------
35 * $Log: blastclust.c,v $
36 * Revision 6.49 2007/12/21 14:00:47 madden
37 * Exit if query cannot be processed, JIRA SB-32
38 *
39 * Revision 6.48 2005/08/19 20:12:32 dondosha
40 * Added extensive doxygen-style comments
41 *
42 * Revision 6.47 2005/07/28 16:16:46 coulouri
43 * remove dead code
44 *
45 * Revision 6.46 2005/07/27 15:52:02 coulouri
46 * remove unused queue_callback
47 *
48 * Revision 6.45 2004/08/02 20:08:49 dondosha
49 * Write and read header integers separately instead of as a structure; changed first number from 1 byte to 4, to prevent padding with garbage
50 *
51 * Revision 6.44 2004/06/30 21:03:57 madden
52 * Add include for blfmtutl.h
53 *
54 * Revision 6.43 2004/05/03 19:59:48 dondosha
55 * Display version number when run with no arguments
56 *
57 * Revision 6.42 2004/02/24 15:18:15 dondosha
58 * Fixed several memory leaks for nucleotide clustering
59 *
60 * Revision 6.41 2004/02/18 20:42:59 dondosha
61 * Cleaned up how id strings are retrieved for output
62 *
63 * Revision 6.40 2004/02/18 15:18:46 dondosha
64 * Minor fix when freeing title strings
65 *
66 * Revision 6.39 2003/07/22 17:59:48 dondosha
67 * Added call to BlastErrorPrint to make warnings more informative
68 *
69 * Revision 6.38 2003/06/13 19:59:29 dondosha
70 * Fixed 2 purify errors
71 *
72 * Revision 6.37 2003/05/30 17:31:09 coulouri
73 * add rcsid
74 *
75 * Revision 6.36 2003/05/13 16:02:42 coulouri
76 * make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
77 *
78 * Revision 6.35 2003/05/06 20:21:13 dondosha
79 * Typo correction
80 *
81 * Revision 6.34 2003/05/06 20:19:45 dondosha
82 * Changed a confusing error message to a meaningful one
83 *
84 * Revision 6.33 2002/10/08 15:32:42 dondosha
85 * Corrected file description comment in the NCBI header
86 *
87 * Revision 6.32 2002/09/03 16:01:11 dondosha
88 * Introduced a restriction on the total length of concatenated sequences for nucleotide case
89 *
90 * Revision 6.31 2002/05/30 16:20:16 dondosha
91 * 1. Correction in evaluation of hits for DNA case,
92 * 2. Added word size parameter to make DNA clustering more flexible
93 *
94 * Revision 6.30 2002/03/14 21:07:11 dondosha
95 * Slight improvement in finding the used id list from a gi list when reclustering
96 *
97 * Revision 6.29 2002/02/11 21:32:10 dondosha
98 * Free Bioseqs after use to avoid accumulation
99 *
100 * Revision 6.28 2001/11/16 20:34:17 dondosha
101 * Bug fix: BioseqUnlock was missing; also added ObjMgrFreeCache calls
102 *
103 * Revision 6.27 2001/08/15 17:49:08 dondosha
104 * Correction in previous change
105 *
106 * Revision 6.26 2001/08/15 17:27:17 dondosha
107 * Consider and save only the top HSP for each pair of sequences being compared
108 *
109 * Revision 6.25 2001/07/26 18:23:48 dondosha
110 * Added a -c option: configuration file with advanced BLAST options
111 *
112 * Revision 6.24 2001/02/13 16:26:03 dondosha
113 * Bug fix: memory freed in the wrong place
114 *
115 * Revision 6.23 2001/01/31 19:48:14 dondosha
116 * Change #define TMPDIR to environment variable
117 *
118 * Revision 6.22 2001/01/26 20:13:05 dondosha
119 * If at least one sequence id non-numeric, make them all non-numeric
120 *
121 * Revision 6.21 2000/12/19 18:52:38 dondosha
122 * Added option to disallow id parsing when formatting database
123 *
124 * Revision 6.20 2000/12/14 20:51:14 dondosha
125 * Do several Mega BLAST searches if more than 16383 sequences in input
126 *
127 * Revision 6.19 2000/12/07 22:57:34 dondosha
128 * Implemented nucleotide version using Mega BLAST
129 *
130 * Revision 6.18 2000/12/01 17:57:04 dondosha
131 * Handle local and general ids differently to avoid non-informative output
132 *
133 * Revision 6.17 2000/09/01 18:30:55 dondosha
134 * Create database memory map only once for all searches
135 *
136 * Revision 6.16 2000/08/08 17:58:55 dondosha
137 * Change back to create database with indexes
138 *
139 * Revision 6.15 2000/08/07 19:40:57 dondosha
140 * Removed sequence lengths from ClusterLogInfo structure - redundant information
141 *
142 * Revision 6.14 2000/08/07 15:13:36 dondosha
143 * Changed comment for -S option
144 *
145 * Revision 6.13 2000/08/04 23:24:40 dondosha
146 * Added functionality to choose neighbors based on percentage of identities
147 *
148 * Revision 6.12 2000/06/08 20:40:15 dondosha
149 * In case of equal lengths of sequences within a cluster or equal number of cluster elements, use alphabetic or numeric order when sorting ids and clusters
150 *
151 * Revision 6.11 2000/06/08 13:34:57 dondosha
152 * Increased buffer size for reading id file
153 *
154 * Revision 6.10 2000/06/07 21:57:04 dondosha
155 * Added option to provide a file with id list for reclustering
156 *
157 * Revision 6.9 2000/06/07 14:23:20 dondosha
158 * Changed ctime_r call to portable DayTimeStr
159 *
160 * Revision 6.8 2000/06/06 19:32:41 dondosha
161 * Changed progress interval back to 1000
162 *
163 * Revision 6.7 2000/06/06 19:23:42 dondosha
164 * Added option to finish incomplete clustering from the point where previous search ended
165 *
166 * Revision 6.5 2000/05/22 18:19:59 dondosha
167 * In case search set up fails, destruct all necessary stuff before going to next query
168 *
169 * Revision 6.4 2000/05/17 17:44:50 dondosha
170 * Cleaned from unused variables
171 *
172 * Revision 6.3 2000/05/05 18:16:56 dondosha
173 * Enhanced to process all types of SeqIds
174 *
175 * Revision 6.2 2000/05/03 16:50:40 dondosha
176 * Removed system calls, added sorting of clusters by size, sequences within clusters by length
177 *
178 * Revision 6.1 2000/04/27 14:47:18 dondosha
179 * Clustering of protein neighbours - initial revision
180 *
181 * Revision 1.1 2000/04/12 12:53:04 dondosha
182 * Clustering of protein neighbors
183 *
184 * ===========================================================================
185 */
186 #include <ncbi.h>
187 #include <objseq.h>
188 #include <objsset.h>
189 #include <sequtil.h>
190 #include <seqport.h>
191 #include <tofasta.h>
192 #include <blast.h>
193 #include <blastpri.h>
194 #include <simutil.h>
195 #include <txalign.h>
196 #include <gapxdrop.h>
197 #include <sqnutils.h>
198 #include <mbalign.h>
199 #include <mblast.h>
200 #include <blfmtutl.h>
201
202 #define DEBUG 0
203
204 /** File to write cluster output. */
205 FILE *global_fp=NULL;
206 /** Array of indices pointing to root sequences of each cluster. */
207 static Int4Ptr root;
208 /** Mutex for modifying the cluster root array. */
209 static TNlmMutex root_mutex;
210
211 /** HSP information, used internally. */
212 typedef struct blastclust_hsp_info
213 {
214 Int4 id; /**< Subject ordinal number. */
215 Int4 score; /**< HSP score */
216 Int4 index;
217 } BlastClustHspInfo, PNTR BlastClustHspInfoPtr;
218
219 /** User parameters for clustering. */
220 typedef struct cluster_parameters
221 {
222 Boolean bidirectional; /**< Is bydirectional coverage required? */
223 FloatHi length_threshold; /**< Length coverage threshold. */
224 FloatHi score_threshold; /**< Score coverage threshold. */
225 FILE *logfp; /**< File to write hit data to. */
226 } ClusterParameters, PNTR ClusterParametersPtr;
227
228 /** Globally accessible parameters structure. */
229 static ClusterParametersPtr global_parameters;
230
231 /** Basic structure written to the hit file. */
232 typedef struct cluster_log_info
233 {
234 Int4 id1; /**< Index of the first sequence for this HSP. */
235 Int4 id2; /**< Index of the second sequence for this HSP. */
236 Int4 hsp_length1; /**< Length of this HSP in first sequence. */
237 Int4 hsp_length2; /**< Length of this HSP in second sequence. */
238 FloatHi bit_score;/**< This HSP's bit score. */
239 FloatHi perc_identity; /**< This HSP's percent identity. */
240 } ClusterLogInfo, PNTR ClusterLogInfoPtr;
241
242 /** Input sequence information. */
243 typedef struct blast_cluster_element
244 {
245 Int4 gi; /**< Gi or other numeric identifier. */
246 CharPtr id;/**< String identifier. */
247 Int4 len; /**< Sequence length. */
248 } BlastClusterElement, PNTR BlastClusterElementPtr;
249
250 /** Information about all input sequences. */
251 typedef struct blast_cluster
252 {
253 Int4 size; /** Number of sequences. */
254 BlastClusterElementPtr PNTR elements; /** Array of pointers to input sequence
255 information structures. */
256 } BlastCluster, PNTR BlastClusterPtr;
257
258 /** Comparison function for sorting input sequences in the clusters.
259 * Sequences are sorted as follows, with decreasing priorities
260 * 1. Decreasing order of length
261 * 2. Increasing order of numeric identifiers
262 * 3. Lexicographic order of string identifiers.
263 */
264 static int LIBCALLBACK
265 compare_cluster_elements(VoidPtr v1, VoidPtr v2)
266 {
267 BlastClusterElementPtr e1, e2;
268
269 e1 = *(BlastClusterElementPtr PNTR) v1;
270 e2 = *(BlastClusterElementPtr PNTR) v2;
271
272 if (e1->len > e2->len)
273 return -1;
274 else if (e1->len < e2->len)
275 return 1;
276 else if (e1->gi > 0 && e2->gi>0) {
277 if (e1->gi < e2->gi)
278 return -1;
279 else if (e1->gi > e2->gi)
280 return 1;
281 } else
282 return StrCmp(e1->id, e2->id);
283
284 return 0;
285 }
286
287 /** Comparison function for sorting clusters by size. Sequences within clusters
288 * are sorted in a specific order @sa compare_cluster_elements.
289 */
290 static int LIBCALLBACK
291 compare_clusters(VoidPtr v1, VoidPtr v2)
292 {
293 BlastClusterPtr c1, c2;
294
295 c1 = *(BlastClusterPtr PNTR) v1;
296 c2 = *(BlastClusterPtr PNTR) v2;
297
298 if (c1->size > c2->size)
299 return -1;
300 else if (c1->size < c2->size)
301 return 1;
302 else
303 return compare_cluster_elements(c1->elements, c2->elements);
304 }
305
306 /** Comparison function for sorting HSP information structures. Sorting is
307 * performed according to the following rules with decreasing priority:
308 * 1. In increasing order of sequence index;
309 * 2. In decreasing order of scores.
310 */
311 static int LIBCALLBACK
312 hsp_info_id_score_compare(VoidPtr v1, VoidPtr v2)
313 {
314 BlastClustHspInfoPtr c1, c2;
315
316 c1 = *(BlastClustHspInfoPtr PNTR) v1;
317 c2 = *(BlastClustHspInfoPtr PNTR) v2;
318
319 if (c1->id == c2->id && c1->score == c2->score)
320 return 0;
321 else if ((c1->id < c2->id) ||
322 (c1->id == c2->id && c1->score > c2->score))
323 return -1;
324 else
325 return 1;
326 }
327
328 /** Starting size for a cluster, which can be dynamically reallocated when
329 * necessary.
330 */
331 #define ORIGINAL_CLUSTER_SIZE 10
332
333 /** Creates the clusters, based on the root array, and prints them to the
334 * cluster output file.
335 * @param num_queries Number of sequences being clustered. [in]
336 * @param seq_len Array of sequence lengths [in]
337 * @param id_list Array of sequence string identifiers [in]
338 * @param gi_list array of sequence numeric identifiers [in]
339 * @param used_id_index Array of flags indicating whether id with a given index
340 * has already been processed. [in]
341 */
342 static int
343 BlastClusterNeighbours(Int4 num_queries, Int4Ptr seq_len,
344 CharPtr PNTR id_list, Int4Ptr gi_list,
345 Boolean PNTR used_id_index)
346 {
347 BlastClusterPtr PNTR cluster;
348 Int4 num_clusters, available_size, index, i;
349 Boolean numeric_id_type;
350
351 if (gi_list)
352 numeric_id_type = TRUE;
353 else if (id_list)
354 numeric_id_type = FALSE;
355 else
356 return 0;
357
358 for (index=1; index<num_queries; index++) {
359 i = index;
360 while (root[i] != i)
361 i = root[i];
362 root[index] = i;
363 }
364
365 cluster = (BlastClusterPtr PNTR)
366 MemNew(num_queries*sizeof(BlastClusterPtr));
367 num_clusters = 0;
368
369 /* Loop over input sequence indices. */
370 for (index = 0; index < num_queries; index++) {
371 /* If this sequence hasn't been processed yet, skip it. */
372 if (used_id_index != NULL && !used_id_index[index])
373 continue;
374 /* If this sequence is the root of its corresponding cluster,
375 start a new cluster. */
376 if (root[index] == index) {
377 cluster[num_clusters] =
378 (BlastClusterPtr) MemNew(sizeof(BlastCluster));
379 cluster[num_clusters]->size = 1;
380 cluster[num_clusters]->elements =
381 (BlastClusterElementPtr PNTR)
382 Malloc(ORIGINAL_CLUSTER_SIZE*sizeof(BlastClusterElementPtr));
383 cluster[num_clusters]->elements[0] =
384 (BlastClusterElementPtr) MemNew(sizeof(BlastClusterElement));
385
386 if (numeric_id_type)
387 cluster[num_clusters]->elements[0]->gi = gi_list[index];
388 else
389 cluster[num_clusters]->elements[0]->id = id_list[index];
390 cluster[num_clusters]->elements[0]->len = seq_len[index];
391 available_size = ORIGINAL_CLUSTER_SIZE;
392
393 /* Find other sequences belonging to this cluster and add them. */
394 for (i = index+1; i < num_queries; i++) {
395 if (root[i] == index) {
396 Int4 size = cluster[num_clusters]->size;
397 /* Reallocate the elements array if necessary. */
398 if (size >= available_size) {
399 available_size *= 2;
400 cluster[num_clusters]->elements =
401 (BlastClusterElementPtr PNTR)
402 Realloc(cluster[num_clusters]->elements,
403 available_size*sizeof(BlastClusterElementPtr));
404 }
405 cluster[num_clusters]->elements[size] =
406 (BlastClusterElementPtr) MemNew(sizeof(BlastClusterElement));
407 if (numeric_id_type)
408 cluster[num_clusters]->elements[size]->gi = gi_list[i];
409 else
410 cluster[num_clusters]->elements[size]->id = id_list[i];
411
412 cluster[num_clusters]->elements[size]->len = seq_len[i];
413 ++cluster[num_clusters]->size;
414 }
415 }
416 num_clusters++;
417 }
418 }
419
420 /* Sort each cluster in decreasing order of sequence lengths */
421 for (index=0; index<num_clusters; index++)
422 HeapSort(cluster[index]->elements, cluster[index]->size,
423 sizeof(BlastClusterElementPtr), compare_cluster_elements);
424 /* Sort clusters in decreasing order of sizes */
425 HeapSort(cluster, num_clusters, sizeof(BlastClusterPtr), compare_clusters);
426 /* Print out all clusters */
427 for (index=0; index<num_clusters; index++) {
428 for (i=0; i<cluster[index]->size; i++) {
429 if (numeric_id_type)
430 fprintf(global_fp, "%ld ", cluster[index]->elements[i]->gi);
431 else
432 fprintf(global_fp, "%s ", cluster[index]->elements[i]->id);
433 MemFree(cluster[index]->elements[i]);
434 }
435 fprintf(global_fp, "\n");
436 MemFree(cluster[index]->elements);
437 MemFree(cluster[index]);
438 }
439 MemFree(cluster);
440 return 1;
441 }
442
443 #define INFO_LIST_SIZE 1000
444 #define FILE_BUFFER_SIZE 4096
445
446 /* Reclusters saved hits and returns the ordinal id of the last query found in
447 * the hits file, plus 1.
448 * @param infofp File to read hit information from [in]
449 * @param outfp File to write clusters to [in]
450 * @param gilp Pointer to an array of gis (numeric ids) found in the hits
451 * file [out]
452 * @param idlp Pointer to an array of string sequence ids found in the hits
453 * file [out]
454 * @param seqlp Pointer to an array of lengths of sequences found in the hits
455 * file [out]
456 * @param idfp File to read a list of ids that need to be reclustered
457 * (optional) [in]
458 * @return Index of the last query found in the hits file ( == ordinal id + 1)
459 */
460
461 static Int4 ReclusterFromFile(FILE *infofp, FILE *outfp, Int4Ptr PNTR gilp,
462 CharPtr PNTR PNTR idlp, Int4Ptr PNTR seqlp,
463 FILE *idfp)
464 {
465 ClusterLogInfoPtr info;
466 Int4 num_queries, num_hits, i, root1, root2, total_id_len;
467 Int4Ptr gi_list = NULL, seq_len = NULL;
468 CharPtr PNTR id_list = NULL;
469 CharPtr ptr, id_string = NULL;
470 FloatHi length_coverage, score_coverage;
471 Uint4 header_size, numeric_id_type;
472 Int4 last_seq = -1;
473 CharPtr id = NULL;
474 Boolean PNTR used_id_index = NULL;
475
476 /* Read header data from the hits file. */
477 FileRead(&numeric_id_type, sizeof(Int4), 1, infofp);
478 FileRead(&header_size, sizeof(Int4), 1, infofp);
479
480 if (numeric_id_type) {
481 num_queries = header_size;
482 gi_list = (Int4Ptr) MemNew(num_queries*sizeof(Int4));
483 FileRead(gi_list, sizeof(Int4), num_queries, infofp);
484 } else {
485 total_id_len = header_size;
486 num_queries = 0;
487 id_string = (CharPtr) MemNew(total_id_len+1);
488 FileRead(id_string, sizeof(Char), total_id_len, infofp);
489 ptr = id_string;
490 /* Count the ID's and change delimiter from space to null character */
491 for (i=0; i<total_id_len; i++) {
492 if (*ptr==' ') {
493 num_queries++;
494 *ptr = '\0';
495 }
496 ptr++;
497 }
498 id_list = (CharPtr PNTR) MemNew(num_queries*sizeof(CharPtr));
499 for (i=0; i<num_queries; i++) {
500 /* No need for allocation of new memory */
501 id_list[i] = id_string;
502 id_string += StringLen(id_string) + 1;
503 }
504 id_string = id_list[0];
505 }
506
507 seq_len = (Int4Ptr) Malloc(num_queries*sizeof(Int4));
508 FileRead(seq_len, sizeof(Int4), num_queries, infofp);
509
510 info = (ClusterLogInfoPtr) MemNew(INFO_LIST_SIZE*sizeof(ClusterLogInfo));
511
512 /* Initialize the root array, putting each sequence in its own
513 one-element cluster. */
514 root = (Int4Ptr) Malloc(num_queries*sizeof(Int4));
515 for (i=0; i<num_queries; i++)
516 root[i] = i;
517
518 /* Test for list of ids to use for reclustering, and fill the array of
519 ids used in this reclustering. */
520 if (idfp) {
521 CharPtr used_id_string;
522 CharPtr delimiter_list = " \t\n,;";
523 Int4 length;
524
525 used_id_string = (CharPtr) MemNew(FILE_BUFFER_SIZE+1);
526 used_id_index = (Boolean PNTR) MemNew(num_queries*sizeof(Boolean));
527 while((length =
528 FileRead(used_id_string, sizeof(Char), FILE_BUFFER_SIZE, idfp))) {
529 ptr = used_id_string;
530 while ((id = StringTokMT(ptr, delimiter_list, &ptr))
531 != NULL) {
532 if (ptr==NULL && length==FILE_BUFFER_SIZE) {
533 fseek(idfp, -StrLen(id), SEEK_CUR);
534 break;
535 }
536 for (i=0; i<num_queries; i++) {
537 if ((gi_list && atoi(id) == gi_list[i]) ||
538 (!gi_list && !StrCmp(id_list[i], id)))
539 break;
540 }
541 if (i < num_queries)
542 used_id_index[i] = TRUE;
543 }
544 }
545 }
546
547 /* Read the HSP data. */
548 while ((num_hits = FileRead(info, sizeof(ClusterLogInfo), INFO_LIST_SIZE,
549 infofp)) > 0) {
550 for (i=0; i<num_hits; i++) {
551 /* If one of the sequences involved in this HSP is not of interest,
552 skip this HSP. */
553 if (used_id_index &&
554 (!used_id_index[info[i].id1] || !used_id_index[info[i].id2]))
555 continue;
556
557 /* Calculate the coverage numbers. */
558 if (global_parameters->bidirectional)
559 length_coverage = MIN(((FloatHi)info[i].hsp_length1) /
560 seq_len[info[i].id1],
561 ((FloatHi)info[i].hsp_length2) /
562 seq_len[info[i].id2]);
563 else
564 length_coverage = MAX(((FloatHi)info[i].hsp_length1) /
565 seq_len[info[i].id1],
566 ((FloatHi)info[i].hsp_length2) /
567 seq_len[info[i].id2]);
568
569 if (global_parameters->score_threshold < 3.0)
570 score_coverage = info[i].bit_score /
571 (MAX(info[i].hsp_length1, info[i].hsp_length2));
572 else
573 score_coverage = info[i].perc_identity;
574
575 /* If coverage satisfies the input requirements, update the cluster
576 root information. */
577 if (length_coverage >= global_parameters->length_threshold &&
578 score_coverage >= global_parameters->score_threshold) {
579 root1 = info[i].id1;
580 while (root[root1] != root1)
581 root1 = root[root1];
582 root2 = info[i].id2;
583 while (root[root2] != root2)
584 root2 = root[root2];
585
586 if (root1 < root2)
587 root[root2] = root1;
588 else if (root1 > root2)
589 root[root1] = root2;
590 }
591 } /* End loop on hits from a chunk */
592 last_seq = info[num_hits-1].id1;
593 } /* End loop on chunks of hits */
594
595 /* Create the cluster structures and print out. */
596 if (outfp != NULL) {
597 global_fp = outfp;
598 BlastClusterNeighbours(num_queries, seq_len, id_list, gi_list,
599 used_id_index);
600 }
601 *gilp = gi_list;
602 *idlp = id_list;
603 *seqlp = seq_len;
604 MemFree(id_string);
605 MemFree(info);
606 return last_seq + 1;
607 }
608
609 /** Calculates percent identity given a gapped alignment block.
610 * @param HSP gapped alignment structure, returned from a gapped alignment
611 * routine. [in]
612 * @return percent of identical matches in this alignment.
613 */
614 static FloatHi
615 GapAlignPercentIdentity(GapAlignBlkPtr gabp)
616 {
617 GapXEditScriptPtr esp = gabp->edit_block->esp;
618 Int4 identical, total, q_index, s_index, i;
619 FloatHi perc_identity;
620 Uint1Ptr query, subject;
621
622 q_index = gabp->query_start;
623 s_index = gabp->subject_start;
624 query = gabp->query + q_index;
625 subject = gabp->subject + s_index;
626 identical = total = 0;
627
628 while (esp) {
629 if (esp->op_type == GAPALIGN_SUB || esp->op_type == GAPALIGN_DECLINE) {
630 for (i=0; i<esp->num; i++) {
631 if (*query == *subject)
632 identical++;
633 query++;
634 subject++;
635 }
636 } else if (esp->op_type == GAPALIGN_DEL)
637 subject += esp->num;
638 else if (esp->op_type == GAPALIGN_INS)
639 query += esp->num;
640 total += esp->num;
641 esp = esp->next;
642 }
643 perc_identity = ((FloatHi)identical) / total * 100;
644
645 return perc_identity;
646 }
647
648 /** Line length for configuration file with advanced options. */
649 #define BUFFER_SIZE 80
650
651 /** Callback for printing out HSP data into the hits file.
652 * @param ptr BlastSearchBlkPtr, containing current list of HSPs. [in]
653 */
654 static int LIBCALLBACK
655 PrintNeighbors(VoidPtr ptr)
656 {
657 BLAST_HitListPtr current_hitlist;
658 BLAST_HSPPtr hsp;
659 Int4 index;
660 BlastSearchBlkPtr search;
661 Int4 id1, id2, root1, root2, hspcnt;
662 Int4 subject_length;
663 Uint1Ptr subject, query;
664 ClusterLogInfoPtr loginfo = NULL;
665 BlastClustHspInfoPtr PNTR hsp_info_array;
666 Int4 query_count;
667
668 if (ptr == NULL)
669 return 0;
670
671 search = (BlastSearchBlkPtr) ptr;
672
673 if (search->current_hitlist == NULL || search->current_hitlist->hspcnt <= 0) {
674 /* No hits to save. */
675 search->subject_info = BLASTSubjectInfoDestruct(search->subject_info);
676 return 0;
677 }
678
679 current_hitlist = search->current_hitlist;
680
681 if (search->prog_number == blast_type_blastp)
682 subject_length = readdb_get_sequence(search->rdfp, search->subject_id, &subject);
683 else { /* Mega BLAST saves ncbi4na-encoded sequence */
684 subject = search->subject->sequence_start + 1;
685 subject_length = search->subject->length;
686 }
687 hspcnt = current_hitlist->hspcnt;
688
689 if (search->prog_number == blast_type_blastp)
690 id1 = SeqId2OrdinalId(search->rdfp, search->query_id);
691 else
692 id1 = -1;
693 id2 = search->subject_id;
694
695 /* For blastp, save only HSPs with first sequence index smaller than
696 second. */
697 if (id1 < id2) {
698 #define BUF_CHUNK_SIZE 1024
699 Int4 query_length, q_length, s_length;
700 FloatHi length_coverage, bit_score, score_coverage, perc_identity;
701 BLAST_KarlinBlkPtr kbp;
702 GapAlignBlkPtr gap_align = search->gap_align;
703 Int2 context;
704 Int4 max_score = 0;
705
706 if (search->prog_number == blast_type_blastp) {
707 query = search->context[0].query->sequence;
708 query_length = search->context[0].query->length;
709 }
710
711 query_count = 1;
712
713 /* For blastn, the current hit list contains HSPs from multiple "query"
714 sequences. Create an array of HSP information structures, and find
715 highest scoring HSP for every pair of sequences present in the
716 current list. */
717 if (search->prog_number == blast_type_blastn) {
718 hsp_info_array = (BlastClustHspInfoPtr PNTR)
719 Malloc(hspcnt*sizeof(BlastClustHspInfoPtr));
720 for (index=0; index<hspcnt; index++) {
721 hsp_info_array[index] = (BlastClustHspInfoPtr)
722 Malloc(sizeof(BlastClustHspInfo));
723 hsp = current_hitlist->hsp_array[index];
724 hsp_info_array[index]->id =
725 SeqId2OrdinalId(search->rdfp,
726 search->qid_array[hsp->context/2]);
727 hsp_info_array[index]->score = hsp->score;
728 hsp_info_array[index]->index = index;
729 }
730 HeapSort(hsp_info_array, hspcnt, sizeof(BlastClustHspInfoPtr),
731 hsp_info_id_score_compare);
732 /* Leave only one highest scoring hsp per "query" sequence.
733 Deallocate memory for all others. */
734 for (index=1; index<hspcnt; index++) {
735 if (hsp_info_array[index]->id < id2 &&
736 hsp_info_array[index]->id !=
737 hsp_info_array[query_count-1]->id) {
738 hsp_info_array[query_count] = hsp_info_array[index];
739 query_count++;
740 } else
741 hsp_info_array[index] = MemFree(hsp_info_array[index]);
742 }
743
744 hsp = current_hitlist->hsp_array[hsp_info_array[0]->index];
745 } else {
746 /* For blastp, just find the highest-scoring HSP. */
747 for (index=0; index<hspcnt; index++) {
748 if (current_hitlist->hsp_array[index]->score > max_score) {
749 hsp = current_hitlist->hsp_array[index];
750 max_score = hsp->score;
751 }
752 }
753 }
754
755 if (global_parameters->logfp)
756 loginfo = (ClusterLogInfoPtr)
757 MemNew(query_count*sizeof(ClusterLogInfo));
758
759 index = 0;
760 while (hsp) {
761 /* Calculate the coverage values for this HSP. */
762 if (search->prog_number == blast_type_blastn) {
763 context = hsp->context;
764 id1 = hsp_info_array[index]->id;
765 query_length =
766 search->query_context_offsets[context+1] -
767 search->query_context_offsets[context] - 1;
768 }
769 q_length = hsp->query.length;
770 s_length = hsp->subject.length;
771
772 if (global_parameters->bidirectional)
773 length_coverage = MIN(((FloatHi)q_length) / query_length,
774 ((FloatHi)s_length) / subject_length);
775 else
776 length_coverage = MAX(((FloatHi)q_length) / query_length,
777 ((FloatHi)s_length) / subject_length);
778
779 if (search->prog_number == blast_type_blastp)
780 kbp = search->sbp->kbp_gap[search->first_context];
781 else
782 kbp = search->sbp->kbp[context];
783 bit_score = ((hsp->score*kbp->Lambda) -
784 kbp->logK)/NCBIMATH_LN2;
785
786 /* Calculate percent identity, performing traceback search, if
787 necessary. */
788 if (search->prog_number == blast_type_blastp) {
789 gap_align->query_frame = ContextToFrame(search,
790 hsp->context);
791 gap_align->subject_frame = hsp->subject.frame;
792 gap_align->q_start = hsp->query.gapped_start;
793 gap_align->s_start = hsp->subject.gapped_start;
794 PerformGappedAlignmentWithTraceback(gap_align);
795 perc_identity = GapAlignPercentIdentity(gap_align);
796 gap_align->state_struct =
797 GapXDropStateDestroy(gap_align->state_struct);
798 gap_align->edit_block =
799 GapXEditBlockDelete(gap_align->edit_block);
800 } else {
801 Int4Ptr length, start;
802 Uint1Ptr strands;
803 Int4 align_length = 0, numseg, i;
804 GapXEditScriptPtr esp;
805 query = search->context[context].query->sequence;
806 esp = hsp->gap_info->esp;
807 for (numseg=0; esp; esp = esp->next, numseg++);
808 GXECollectDataForSeqalign(hsp->gap_info,
809 hsp->gap_info->esp, numseg,
810 &start, &length, &strands,
811 &hsp->query.offset,
812 &hsp->subject.offset);
813 perc_identity = 0;
814 for (i=0; i<numseg; i++) {
815 align_length += length[i];
816 if (start[2*i] != -1 && start[2*i+1] != -1)
817 perc_identity +=
818 MegaBlastGetNumIdentical(query, subject, start[2*i],
819 start[2*i+1], length[i],
820 FALSE);
821 }
822 perc_identity = perc_identity / align_length * 100;
823 start = MemFree(start);
824 length = MemFree(length);
825 strands = MemFree(strands);
826 }
827 if (global_parameters->score_threshold < 3.0)
828 score_coverage = bit_score / (MAX(q_length, s_length));
829 else
830 score_coverage = perc_identity;
831
832 /* If hits are being saved, save this HSP's information. */
833 if (global_parameters->logfp) {
834 loginfo[index].id1 = id1;
835 loginfo[index].id2 = id2;
836 loginfo[index].hsp_length1 = q_length;
837 loginfo[index].hsp_length2 = s_length;
838 loginfo[index].bit_score = bit_score;
839 loginfo[index].perc_identity = perc_identity;
840 }
841
842 /* If this HSP satisfies the coverage criteria, update the
843 cluster roots. */
844 if (length_coverage >= global_parameters->length_threshold &&
845 score_coverage >= global_parameters->score_threshold) {
846 root1 = id1;
847
848 NlmMutexLockEx(&root_mutex);
849 while (root[root1] != root1)
850 root1 = root[root1];
851 root2 = id2;
852 while (root[root2] != root2)
853 root2 = root[root2];
854
855 if (root1 < root2)
856 root[root2] = root1;
857 else if (root1 > root2)
858 root[root1] = root2;
859 NlmMutexUnlock(root_mutex);
860 }
861
862 /* For blastn, get to the highest scoring hsp for the next
863 "query" */
864 if (search->prog_number == blast_type_blastn &&
865 ++index < query_count)
866 hsp =
867 current_hitlist->hsp_array[hsp_info_array[index]->index];
868 else
869 hsp = NULL;
870 }
871
872 /* Deallocate memory for the auxiliary array of HSP info
873 structures. Also free all gap information structures,
874 because they are not freed when hit list is cleaned. */
875 if (search->prog_number == blast_type_blastn) {
876 for (index=0; index<query_count; index++)
877 MemFree(hsp_info_array[index]);
878 hsp_info_array = MemFree(hsp_info_array);
879
880 for (index=0; index < current_hitlist->hspcnt; ++index) {
881 hsp = current_hitlist->hsp_array[index];
882 hsp->gap_info = GapXEditBlockDelete(hsp->gap_info);
883 }
884 }
885
886 if (global_parameters->logfp && loginfo) {
887 FileWrite(loginfo, sizeof(ClusterLogInfo), query_count,
888 global_parameters->logfp);
889 loginfo = MemFree(loginfo);
890 fflush(global_parameters->logfp);
891 }
892 } else {
893 /* This can't happen in normal situation. If it does, the most likely
894 reason is presense of non-unique identifiers in the input file */
895 ErrPostEx(SEV_FATAL, 1, 0, "Blastclust cannot process input files with"
896 " non-unique sequence identifiers\n");
897 return 1;
898 }
899
900 return 0;
901 }
902
903 #ifdef NUMARG
904 #undef NUMARG
905 #endif
906 #define NUMARG (sizeof(myargs)/sizeof(myargs[0]))
907
908 static Args myargs [] = {
909 { "FASTA input file (program will format the database and remove files in the end)", /* 0 */
910 "stdin", NULL, NULL, FALSE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
911 { "Number of CPU's to use", /* 1 */
912 "1", NULL, NULL, FALSE, 'a', ARG_INT, 0.0, 0, NULL},
913 { "Output file for list of clusters", /* 2 */
914 "stdout", NULL, NULL, FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
915 { "Length coverage threshold", /* 3 */
916 "0.9", NULL, NULL, FALSE, 'L', ARG_FLOAT, 0.0, 0, NULL},
917 { "Score coverage threshold (bit score / length if < 3.0, percentage of identities otherwise)", /* 4 */
918 "1.75", NULL, NULL, FALSE, 'S', ARG_FLOAT, 0.0, 0, NULL},
919 { "Require coverage on both neighbours?", /* 5 */
920 "TRUE", NULL, NULL, FALSE, 'b', ARG_BOOLEAN, 0.0, 0, NULL},
921 { "File to save all neighbours", /* 6 */
922 NULL, NULL, NULL, TRUE, 's', ARG_FILE_OUT, 0.0, 0, NULL},
923 { "File to restore neighbors for reclustering", /* 7 */
924 NULL, NULL, NULL, TRUE, 'r', ARG_FILE_IN, 0.0, 0, NULL},
925 { "Input as a database", /* 8 */
926 NULL, NULL, NULL, TRUE, 'd', ARG_FILE_IN, 0.0, 0, NULL},
927 { "Print progress messages (verbose mode)", /* 9 */
928 "stdout", NULL, NULL, FALSE, 'v', ARG_FILE_OUT, 0.0, 0, NULL},
929 { "Complete unfinished clustering", /* 10 */
930 "FALSE", NULL, NULL, FALSE, 'C', ARG_BOOLEAN, 0.0, 0, NULL},
931 { "Restrict reclustering to id list", /* 11 */
932 NULL, NULL, NULL, TRUE, 'l', ARG_FILE_IN, 0.0, 0, NULL},
933 { "Is input proteins?", /* 12 */
934 "TRUE", NULL, NULL, FALSE, 'p', ARG_BOOLEAN, 0.0, 0, NULL},
935 { "Enable id parsing in database formatting?", /* 13 */
936 "TRUE", NULL, NULL, FALSE, 'e', ARG_BOOLEAN, 0.0, 0, NULL},
937 { "Configuration file with advanced options", /* 14 */
938 NULL, NULL, NULL, TRUE, 'c', ARG_FILE_IN, 0.0, 0, NULL},
939 { "Word size to use (0 for default: proteins 3, nucleotides 32)",/* 15 */
940 "0", NULL, NULL, FALSE, 'W', ARG_INT, 0.0, 0, NULL}
941
942 };
943
944 /** Print progress messages after how many processed sequences? */
945 #define PROGRESS_INTERVAL 1000
946 /** Maximal number of sequences to concatenate for a blastn search. */
947 #define MAX_NUM_QUERIES 16383 /* == 1/2 INT2_MAX */
948 /** Maximal total length of concatenated sequences in a blastn search. */
949 #define MAX_TOTAL_LENGTH 5000000
950
951 Int2 Main (void)
952 {
953 BLAST_OptionsBlkPtr options;
954 BlastSearchBlkPtr search;
955 Boolean db_is_na, query_is_na;
956 Int4 qsize, dbsize, first_seq;
957 ReadDBFILEPtr rdfp;
958 Uint1 align_type;
959 SeqIdPtr sip;
960 BioseqPtr query_bsp = NULL, PNTR query_bsp_array;
961 CharPtr blast_program, blast_inputfile, blast_outputfile, blast_database,
962 progress_file = NULL;
963 CharPtr logfile, info_file, input_name;
964 Int4 total_id_len = 0;
965 FILE *outfp, *infofp, *progressfp, *idfp;
966 Int4 index, i, num_queries, num_bsps, num_left;
967 Int8 total_length;
968 Int4Ptr gi_list = NULL, seq_len = NULL;
969 CharPtr PNTR id_list = NULL, id_string = NULL;
970 Boolean db_formatted = FALSE;
971 Boolean numeric_id_type = TRUE;
972 Char db_file[BUFFER_SIZE];
973 Boolean print_progress, finish_incomplete, is_prot, parse_mode;
974 FDB_optionsPtr fdb_options;
975 Char timestr[24];
976 CharPtr tmpdir;
977 Int4 total_query_length;
978 CharPtr title = NULL;
979 Char buf[256] = { '\0' };
980
981 StringCpy(buf, "blastclust ");
982 StringNCat(buf, BlastGetVersionNumber(), sizeof(buf)-StringLen(buf)-1);
983
984 if (! GetArgs (buf, NUMARG, myargs))
985 return (1);
986
987 UseLocalAsnloadDataAndErrMsg ();
988
989 if (! SeqEntryLoad())
990 return 1;
991
992 ErrSetMessageLevel(SEV_WARNING);
993
994 global_parameters = (ClusterParametersPtr) MemNew(sizeof(ClusterParameters));
995
996 global_parameters->length_threshold = myargs[3].floatvalue;
997 global_parameters->score_threshold = myargs[4].floatvalue;
998 global_parameters->bidirectional = (Boolean) myargs[5].intvalue;
999 finish_incomplete = (Boolean) myargs[10].intvalue;
1000
1001 print_progress = (Boolean) StrCmp(myargs[9].strvalue, "F");
1002 if (print_progress)
1003 progress_file = myargs[9].strvalue;
1004
1005 if (progress_file != NULL &&
1006 (progressfp = FileOpen(progress_file, "w")) == NULL) {
1007 ErrPostEx(SEV_FATAL, 1, 0, "blastclust: Unable to open progress file %s\n",
1008 progress_file);
1009 return (1);
1010 }
1011
1012 blast_outputfile = myargs[2].strvalue;
1013 outfp = NULL;
1014 if (blast_outputfile != NULL) {
1015 if ((outfp = FileOpen(blast_outputfile, "w")) == NULL) {
1016 ErrPostEx(SEV_FATAL, 1, 0, "blastclust: Unable to open output file %s\n", blast_outputfile);
1017 return (1);
1018 }
1019 }
1020
1021 info_file = myargs[7].strvalue;
1022 if (info_file) {
1023 /* Non-empty string means only retrieve neighbors for reclustering */
1024 if ((infofp = FileOpen(info_file, "rb")) == NULL) {
1025 ErrPostEx(SEV_FATAL, 1, 0, "blastclust: Unable to open neighbors file %s for reading\n", info_file);
1026 return (1);
1027 }
1028 if (myargs[11].strvalue) {
1029 if ((idfp = FileOpen(myargs[11].strvalue, "r")) == NULL) {
1030 ErrPostEx(SEV_FATAL, 1, 0, "blastclust: Unable to open id file %s for reading\n", myargs[11].strvalue);
1031 return (1);
1032 }
1033 } else
1034 idfp = NULL;
1035 /* No need for another search, simply get all the neighbours
1036 and reculster them using new thresholds */
1037 ReclusterFromFile(infofp, outfp, &gi_list, &id_list, &seq_len, idfp);
1038 MemFree(gi_list);
1039 MemFree(id_list);
1040 MemFree(seq_len);
1041 FileClose(infofp);
1042 FileClose(outfp);
1043 return 0;
1044 } else
1045 infofp = NULL;
1046
1047 is_prot = (Boolean) myargs[12].intvalue;
1048 parse_mode = (Boolean) myargs[13].intvalue;
1049 if (is_prot)
1050 blast_program = StringSave("blastp");
1051 else
1052 blast_program = StringSave("blastn");
1053
1054 if (myargs[8].strvalue)
1055 db_formatted = TRUE;
1056
1057 if (db_formatted) {
1058 blast_database = myargs[8].strvalue;
1059 } else {
1060 /* Need to format the database */
1061 blast_inputfile = myargs[0].strvalue;
1062 input_name = FileNameFind(blast_inputfile);
1063
1064 if (tmpdir = getenv("TMPDIR")) {
1065 blast_database =
1066 Malloc(StringLen(input_name) + StringLen(tmpdir) + 2);
1067 sprintf(blast_database, "%s/%s", tmpdir, input_name);
1068 } else
1069 blast_database = blast_inputfile;
1070
1071 fdb_options = FDBOptionsNew(blast_inputfile, is_prot, NULL, FALSE,
1072 FALSE, FALSE, FALSE, TRUE, parse_mode,
1073 blast_database, NULL, 0, 0, FORMATDB_VER,
1074 FALSE, FALSE);
1075 FastaToBlastDB(fdb_options, 0);
1076
1077 fdb_options = FDBOptionsFree(fdb_options);
1078 }
1079
1080 logfile = myargs[6].strvalue;
1081 if (logfile) { /* Empty string means do not write log information */
1082 if (finish_incomplete) {
1083 if ((global_parameters->logfp = FileOpen(logfile, "ab+")) == NULL) {
1084 ErrPostEx(SEV_FATAL, 1, 0, "blast: Unable to open log file %s for appending\n", logfile);
1085 return (1);
1086 }
1087 } else {
1088 if ((global_parameters->logfp = FileOpen(logfile, "wb+")) == NULL) {
1089 ErrPostEx(SEV_FATAL, 1, 0, "blast: Unable to open log file %s for writing\n", logfile);
1090 return (1);
1091 }
1092 }
1093 }
1094 global_fp = outfp;
1095
1096 align_type = BlastGetTypes(blast_program, &query_is_na, &db_is_na);
1097
1098 rdfp = readdb_new(blast_database, is_prot);
1099
1100 options = BLASTOptionNew(blast_program, TRUE);
1101
1102 if (options == NULL)
1103 return 3;
1104
1105 /* Set the default search options. */
1106 options->use_real_db_size = TRUE;
1107 options->sort_gi_list = FALSE;
1108
1109 options->expect_value = 1e-6;
1110 qsize = 300;
1111 dbsize = (20*1000*1000);
1112 options->searchsp_eff = (FloatHi) qsize * (FloatHi) dbsize;
1113 options->perform_culling = FALSE;
1114 options->do_not_reevaluate = TRUE;
1115 options->do_sum_stats = FALSE;
1116 options->number_of_cpus = myargs[1].intvalue;
1117 if (!is_prot) {
1118 options->is_megablast_search = TRUE;
1119 options->gap_open = options->gap_extend = 0;
1120 options->wordsize = 32;
1121 options->block_width = 0;
1122 options->window_size = 0;
1123 }
1124
1125 if (myargs[15].intvalue != 0)
1126 options->wordsize = myargs[15].intvalue;
1127
1128 if (myargs[14].strvalue) {
1129 CharPtr string_options = NULL;
1130 Int2 string_options_len = 0;
1131 CharPtr error = NULL;
1132 FILE *config_file;
1133 Char opt_line[BUFFER_SIZE+1];
1134
1135 if ((config_file = FileOpen(myargs[14].strvalue, "r")) == NULL) {
1136 ErrPostEx(SEV_FATAL, 1, 0,
1137 "Cannot open advanced options configuration file %s\n",
1138 myargs[14].strvalue);
1139 }
1140
1141 /* Read advanced options from a configuration file, one line at a
1142 time. */
1143 while (fgets(opt_line, BUFFER_SIZE, config_file) != 0) {
1144 if (*opt_line == '#')
1145 continue;
1146 if (!string_options) {
1147 string_options = (CharPtr) MemNew(BUFFER_SIZE+1);
1148 string_options_len = BUFFER_SIZE;
1149 } else {
1150 if (StrLen(string_options) + StrLen(opt_line) + 1 >
1151 string_options_len) {
1152 CharPtr tmp = (CharPtr) Realloc(string_options,
1153 2*string_options_len);
1154 if (tmp) {
1155 string_options = tmp;
1156 string_options_len *= 2;
1157 }
1158 }
1159 StringCat(string_options, " ");
1160 }
1161 StringCat(string_options, opt_line);
1162 }
1163 if (string_options)
1164 parse_blast_options(options, string_options, &error,
1165 NULL, NULL, NULL);
1166 if (error) {
1167 ErrPostEx(SEV_WARNING, 0, 0, "blastclust: parse_blast_options: %s\n",
1168 error);
1169 }
1170 }
1171
1172 readdb_get_totals_ex(rdfp, &total_length, &num_queries, TRUE);
1173
1174 root = (Int4Ptr) Malloc(num_queries*sizeof(Int4));
1175
1176 /* If this is a continuation of a previous search that has not been
1177 completed, read the previous search information and start from there. */
1178 if (is_prot && finish_incomplete) {
1179 first_seq = ReclusterFromFile(global_parameters->logfp, NULL, &gi_list,
1180 &id_list, &seq_len, NULL);
1181 } else {
1182 Uint4 header_size = 0;
1183 Uint4 header_numeric_id_type = 0;
1184 first_seq = 0;
1185 /* Put each sequence in its own one-element cluster. */
1186 for (index=0; index<num_queries; index++)
1187 root[index] = index;
1188
1189 gi_list = (Int4Ptr) MemNew(num_queries*sizeof(Int4));
1190 id_list = (CharPtr PNTR) MemNew(num_queries*sizeof(CharPtr));
1191 seq_len = (Int4Ptr) MemNew(num_queries*sizeof(Int4));
1192
1193 /* Get the query identificators, decide if they are string or
1194 numeric. */
1195 for (index=0; index<num_queries; index++) {
1196 readdb_get_descriptor(rdfp, index, &sip, &title);
1197 seq_len[index] = readdb_get_sequence_length(rdfp, index);
1198
1199 if (sip->choice != SEQID_GENERAL ||
1200 StringCmp(((DbtagPtr)sip->data.ptrvalue)->db, "BL_ORD_ID")) {
1201 numeric_id_type &=
1202 GetAccessionFromSeqId(sip, &gi_list[index],
1203 &id_list[index]);
1204 } else {
1205 CharPtr dummy_ptr = NULL;
1206 numeric_id_type = FALSE;
1207 if (title && *title != NULLB) {
1208 id_list[index] =
1209 StringSave(StringTokMT(title, " ", &dummy_ptr));
1210 } else {
1211 id_list[index] = (CharPtr) Malloc(BUFFER_SIZE+1);
1212 SeqIdWrite(sip, id_list[index],
1213 PRINTID_FASTA_SHORT, BUFFER_SIZE);
1214 }
1215 }
1216 title = MemFree(title);
1217 sip = SeqIdSetFree(sip);
1218 }
1219
1220 if (numeric_id_type) {
1221 id_list = MemFree(id_list);
1222 header_size = num_queries;
1223 header_numeric_id_type = 1;
1224 } else {
1225 total_id_len = 0;
1226 /* Check if some ids were gis and convert them to strings */
1227 for (i=0; i<num_queries; i++) {
1228 if (gi_list[i] > 0) {
1229 id_list[i] = (CharPtr) MemNew(10);
1230 sprintf(id_list[i], "%ld", gi_list[i]);
1231 }
1232 total_id_len += StringLen(id_list[i]) + 1;
1233 }
1234 gi_list = MemFree(gi_list);
1235 id_string = (CharPtr) MemNew(total_id_len+1);
1236 for (i=0; i<num_queries; i++) {
1237 StringCat(id_string, id_list[i]);
1238 StringCat(id_string, " ");
1239 }
1240 header_size = total_id_len;
1241 }
1242
1243 /* Write the sequence identifiers and lengths into the hits file
1244 header. */
1245 FileWrite(&header_numeric_id_type, sizeof(Uint4), 1,
1246 global_parameters->logfp);
1247 FileWrite(&header_size, sizeof(Uint4), 1, global_parameters->logfp);
1248
1249 if (numeric_id_type)
1250 FileWrite(gi_list, sizeof(Int4), num_queries,
1251 global_parameters->logfp);
1252 else {
1253 FileWrite(id_string, sizeof(Char), total_id_len,
1254 global_parameters->logfp);
1255 MemFree(id_string);
1256 }
1257
1258 FileWrite(seq_len, sizeof(Int4), num_queries, global_parameters->logfp);
1259 fflush(global_parameters->logfp);
1260 }
1261
1262 /* Print the first progress message, if necessary. */
1263 if (print_progress) {
1264 DayTimeStr(timestr, TRUE, TRUE);
1265 if (finish_incomplete)
1266 fprintf(progressfp,
1267 "%s Finish clustering of %ld queries, starting from query %ld\n",
1268 timestr, num_queries, first_seq);
1269 else
1270 fprintf(progressfp, "%s Start clustering of %ld queries\n",
1271 timestr, num_queries);
1272 }
1273
1274 if (!is_prot) {
1275 SeqAlignPtr PNTR head; /* Will not be used */
1276
1277 query_bsp_array = (BioseqPtr PNTR)
1278 Malloc((MIN(num_queries, MAX_NUM_QUERIES)+1)*sizeof(BioseqPtr));
1279 num_left = num_queries;
1280 /* For blastn, concatenate as many queries as possible and run the
1281 search engine. */
1282 while (num_left>0) {
1283 num_bsps = MIN(num_left, MAX_NUM_QUERIES);
1284 total_query_length = 0;
1285 for (index=0; index<num_bsps; index++) {
1286 query_bsp_array[index] = readdb_get_bioseq(rdfp, index);
1287 total_query_length += query_bsp_array[index]->length;
1288 if (total_query_length > MAX_TOTAL_LENGTH)
1289 break;
1290 }
1291 num_bsps = index;
1292 query_bsp_array[num_bsps] = NULL;
1293
1294 head = BioseqMegaBlastEngine(query_bsp_array, blast_program,
1295 blast_database, options, NULL, NULL,
1296 NULL, NULL, NULL, 0, PrintNeighbors);
1297 for (index=0; index<num_bsps; index++)
1298 query_bsp_array[index] = BioseqFree(query_bsp_array[index]);
1299 num_left -= num_bsps;
1300 ObjMgrFreeCache(0);
1301 }
1302 MemFree(query_bsp_array);
1303 } else {
1304 /* Loop over all sequences, searching each time only against
1305 sequences with larger indices. */
1306 for (index=first_seq; index<num_queries; index++) {
1307 query_bsp = readdb_get_bioseq(rdfp, index);
1308 /* Set the starting database sequence for the search. */
1309 options->first_db_seq = index + 1;
1310
1311 /* Set up search. */
1312 search = BLASTSetUpSearchWithReadDbInternal(NULL, query_bsp,
1313 blast_program, seq_len[index], blast_database,
1314 options, NULL, NULL, NULL, 0, rdfp);
1315 if (search != NULL && !search->query_invalid) {
1316 search->handle_results = PrintNeighbors;
1317
1318 /* Run BLAST. */
1319 do_the_blast_run(search);
1320 } else if (search) {
1321 BlastErrorPrint(search->error_return);
1322 ErrPostEx(SEV_ERROR, 1, 0, "Failed to process query number %ld", (long) index);
1323 return 1;
1324 }
1325 search = BlastSearchBlkDestruct(search);
1326 query_bsp = BioseqFree(query_bsp);
1327
1328 if (print_progress && (index + 1)%PROGRESS_INTERVAL == 0) {
1329 DayTimeStr(timestr, TRUE, TRUE);
1330 fprintf(progressfp, "%s Finished processing of %ld queries\n",
1331 timestr, index+1);
1332 }
1333 } /* End of loop on queries */
1334 }
1335 rdfp = readdb_destruct(rdfp);
1336
1337 /* Create the clusters and print them out. */
1338 BlastClusterNeighbours(num_queries, seq_len, id_list, gi_list, NULL);
1339
1340 MemFree(seq_len);
1341 if (numeric_id_type)
1342 MemFree(gi_list);
1343 else {
1344 for (i=0; i<num_queries; i++)
1345 MemFree(id_list[i]);
1346 MemFree(id_list);
1347 }
1348 MemFree(root);
1349
1350 fflush(global_fp);
1351 options = BLASTOptionDelete(options);
1352 blast_program = (CharPtr) MemFree(blast_program);
1353 FileClose(outfp);
1354 if (global_parameters->logfp)
1355 FileClose(global_parameters->logfp);
1356
1357 /* Remove the temporary BLAST database files. */
1358 if (!db_formatted && StringLen(blast_database) > 0) {
1359 Char p_or_n = (is_prot) ? 'p' : 'n';
1360 sprintf(db_file, "%s.%chr", blast_database, p_or_n);
1361 FileRemove(db_file);
1362 sprintf(db_file, "%s.%cin", blast_database, p_or_n);
1363 FileRemove(db_file);
1364 sprintf(db_file, "%s.%csq", blast_database, p_or_n);
1365 FileRemove(db_file);
1366 if (parse_mode) {
1367 sprintf(db_file, "%s.%cnd", blast_database, p_or_n);
1368 FileRemove(db_file);
1369 sprintf(db_file, "%s.%cni", blast_database, p_or_n);
1370 FileRemove(db_file);
1371 sprintf(db_file, "%s.%csd", blast_database, p_or_n);
1372 FileRemove(db_file);
1373 sprintf(db_file, "%s.%csi", blast_database, p_or_n);
1374 FileRemove(db_file);
1375 }
1376 }
1377 MemFree(blast_database);
1378 return 0;
1379 }
1380
1381 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |