NCBI C Toolkit Cross Reference

C/demo/blastclust.c


  1 static char const rcsid[] = "$Id: blastclust.c,v 6.49 2007/12/21 14:00:47 madden Exp $";
  2 
  3 /*  $RCSfile: blastclust.c,v $  $Revision: 6.49 $  $Date: 2007/12/21 14:00:47 $
  4 * ===========================================================================
  5 *
  6 *                            PUBLIC DOMAIN NOTICE
  7 *               National Center for Biotechnology Information
  8 *
  9 *  This software/database is a "United States Government Work" under the
 10 *  terms of the United States Copyright Act.  It was written as part of
 11 *  the author's official duties as a United States Government employee and
 12 *  thus cannot be copyrighted.  This software/database is freely available
 13 *  to the public for use. The National Library of Medicine and the U.S.
 14 *  Government have not placed any restriction on its use or reproduction.
 15 *
 16 *  Although all reasonable efforts have been taken to ensure the accuracy
 17 *  and reliability of the software and data, the NLM and the U.S.
 18 *  Government do not and cannot warrant the performance or results that
 19 *  may be obtained by using this software or data. The NLM and the U.S.
 20 *  Government disclaim all warranties, express or implied, including
 21 *  warranties of performance, merchantability or fitness for any particular
 22 *  purpose.
 23 *
 24 *  Please cite the author in any work or product based on this material.
 25 *
 26 * ===========================================================================
 27 *
 28 * Author: Ilya Dondoshansky 
 29 *
 30 * File Description:
 31 *   Program to perform a single-linkage clustering of a set of protein or DNA 
 32 *   sequences. See file README.bcl for description
 33 *
 34 * ---------------------------------------------------------------------------
 35 * $Log: blastclust.c,v $
 36 * Revision 6.49  2007/12/21 14:00:47  madden
 37 * Exit if query cannot be processed, JIRA SB-32
 38 *
 39 * Revision 6.48  2005/08/19 20:12:32  dondosha
 40 * Added extensive doxygen-style comments
 41 *
 42 * Revision 6.47  2005/07/28 16:16:46  coulouri
 43 * remove dead code
 44 *
 45 * Revision 6.46  2005/07/27 15:52:02  coulouri
 46 * remove unused queue_callback
 47 *
 48 * Revision 6.45  2004/08/02 20:08:49  dondosha
 49 * Write and read header integers separately instead of as a structure; changed first number from 1 byte to 4, to prevent padding with garbage
 50 *
 51 * Revision 6.44  2004/06/30 21:03:57  madden
 52 * Add include for blfmtutl.h
 53 *
 54 * Revision 6.43  2004/05/03 19:59:48  dondosha
 55 * Display version number when run with no arguments
 56 *
 57 * Revision 6.42  2004/02/24 15:18:15  dondosha
 58 * Fixed several memory leaks for nucleotide clustering
 59 *
 60 * Revision 6.41  2004/02/18 20:42:59  dondosha
 61 * Cleaned up how id strings are retrieved for output
 62 *
 63 * Revision 6.40  2004/02/18 15:18:46  dondosha
 64 * Minor fix when freeing title strings
 65 *
 66 * Revision 6.39  2003/07/22 17:59:48  dondosha
 67 * Added call to BlastErrorPrint to make warnings more informative
 68 *
 69 * Revision 6.38  2003/06/13 19:59:29  dondosha
 70 * Fixed 2 purify errors
 71 *
 72 * Revision 6.37  2003/05/30 17:31:09  coulouri
 73 * add rcsid
 74 *
 75 * Revision 6.36  2003/05/13 16:02:42  coulouri
 76 * make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
 77 *
 78 * Revision 6.35  2003/05/06 20:21:13  dondosha
 79 * Typo correction
 80 *
 81 * Revision 6.34  2003/05/06 20:19:45  dondosha
 82 * Changed a confusing error message to a meaningful one
 83 *
 84 * Revision 6.33  2002/10/08 15:32:42  dondosha
 85 * Corrected file description comment in the NCBI header
 86 *
 87 * Revision 6.32  2002/09/03 16:01:11  dondosha
 88 * Introduced a restriction on the total length of concatenated sequences for nucleotide case
 89 *
 90 * Revision 6.31  2002/05/30 16:20:16  dondosha
 91 * 1. Correction in evaluation of hits for DNA case,
 92 * 2. Added word size parameter to make DNA clustering more flexible
 93 *
 94 * Revision 6.30  2002/03/14 21:07:11  dondosha
 95 * Slight improvement in finding the used id list from a gi list when reclustering
 96 *
 97 * Revision 6.29  2002/02/11 21:32:10  dondosha
 98 * Free Bioseqs after use to avoid accumulation
 99 *
100 * Revision 6.28  2001/11/16 20:34:17  dondosha
101 * Bug fix: BioseqUnlock was missing; also added ObjMgrFreeCache calls
102 *
103 * Revision 6.27  2001/08/15 17:49:08  dondosha
104 * Correction in previous change
105 *
106 * Revision 6.26  2001/08/15 17:27:17  dondosha
107 * Consider and save only the top HSP for each pair of sequences being compared
108 *
109 * Revision 6.25  2001/07/26 18:23:48  dondosha
110 * Added a -c option: configuration file with advanced BLAST options
111 *
112 * Revision 6.24  2001/02/13 16:26:03  dondosha
113 * Bug fix: memory freed in the wrong place
114 *
115 * Revision 6.23  2001/01/31 19:48:14  dondosha
116 * Change #define TMPDIR to environment variable
117 *
118 * Revision 6.22  2001/01/26 20:13:05  dondosha
119 * If at least one sequence id non-numeric, make them all non-numeric
120 *
121 * Revision 6.21  2000/12/19 18:52:38  dondosha
122 * Added option to disallow id parsing when formatting database
123 *
124 * Revision 6.20  2000/12/14 20:51:14  dondosha
125 * Do several Mega BLAST searches if more than 16383 sequences in input
126 *
127 * Revision 6.19  2000/12/07 22:57:34  dondosha
128 * Implemented nucleotide version using Mega BLAST
129 *
130 * Revision 6.18  2000/12/01 17:57:04  dondosha
131 * Handle local and general ids differently to avoid non-informative output
132 *
133 * Revision 6.17  2000/09/01 18:30:55  dondosha
134 * Create database memory map only once for all searches
135 *
136 * Revision 6.16  2000/08/08 17:58:55  dondosha
137 * Change back to create database with indexes
138 *
139 * Revision 6.15  2000/08/07 19:40:57  dondosha
140 * Removed sequence lengths from ClusterLogInfo structure - redundant information
141 *
142 * Revision 6.14  2000/08/07 15:13:36  dondosha
143 * Changed comment for -S option
144 *
145 * Revision 6.13  2000/08/04 23:24:40  dondosha
146 * Added functionality to choose neighbors based on percentage of identities
147 *
148 * Revision 6.12  2000/06/08 20:40:15  dondosha
149 * In case of equal lengths of sequences within a cluster or equal number of cluster elements, use alphabetic or numeric order when sorting ids and clusters
150 *
151 * Revision 6.11  2000/06/08 13:34:57  dondosha
152 * Increased buffer size for reading id file
153 *
154 * Revision 6.10  2000/06/07 21:57:04  dondosha
155 * Added option to provide a file with id list for reclustering
156 *
157 * Revision 6.9  2000/06/07 14:23:20  dondosha
158 * Changed ctime_r call to portable DayTimeStr
159 *
160 * Revision 6.8  2000/06/06 19:32:41  dondosha
161 * Changed progress interval back to 1000
162 *
163 * Revision 6.7  2000/06/06 19:23:42  dondosha
164 * Added option to finish incomplete clustering from the point where previous search ended
165 *
166 * Revision 6.5  2000/05/22 18:19:59  dondosha
167 * In case search set up fails, destruct all necessary stuff before going to next query
168 *
169 * Revision 6.4  2000/05/17 17:44:50  dondosha
170 * Cleaned from unused variables
171 *
172 * Revision 6.3  2000/05/05 18:16:56  dondosha
173 * Enhanced to process all types of SeqIds
174 *
175 * Revision 6.2  2000/05/03 16:50:40  dondosha
176 * Removed system calls, added sorting of clusters by size, sequences within clusters by length
177 *
178 * Revision 6.1  2000/04/27 14:47:18  dondosha
179 * Clustering of protein neighbours - initial revision
180 *
181 * Revision 1.1  2000/04/12 12:53:04   dondosha
182 * Clustering of protein neighbors
183 *
184 * ===========================================================================
185 */
186 #include <ncbi.h>
187 #include <objseq.h>
188 #include <objsset.h>
189 #include <sequtil.h>
190 #include <seqport.h>
191 #include <tofasta.h>
192 #include <blast.h>
193 #include <blastpri.h>
194 #include <simutil.h>
195 #include <txalign.h>
196 #include <gapxdrop.h>
197 #include <sqnutils.h>
198 #include <mbalign.h>
199 #include <mblast.h>
200 #include <blfmtutl.h>
201 
202 #define DEBUG 0
203 
204 /** File to write cluster output. */
205 FILE *global_fp=NULL;
206 /** Array of indices pointing to root sequences of each cluster. */
207 static Int4Ptr root;
208 /** Mutex for modifying the cluster root array. */
209 static TNlmMutex root_mutex;
210 
211 /** HSP information, used internally. */
212 typedef struct blastclust_hsp_info
213 {
214    Int4 id;    /**< Subject ordinal number. */
215    Int4 score; /**< HSP score */
216    Int4 index;
217 } BlastClustHspInfo, PNTR BlastClustHspInfoPtr;
218 
219 /** User parameters for clustering. */
220 typedef struct cluster_parameters
221 {
222    Boolean bidirectional;    /**< Is bydirectional coverage required? */
223    FloatHi length_threshold; /**< Length coverage threshold. */
224    FloatHi score_threshold;  /**< Score coverage threshold. */
225    FILE *logfp;              /**< File to write hit data to. */
226 } ClusterParameters, PNTR ClusterParametersPtr;
227 
228 /** Globally accessible parameters structure. */
229 static ClusterParametersPtr global_parameters;
230 
231 /** Basic structure written to the hit file. */
232 typedef struct cluster_log_info
233 {
234    Int4 id1; /**< Index of the first sequence for this HSP. */
235    Int4 id2; /**< Index of the second sequence for this HSP. */
236    Int4 hsp_length1; /**< Length of this HSP in first sequence. */ 
237    Int4 hsp_length2; /**< Length of this HSP in second sequence. */ 
238    FloatHi bit_score;/**< This HSP's bit score. */
239    FloatHi perc_identity; /**< This HSP's percent identity. */
240 } ClusterLogInfo, PNTR ClusterLogInfoPtr;
241 
242 /** Input sequence information. */
243 typedef struct blast_cluster_element
244 {
245    Int4 gi;   /**< Gi or other numeric identifier. */
246    CharPtr id;/**< String identifier. */
247    Int4 len;  /**< Sequence length. */
248 } BlastClusterElement, PNTR BlastClusterElementPtr;
249 
250 /** Information about all input sequences. */
251 typedef struct blast_cluster
252 {
253    Int4 size; /** Number of sequences. */
254    BlastClusterElementPtr PNTR elements; /** Array of pointers to input sequence
255                                              information structures. */
256 } BlastCluster, PNTR BlastClusterPtr;
257 
258 /** Comparison function for sorting input sequences in the clusters.
259  * Sequences are sorted as follows, with decreasing priorities
260  * 1. Decreasing order of length
261  * 2. Increasing order of numeric identifiers
262  * 3. Lexicographic order of string identifiers.
263  */
264 static int LIBCALLBACK
265 compare_cluster_elements(VoidPtr v1, VoidPtr v2)
266 {
267    BlastClusterElementPtr e1, e2;
268    
269    e1 = *(BlastClusterElementPtr PNTR) v1;
270    e2 = *(BlastClusterElementPtr PNTR) v2;
271 
272    if (e1->len > e2->len) 
273       return -1;
274    else if (e1->len < e2->len) 
275       return 1; 
276    else if (e1->gi > 0 && e2->gi>0) {
277       if (e1->gi < e2->gi)
278          return -1;
279       else if (e1->gi > e2->gi)
280          return 1;
281    } else 
282       return StrCmp(e1->id, e2->id);
283 
284    return 0;
285 }
286 
287 /** Comparison function for sorting clusters by size. Sequences within clusters
288  * are sorted in a specific order @sa compare_cluster_elements. 
289  */ 
290 static int LIBCALLBACK
291 compare_clusters(VoidPtr v1, VoidPtr v2)
292 {
293    BlastClusterPtr c1, c2;
294 
295    c1 = *(BlastClusterPtr PNTR) v1;
296    c2 = *(BlastClusterPtr PNTR) v2;
297 
298    if (c1->size > c2->size) 
299       return -1;
300    else if (c1->size < c2->size) 
301       return 1;
302    else 
303       return compare_cluster_elements(c1->elements, c2->elements);
304 }
305 
306 /** Comparison function for sorting HSP information structures. Sorting is 
307  * performed according to the following rules with decreasing priority:
308  * 1. In increasing order of sequence index;
309  * 2. In decreasing order of scores.
310  */
311 static int LIBCALLBACK
312 hsp_info_id_score_compare(VoidPtr v1, VoidPtr v2)
313 {
314    BlastClustHspInfoPtr c1, c2;
315 
316    c1 = *(BlastClustHspInfoPtr PNTR) v1;
317    c2 = *(BlastClustHspInfoPtr PNTR) v2;
318 
319    if (c1->id == c2->id && c1->score == c2->score)
320       return 0;
321    else if ((c1->id < c2->id) || 
322             (c1->id == c2->id && c1->score > c2->score)) 
323       return -1;
324    else 
325       return 1;
326 }
327 
328 /** Starting size for a cluster, which can be dynamically reallocated when
329  * necessary. 
330  */
331 #define ORIGINAL_CLUSTER_SIZE 10
332 
333 /** Creates the clusters, based on the root array, and prints them to the
334  * cluster output file. 
335  * @param num_queries Number of sequences being clustered. [in]
336  * @param seq_len Array of sequence lengths [in]
337  * @param id_list Array of sequence string identifiers [in]
338  * @param gi_list array of sequence numeric identifiers [in]
339  * @param used_id_index Array of flags indicating whether id with a given index
340  *                      has already been processed. [in]
341  */
342 static int 
343 BlastClusterNeighbours(Int4 num_queries, Int4Ptr seq_len, 
344                        CharPtr PNTR id_list, Int4Ptr gi_list,
345                        Boolean PNTR used_id_index)
346 {
347    BlastClusterPtr PNTR cluster;
348    Int4 num_clusters, available_size, index, i;
349    Boolean numeric_id_type;
350 
351    if (gi_list)
352       numeric_id_type = TRUE;
353    else if (id_list)
354       numeric_id_type = FALSE;
355    else 
356       return 0;
357 
358     for (index=1; index<num_queries; index++) {
359         i = index;
360         while (root[i] != i)
361             i = root[i];
362         root[index] = i;
363     }
364 
365     cluster = (BlastClusterPtr PNTR) 
366        MemNew(num_queries*sizeof(BlastClusterPtr));
367     num_clusters = 0;
368 
369     /* Loop over input sequence indices. */
370     for (index = 0; index < num_queries; index++) {
371         /* If this sequence hasn't been processed yet, skip it. */
372         if (used_id_index != NULL && !used_id_index[index])
373             continue;
374         /* If this sequence is the root of its corresponding cluster, 
375            start a new cluster. */
376         if (root[index] == index) {
377             cluster[num_clusters] = 
378                 (BlastClusterPtr) MemNew(sizeof(BlastCluster)); 
379             cluster[num_clusters]->size = 1;
380             cluster[num_clusters]->elements = 
381                 (BlastClusterElementPtr PNTR) 
382                 Malloc(ORIGINAL_CLUSTER_SIZE*sizeof(BlastClusterElementPtr));
383             cluster[num_clusters]->elements[0] = 
384                 (BlastClusterElementPtr) MemNew(sizeof(BlastClusterElement));
385             
386             if (numeric_id_type)
387                 cluster[num_clusters]->elements[0]->gi = gi_list[index];
388             else
389                 cluster[num_clusters]->elements[0]->id = id_list[index];
390             cluster[num_clusters]->elements[0]->len = seq_len[index];
391             available_size = ORIGINAL_CLUSTER_SIZE;
392             
393             /* Find other sequences belonging to this cluster and add them. */
394             for (i = index+1; i < num_queries; i++) {
395                 if (root[i] == index) {
396                     Int4 size = cluster[num_clusters]->size;
397                     /* Reallocate the elements array if necessary. */
398                     if (size >= available_size) {
399                         available_size *= 2;
400                         cluster[num_clusters]->elements = 
401                             (BlastClusterElementPtr PNTR) 
402                             Realloc(cluster[num_clusters]->elements, 
403                                     available_size*sizeof(BlastClusterElementPtr));
404                     } 
405                     cluster[num_clusters]->elements[size] = 
406                         (BlastClusterElementPtr) MemNew(sizeof(BlastClusterElement));
407                     if (numeric_id_type)
408                         cluster[num_clusters]->elements[size]->gi = gi_list[i];
409                     else
410                         cluster[num_clusters]->elements[size]->id = id_list[i];
411                     
412                     cluster[num_clusters]->elements[size]->len = seq_len[i];
413                     ++cluster[num_clusters]->size;
414                 }
415             }
416             num_clusters++;
417         }
418     }
419 
420     /* Sort each cluster in decreasing order of sequence lengths */
421     for (index=0; index<num_clusters; index++) 
422        HeapSort(cluster[index]->elements, cluster[index]->size, 
423                 sizeof(BlastClusterElementPtr), compare_cluster_elements);
424     /* Sort clusters in decreasing order of sizes */
425     HeapSort(cluster, num_clusters, sizeof(BlastClusterPtr), compare_clusters);
426     /* Print out all clusters */
427     for (index=0; index<num_clusters; index++) {
428         for (i=0; i<cluster[index]->size; i++) {
429             if (numeric_id_type)
430                 fprintf(global_fp, "%ld ", cluster[index]->elements[i]->gi); 
431             else 
432                 fprintf(global_fp, "%s ", cluster[index]->elements[i]->id);
433             MemFree(cluster[index]->elements[i]);
434         }
435         fprintf(global_fp, "\n");
436         MemFree(cluster[index]->elements);
437         MemFree(cluster[index]);
438     }
439     MemFree(cluster);
440     return 1;
441 }
442 
443 #define INFO_LIST_SIZE 1000
444 #define FILE_BUFFER_SIZE 4096
445 
446 /* Reclusters saved hits and returns the ordinal id of the last query found in
447  * the hits file, plus 1.
448  * @param infofp File to read hit information from [in]
449  * @param outfp File to write clusters to [in]
450  * @param gilp Pointer to an array  of gis (numeric ids) found in the hits 
451  *             file [out]
452  * @param idlp Pointer to an array of string sequence ids found in the hits 
453  *             file [out]
454  * @param seqlp Pointer to an array of lengths of sequences found in the hits
455  *              file [out]
456  * @param idfp File to read a list of ids that need to be reclustered 
457  *             (optional) [in]
458  * @return Index of the last query found in the hits file ( == ordinal id + 1)
459  */
460    
461 static Int4 ReclusterFromFile(FILE *infofp, FILE *outfp, Int4Ptr PNTR gilp,
462                               CharPtr PNTR PNTR idlp, Int4Ptr PNTR seqlp,
463                               FILE *idfp)
464 {
465    ClusterLogInfoPtr info;
466    Int4 num_queries, num_hits, i, root1, root2, total_id_len;
467    Int4Ptr gi_list = NULL, seq_len = NULL;
468    CharPtr PNTR id_list = NULL;
469    CharPtr ptr, id_string = NULL;
470    FloatHi length_coverage, score_coverage; 
471    Uint4 header_size, numeric_id_type;
472    Int4 last_seq = -1;
473    CharPtr id = NULL;
474    Boolean PNTR used_id_index = NULL;
475 
476    /* Read header data from the hits file. */
477    FileRead(&numeric_id_type, sizeof(Int4), 1, infofp);
478    FileRead(&header_size, sizeof(Int4), 1, infofp);
479 
480    if (numeric_id_type) {
481        num_queries = header_size;
482        gi_list = (Int4Ptr) MemNew(num_queries*sizeof(Int4));
483        FileRead(gi_list, sizeof(Int4), num_queries, infofp);
484    } else {
485        total_id_len = header_size;
486        num_queries = 0;
487        id_string = (CharPtr) MemNew(total_id_len+1);
488        FileRead(id_string, sizeof(Char), total_id_len, infofp);
489        ptr = id_string;
490        /* Count the ID's and change delimiter from space to null character */
491        for (i=0; i<total_id_len; i++) {
492            if (*ptr==' ') {
493                num_queries++;
494                *ptr = '\0';
495            }
496            ptr++;
497        }
498        id_list = (CharPtr PNTR) MemNew(num_queries*sizeof(CharPtr));
499        for (i=0; i<num_queries; i++) {
500            /* No need for allocation of new memory */
501            id_list[i] = id_string;
502            id_string += StringLen(id_string) + 1;
503        }
504        id_string = id_list[0];
505    }
506 
507    seq_len = (Int4Ptr) Malloc(num_queries*sizeof(Int4));
508    FileRead(seq_len, sizeof(Int4), num_queries, infofp);
509 
510    info = (ClusterLogInfoPtr) MemNew(INFO_LIST_SIZE*sizeof(ClusterLogInfo));
511 
512    /* Initialize the root array, putting each sequence in its own 
513       one-element cluster. */
514    root = (Int4Ptr) Malloc(num_queries*sizeof(Int4));
515    for (i=0; i<num_queries; i++)
516       root[i] = i;
517 
518    /* Test for list of ids to use for reclustering, and fill the array of
519       ids used in this reclustering. */
520    if (idfp) {
521       CharPtr used_id_string;
522       CharPtr delimiter_list = " \t\n,;";
523       Int4 length;
524 
525       used_id_string = (CharPtr) MemNew(FILE_BUFFER_SIZE+1);
526       used_id_index = (Boolean PNTR) MemNew(num_queries*sizeof(Boolean));
527       while((length = 
528              FileRead(used_id_string, sizeof(Char), FILE_BUFFER_SIZE, idfp))) {
529           ptr = used_id_string;
530           while ((id = StringTokMT(ptr, delimiter_list, &ptr)) 
531                  != NULL) {
532               if (ptr==NULL && length==FILE_BUFFER_SIZE) {
533                   fseek(idfp, -StrLen(id), SEEK_CUR);
534                   break;
535               }
536               for (i=0; i<num_queries; i++) {
537                   if ((gi_list && atoi(id) == gi_list[i]) ||
538                       (!gi_list && !StrCmp(id_list[i], id)))
539                       break;
540               }
541               if (i < num_queries)
542                   used_id_index[i] = TRUE;
543           }
544       }
545    }
546 
547    /* Read the HSP data. */
548    while ((num_hits = FileRead(info, sizeof(ClusterLogInfo), INFO_LIST_SIZE,
549                                infofp)) > 0) {
550       for (i=0; i<num_hits; i++) {
551           /* If one of the sequences involved in this HSP is not of interest,
552              skip this HSP. */
553           if (used_id_index && 
554               (!used_id_index[info[i].id1] || !used_id_index[info[i].id2]))
555               continue;
556 
557           /* Calculate the coverage numbers. */
558           if (global_parameters->bidirectional)
559               length_coverage = MIN(((FloatHi)info[i].hsp_length1) / 
560                                     seq_len[info[i].id1], 
561                                     ((FloatHi)info[i].hsp_length2) / 
562                                     seq_len[info[i].id2]);
563           else
564               length_coverage = MAX(((FloatHi)info[i].hsp_length1) / 
565                                     seq_len[info[i].id1], 
566                                     ((FloatHi)info[i].hsp_length2) / 
567                                     seq_len[info[i].id2]);
568           
569           if (global_parameters->score_threshold < 3.0)
570               score_coverage = info[i].bit_score / 
571                   (MAX(info[i].hsp_length1, info[i].hsp_length2));
572           else 
573               score_coverage = info[i].perc_identity;
574           
575           /* If coverage satisfies the input requirements, update the cluster
576              root information. */
577           if (length_coverage >= global_parameters->length_threshold && 
578               score_coverage >= global_parameters->score_threshold) {
579               root1 = info[i].id1;
580               while (root[root1] != root1)
581                   root1 = root[root1];
582               root2 = info[i].id2;
583               while (root[root2] != root2)
584                   root2 = root[root2];
585               
586               if (root1 < root2)
587                   root[root2] = root1;
588               else if (root1 > root2)
589                   root[root1] = root2;
590           }
591       } /* End loop on hits from a chunk */
592       last_seq = info[num_hits-1].id1;
593    } /* End loop on chunks of hits */
594    
595    /* Create the cluster structures and print out. */
596    if (outfp != NULL) {
597       global_fp = outfp;
598       BlastClusterNeighbours(num_queries, seq_len, id_list, gi_list, 
599                              used_id_index);
600    }
601    *gilp = gi_list;
602    *idlp = id_list;
603    *seqlp = seq_len;
604    MemFree(id_string);
605    MemFree(info);
606    return last_seq + 1;
607 }
608 
609 /** Calculates percent identity given a gapped alignment block.
610  * @param HSP gapped alignment structure, returned from a gapped alignment 
611  *        routine. [in]
612  * @return percent of identical matches in this alignment.
613  */ 
614 static FloatHi 
615 GapAlignPercentIdentity(GapAlignBlkPtr gabp)
616 {
617    GapXEditScriptPtr esp = gabp->edit_block->esp;
618    Int4 identical, total, q_index, s_index, i;
619    FloatHi perc_identity;
620    Uint1Ptr query, subject;
621 
622    q_index = gabp->query_start;
623    s_index = gabp->subject_start;
624    query = gabp->query + q_index;
625    subject = gabp->subject + s_index;
626    identical = total = 0;
627 
628    while (esp) {
629       if (esp->op_type == GAPALIGN_SUB || esp->op_type == GAPALIGN_DECLINE) {
630          for (i=0; i<esp->num; i++) {
631             if (*query == *subject)
632                identical++;
633             query++;
634             subject++;
635          }
636       } else if (esp->op_type == GAPALIGN_DEL) 
637          subject += esp->num;
638       else if (esp->op_type == GAPALIGN_INS)
639          query += esp->num;
640       total += esp->num;
641       esp = esp->next;
642    }
643    perc_identity = ((FloatHi)identical) / total * 100;
644 
645    return perc_identity;
646 }
647    
648 /** Line length for configuration file with advanced options. */
649 #define BUFFER_SIZE 80
650 
651 /** Callback for printing out HSP data into the hits file.
652  * @param ptr BlastSearchBlkPtr, containing current list of HSPs. [in]
653  */
654 static int LIBCALLBACK
655 PrintNeighbors(VoidPtr ptr)
656 {
657     BLAST_HitListPtr current_hitlist;
658     BLAST_HSPPtr hsp; 
659     Int4 index;
660     BlastSearchBlkPtr search;
661     Int4 id1, id2, root1, root2, hspcnt;
662     Int4 subject_length;
663     Uint1Ptr subject, query;
664     ClusterLogInfoPtr loginfo = NULL;
665     BlastClustHspInfoPtr PNTR hsp_info_array;
666     Int4 query_count;
667 
668     if (ptr == NULL)
669         return 0;       
670     
671     search = (BlastSearchBlkPtr) ptr;
672     
673     if (search->current_hitlist == NULL || search->current_hitlist->hspcnt <= 0) {
674         /* No hits to save. */
675         search->subject_info = BLASTSubjectInfoDestruct(search->subject_info);
676         return 0;
677     }
678     
679     current_hitlist = search->current_hitlist;
680     
681     if (search->prog_number == blast_type_blastp)
682         subject_length = readdb_get_sequence(search->rdfp, search->subject_id, &subject);
683     else { /* Mega BLAST saves ncbi4na-encoded sequence */
684         subject = search->subject->sequence_start + 1;
685         subject_length = search->subject->length;
686     }
687     hspcnt = current_hitlist->hspcnt;
688     
689     if (search->prog_number == blast_type_blastp)
690        id1 = SeqId2OrdinalId(search->rdfp, search->query_id);
691     else 
692        id1 = -1;
693     id2 = search->subject_id;
694 
695     /* For blastp, save only HSPs with first sequence index smaller than 
696        second. */
697     if (id1 < id2) { 
698 #define BUF_CHUNK_SIZE 1024
699         Int4 query_length, q_length, s_length;
700         FloatHi length_coverage, bit_score, score_coverage, perc_identity; 
701         BLAST_KarlinBlkPtr kbp;
702         GapAlignBlkPtr gap_align = search->gap_align;
703         Int2 context;
704         Int4 max_score = 0;
705 
706         if (search->prog_number == blast_type_blastp) {
707            query = search->context[0].query->sequence;
708            query_length = search->context[0].query->length;
709         }
710 
711         query_count = 1;
712 
713         /* For blastn, the current hit list contains HSPs from multiple "query"
714            sequences. Create an array of HSP information structures, and find
715            highest scoring HSP for every pair of sequences present in the 
716            current list. */
717         if (search->prog_number == blast_type_blastn) {
718            hsp_info_array = (BlastClustHspInfoPtr PNTR) 
719               Malloc(hspcnt*sizeof(BlastClustHspInfoPtr));
720            for (index=0; index<hspcnt; index++) {
721               hsp_info_array[index] = (BlastClustHspInfoPtr)
722                  Malloc(sizeof(BlastClustHspInfo));
723               hsp = current_hitlist->hsp_array[index];
724               hsp_info_array[index]->id = 
725                  SeqId2OrdinalId(search->rdfp,
726                                  search->qid_array[hsp->context/2]);
727               hsp_info_array[index]->score = hsp->score;
728               hsp_info_array[index]->index = index;
729            }
730            HeapSort(hsp_info_array, hspcnt, sizeof(BlastClustHspInfoPtr), 
731                     hsp_info_id_score_compare);
732            /* Leave only one highest scoring hsp per "query" sequence.
733               Deallocate memory for all others. */
734            for (index=1; index<hspcnt; index++) {
735               if (hsp_info_array[index]->id < id2 && 
736                   hsp_info_array[index]->id != 
737                   hsp_info_array[query_count-1]->id) {
738                  hsp_info_array[query_count] = hsp_info_array[index];
739                  query_count++;
740               } else 
741                  hsp_info_array[index] = MemFree(hsp_info_array[index]);
742            }
743            
744            hsp = current_hitlist->hsp_array[hsp_info_array[0]->index];
745         } else {
746             /* For blastp, just find the highest-scoring HSP. */
747            for (index=0; index<hspcnt; index++) {
748               if (current_hitlist->hsp_array[index]->score > max_score) {
749                  hsp = current_hitlist->hsp_array[index];
750                  max_score = hsp->score;
751               }
752            }
753         }
754 
755         if (global_parameters->logfp)
756             loginfo = (ClusterLogInfoPtr)
757                 MemNew(query_count*sizeof(ClusterLogInfo));
758         
759         index = 0;
760         while (hsp) {
761             /* Calculate the coverage values for this HSP. */
762             if (search->prog_number == blast_type_blastn) {
763                 context = hsp->context;
764                 id1 = hsp_info_array[index]->id;
765                 query_length = 
766                     search->query_context_offsets[context+1] -
767                     search->query_context_offsets[context] - 1;
768             }
769             q_length = hsp->query.length;
770             s_length = hsp->subject.length;
771             
772             if (global_parameters->bidirectional)
773                 length_coverage = MIN(((FloatHi)q_length) / query_length, 
774                                       ((FloatHi)s_length) / subject_length);
775             else
776                 length_coverage = MAX(((FloatHi)q_length) / query_length, 
777                                       ((FloatHi)s_length) / subject_length);
778             
779             if (search->prog_number == blast_type_blastp)
780                 kbp = search->sbp->kbp_gap[search->first_context];
781             else
782                 kbp = search->sbp->kbp[context];
783             bit_score = ((hsp->score*kbp->Lambda) -
784                          kbp->logK)/NCBIMATH_LN2;
785             
786             /* Calculate percent identity, performing traceback search, if
787                necessary. */
788             if (search->prog_number == blast_type_blastp) {
789                 gap_align->query_frame = ContextToFrame(search,
790                                                         hsp->context);
791                 gap_align->subject_frame = hsp->subject.frame;
792                 gap_align->q_start = hsp->query.gapped_start;
793                 gap_align->s_start = hsp->subject.gapped_start;
794                 PerformGappedAlignmentWithTraceback(gap_align);
795                 perc_identity = GapAlignPercentIdentity(gap_align);
796                 gap_align->state_struct = 
797                     GapXDropStateDestroy(gap_align->state_struct);
798                 gap_align->edit_block = 
799                     GapXEditBlockDelete(gap_align->edit_block);
800             } else {
801                 Int4Ptr length, start;
802                 Uint1Ptr strands;
803                 Int4 align_length = 0, numseg, i;
804                 GapXEditScriptPtr esp;
805                 query = search->context[context].query->sequence;
806                 esp = hsp->gap_info->esp;
807                 for (numseg=0; esp; esp = esp->next, numseg++);
808                 GXECollectDataForSeqalign(hsp->gap_info, 
809                                           hsp->gap_info->esp, numseg,
810                                           &start, &length, &strands, 
811                                           &hsp->query.offset, 
812                                           &hsp->subject.offset);
813                 perc_identity = 0;
814                 for (i=0; i<numseg; i++) {
815                     align_length += length[i];
816                     if (start[2*i] != -1 && start[2*i+1] != -1)
817                         perc_identity += 
818                             MegaBlastGetNumIdentical(query, subject, start[2*i],
819                                                      start[2*i+1], length[i], 
820                                                      FALSE);
821                 }
822                 perc_identity = perc_identity / align_length * 100;
823                 start = MemFree(start);
824                 length = MemFree(length);
825                 strands = MemFree(strands);
826             }
827             if (global_parameters->score_threshold < 3.0)
828                 score_coverage = bit_score / (MAX(q_length, s_length));
829             else
830                 score_coverage = perc_identity;
831             
832             /* If hits are being saved, save this HSP's information. */
833             if (global_parameters->logfp) {
834                 loginfo[index].id1 = id1;
835                 loginfo[index].id2 = id2;
836                 loginfo[index].hsp_length1 = q_length;
837                 loginfo[index].hsp_length2 = s_length;
838                 loginfo[index].bit_score = bit_score;
839                 loginfo[index].perc_identity = perc_identity;
840             }
841             
842             /* If this HSP satisfies the coverage criteria, update the 
843                cluster roots. */
844             if (length_coverage >= global_parameters->length_threshold && 
845                 score_coverage >= global_parameters->score_threshold) {
846                 root1 = id1;
847                 
848                 NlmMutexLockEx(&root_mutex);
849                 while (root[root1] != root1)
850                     root1 = root[root1];
851                 root2 = id2;
852                 while (root[root2] != root2)
853                     root2 = root[root2];
854                 
855                 if (root1 < root2)
856                     root[root2] = root1;
857                 else if (root1 > root2)
858                     root[root1] = root2;
859                 NlmMutexUnlock(root_mutex);
860             }
861 
862            /* For blastn, get to the highest scoring hsp for the next 
863               "query" */
864            if (search->prog_number == blast_type_blastn && 
865                ++index < query_count)
866               hsp =
867                  current_hitlist->hsp_array[hsp_info_array[index]->index];
868            else
869               hsp = NULL;
870         }
871 
872         /* Deallocate memory for the auxiliary array of HSP info 
873            structures. Also free all gap information structures, 
874            because they are not freed when hit list is cleaned. */
875         if (search->prog_number == blast_type_blastn) {
876            for (index=0; index<query_count; index++)
877               MemFree(hsp_info_array[index]);
878            hsp_info_array = MemFree(hsp_info_array);
879            
880            for (index=0; index < current_hitlist->hspcnt; ++index) {
881               hsp = current_hitlist->hsp_array[index];
882               hsp->gap_info = GapXEditBlockDelete(hsp->gap_info);
883            }
884         }
885 
886         if (global_parameters->logfp && loginfo) {
887             FileWrite(loginfo, sizeof(ClusterLogInfo), query_count, 
888                       global_parameters->logfp);
889             loginfo = MemFree(loginfo);
890             fflush(global_parameters->logfp);
891         }
892     } else {
893        /* This can't happen in normal situation. If it does, the most likely
894           reason is presense of non-unique identifiers in the input file */
895        ErrPostEx(SEV_FATAL, 1, 0, "Blastclust cannot process input files with"
896                  " non-unique sequence identifiers\n");
897        return 1;
898     }
899        
900     return 0;
901 }
902 
903 #ifdef NUMARG
904 #undef NUMARG
905 #endif
906 #define NUMARG (sizeof(myargs)/sizeof(myargs[0]))
907 
908 static Args myargs [] = {
909    { "FASTA input file (program will format the database and remove files in the end)",                                                         /* 0 */
910      "stdin", NULL, NULL, FALSE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
911    { "Number of CPU's to use",                                   /* 1 */
912       "1", NULL, NULL, FALSE, 'a', ARG_INT, 0.0, 0, NULL},       
913    { "Output file for list of clusters",                         /* 2 */
914       "stdout", NULL, NULL, FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
915    { "Length coverage threshold",                                /* 3 */
916       "0.9", NULL, NULL, FALSE, 'L', ARG_FLOAT, 0.0, 0, NULL},   
917    { "Score coverage threshold (bit score / length if < 3.0, percentage of identities otherwise)",                                                              /* 4 */
918       "1.75", NULL, NULL, FALSE, 'S', ARG_FLOAT, 0.0, 0, NULL},  
919    { "Require coverage on both neighbours?",                     /* 5 */
920       "TRUE", NULL, NULL, FALSE, 'b', ARG_BOOLEAN, 0.0, 0, NULL},
921    { "File to save all neighbours",                              /* 6 */
922       NULL, NULL, NULL, TRUE, 's', ARG_FILE_OUT, 0.0, 0, NULL},
923    { "File to restore neighbors for reclustering",               /* 7 */
924       NULL, NULL, NULL, TRUE, 'r', ARG_FILE_IN, 0.0, 0, NULL}, 
925    { "Input as a database",                                      /* 8 */
926       NULL, NULL, NULL, TRUE, 'd', ARG_FILE_IN, 0.0, 0, NULL}, 
927    { "Print progress messages (verbose mode)",                   /* 9 */
928       "stdout", NULL, NULL, FALSE, 'v', ARG_FILE_OUT, 0.0, 0, NULL}, 
929    { "Complete unfinished clustering",                           /* 10 */
930       "FALSE", NULL, NULL, FALSE, 'C', ARG_BOOLEAN, 0.0, 0, NULL},   
931    { "Restrict reclustering to id list",                         /* 11 */
932       NULL, NULL, NULL, TRUE, 'l', ARG_FILE_IN, 0.0, 0, NULL},   
933    { "Is input proteins?",                                       /* 12 */
934       "TRUE", NULL, NULL, FALSE, 'p', ARG_BOOLEAN, 0.0, 0, NULL},
935    { "Enable id parsing in database formatting?",                /* 13 */
936       "TRUE", NULL, NULL, FALSE, 'e', ARG_BOOLEAN, 0.0, 0, NULL},
937    { "Configuration file with advanced options",                 /* 14 */
938      NULL, NULL, NULL, TRUE, 'c', ARG_FILE_IN, 0.0, 0, NULL},
939    { "Word size to use (0 for default: proteins 3, nucleotides 32)",/* 15 */
940      "0", NULL, NULL, FALSE, 'W', ARG_INT, 0.0, 0, NULL}       
941  
942 };
943 
944 /** Print progress messages after how many processed sequences? */
945 #define PROGRESS_INTERVAL 1000
946 /** Maximal number of sequences to concatenate for a blastn search. */
947 #define MAX_NUM_QUERIES 16383 /* == 1/2 INT2_MAX */
948 /** Maximal total length of concatenated sequences in a blastn search. */
949 #define MAX_TOTAL_LENGTH 5000000
950 
951 Int2 Main (void)
952 {
953     BLAST_OptionsBlkPtr options;
954     BlastSearchBlkPtr search;
955     Boolean db_is_na, query_is_na;
956     Int4 qsize, dbsize, first_seq;
957     ReadDBFILEPtr rdfp;
958     Uint1 align_type;
959     SeqIdPtr sip;
960     BioseqPtr query_bsp = NULL, PNTR query_bsp_array;
961     CharPtr blast_program, blast_inputfile, blast_outputfile, blast_database,
962        progress_file = NULL;
963     CharPtr logfile, info_file, input_name;
964     Int4 total_id_len = 0;
965     FILE *outfp, *infofp, *progressfp, *idfp;
966     Int4 index, i, num_queries, num_bsps, num_left;
967     Int8 total_length;
968     Int4Ptr gi_list = NULL, seq_len = NULL;
969     CharPtr PNTR id_list = NULL, id_string = NULL;
970     Boolean db_formatted = FALSE;
971     Boolean numeric_id_type = TRUE;
972     Char db_file[BUFFER_SIZE];
973     Boolean print_progress, finish_incomplete, is_prot, parse_mode;
974     FDB_optionsPtr fdb_options;
975     Char timestr[24];
976     CharPtr tmpdir;
977     Int4 total_query_length;
978     CharPtr title = NULL;
979     Char buf[256] = { '\0' };
980 
981     StringCpy(buf, "blastclust ");
982     StringNCat(buf, BlastGetVersionNumber(), sizeof(buf)-StringLen(buf)-1);
983 
984     if (! GetArgs (buf, NUMARG, myargs))
985        return (1);
986     
987     UseLocalAsnloadDataAndErrMsg ();
988     
989     if (! SeqEntryLoad())
990        return 1;
991     
992     ErrSetMessageLevel(SEV_WARNING);
993 
994     global_parameters = (ClusterParametersPtr) MemNew(sizeof(ClusterParameters));
995 
996     global_parameters->length_threshold = myargs[3].floatvalue;
997     global_parameters->score_threshold = myargs[4].floatvalue;
998     global_parameters->bidirectional = (Boolean) myargs[5].intvalue;
999     finish_incomplete = (Boolean) myargs[10].intvalue;
1000     
1001     print_progress = (Boolean) StrCmp(myargs[9].strvalue, "F");
1002     if (print_progress)
1003        progress_file = myargs[9].strvalue;
1004 
1005     if (progress_file != NULL &&
1006         (progressfp = FileOpen(progress_file, "w")) == NULL) {
1007        ErrPostEx(SEV_FATAL, 1, 0, "blastclust: Unable to open progress file %s\n",
1008                  progress_file);
1009        return (1);
1010     }
1011     
1012     blast_outputfile = myargs[2].strvalue;
1013     outfp = NULL;
1014     if (blast_outputfile != NULL) {
1015         if ((outfp = FileOpen(blast_outputfile, "w")) == NULL) {
1016             ErrPostEx(SEV_FATAL, 1, 0, "blastclust: Unable to open output file %s\n", blast_outputfile);
1017             return (1);
1018         }
1019     }
1020 
1021     info_file = myargs[7].strvalue;
1022     if (info_file) {
1023        /* Non-empty string means only retrieve neighbors for reclustering */
1024        if ((infofp = FileOpen(info_file, "rb")) == NULL) { 
1025           ErrPostEx(SEV_FATAL, 1, 0, "blastclust: Unable to open neighbors file %s for reading\n", info_file);
1026           return (1);
1027        }
1028        if (myargs[11].strvalue) {
1029           if ((idfp = FileOpen(myargs[11].strvalue, "r")) == NULL) {
1030              ErrPostEx(SEV_FATAL, 1, 0, "blastclust: Unable to open id file %s for reading\n", myargs[11].strvalue);
1031              return (1);
1032           }
1033        } else
1034           idfp = NULL;
1035        /* No need for another search, simply get all the neighbours
1036           and reculster them using new thresholds */
1037        ReclusterFromFile(infofp, outfp, &gi_list, &id_list, &seq_len, idfp);
1038        MemFree(gi_list);
1039        MemFree(id_list);
1040        MemFree(seq_len);
1041        FileClose(infofp);
1042        FileClose(outfp);
1043        return 0;
1044     } else
1045        infofp = NULL;
1046 
1047     is_prot = (Boolean) myargs[12].intvalue;
1048     parse_mode = (Boolean) myargs[13].intvalue;
1049     if (is_prot)
1050        blast_program = StringSave("blastp");
1051     else 
1052        blast_program = StringSave("blastn");
1053 
1054     if (myargs[8].strvalue)
1055        db_formatted = TRUE;
1056 
1057     if (db_formatted) {
1058        blast_database = myargs[8].strvalue;
1059     } else {
1060        /* Need to format the database */
1061        blast_inputfile = myargs[0].strvalue;
1062        input_name = FileNameFind(blast_inputfile);
1063 
1064        if (tmpdir = getenv("TMPDIR")) {
1065           blast_database = 
1066              Malloc(StringLen(input_name) + StringLen(tmpdir) + 2);
1067           sprintf(blast_database, "%s/%s", tmpdir, input_name);
1068        } else
1069           blast_database = blast_inputfile;
1070 
1071        fdb_options = FDBOptionsNew(blast_inputfile, is_prot, NULL, FALSE, 
1072                                    FALSE, FALSE, FALSE, TRUE, parse_mode, 
1073                                    blast_database, NULL, 0, 0, FORMATDB_VER,
1074                                    FALSE, FALSE);
1075        FastaToBlastDB(fdb_options, 0);
1076 
1077        fdb_options = FDBOptionsFree(fdb_options);
1078     }
1079 
1080     logfile = myargs[6].strvalue;
1081     if (logfile) { /* Empty string means do not write log information */
1082        if (finish_incomplete) {
1083           if ((global_parameters->logfp = FileOpen(logfile, "ab+")) == NULL) { 
1084              ErrPostEx(SEV_FATAL, 1, 0, "blast: Unable to open log file %s for appending\n", logfile);
1085              return (1);
1086           }
1087        } else {
1088           if ((global_parameters->logfp = FileOpen(logfile, "wb+")) == NULL) { 
1089              ErrPostEx(SEV_FATAL, 1, 0, "blast: Unable to open log file %s for writing\n", logfile);
1090              return (1);
1091           }
1092        }
1093     }
1094     global_fp = outfp;
1095 
1096     align_type = BlastGetTypes(blast_program, &query_is_na, &db_is_na);
1097     
1098     rdfp = readdb_new(blast_database, is_prot);
1099     
1100     options = BLASTOptionNew(blast_program, TRUE);
1101     
1102     if (options == NULL)
1103         return 3;
1104 
1105     /* Set the default search options. */
1106     options->use_real_db_size = TRUE;
1107     options->sort_gi_list = FALSE;
1108     
1109     options->expect_value  = 1e-6;      
1110     qsize = 300;
1111     dbsize = (20*1000*1000);
1112     options->searchsp_eff = (FloatHi) qsize * (FloatHi) dbsize;
1113     options->perform_culling = FALSE;
1114     options->do_not_reevaluate = TRUE;
1115     options->do_sum_stats = FALSE;
1116     options->number_of_cpus = myargs[1].intvalue;
1117     if (!is_prot) {
1118        options->is_megablast_search = TRUE;
1119        options->gap_open = options->gap_extend = 0;
1120        options->wordsize = 32;
1121        options->block_width = 0;
1122        options->window_size = 0;
1123     }    
1124 
1125     if (myargs[15].intvalue != 0)
1126        options->wordsize = myargs[15].intvalue;
1127 
1128     if (myargs[14].strvalue) {
1129        CharPtr string_options = NULL;
1130        Int2 string_options_len = 0;
1131        CharPtr error = NULL;
1132        FILE *config_file;
1133        Char opt_line[BUFFER_SIZE+1];
1134        
1135        if ((config_file = FileOpen(myargs[14].strvalue, "r")) == NULL) {
1136           ErrPostEx(SEV_FATAL, 1, 0, 
1137                     "Cannot open advanced options configuration file %s\n",
1138                     myargs[14].strvalue);
1139        }
1140 
1141        /* Read advanced options from a configuration file, one line at a
1142           time. */
1143        while (fgets(opt_line, BUFFER_SIZE, config_file) != 0) {
1144           if (*opt_line == '#')
1145              continue;
1146           if (!string_options) {
1147              string_options = (CharPtr) MemNew(BUFFER_SIZE+1);
1148              string_options_len = BUFFER_SIZE;
1149           } else {
1150              if (StrLen(string_options) + StrLen(opt_line) + 1 > 
1151                  string_options_len) {
1152                 CharPtr tmp = (CharPtr) Realloc(string_options, 
1153                                                 2*string_options_len);
1154                 if (tmp) {
1155                    string_options = tmp;
1156                    string_options_len *= 2;
1157                 }
1158              }
1159              StringCat(string_options, " ");
1160           }
1161           StringCat(string_options, opt_line);
1162        }
1163        if (string_options)
1164           parse_blast_options(options, string_options, &error, 
1165                               NULL, NULL, NULL);
1166        if (error) {
1167           ErrPostEx(SEV_WARNING, 0, 0, "blastclust: parse_blast_options: %s\n",
1168                     error);
1169        }
1170     }
1171 
1172     readdb_get_totals_ex(rdfp, &total_length, &num_queries, TRUE);
1173 
1174     root = (Int4Ptr) Malloc(num_queries*sizeof(Int4));
1175 
1176     /* If this is a continuation of a previous search that has not been 
1177        completed, read the previous search information and start from there. */
1178     if (is_prot && finish_incomplete) {
1179        first_seq = ReclusterFromFile(global_parameters->logfp, NULL, &gi_list,
1180                                     &id_list, &seq_len, NULL);
1181     } else {
1182        Uint4 header_size = 0;
1183        Uint4 header_numeric_id_type = 0;
1184        first_seq = 0;
1185        /* Put each sequence in its own one-element cluster. */
1186        for (index=0; index<num_queries; index++)
1187            root[index] = index;
1188        
1189        gi_list = (Int4Ptr) MemNew(num_queries*sizeof(Int4));
1190        id_list = (CharPtr PNTR) MemNew(num_queries*sizeof(CharPtr));
1191        seq_len = (Int4Ptr) MemNew(num_queries*sizeof(Int4));
1192        
1193        /* Get the query identificators, decide if they are string or 
1194           numeric. */
1195        for (index=0; index<num_queries; index++) {
1196            readdb_get_descriptor(rdfp, index, &sip, &title);
1197            seq_len[index] = readdb_get_sequence_length(rdfp, index);
1198           
1199           if (sip->choice != SEQID_GENERAL ||
1200               StringCmp(((DbtagPtr)sip->data.ptrvalue)->db, "BL_ORD_ID")) {
1201              numeric_id_type &= 
1202                 GetAccessionFromSeqId(sip, &gi_list[index], 
1203                                       &id_list[index]); 
1204           } else {
1205              CharPtr dummy_ptr = NULL;
1206              numeric_id_type = FALSE;
1207              if (title && *title != NULLB) {
1208                 id_list[index] = 
1209                    StringSave(StringTokMT(title, " ", &dummy_ptr));
1210              } else {
1211                 id_list[index] = (CharPtr) Malloc(BUFFER_SIZE+1);
1212                 SeqIdWrite(sip, id_list[index],
1213                            PRINTID_FASTA_SHORT, BUFFER_SIZE);
1214              }
1215           }
1216           title = MemFree(title);
1217           sip = SeqIdSetFree(sip);
1218        }
1219 
1220        if (numeric_id_type) {
1221            id_list = MemFree(id_list);
1222            header_size = num_queries;
1223            header_numeric_id_type = 1;
1224        } else {
1225            total_id_len = 0;
1226            /* Check if some ids were gis and convert them to strings */
1227            for (i=0; i<num_queries; i++) {
1228                if (gi_list[i] > 0) {
1229                    id_list[i] = (CharPtr) MemNew(10);
1230                    sprintf(id_list[i], "%ld", gi_list[i]);
1231                }
1232                total_id_len += StringLen(id_list[i]) + 1;
1233            }
1234            gi_list = MemFree(gi_list);
1235            id_string = (CharPtr) MemNew(total_id_len+1);
1236            for (i=0; i<num_queries; i++) {
1237                StringCat(id_string, id_list[i]);
1238                StringCat(id_string, " ");
1239            }
1240            header_size = total_id_len;
1241        }
1242        
1243        /* Write the sequence identifiers and lengths into the hits file
1244           header. */
1245        FileWrite(&header_numeric_id_type, sizeof(Uint4), 1, 
1246                  global_parameters->logfp);
1247        FileWrite(&header_size, sizeof(Uint4), 1, global_parameters->logfp);
1248 
1249        if (numeric_id_type) 
1250            FileWrite(gi_list, sizeof(Int4), num_queries, 
1251                      global_parameters->logfp);
1252        else {
1253            FileWrite(id_string, sizeof(Char), total_id_len, 
1254                      global_parameters->logfp);
1255            MemFree(id_string);
1256        }
1257        
1258        FileWrite(seq_len, sizeof(Int4), num_queries, global_parameters->logfp);
1259        fflush(global_parameters->logfp);
1260     }
1261 
1262     /* Print the first progress message, if necessary. */
1263     if (print_progress) {
1264         DayTimeStr(timestr, TRUE, TRUE);
1265         if (finish_incomplete)
1266             fprintf(progressfp, 
1267                "%s Finish clustering of %ld queries, starting from query %ld\n", 
1268                     timestr, num_queries, first_seq);
1269         else
1270             fprintf(progressfp, "%s Start clustering of %ld queries\n", 
1271                     timestr, num_queries);
1272     }
1273 
1274     if (!is_prot) {
1275        SeqAlignPtr PNTR head; /* Will not be used */
1276 
1277        query_bsp_array = (BioseqPtr PNTR) 
1278           Malloc((MIN(num_queries, MAX_NUM_QUERIES)+1)*sizeof(BioseqPtr));
1279        num_left = num_queries;
1280        /* For blastn, concatenate as many queries as possible and run the 
1281           search engine. */
1282        while (num_left>0) {
1283           num_bsps = MIN(num_left, MAX_NUM_QUERIES);
1284           total_query_length = 0;
1285           for (index=0; index<num_bsps; index++) {
1286              query_bsp_array[index] = readdb_get_bioseq(rdfp, index);
1287              total_query_length += query_bsp_array[index]->length;
1288              if (total_query_length > MAX_TOTAL_LENGTH)
1289                 break;
1290           }
1291           num_bsps = index;
1292           query_bsp_array[num_bsps] = NULL;
1293           
1294           head = BioseqMegaBlastEngine(query_bsp_array, blast_program, 
1295                                        blast_database, options, NULL, NULL,
1296                                        NULL, NULL, NULL, 0, PrintNeighbors);
1297           for (index=0; index<num_bsps; index++)
1298              query_bsp_array[index] = BioseqFree(query_bsp_array[index]);
1299           num_left -= num_bsps;
1300           ObjMgrFreeCache(0);
1301        }
1302        MemFree(query_bsp_array);
1303     } else {
1304         /* Loop over all sequences, searching each time only against 
1305            sequences with larger indices. */
1306        for (index=first_seq; index<num_queries; index++) {
1307           query_bsp = readdb_get_bioseq(rdfp, index);
1308           /* Set the starting database sequence for the search. */
1309           options->first_db_seq = index + 1;
1310           
1311           /* Set up search. */
1312           search = BLASTSetUpSearchWithReadDbInternal(NULL, query_bsp,
1313                       blast_program, seq_len[index], blast_database, 
1314                       options, NULL, NULL, NULL, 0, rdfp);
1315           if (search != NULL && !search->query_invalid) {
1316              search->handle_results = PrintNeighbors;
1317              
1318              /* Run BLAST. */
1319              do_the_blast_run(search);
1320           } else if (search) {
1321              BlastErrorPrint(search->error_return);
1322              ErrPostEx(SEV_ERROR, 1, 0, "Failed to process query number %ld", (long) index);
1323              return 1;
1324           }
1325           search = BlastSearchBlkDestruct(search);
1326           query_bsp = BioseqFree(query_bsp);
1327           
1328           if (print_progress && (index + 1)%PROGRESS_INTERVAL == 0) {
1329              DayTimeStr(timestr, TRUE, TRUE);
1330              fprintf(progressfp, "%s Finished processing of %ld queries\n", 
1331                      timestr, index+1);
1332           }
1333        } /* End of loop on queries */
1334     }
1335     rdfp = readdb_destruct(rdfp);
1336 
1337     /* Create the clusters and print them out. */
1338     BlastClusterNeighbours(num_queries, seq_len, id_list, gi_list, NULL);
1339 
1340     MemFree(seq_len);
1341     if (numeric_id_type)
1342        MemFree(gi_list);
1343     else {
1344        for (i=0; i<num_queries; i++)
1345           MemFree(id_list[i]);
1346        MemFree(id_list);
1347     }
1348     MemFree(root);
1349     
1350     fflush(global_fp);
1351     options = BLASTOptionDelete(options);
1352     blast_program = (CharPtr) MemFree(blast_program);
1353     FileClose(outfp);
1354     if (global_parameters->logfp)
1355        FileClose(global_parameters->logfp);
1356 
1357     /* Remove the temporary BLAST database files. */
1358     if (!db_formatted && StringLen(blast_database) > 0) {
1359        Char p_or_n = (is_prot) ? 'p' : 'n';
1360        sprintf(db_file, "%s.%chr", blast_database, p_or_n);
1361        FileRemove(db_file);
1362        sprintf(db_file, "%s.%cin", blast_database, p_or_n);
1363        FileRemove(db_file);
1364        sprintf(db_file, "%s.%csq", blast_database, p_or_n);
1365        FileRemove(db_file);
1366        if (parse_mode) {
1367           sprintf(db_file, "%s.%cnd", blast_database, p_or_n);
1368           FileRemove(db_file);
1369           sprintf(db_file, "%s.%cni", blast_database, p_or_n);
1370           FileRemove(db_file);
1371           sprintf(db_file, "%s.%csd", blast_database, p_or_n);
1372           FileRemove(db_file);
1373           sprintf(db_file, "%s.%csi", blast_database, p_or_n);
1374           FileRemove(db_file);
1375        }
1376     }
1377     MemFree(blast_database);
1378     return 0;
1379 }
1380 
1381 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.