NCBI C Toolkit Cross Reference

C/algo/blast/api/blast_tabular.c


  1 /* $Id: blast_tabular.c,v 1.41 2009/06/01 14:33:38 maning Exp $
  2 * ===========================================================================
  3 *
  4 *                            PUBLIC DOMAIN NOTICE
  5 *               National Center for Biotechnology Information
  6 *
  7 *  This software/database is a "United States Government Work" under the
  8 *  terms of the United States Copyright Act.  It was written as part of
  9 *  the author's offical duties as a United States Government employee and
 10 *  thus cannot be copyrighted.  This software/database is freely available
 11 *  to the public for use. The National Library of Medicine and the U.S.
 12 *  Government have not placed any restriction on its use or reproduction.
 13 *
 14 *  Although all reasonable efforts have been taken to ensure the accuracy
 15 *  and reliability of the software and data, the NLM and the U.S.
 16 *  Government do not and cannot warrant the performance or results that
 17 *  may be obtained by using this software or data. The NLM and the U.S.
 18 *  Government disclaim all warranties, express or implied, including
 19 *  warranties of performance, merchantability or fitness for any particular
 20 *  purpose.
 21 *
 22 *  Please cite the author in any work or product based on this material.
 23 *
 24 *  Author: Ilya Dondoshansky
 25 * ===========================================================================*/
 26 
 27 /** @file blast_tabular.c
 28  * On-the-fly tabular formatting of BLAST results
 29  */
 30 
 31 #ifndef SKIP_DOXYGEN_PROCESSING
 32 static char const rcsid[] = "$Id: blast_tabular.c,v 1.41 2009/06/01 14:33:38 maning Exp $";
 33 #endif /* SKIP_DOXYGEN_PROCESSING */
 34 
 35 #include <algo/blast/api/blast_tabular.h>
 36 #include <algo/blast/core/blast_util.h>
 37 #include <algo/blast/core/blast_setup.h>
 38 #include <algo/blast/core/blast_engine.h>
 39 #include <algo/blast/core/blast_traceback.h>
 40 #include <algo/blast/api/blast_format.h>
 41 #include <algo/blast/api/hspfilter_queue.h>
 42 #include <algo/blast/api/blast_seqalign.h>
 43 #include <algo/blast/core/blast_seqsrc_impl.h>
 44 #include <algo/blast/core/gencode_singleton.h>
 45 
 46 #include <txalign.h>
 47 
 48 /** @addtogroup CToolkitAlgoBlast
 49  *
 50  * @{
 51  */
 52 
 53 BlastTabularFormatData*
 54 BlastTabularFormatDataNew(FILE* outfp, AsnIoPtr asn_outfp, SeqLoc* query_seqloc,
 55                           EBlastTabularFormatOptions format_option,
 56                           Boolean believe_query)
 57 {
 58    BlastTabularFormatData* tf_data = 
 59       (BlastTabularFormatData*) calloc(1, sizeof(BlastTabularFormatData));
 60    tf_data->outfp = outfp;
 61    tf_data->asn_outfp = asn_outfp;
 62    tf_data->query_slp = query_seqloc;
 63    tf_data->format_options = format_option;
 64    tf_data->believe_query = believe_query;
 65 
 66    return tf_data;
 67 }
 68 
 69 Int2
 70 Blast_TabularFormatDataSetUp(BlastTabularFormatData* tf_data,
 71                              EBlastProgramType program, 
 72                              BlastHSPStream* hsp_stream, 
 73                              const BlastSeqSrc* seq_src, 
 74                              BLAST_SequenceBlk* query, 
 75                              BlastQueryInfo* query_info,
 76                              const BlastScoringOptions* score_options, 
 77                              BlastScoreBlk* sbp,
 78                              const BlastEffectiveLengthsOptions* eff_len_options,
 79                              const BlastExtensionOptions* ext_options,
 80                              const BlastHitSavingOptions* hit_options,
 81                              const BlastDatabaseOptions* db_options)
 82 {
 83     Int2 status = 0;
 84 
 85     ASSERT(score_options && db_options);
 86 
 87     tf_data->perform_traceback = score_options->gapped_calculation;
 88     tf_data->program = program;
 89     tf_data->hsp_stream = hsp_stream;
 90     tf_data->query = query;
 91     tf_data->gen_code_string = GenCodeSingletonFind(db_options->genetic_code);
 92     /* Sequence source must be copied, to guarantee multi-thread safety. */
 93     tf_data->seq_src = BlastSeqSrcCopy(seq_src);
 94     /* Effective lengths must be duplicated in query info structure, because
 95        they might be changing in the preliminary search. */
 96     tf_data->query_info = BlastQueryInfoDup(query_info);
 97     
 98     /* If traceback will have to be performed before tabular output, 
 99        do the preparation for it here. */
100     if (tf_data->perform_traceback) {
101         status = 
102             BLAST_GapAlignSetUp(program, seq_src, score_options, 
103                                 eff_len_options, ext_options, hit_options, 
104                                 tf_data->query_info, sbp, &tf_data->score_params,
105                                 &tf_data->ext_params, &tf_data->hit_params, 
106                                 &tf_data->eff_len_params, &tf_data->gap_align);
107         tf_data->gap_align->gap_x_dropoff = tf_data->ext_params->gap_x_dropoff_final;
108     }
109     return status;
110 }
111 
112 void
113 BlastTabularFormatDataClean(BlastTabularFormatData* tf_data)
114 {
115     if (!tf_data)
116         return;
117 
118     /* Free the structures that have been allocated internally */
119     tf_data->query_info = BlastQueryInfoFree(tf_data->query_info);
120     tf_data->score_params = BlastScoringParametersFree(tf_data->score_params);
121     tf_data->ext_params = BlastExtensionParametersFree(tf_data->ext_params);
122     tf_data->hit_params = BlastHitSavingParametersFree(tf_data->hit_params);
123     tf_data->eff_len_params = 
124         BlastEffectiveLengthsParametersFree(tf_data->eff_len_params);
125     tf_data->gap_align = BLAST_GapAlignStructFree(tf_data->gap_align);
126     tf_data->seq_src = BlastSeqSrcFree(tf_data->seq_src);
127 }
128 
129 BlastTabularFormatData* 
130 BlastTabularFormatDataFree(BlastTabularFormatData* tf_data)
131 {
132     if (!tf_data)
133         return NULL;
134 
135     /* Free the internal structures, if they haven't been freed earlier. */
136     BlastTabularFormatDataClean(tf_data);
137 
138     sfree(tf_data);
139     return tf_data;
140 }
141 
142 /** Creates nucleotide sequence buffers corresponding to a local alignment.
143  * Used in tabular output with "print sequences" option.
144  * @param program Type of BLAST program [in]
145  * @param hsp Internal HSP structure [in]
146  * @param query_seq Query sequence in blastna encoding. [in]
147  * @param subject_seq Subject sequence in blastna encoding [in]
148  * @param query_length Length of query sequence [in]
149  * @param subject_length Length of subject sequence [in]
150  * @param query_buffer Preallocated buffer for text query sequence [in] [out]
151  * @param subject_buffer Preallocated buffer for text subject sequence [in] [out]
152  */
153 static void 
154 FillNuclSequenceBuffers(EBlastProgramType program, BlastHSP* hsp, 
155                         Uint1* query_seq, Uint1* subject_seq, Int4 query_length, 
156                         Int4 subject_length, char* query_buffer, 
157                         char* subject_buffer)
158 {
159    Int4 index, index1;
160    const char* blastna_to_iupacna     = "ACGTRYMKWSBDHVN-";
161    const char* blastna_to_iupacna_rev = "TGCAYRKMSWVHDBN-"; 
162    Uint1* query_ptr;
163    Uint1* subject_ptr;
164    Int4 numseg;
165    Int4* starts;
166    Int4* lengths;
167    Int4 offset;
168    Int4 start1, start2;
169    char* buffer;
170    Boolean reverse;
171    Boolean translate1, translate2;
172 
173    translate1 = Blast_QueryIsTranslated(program);
174    translate2 = Blast_SubjectIsTranslated(program);
175 
176    reverse = (hsp->query.frame != hsp->subject.frame);
177 
178    /* Calculate number of segments. */
179    numseg = hsp->gap_info->size;
180    /* Find the starts and lengths of each segment. */
181    start1 = hsp->query.offset;
182    start2 = hsp->subject.offset;
183    GapCollectDataForSeqalign(hsp, hsp->gap_info, 0, numseg, query_length,
184                              subject_length, translate1, translate2,
185                              &starts, &lengths, NULL, &start1, &start2);
186 
187    offset = 0;
188    if (!reverse) {
189       for (index = 0; index < numseg; ++index) {
190          buffer = &query_buffer[offset];
191          if (starts[2*index] != -1) {
192             query_ptr = &query_seq[starts[2*index]];
193             for (index1 = 0; index1 < lengths[index]; ++index1) {
194                *buffer = blastna_to_iupacna[*query_ptr];
195                buffer++;
196                query_ptr++;
197             }
198          } else {
199             memset(buffer, '-', lengths[index]);
200          }
201          buffer = &subject_buffer[offset];
202          if (starts[2*index+1] != -1) {
203             subject_ptr = &subject_seq[starts[2*index+1]];
204             for (index1 = 0; index1 < lengths[index]; ++index1) {
205                *buffer = blastna_to_iupacna[*subject_ptr];
206                buffer++;
207                subject_ptr++;
208             }
209          } else {
210             memset(buffer, '-', lengths[index]);
211          }
212          offset += lengths[index];
213       }
214    } else {
215       for (index = numseg-1; index >=0; --index) {
216          buffer = &query_buffer[offset];
217          if (starts[2*index] != -1) {
218             query_ptr = &query_seq[starts[2*index]];
219             for (index1 = 0; index1 < lengths[index]; ++index1) {
220                *buffer = blastna_to_iupacna[*query_ptr];
221                buffer++;
222                query_ptr++;
223             }
224          } else {
225             memset(buffer, '-', lengths[index]);
226          }
227          buffer = &subject_buffer[offset];
228          if (starts[2*index+1] != -1) {
229             subject_ptr = &subject_seq[starts[2*index+1]+lengths[index]-1];
230             for (index1 = 0; index1 < lengths[index]; ++index1) {
231                *buffer = blastna_to_iupacna_rev[*subject_ptr];
232                buffer++;
233                subject_ptr--;
234             }
235          } else {
236             memset(buffer, '-', lengths[index]);
237          }
238          offset += lengths[index];
239       }
240    }
241 
242    sfree(starts);
243    sfree(lengths);
244 }
245 
246 /** Maximal buffer length to use for a Seq-id in tabular output. */
247 #define SEQIDLEN_MAX 255
248 
249 /** For incremental ASN.1 output, the maximum number of seq-aligns
250     that are packed into a single seq-annot */
251 #define INCREMENTAL_ASN_BATCH_SIZE 50
252 
253 void* Blast_TabularFormatThread(void* data) 
254 {
255    BlastTabularFormatData* tf_data;
256    EBlastProgramType program;
257    BlastHSPList* hsp_list = NULL;
258    BlastSeqSrc* seq_src;
259    BLAST_SequenceBlk* query = NULL; 
260    BlastQueryInfo* query_info = NULL;
261    BlastScoringParameters* score_params = NULL;
262    BlastExtensionParameters* ext_params = NULL;
263    BlastHitSavingParameters* hit_params = NULL;
264    BlastEffectiveLengthsParameters* eff_len_params = NULL;
265    Uint1* gen_code_string = NULL;
266    BlastGapAlignStruct* gap_align = NULL;
267    Int4 query_index, index;
268    char* query_buffer = NULL;
269    char* subject_buffer = NULL;
270    Int4 q_start=0, q_end=0, s_start=0, s_end=0;
271    SeqLoc* slp;
272    char bit_score_buff[10], eval_buff[10];
273    char* eval_buff_ptr = NULL;
274    BlastHSP* hsp;
275    SeqId** query_id_array = NULL;
276    SeqId* subject_id = NULL;
277    Int4 align_length = 0;
278    Int4 num_gaps = 0, num_gap_opens = 0, num_mismatches = 0;
279    double perc_ident = 0;
280    BlastSeqSrcGetSeqArg seq_arg;
281    Boolean one_seq_update_params;
282    ReadDBFILE* rdfp = NULL;
283    char* descr;
284    Int4 num_queries;
285    Int4* query_lengths;
286    Boolean sequence_in_use = FALSE;
287    Int4 num_asn_results = 0;
288    SeqAlignPtr sap_head = NULL;
289    SeqAlignPtr sap_last = NULL;
290  
291    tf_data = (BlastTabularFormatData*) data;
292    if (!tf_data || !tf_data->query_slp || !tf_data->hsp_stream ||
293        !tf_data->seq_src || (!tf_data->outfp && !tf_data->asn_outfp)) 
294       return NULL;
295 
296    program = tf_data->program;
297    seq_src = tf_data->seq_src;
298    query = tf_data->query;
299    query_info = tf_data->query_info;
300 
301    seq_arg.seq = NULL;
302    seq_arg.oid = 0;
303 
304    if (tf_data->perform_traceback) {
305       score_params = tf_data->score_params;
306       ext_params = tf_data->ext_params;
307       hit_params = tf_data->hit_params;
308       eff_len_params = tf_data->eff_len_params;
309       gap_align = tf_data->gap_align;
310       gen_code_string = tf_data->gen_code_string;
311       seq_arg.encoding = Blast_TracebackGetEncoding(program);
312    }
313 
314    num_queries = ValNodeLen(tf_data->query_slp);
315    query_id_array = (SeqId**) malloc(num_queries*sizeof(SeqId*));
316    query_lengths = (Int4*) malloc(num_queries*sizeof(Int4));
317 
318    for (index = 0, slp = tf_data->query_slp; slp; ++index, slp = slp->next) {
319       BioseqPtr bsp = BioseqLockById(SeqLocId(slp));
320       query_id_array[index] = SeqIdSetDup(bsp->id);
321       query_lengths[index] = BioseqGetLen(bsp);
322       BioseqUnlockById(SeqLocId(slp));
323    }
324 
325    one_seq_update_params = (BlastSeqSrcGetTotLen(seq_src) == 0);
326 
327    /* The line below shouldn't have to access the BlastSeqSrc's data structure
328     * FIXME*/
329    rdfp = (ReadDBFILE*) _BlastSeqSrcImpl_GetDataStructure(seq_src);
330 
331    while (BlastHSPQueueRead(tf_data->hsp_stream->writer->data, &hsp_list) 
332           != kBlastHSPStream_Eof) {
333        Int4 subject_length; 
334       if (!hsp_list) {
335          /* This should not happen, but just in case */
336          continue;
337       }
338 
339       /* Perform traceback if necessary */
340       if (tf_data->perform_traceback) {
341          seq_arg.oid = hsp_list->oid;
342          if (BlastSeqSrcGetSequence(seq_src, (void*) &seq_arg) < 0)
343              continue;
344          
345          sequence_in_use = TRUE;
346          if (one_seq_update_params) {
347             Int2 status;
348             /* This is not a database search, so effective search spaces
349                need to be recalculated based on this subject sequence length */
350             if ((status = BLAST_OneSubjectUpdateParameters(program, 
351                              seq_arg.seq->length, 
352                              score_params->options, 
353                              query_info, gap_align->sbp, 
354                              hit_params, NULL, 
355                              eff_len_params)) != 0) {
356                hsp_list = Blast_HSPListFree(hsp_list);
357                BlastSeqSrcReleaseSequence(seq_src, (void*)&seq_arg);
358                sequence_in_use = FALSE;
359                continue;
360             }
361          }
362 
363          Blast_TracebackFromHSPList(program, hsp_list, query,
364             seq_arg.seq, query_info, gap_align, gap_align->sbp, score_params,
365             ext_params->options, hit_params, gen_code_string, NULL);
366          /* Return subject sequence unless it is needed for the sequence
367             printout */
368          if (tf_data->format_options != eBlastTabularAddSequences) {
369             BlastSeqSrcReleaseSequence(seq_src, (void*)&seq_arg);
370             sequence_in_use = FALSE;
371          }
372          /* Recalculate the bit scores, since they might have changed. */
373          Blast_HSPListGetBitScores(hsp_list, 
374             score_params->options->gapped_calculation, gap_align->sbp);
375       }
376 
377       if (!readdb_get_descriptor(rdfp, hsp_list->oid, &subject_id, &descr)) {
378           subject_buffer = strdup("Unknown");
379       } else if (subject_id->choice != SEQID_GENERAL ||
380                  strcmp(((DbtagPtr)subject_id->data.ptrvalue)->db, 
381                         "BL_ORD_ID")) {
382          /* All cases except when database was formatted without seqid indices. 
383             In that case all real Seq-id information is hidden in the 
384             description. */
385          if (tf_data->show_gi || tf_data->show_accession) {
386             Blast_SeqIdGetDefLine(subject_id, &subject_buffer, 
387                                   tf_data->show_gi, tf_data->show_accession,
388                                   TRUE); 
389          } else {
390             if ( !(subject_buffer = (char*) malloc(sizeof(char)*SEQIDLEN_MAX)))
391                return NULL;
392             SeqIdWrite(subject_id, subject_buffer, PRINTID_FASTA_LONG, 
393                        SEQIDLEN_MAX-1);
394          }
395          /* Found something for the seqid buffer; description can be 
396             discarded now. */
397          if (subject_buffer != NULL)
398             sfree(descr);
399       }
400 
401       /* Last chance to assign anything - take the first token from the 
402          description. */
403       if (!subject_buffer && descr)
404          subject_buffer = strtok(descr, " \t\n\r");
405 
406       /* Retrieve the subject sequence if it is needed and this has not 
407          already been done. */ 
408       if (tf_data->format_options == eBlastTabularAddSequences && 
409           !tf_data->perform_traceback) {
410           seq_arg.oid = hsp_list->oid;
411           seq_arg.encoding = eBlastEncodingNucleotide;
412           if (BlastSeqSrcGetSequence(seq_src, (void*) &seq_arg) < 0) {
413              if (subject_id)
414                 subject_id = SeqIdSetFree(subject_id);
415              continue;
416           }
417           sequence_in_use = TRUE;
418       }
419 
420       subject_length = BlastSeqSrcGetSeqLen(seq_src, (void*)&hsp_list->oid);
421 
422       for (index = 0; index < hsp_list->hspcnt; ++index) {
423          char* query_buffer_ptr=NULL;
424          hsp = hsp_list->hsp_array[index];
425          query_index = 
426             Blast_GetQueryIndexFromContext(hsp->context, program);
427 
428          /* handle incremental ASN.1 output */
429          if (tf_data->format_options == eBlastIncrementalASN) {
430             SeqAlignPtr sap = NULL;
431             if (tf_data->is_ooframe) {
432                sap = OOFBlastHSPToSeqAlign(program, hsp, 
433                                    query_id_array[query_index], subject_id,
434                                    query_lengths[query_index], subject_length);
435             }
436             else {
437                sap = BlastHSPToSeqAlign(program, hsp, 
438                                    query_id_array[query_index], subject_id,
439                                    query_lengths[query_index], subject_length);
440             }
441             sap->score = GetScoreSetFromBlastHsp(hsp);
442             /* add to the current batch of results */
443             if (sap_head == NULL) {
444                sap_head = sap_last = sap;
445             }
446             else {
447                sap_last->next = sap;
448                sap_last = sap;
449             }
450 
451             /* flush the current batch if enough alignments
452                have accumulated */
453             if (++num_asn_results == INCREMENTAL_ASN_BATCH_SIZE) {
454                SeqAnnot* seqannot = SeqAnnotNew();
455                Boolean unused; 
456                seqannot->type = 2;
457                AddAlignInfoToSeqAnnot(seqannot, 
458                              GetOldAlignType(program, &unused));
459                seqannot->data = sap_head;
460                SeqAnnotAsnWrite((SeqAnnot*) seqannot, tf_data->asn_outfp, NULL);
461                AsnIoReset(tf_data->asn_outfp);
462                num_asn_results = 0;
463                sap_head = sap_last = NULL;
464                seqannot = SeqAnnotFree(seqannot);
465             }
466             continue;
467          }
468 
469          /* handle ordinary tabular output */
470 
471          Blast_SeqIdGetDefLine(query_id_array[query_index], &query_buffer, 
472                                tf_data->show_gi, tf_data->show_accession,
473                                tf_data->believe_query);
474          
475          eval_buff_ptr = eval_buff;
476          ScoreAndEvalueToBuffers(hsp->bit_score, hsp->evalue, 
477                                  bit_score_buff, &eval_buff_ptr, 0);
478          
479          /* Calculate percentage of identities */
480          Blast_HSPCalcLengthAndGaps(hsp, &align_length, &num_gaps, 
481                                     &num_gap_opens);
482          perc_ident = ((double)hsp->num_ident)/align_length * 100;
483          num_mismatches = align_length - hsp->num_ident - num_gaps;
484          
485          Blast_HSPGetAdjustedOffsets(program, hsp, query_lengths[query_index],
486                                      subject_length, &q_start, &q_end, 
487                                      &s_start, &s_end);
488          
489          query_buffer_ptr = query_buffer;
490          if (strstr(query_buffer, "lcl|") == query_buffer)
491             query_buffer_ptr += 4;
492 
493 
494          if (tf_data->format_options == eBlastTabularAddSequences) {
495             char* query_seq_buffer = NULL, *subject_seq_buffer = NULL;
496             Uint1* query_seq = NULL;
497             Int4 context;
498             context = hsp->context - (hsp->context % 2);
499             query_seq =
500                 & query->sequence[query_info->contexts[context].query_offset];
501             
502             query_seq_buffer = MemNew((align_length+1));
503             subject_seq_buffer = MemNew((align_length+1));
504 
505             FillNuclSequenceBuffers(program, hsp, query_seq, 
506                                     seq_arg.seq->sequence, 
507                                     query_lengths[query_index], 
508                                     seq_arg.seq->length, query_seq_buffer, 
509                                     subject_seq_buffer);
510             
511             fprintf(tf_data->outfp, 
512                     "%s\t%s\t%.2f\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%s\t%s\t%s\t%s\n",
513                     query_buffer_ptr, subject_buffer, perc_ident, 
514                     (long) align_length, (long) num_mismatches, 
515                     (long) num_gap_opens, (long) q_start, (long) q_end, 
516                     (long) s_start, (long) s_end, eval_buff, bit_score_buff,
517                     query_seq_buffer, subject_seq_buffer);
518             sfree(query_seq_buffer);
519             sfree(subject_seq_buffer);
520          } else {
521             fprintf(tf_data->outfp, 
522                     "%s\t%s\t%.2f\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%s\t%s\n",
523                     query_buffer_ptr, subject_buffer, perc_ident, 
524                     (long) align_length, (long) num_mismatches, 
525                     (long) num_gap_opens, (long) q_start, (long) q_end, 
526                     (long) s_start, (long) s_end, eval_buff, bit_score_buff);
527          }
528          sfree(query_buffer);
529       }
530 
531       /* Return the subject sequence, if it hasn't yet been done. */
532       if (sequence_in_use)
533           BlastSeqSrcReleaseSequence(seq_src, (void*)&seq_arg);
534 
535       fflush(tf_data->outfp);
536       sfree(subject_buffer);
537       hsp_list = Blast_HSPListFree(hsp_list);
538       if (subject_id)
539          subject_id = SeqIdSetFree(subject_id);
540    }
541 
542    /* flush any leftover ASN.1 output */
543    if (sap_head != NULL) {
544       SeqAnnot* seqannot = SeqAnnotNew();
545       Boolean unused; 
546       seqannot->type = 2;
547       AddAlignInfoToSeqAnnot(seqannot, 
548                     GetOldAlignType(program, &unused));
549       seqannot->data = sap_head;
550       SeqAnnotAsnWrite((SeqAnnot*) seqannot, tf_data->asn_outfp, NULL);
551       AsnIoReset(tf_data->asn_outfp);
552       seqannot = SeqAnnotFree(seqannot);
553    }
554 
555    BlastSequenceBlkFree(seq_arg.seq);
556 
557    for (index = 0; index<num_queries; ++index)
558    {
559         SeqIdSetFree(query_id_array[index]);
560         query_id_array[index] = NULL;
561    }
562    sfree(query_lengths);
563    sfree(query_id_array);
564 
565    return NULL;
566 }
567 /* @} */
568 
569 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.