|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/algo/blast/api/blast_tabular.c |
source navigation diff markup identifier search freetext search file search |
1 /* $Id: blast_tabular.c,v 1.41 2009/06/01 14:33:38 maning Exp $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * Author: Ilya Dondoshansky
25 * ===========================================================================*/
26
27 /** @file blast_tabular.c
28 * On-the-fly tabular formatting of BLAST results
29 */
30
31 #ifndef SKIP_DOXYGEN_PROCESSING
32 static char const rcsid[] = "$Id: blast_tabular.c,v 1.41 2009/06/01 14:33:38 maning Exp $";
33 #endif /* SKIP_DOXYGEN_PROCESSING */
34
35 #include <algo/blast/api/blast_tabular.h>
36 #include <algo/blast/core/blast_util.h>
37 #include <algo/blast/core/blast_setup.h>
38 #include <algo/blast/core/blast_engine.h>
39 #include <algo/blast/core/blast_traceback.h>
40 #include <algo/blast/api/blast_format.h>
41 #include <algo/blast/api/hspfilter_queue.h>
42 #include <algo/blast/api/blast_seqalign.h>
43 #include <algo/blast/core/blast_seqsrc_impl.h>
44 #include <algo/blast/core/gencode_singleton.h>
45
46 #include <txalign.h>
47
48 /** @addtogroup CToolkitAlgoBlast
49 *
50 * @{
51 */
52
53 BlastTabularFormatData*
54 BlastTabularFormatDataNew(FILE* outfp, AsnIoPtr asn_outfp, SeqLoc* query_seqloc,
55 EBlastTabularFormatOptions format_option,
56 Boolean believe_query)
57 {
58 BlastTabularFormatData* tf_data =
59 (BlastTabularFormatData*) calloc(1, sizeof(BlastTabularFormatData));
60 tf_data->outfp = outfp;
61 tf_data->asn_outfp = asn_outfp;
62 tf_data->query_slp = query_seqloc;
63 tf_data->format_options = format_option;
64 tf_data->believe_query = believe_query;
65
66 return tf_data;
67 }
68
69 Int2
70 Blast_TabularFormatDataSetUp(BlastTabularFormatData* tf_data,
71 EBlastProgramType program,
72 BlastHSPStream* hsp_stream,
73 const BlastSeqSrc* seq_src,
74 BLAST_SequenceBlk* query,
75 BlastQueryInfo* query_info,
76 const BlastScoringOptions* score_options,
77 BlastScoreBlk* sbp,
78 const BlastEffectiveLengthsOptions* eff_len_options,
79 const BlastExtensionOptions* ext_options,
80 const BlastHitSavingOptions* hit_options,
81 const BlastDatabaseOptions* db_options)
82 {
83 Int2 status = 0;
84
85 ASSERT(score_options && db_options);
86
87 tf_data->perform_traceback = score_options->gapped_calculation;
88 tf_data->program = program;
89 tf_data->hsp_stream = hsp_stream;
90 tf_data->query = query;
91 tf_data->gen_code_string = GenCodeSingletonFind(db_options->genetic_code);
92 /* Sequence source must be copied, to guarantee multi-thread safety. */
93 tf_data->seq_src = BlastSeqSrcCopy(seq_src);
94 /* Effective lengths must be duplicated in query info structure, because
95 they might be changing in the preliminary search. */
96 tf_data->query_info = BlastQueryInfoDup(query_info);
97
98 /* If traceback will have to be performed before tabular output,
99 do the preparation for it here. */
100 if (tf_data->perform_traceback) {
101 status =
102 BLAST_GapAlignSetUp(program, seq_src, score_options,
103 eff_len_options, ext_options, hit_options,
104 tf_data->query_info, sbp, &tf_data->score_params,
105 &tf_data->ext_params, &tf_data->hit_params,
106 &tf_data->eff_len_params, &tf_data->gap_align);
107 tf_data->gap_align->gap_x_dropoff = tf_data->ext_params->gap_x_dropoff_final;
108 }
109 return status;
110 }
111
112 void
113 BlastTabularFormatDataClean(BlastTabularFormatData* tf_data)
114 {
115 if (!tf_data)
116 return;
117
118 /* Free the structures that have been allocated internally */
119 tf_data->query_info = BlastQueryInfoFree(tf_data->query_info);
120 tf_data->score_params = BlastScoringParametersFree(tf_data->score_params);
121 tf_data->ext_params = BlastExtensionParametersFree(tf_data->ext_params);
122 tf_data->hit_params = BlastHitSavingParametersFree(tf_data->hit_params);
123 tf_data->eff_len_params =
124 BlastEffectiveLengthsParametersFree(tf_data->eff_len_params);
125 tf_data->gap_align = BLAST_GapAlignStructFree(tf_data->gap_align);
126 tf_data->seq_src = BlastSeqSrcFree(tf_data->seq_src);
127 }
128
129 BlastTabularFormatData*
130 BlastTabularFormatDataFree(BlastTabularFormatData* tf_data)
131 {
132 if (!tf_data)
133 return NULL;
134
135 /* Free the internal structures, if they haven't been freed earlier. */
136 BlastTabularFormatDataClean(tf_data);
137
138 sfree(tf_data);
139 return tf_data;
140 }
141
142 /** Creates nucleotide sequence buffers corresponding to a local alignment.
143 * Used in tabular output with "print sequences" option.
144 * @param program Type of BLAST program [in]
145 * @param hsp Internal HSP structure [in]
146 * @param query_seq Query sequence in blastna encoding. [in]
147 * @param subject_seq Subject sequence in blastna encoding [in]
148 * @param query_length Length of query sequence [in]
149 * @param subject_length Length of subject sequence [in]
150 * @param query_buffer Preallocated buffer for text query sequence [in] [out]
151 * @param subject_buffer Preallocated buffer for text subject sequence [in] [out]
152 */
153 static void
154 FillNuclSequenceBuffers(EBlastProgramType program, BlastHSP* hsp,
155 Uint1* query_seq, Uint1* subject_seq, Int4 query_length,
156 Int4 subject_length, char* query_buffer,
157 char* subject_buffer)
158 {
159 Int4 index, index1;
160 const char* blastna_to_iupacna = "ACGTRYMKWSBDHVN-";
161 const char* blastna_to_iupacna_rev = "TGCAYRKMSWVHDBN-";
162 Uint1* query_ptr;
163 Uint1* subject_ptr;
164 Int4 numseg;
165 Int4* starts;
166 Int4* lengths;
167 Int4 offset;
168 Int4 start1, start2;
169 char* buffer;
170 Boolean reverse;
171 Boolean translate1, translate2;
172
173 translate1 = Blast_QueryIsTranslated(program);
174 translate2 = Blast_SubjectIsTranslated(program);
175
176 reverse = (hsp->query.frame != hsp->subject.frame);
177
178 /* Calculate number of segments. */
179 numseg = hsp->gap_info->size;
180 /* Find the starts and lengths of each segment. */
181 start1 = hsp->query.offset;
182 start2 = hsp->subject.offset;
183 GapCollectDataForSeqalign(hsp, hsp->gap_info, 0, numseg, query_length,
184 subject_length, translate1, translate2,
185 &starts, &lengths, NULL, &start1, &start2);
186
187 offset = 0;
188 if (!reverse) {
189 for (index = 0; index < numseg; ++index) {
190 buffer = &query_buffer[offset];
191 if (starts[2*index] != -1) {
192 query_ptr = &query_seq[starts[2*index]];
193 for (index1 = 0; index1 < lengths[index]; ++index1) {
194 *buffer = blastna_to_iupacna[*query_ptr];
195 buffer++;
196 query_ptr++;
197 }
198 } else {
199 memset(buffer, '-', lengths[index]);
200 }
201 buffer = &subject_buffer[offset];
202 if (starts[2*index+1] != -1) {
203 subject_ptr = &subject_seq[starts[2*index+1]];
204 for (index1 = 0; index1 < lengths[index]; ++index1) {
205 *buffer = blastna_to_iupacna[*subject_ptr];
206 buffer++;
207 subject_ptr++;
208 }
209 } else {
210 memset(buffer, '-', lengths[index]);
211 }
212 offset += lengths[index];
213 }
214 } else {
215 for (index = numseg-1; index >=0; --index) {
216 buffer = &query_buffer[offset];
217 if (starts[2*index] != -1) {
218 query_ptr = &query_seq[starts[2*index]];
219 for (index1 = 0; index1 < lengths[index]; ++index1) {
220 *buffer = blastna_to_iupacna[*query_ptr];
221 buffer++;
222 query_ptr++;
223 }
224 } else {
225 memset(buffer, '-', lengths[index]);
226 }
227 buffer = &subject_buffer[offset];
228 if (starts[2*index+1] != -1) {
229 subject_ptr = &subject_seq[starts[2*index+1]+lengths[index]-1];
230 for (index1 = 0; index1 < lengths[index]; ++index1) {
231 *buffer = blastna_to_iupacna_rev[*subject_ptr];
232 buffer++;
233 subject_ptr--;
234 }
235 } else {
236 memset(buffer, '-', lengths[index]);
237 }
238 offset += lengths[index];
239 }
240 }
241
242 sfree(starts);
243 sfree(lengths);
244 }
245
246 /** Maximal buffer length to use for a Seq-id in tabular output. */
247 #define SEQIDLEN_MAX 255
248
249 /** For incremental ASN.1 output, the maximum number of seq-aligns
250 that are packed into a single seq-annot */
251 #define INCREMENTAL_ASN_BATCH_SIZE 50
252
253 void* Blast_TabularFormatThread(void* data)
254 {
255 BlastTabularFormatData* tf_data;
256 EBlastProgramType program;
257 BlastHSPList* hsp_list = NULL;
258 BlastSeqSrc* seq_src;
259 BLAST_SequenceBlk* query = NULL;
260 BlastQueryInfo* query_info = NULL;
261 BlastScoringParameters* score_params = NULL;
262 BlastExtensionParameters* ext_params = NULL;
263 BlastHitSavingParameters* hit_params = NULL;
264 BlastEffectiveLengthsParameters* eff_len_params = NULL;
265 Uint1* gen_code_string = NULL;
266 BlastGapAlignStruct* gap_align = NULL;
267 Int4 query_index, index;
268 char* query_buffer = NULL;
269 char* subject_buffer = NULL;
270 Int4 q_start=0, q_end=0, s_start=0, s_end=0;
271 SeqLoc* slp;
272 char bit_score_buff[10], eval_buff[10];
273 char* eval_buff_ptr = NULL;
274 BlastHSP* hsp;
275 SeqId** query_id_array = NULL;
276 SeqId* subject_id = NULL;
277 Int4 align_length = 0;
278 Int4 num_gaps = 0, num_gap_opens = 0, num_mismatches = 0;
279 double perc_ident = 0;
280 BlastSeqSrcGetSeqArg seq_arg;
281 Boolean one_seq_update_params;
282 ReadDBFILE* rdfp = NULL;
283 char* descr;
284 Int4 num_queries;
285 Int4* query_lengths;
286 Boolean sequence_in_use = FALSE;
287 Int4 num_asn_results = 0;
288 SeqAlignPtr sap_head = NULL;
289 SeqAlignPtr sap_last = NULL;
290
291 tf_data = (BlastTabularFormatData*) data;
292 if (!tf_data || !tf_data->query_slp || !tf_data->hsp_stream ||
293 !tf_data->seq_src || (!tf_data->outfp && !tf_data->asn_outfp))
294 return NULL;
295
296 program = tf_data->program;
297 seq_src = tf_data->seq_src;
298 query = tf_data->query;
299 query_info = tf_data->query_info;
300
301 seq_arg.seq = NULL;
302 seq_arg.oid = 0;
303
304 if (tf_data->perform_traceback) {
305 score_params = tf_data->score_params;
306 ext_params = tf_data->ext_params;
307 hit_params = tf_data->hit_params;
308 eff_len_params = tf_data->eff_len_params;
309 gap_align = tf_data->gap_align;
310 gen_code_string = tf_data->gen_code_string;
311 seq_arg.encoding = Blast_TracebackGetEncoding(program);
312 }
313
314 num_queries = ValNodeLen(tf_data->query_slp);
315 query_id_array = (SeqId**) malloc(num_queries*sizeof(SeqId*));
316 query_lengths = (Int4*) malloc(num_queries*sizeof(Int4));
317
318 for (index = 0, slp = tf_data->query_slp; slp; ++index, slp = slp->next) {
319 BioseqPtr bsp = BioseqLockById(SeqLocId(slp));
320 query_id_array[index] = SeqIdSetDup(bsp->id);
321 query_lengths[index] = BioseqGetLen(bsp);
322 BioseqUnlockById(SeqLocId(slp));
323 }
324
325 one_seq_update_params = (BlastSeqSrcGetTotLen(seq_src) == 0);
326
327 /* The line below shouldn't have to access the BlastSeqSrc's data structure
328 * FIXME*/
329 rdfp = (ReadDBFILE*) _BlastSeqSrcImpl_GetDataStructure(seq_src);
330
331 while (BlastHSPQueueRead(tf_data->hsp_stream->writer->data, &hsp_list)
332 != kBlastHSPStream_Eof) {
333 Int4 subject_length;
334 if (!hsp_list) {
335 /* This should not happen, but just in case */
336 continue;
337 }
338
339 /* Perform traceback if necessary */
340 if (tf_data->perform_traceback) {
341 seq_arg.oid = hsp_list->oid;
342 if (BlastSeqSrcGetSequence(seq_src, (void*) &seq_arg) < 0)
343 continue;
344
345 sequence_in_use = TRUE;
346 if (one_seq_update_params) {
347 Int2 status;
348 /* This is not a database search, so effective search spaces
349 need to be recalculated based on this subject sequence length */
350 if ((status = BLAST_OneSubjectUpdateParameters(program,
351 seq_arg.seq->length,
352 score_params->options,
353 query_info, gap_align->sbp,
354 hit_params, NULL,
355 eff_len_params)) != 0) {
356 hsp_list = Blast_HSPListFree(hsp_list);
357 BlastSeqSrcReleaseSequence(seq_src, (void*)&seq_arg);
358 sequence_in_use = FALSE;
359 continue;
360 }
361 }
362
363 Blast_TracebackFromHSPList(program, hsp_list, query,
364 seq_arg.seq, query_info, gap_align, gap_align->sbp, score_params,
365 ext_params->options, hit_params, gen_code_string, NULL);
366 /* Return subject sequence unless it is needed for the sequence
367 printout */
368 if (tf_data->format_options != eBlastTabularAddSequences) {
369 BlastSeqSrcReleaseSequence(seq_src, (void*)&seq_arg);
370 sequence_in_use = FALSE;
371 }
372 /* Recalculate the bit scores, since they might have changed. */
373 Blast_HSPListGetBitScores(hsp_list,
374 score_params->options->gapped_calculation, gap_align->sbp);
375 }
376
377 if (!readdb_get_descriptor(rdfp, hsp_list->oid, &subject_id, &descr)) {
378 subject_buffer = strdup("Unknown");
379 } else if (subject_id->choice != SEQID_GENERAL ||
380 strcmp(((DbtagPtr)subject_id->data.ptrvalue)->db,
381 "BL_ORD_ID")) {
382 /* All cases except when database was formatted without seqid indices.
383 In that case all real Seq-id information is hidden in the
384 description. */
385 if (tf_data->show_gi || tf_data->show_accession) {
386 Blast_SeqIdGetDefLine(subject_id, &subject_buffer,
387 tf_data->show_gi, tf_data->show_accession,
388 TRUE);
389 } else {
390 if ( !(subject_buffer = (char*) malloc(sizeof(char)*SEQIDLEN_MAX)))
391 return NULL;
392 SeqIdWrite(subject_id, subject_buffer, PRINTID_FASTA_LONG,
393 SEQIDLEN_MAX-1);
394 }
395 /* Found something for the seqid buffer; description can be
396 discarded now. */
397 if (subject_buffer != NULL)
398 sfree(descr);
399 }
400
401 /* Last chance to assign anything - take the first token from the
402 description. */
403 if (!subject_buffer && descr)
404 subject_buffer = strtok(descr, " \t\n\r");
405
406 /* Retrieve the subject sequence if it is needed and this has not
407 already been done. */
408 if (tf_data->format_options == eBlastTabularAddSequences &&
409 !tf_data->perform_traceback) {
410 seq_arg.oid = hsp_list->oid;
411 seq_arg.encoding = eBlastEncodingNucleotide;
412 if (BlastSeqSrcGetSequence(seq_src, (void*) &seq_arg) < 0) {
413 if (subject_id)
414 subject_id = SeqIdSetFree(subject_id);
415 continue;
416 }
417 sequence_in_use = TRUE;
418 }
419
420 subject_length = BlastSeqSrcGetSeqLen(seq_src, (void*)&hsp_list->oid);
421
422 for (index = 0; index < hsp_list->hspcnt; ++index) {
423 char* query_buffer_ptr=NULL;
424 hsp = hsp_list->hsp_array[index];
425 query_index =
426 Blast_GetQueryIndexFromContext(hsp->context, program);
427
428 /* handle incremental ASN.1 output */
429 if (tf_data->format_options == eBlastIncrementalASN) {
430 SeqAlignPtr sap = NULL;
431 if (tf_data->is_ooframe) {
432 sap = OOFBlastHSPToSeqAlign(program, hsp,
433 query_id_array[query_index], subject_id,
434 query_lengths[query_index], subject_length);
435 }
436 else {
437 sap = BlastHSPToSeqAlign(program, hsp,
438 query_id_array[query_index], subject_id,
439 query_lengths[query_index], subject_length);
440 }
441 sap->score = GetScoreSetFromBlastHsp(hsp);
442 /* add to the current batch of results */
443 if (sap_head == NULL) {
444 sap_head = sap_last = sap;
445 }
446 else {
447 sap_last->next = sap;
448 sap_last = sap;
449 }
450
451 /* flush the current batch if enough alignments
452 have accumulated */
453 if (++num_asn_results == INCREMENTAL_ASN_BATCH_SIZE) {
454 SeqAnnot* seqannot = SeqAnnotNew();
455 Boolean unused;
456 seqannot->type = 2;
457 AddAlignInfoToSeqAnnot(seqannot,
458 GetOldAlignType(program, &unused));
459 seqannot->data = sap_head;
460 SeqAnnotAsnWrite((SeqAnnot*) seqannot, tf_data->asn_outfp, NULL);
461 AsnIoReset(tf_data->asn_outfp);
462 num_asn_results = 0;
463 sap_head = sap_last = NULL;
464 seqannot = SeqAnnotFree(seqannot);
465 }
466 continue;
467 }
468
469 /* handle ordinary tabular output */
470
471 Blast_SeqIdGetDefLine(query_id_array[query_index], &query_buffer,
472 tf_data->show_gi, tf_data->show_accession,
473 tf_data->believe_query);
474
475 eval_buff_ptr = eval_buff;
476 ScoreAndEvalueToBuffers(hsp->bit_score, hsp->evalue,
477 bit_score_buff, &eval_buff_ptr, 0);
478
479 /* Calculate percentage of identities */
480 Blast_HSPCalcLengthAndGaps(hsp, &align_length, &num_gaps,
481 &num_gap_opens);
482 perc_ident = ((double)hsp->num_ident)/align_length * 100;
483 num_mismatches = align_length - hsp->num_ident - num_gaps;
484
485 Blast_HSPGetAdjustedOffsets(program, hsp, query_lengths[query_index],
486 subject_length, &q_start, &q_end,
487 &s_start, &s_end);
488
489 query_buffer_ptr = query_buffer;
490 if (strstr(query_buffer, "lcl|") == query_buffer)
491 query_buffer_ptr += 4;
492
493
494 if (tf_data->format_options == eBlastTabularAddSequences) {
495 char* query_seq_buffer = NULL, *subject_seq_buffer = NULL;
496 Uint1* query_seq = NULL;
497 Int4 context;
498 context = hsp->context - (hsp->context % 2);
499 query_seq =
500 & query->sequence[query_info->contexts[context].query_offset];
501
502 query_seq_buffer = MemNew((align_length+1));
503 subject_seq_buffer = MemNew((align_length+1));
504
505 FillNuclSequenceBuffers(program, hsp, query_seq,
506 seq_arg.seq->sequence,
507 query_lengths[query_index],
508 seq_arg.seq->length, query_seq_buffer,
509 subject_seq_buffer);
510
511 fprintf(tf_data->outfp,
512 "%s\t%s\t%.2f\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%s\t%s\t%s\t%s\n",
513 query_buffer_ptr, subject_buffer, perc_ident,
514 (long) align_length, (long) num_mismatches,
515 (long) num_gap_opens, (long) q_start, (long) q_end,
516 (long) s_start, (long) s_end, eval_buff, bit_score_buff,
517 query_seq_buffer, subject_seq_buffer);
518 sfree(query_seq_buffer);
519 sfree(subject_seq_buffer);
520 } else {
521 fprintf(tf_data->outfp,
522 "%s\t%s\t%.2f\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%s\t%s\n",
523 query_buffer_ptr, subject_buffer, perc_ident,
524 (long) align_length, (long) num_mismatches,
525 (long) num_gap_opens, (long) q_start, (long) q_end,
526 (long) s_start, (long) s_end, eval_buff, bit_score_buff);
527 }
528 sfree(query_buffer);
529 }
530
531 /* Return the subject sequence, if it hasn't yet been done. */
532 if (sequence_in_use)
533 BlastSeqSrcReleaseSequence(seq_src, (void*)&seq_arg);
534
535 fflush(tf_data->outfp);
536 sfree(subject_buffer);
537 hsp_list = Blast_HSPListFree(hsp_list);
538 if (subject_id)
539 subject_id = SeqIdSetFree(subject_id);
540 }
541
542 /* flush any leftover ASN.1 output */
543 if (sap_head != NULL) {
544 SeqAnnot* seqannot = SeqAnnotNew();
545 Boolean unused;
546 seqannot->type = 2;
547 AddAlignInfoToSeqAnnot(seqannot,
548 GetOldAlignType(program, &unused));
549 seqannot->data = sap_head;
550 SeqAnnotAsnWrite((SeqAnnot*) seqannot, tf_data->asn_outfp, NULL);
551 AsnIoReset(tf_data->asn_outfp);
552 seqannot = SeqAnnotFree(seqannot);
553 }
554
555 BlastSequenceBlkFree(seq_arg.seq);
556
557 for (index = 0; index<num_queries; ++index)
558 {
559 SeqIdSetFree(query_id_array[index]);
560 query_id_array[index] = NULL;
561 }
562 sfree(query_lengths);
563 sfree(query_id_array);
564
565 return NULL;
566 }
567 /* @} */
568
569 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |